#!/usr/bin/env python # -*- coding: utf-8 -*- # 1/26/04 (Mon) # I made some assumption on cgi program character coding. # - *.html files should be in iso-2022-jp # - *.cgi files should be in euc-jp # - http output in iso-2022-jp. # This may cause a difficulty when I try to devise a cgi tool # that handle the html files. Hmmmm # 2/1/04 (Sun) 1.0 released. # 2/6/04 (Fri) 1.01 exception for included # 4/1/04 (Thu) 1.02 removed exception, change order of menu, added home. # 4/4/04 (Sun) 1.03 removed most of the redundant variables # relaxed the requirement for being heading # 2007-04-28 (Sat): 1.04 japanese.iso- => .iso- # 2007-04-28 (Sat): 1.05 insert "search" to navi menu div # 2010-03-20 (Sat): utf-8 output import sys, re, string, datetime #CODING = "euc-jp" CODING = "utf-8" def main(argv): try: fs = open(argv[1]) except IOError: print "Can't open '" + argv[1] + "'" sys.exit(1) lines = fs.readlines() fs.close() try: fd = open("conversion.log", "a") except IOError: print "Can't open 'conversion_log'" sys.exit(1) fd.write("\n%s: %s\n" % (datetime.date.today().isoformat(), argv[1])) # strip the old nav bars anchors = [] nonav_lines = [] navi_start = re.compile('') in_navi = False s_encode = 'utf-8' for line in lines: if re.search('<[^>]*charset\s?=\s?euc-jp\s?"\s?>', line): line = line.replace('euc-jp', 'utf-8') s_encode = 'euc-jp' if re.search(']* href="style/default_u.css".*>', line): line = line.replace('default_u.css', 'default.css') fd.write("default_u to default") if re.search('' , line): line = line.replace('menu_div_u.html', 'menu_div.html') try: uline = unicode(line, s_encode) ## 2007-09-23 (Sun): except UnicodeDecodeError: fd.write('DecodeError: in "%s"\n' % line) fd.close() sys.exit(1) if navi_start.search(uline): in_navi = True if in_navi: if navi_end.search(uline): in_navi = False else: # 4/4/04 (Sun) remove succeeding blank lines if len(uline) <= 1 and len(nonav_lines[-1]) <= 1: continue else: nonav_lines.append(uline) fd.write("coding was '%s', and being converted to utf-8\n" % s_encode) anch_patt = re.compile('\s*') # 4/4/04 (Sun) relaxed

to

# subj_patt = re.compile('<[Hh]3\s*.*>\s*(.*)\s*', re.DOTALL) subj_start = re.compile('<[Hh][23]\s*.*>') subj_end = re.compile('') in_subj = False anchors = [] anchor = "" for uline in nonav_lines: anch_ob = anch_patt.search(uline) if anch_ob: # if found anchor = anch_ob.group(1) # anchors.append(anchor) continue if subj_start.search(uline): in_subj = True if in_subj: # subjlines += uline # modified to ignore consecutive headings 4/4/04 (Sun) if subj_end.search(uline): if len(anchor) >= 1: anchors.append(anchor) in_subj = False anchor = "" nav_form_bak =\ u'''\n''' nav_form = u'''\n''' anchlen = len(anchors) results = [] # insert navigation bar before ??? for line in nonav_lines: sys.stdout.write(line.encode(CODING)) anchor = anch_patt.search(line) if anchor: anchor_key = anchor.group(1) sys.stderr.write(anchor_key + "\n") ## try: ia = anchors.index(anchor_key) except ValueError: sys.stderr.write(anchor_key + "\n") ## # if there is no corresponding

, no navigator is put continue navbar = nav_form % \ (anchors[(ia + 1) % anchlen], anchors[ia - 1]) sys.stdout.write( navbar.encode(CODING) ) fd.close() if __name__ == "__main__": main(sys.argv)