X-Git-Url: http://git.cyclocoop.org/clavettes/images/siteon3.jpg?a=blobdiff_plain;f=maintenance%2Flanguage%2Fzhtable%2FMakefile.py;h=5924c66270e294aaa2cbffcb601ab681f9afc369;hb=bfa365a8ac4abe25a696a4ff337cb26c60748262;hp=4ab57d402ca6a643aa229658581e2c80fa015d71;hpb=4f21e6be661f7305abc223bb31600970a3fb5326;p=lhc%2Fweb%2Fwiklou.git diff --git a/maintenance/language/zhtable/Makefile.py b/maintenance/language/zhtable/Makefile.py index 4ab57d402c..5924c66270 100755 --- a/maintenance/language/zhtable/Makefile.py +++ b/maintenance/language/zhtable/Makefile.py @@ -1,9 +1,13 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # @author Philip -import tarfile as tf -import zipfile as zf -import os, re, shutil, sys, platform +import os +import platform +import re +import shutil +import sys +import tarfile +import zipfile pyversion = platform.python_version() islinux = platform.system().lower() == 'linux' @@ -18,16 +22,18 @@ if pyversion[:3] in ['2.6', '2.7']: if i < 0x10000: return _unichr(i) else: - return _unichr( 0xD7C0 + ( i>>10 ) ) + _unichr( 0xDC00 + ( i & 0x3FF ) ) + return _unichr(0xD7C0 + (i >> 10)) + _unichr(0xDC00 + (i & 0x3FF)) elif pyversion[:2] == '3.': import urllib.request as urllib_request unichr = chr -def unichr2( *args ): - return [unichr( int( i.split('<')[0][2:], 16 ) ) for i in args] -def unichr3( *args ): - return [unichr( int( i[2:7], 16 ) ) for i in args if i[2:7]] +def unichr2(*args): + return [unichr(int(i.split('<')[0][2:], 16)) for i in args] + + +def unichr3(*args): + return [unichr(int(i[2:7], 16)) for i in args if i[2:7]] # DEFINE UNIHAN_VER = '6.3.0' @@ -37,189 +43,201 @@ SCIM_PINYIN_VER = '0.5.92' LIBTABE_VER = '0.2.3' # END OF DEFINE -def download( url, dest ): - if os.path.isfile( dest ): - print( 'File %s is up to date.' % dest ) + +def download(url, dest): + if os.path.isfile(dest): + print('File %s is up to date.' % dest) return global islinux if islinux: - # we use wget instead urlretrieve under Linux, + # we use wget instead urlretrieve under Linux, # because wget could display details like download progress - os.system( 'wget %s -O %s' % ( url, dest ) ) + os.system('wget %s -O %s' % (url, dest)) else: - print( 'Downloading from [%s] ...' % url ) - urllib_request.urlretrieve( url, dest ) - print( 'Download complete.\n' ) + print('Downloading from [%s] ...' % url) + urllib_request.urlretrieve(url, dest) + print('Download complete.\n') return -def uncompress( fp, member, encoding = 'U8' ): - name = member.rsplit( '/', 1 )[-1] - print( 'Extracting %s ...' % name ) - fp.extract( member ) - shutil.move( member, name ) + +def uncompress(fp, member, encoding='U8'): + name = member.rsplit('/', 1)[-1] + print('Extracting %s ...' % name) + fp.extract(member) + shutil.move(member, name) if '/' in member: - shutil.rmtree( member.split( '/', 1 )[0] ) + shutil.rmtree(member.split('/', 1)[0]) if pyversion[:1] in ['2']: - fc = open( name, 'rb', encoding, 'ignore' ) + fc = open(name, 'rb', encoding, 'ignore') else: - fc = open( name, 'r', encoding = encoding, errors = 'ignore' ) + fc = open(name, 'r', encoding=encoding, errors='ignore') return fc unzip = lambda path, member, encoding = 'U8': \ - uncompress( zf.ZipFile( path ), member, encoding ) + uncompress(zipfile.ZipFile(path), member, encoding) untargz = lambda path, member, encoding = 'U8': \ - uncompress( tf.open( path, 'r:gz' ), member, encoding ) + uncompress(tarfile.open(path, 'r:gz'), member, encoding) + -def parserCore( fp, pos, beginmark = None, endmark = None ): +def parserCore(fp, pos, beginmark=None, endmark=None): if beginmark and endmark: start = False - else: start = True + else: + start = True mlist = set() for line in fp: - if beginmark and line.startswith( beginmark ): + if beginmark and line.startswith(beginmark): start = True continue - elif endmark and line.startswith( endmark ): + elif endmark and line.startswith(endmark): break - if start and not line.startswith( '#' ): + if start and not line.startswith('#'): elems = line.split() - if len( elems ) < 2: + if len(elems) < 2: continue - elif len( elems[0] ) > 1 and \ - len( elems[pos] ) > 1: # words only - mlist.add( elems[pos] ) + elif len(elems[0]) > 1 and len(elems[pos]) > 1: # words only + mlist.add(elems[pos]) return mlist -def tablesParser( path, name ): + +def tablesParser(path, name): """ Read file from scim-tables and parse it. """ global SCIM_TABLES_VER - src = 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER, name ) - fp = untargz( path, src, 'U8' ) - return parserCore( fp, 1, 'BEGIN_TABLE', 'END_TABLE' ) + src = 'scim-tables-%s/tables/zh/%s' % (SCIM_TABLES_VER, name) + fp = untargz(path, src, 'U8') + return parserCore(fp, 1, 'BEGIN_TABLE', 'END_TABLE') + +ezbigParser = lambda path: tablesParser(path, 'EZ-Big.txt.in') +wubiParser = lambda path: tablesParser(path, 'Wubi.txt.in') +zrmParser = lambda path: tablesParser(path, 'Ziranma.txt.in') -ezbigParser = lambda path: tablesParser( path, 'EZ-Big.txt.in' ) -wubiParser = lambda path: tablesParser( path, 'Wubi.txt.in' ) -zrmParser = lambda path: tablesParser( path, 'Ziranma.txt.in' ) -def phraseParser( path ): +def phraseParser(path): """ Read phrase_lib.txt and parse it. """ global SCIM_PINYIN_VER src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER - dst = 'phrase_lib.txt' - fp = untargz( path, src, 'U8' ) - return parserCore( fp, 0 ) + fp = untargz(path, src, 'U8') + return parserCore(fp, 0) -def tsiParser( path ): + +def tsiParser(path): """ Read tsi.src and parse it. """ src = 'libtabe/tsi-src/tsi.src' - dst = 'tsi.src' - fp = untargz( path, src, 'big5hkscs' ) - return parserCore( fp, 0 ) + fp = untargz(path, src, 'big5hkscs') + return parserCore(fp, 0) + -def unihanParser( path ): +def unihanParser(path): """ Read Unihan_Variants.txt and parse it. """ - fp = unzip( path, 'Unihan_Variants.txt', 'U8' ) + fp = unzip(path, 'Unihan_Variants.txt', 'U8') t2s = dict() s2t = dict() for line in fp: - if line.startswith( '#' ): + if line.startswith('#'): continue else: elems = line.split() - if len( elems ) < 3: + if len(elems) < 3: continue - type = elems.pop( 1 ) - elems = unichr2( *elems ) + type = elems.pop(1) + elems = unichr2(*elems) if type == 'kTraditionalVariant': s2t[elems[0]] = elems[1:] elif type == 'kSimplifiedVariant': t2s[elems[0]] = elems[1:] fp.close() - return ( t2s, s2t ) + return (t2s, s2t) -def applyExcludes( mlist, path ): + +def applyExcludes(mlist, path): """ Apply exclude rules from path to mlist. """ if pyversion[:1] in ['2']: - excludes = open( path, 'rb', 'U8' ).read().split() + excludes = open(path, 'rb', 'U8').read().split() else: - excludes = open( path, 'r', encoding = 'U8' ).read().split() - excludes = [word.split( '#' )[0].strip() for word in excludes] - excludes = '|'.join( excludes ) - excptn = re.compile( '.*(?:%s).*' % excludes ) - diff = [mword for mword in mlist if excptn.search( mword )] - mlist.difference_update( diff ) + excludes = open(path, 'r', encoding='U8').read().split() + excludes = [word.split('#')[0].strip() for word in excludes] + excludes = '|'.join(excludes) + excptn = re.compile('.*(?:%s).*' % excludes) + diff = [mword for mword in mlist if excptn.search(mword)] + mlist.difference_update(diff) return mlist -def charManualTable( path ): - fp = open( path, 'r', encoding = 'U8' ) - for line in fp: - elems = line.split( '#' )[0].split( '|' ) - elems = unichr3( *elems ) - if len( elems ) > 1: - yield elems[0], elems[1:] - -def toManyRules( src_table ): + +def charManualTable(path): + fp = open(path, 'r', encoding='U8') + for line in fp: + elems = line.split('#')[0].split('|') + elems = unichr3(*elems) + if len(elems) > 1: + yield elems[0], elems[1:] + + +def toManyRules(src_table): tomany = set() if pyversion[:1] in ['2']: - for ( f, t ) in src_table.iteritems(): - for i in range( 1, len( t ) ): - tomany.add( t[i] ) + for (f, t) in src_table.iteritems(): + for i in range(1, len(t)): + tomany.add(t[i]) else: - for ( f, t ) in src_table.items(): - for i in range( 1, len( t ) ): - tomany.add( t[i] ) + for (f, t) in src_table.items(): + for i in range(1, len(t)): + tomany.add(t[i]) return tomany -def removeRules( path, table ): - fp = open( path, 'r', encoding = 'U8' ) + +def removeRules(path, table): + fp = open(path, 'r', encoding='U8') texc = list() for line in fp: - elems = line.split( '=>' ) + elems = line.split('=>') f = t = elems[0].strip() - if len( elems ) == 2: + if len(elems) == 2: t = elems[1].strip() f = f.strip('"').strip("'") t = t.strip('"').strip("'") if f: try: - table.pop( f ) + table.pop(f) except: pass if t: - texc.append( t ) - texcptn = re.compile( '^(?:%s)$' % '|'.join( texc ) ) + texc.append(t) + texcptn = re.compile('^(?:%s)$' % '|'.join(texc)) if pyversion[:1] in ['2']: for (tmp_f, tmp_t) in table.copy().iteritems(): - if texcptn.match( tmp_t ): - table.pop( tmp_f ) + if texcptn.match(tmp_t): + table.pop(tmp_f) else: for (tmp_f, tmp_t) in table.copy().items(): - if texcptn.match( tmp_t ): - table.pop( tmp_f ) + if texcptn.match(tmp_t): + table.pop(tmp_f) return table -def customRules( path ): - fp = open( path, 'r', encoding = 'U8' ) + +def customRules(path): + fp = open(path, 'r', encoding='U8') ret = dict() for line in fp: - line = line.rstrip( '\r\n' ) + line = line.rstrip('\r\n') if '#' in line: - line = line.split( '#' )[0].rstrip() - elems = line.split( '\t' ) - if len( elems ) > 1: + line = line.split('#')[0].rstrip() + elems = line.split('\t') + if len(elems) > 1: ret[elems[0]] = elems[1] return ret -def dictToSortedList( src_table, pos ): - return sorted( src_table.items(), key = lambda m: ( m[pos], m[1 - pos] ) ) -def translate( text, conv_table ): +def dictToSortedList(src_table, pos): + return sorted(src_table.items(), key=lambda m: (m[pos], m[1 - pos])) + + +def translate(text, conv_table): i = 0 - while i < len( text ): - for j in range( len( text ) - i, 0, -1 ): + while i < len(text): + for j in range(len(text) - i, 0, -1): f = text[i:][:j] - t = conv_table.get( f ) + t = conv_table.get(f) if t: text = text[:i] + t + text[i:][j:] i += len(t) - 1 @@ -227,42 +245,44 @@ def translate( text, conv_table ): i += 1 return text -def manualWordsTable( path, conv_table, reconv_table ): - fp = open( path, 'r', encoding = 'U8' ) + +def manualWordsTable(path, conv_table, reconv_table): + fp = open(path, 'r', encoding='U8') reconv_table = {} - wordlist = [line.split( '#' )[0].strip() for line in fp] - wordlist = list( set( wordlist ) ) - wordlist.sort( key = lambda w: ( len(w), w ), reverse = True ) + wordlist = [line.split('#')[0].strip() for line in fp] + wordlist = list(set(wordlist)) + wordlist.sort(key=lambda w: (len(w), w), reverse=True) while wordlist: word = wordlist.pop() - new_word = translate( word, conv_table ) - rcv_word = translate( word, reconv_table ) + new_word = translate(word, conv_table) + rcv_word = translate(word, reconv_table) if word != rcv_word: reconv_table[word] = word reconv_table[new_word] = word return reconv_table -def defaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ): - wordlist = list( src_wordlist ) - wordlist.sort( key = lambda w: ( len(w), w ), reverse = True ) + +def defaultWordsTable(src_wordlist, src_tomany, char_conv_table, + char_reconv_table): + wordlist = list(src_wordlist) + wordlist.sort(key=lambda w: (len(w), w), reverse=True) word_conv_table = {} word_reconv_table = {} conv_table = char_conv_table.copy() reconv_table = char_reconv_table.copy() - tomanyptn = re.compile( '(?:%s)' % '|'.join( src_tomany ) ) + tomanyptn = re.compile('(?:%s)' % '|'.join(src_tomany)) while wordlist: - conv_table.update( word_conv_table ) - reconv_table.update( word_reconv_table ) + conv_table.update(word_conv_table) + reconv_table.update(word_reconv_table) word = wordlist.pop() - new_word_len = word_len = len( word ) + new_word_len = word_len = len(word) while new_word_len == word_len: - add = False - test_word = translate( word, reconv_table ) - new_word = translate( word, conv_table ) - if not reconv_table.get( new_word ) \ - and ( test_word != word \ - or ( tomanyptn.search( word ) \ - and word != translate( new_word, reconv_table ) ) ): + test_word = translate(word, reconv_table) + new_word = translate(word, conv_table) + if not reconv_table.get(new_word) and \ + (test_word != word or + (tomanyptn.search(word) and + word != translate(new_word, reconv_table))): word_conv_table[word] = new_word word_reconv_table[new_word] = word try: @@ -272,109 +292,117 @@ def defaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_ta new_word_len = len(word) return word_reconv_table -def PHPArray( table ): + +def PHPArray(table): lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t] return '\n'.join(lines) + def main(): - #Get Unihan.zip: + # Get Unihan.zip: url = 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER han_dest = 'Unihan-%s.zip' % UNIHAN_VER - download( url, han_dest ) - + download(url, han_dest) + + sfurlbase = 'http://%s.dl.sourceforge.net/sourceforge/' % SF_MIRROR + # Get scim-tables-$(SCIM_TABLES_VER).tar.gz: - url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER ) + url = sfurlbase + 'scim/scim-tables-%s.tar.gz' % SCIM_TABLES_VER tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER - download( url, tbe_dest ) - + download(url, tbe_dest) + # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz: - url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER ) + url = sfurlbase + 'scim/scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER - download( url, pyn_dest ) - + download(url, pyn_dest) + # Get libtabe-$(LIBTABE_VER).tgz: - url = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER ) + url = sfurlbase + 'libtabe/libtabe-%s.tgz' % LIBTABE_VER lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER - download( url, lbt_dest ) - + download(url, lbt_dest) + # Unihan.txt - ( t2s_1tomany, s2t_1tomany ) = unihanParser( han_dest ) + (t2s_1tomany, s2t_1tomany) = unihanParser(han_dest) + + t2s_1tomany.update(charManualTable('symme_supp.manual')) + t2s_1tomany.update(charManualTable('trad2simp.manual')) + s2t_1tomany.update((t[0], [f]) for (f, t) in charManualTable('symme_supp.manual')) + s2t_1tomany.update(charManualTable('simp2trad.manual')) - t2s_1tomany.update( charManualTable( 'symme_supp.manual' ) ) - t2s_1tomany.update( charManualTable( 'trad2simp.manual' ) ) - s2t_1tomany.update( ( t[0], [f] ) for ( f, t ) in charManualTable( 'symme_supp.manual' ) ) - s2t_1tomany.update( charManualTable( 'simp2trad.manual' ) ) - if pyversion[:1] in ['2']: - t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.iteritems()] ) - s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.iteritems()] ) + t2s_1to1 = dict([(f, t[0]) for (f, t) in t2s_1tomany.iteritems()]) + s2t_1to1 = dict([(f, t[0]) for (f, t) in s2t_1tomany.iteritems()]) else: - t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.items()] ) - s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.items()] ) - - s_tomany = toManyRules( t2s_1tomany ) - t_tomany = toManyRules( s2t_1tomany ) + t2s_1to1 = dict([(f, t[0]) for (f, t) in t2s_1tomany.items()]) + s2t_1to1 = dict([(f, t[0]) for (f, t) in s2t_1tomany.items()]) + + s_tomany = toManyRules(t2s_1tomany) + t_tomany = toManyRules(s2t_1tomany) # noconvert rules - t2s_1to1 = removeRules( 'trad2simp_noconvert.manual', t2s_1to1 ) - s2t_1to1 = removeRules( 'simp2trad_noconvert.manual', s2t_1to1 ) - + t2s_1to1 = removeRules('trad2simp_noconvert.manual', t2s_1to1) + s2t_1to1 = removeRules('simp2trad_noconvert.manual', s2t_1to1) + # the supper set for word to word conversion t2s_1to1_supp = t2s_1to1.copy() s2t_1to1_supp = s2t_1to1.copy() - t2s_1to1_supp.update( customRules( 'trad2simp_supp_set.manual' ) ) - s2t_1to1_supp.update( customRules( 'simp2trad_supp_set.manual' ) ) - + t2s_1to1_supp.update(customRules('trad2simp_supp_set.manual')) + s2t_1to1_supp.update(customRules('simp2trad_supp_set.manual')) + # word to word manual rules - t2s_word2word_manual = manualWordsTable( 'simpphrases.manual', s2t_1to1_supp, t2s_1to1_supp ) - t2s_word2word_manual.update( customRules( 'toSimp.manual' ) ) - s2t_word2word_manual = manualWordsTable( 'tradphrases.manual', t2s_1to1_supp, s2t_1to1_supp ) - s2t_word2word_manual.update( customRules( 'toTrad.manual' ) ) + t2s_word2word_manual = manualWordsTable('simpphrases.manual', + s2t_1to1_supp, t2s_1to1_supp) + t2s_word2word_manual.update(customRules('toSimp.manual')) + s2t_word2word_manual = manualWordsTable('tradphrases.manual', + t2s_1to1_supp, s2t_1to1_supp) + s2t_word2word_manual.update(customRules('toTrad.manual')) # word to word rules from input methods t_wordlist = set() s_wordlist = set() - t_wordlist.update( ezbigParser( tbe_dest ), - tsiParser( lbt_dest ) ) - s_wordlist.update( wubiParser( tbe_dest ), - zrmParser( tbe_dest ), - phraseParser( pyn_dest ) ) + t_wordlist.update(ezbigParser(tbe_dest), + tsiParser(lbt_dest)) + s_wordlist.update(wubiParser(tbe_dest), + zrmParser(tbe_dest), + phraseParser(pyn_dest)) # exclude - s_wordlist = applyExcludes( s_wordlist, 'simpphrases_exclude.manual' ) - t_wordlist = applyExcludes( t_wordlist, 'tradphrases_exclude.manual' ) + s_wordlist = applyExcludes(s_wordlist, 'simpphrases_exclude.manual') + t_wordlist = applyExcludes(t_wordlist, 'tradphrases_exclude.manual') s2t_supp = s2t_1to1_supp.copy() - s2t_supp.update( s2t_word2word_manual ) + s2t_supp.update(s2t_word2word_manual) t2s_supp = t2s_1to1_supp.copy() - t2s_supp.update( t2s_word2word_manual ) + t2s_supp.update(t2s_word2word_manual) # parse list to dict - t2s_word2word = defaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp ) - t2s_word2word.update( t2s_word2word_manual ) - s2t_word2word = defaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp ) - s2t_word2word.update( s2t_word2word_manual ) - + t2s_word2word = defaultWordsTable(s_wordlist, s_tomany, + s2t_1to1_supp, t2s_supp) + t2s_word2word.update(t2s_word2word_manual) + s2t_word2word = defaultWordsTable(t_wordlist, t_tomany, + t2s_1to1_supp, s2t_supp) + s2t_word2word.update(s2t_word2word_manual) + # Final tables # sorted list toHans if pyversion[:1] in ['2']: - t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.iteritems() if f != t] ) + t2s_1to1 = dict([(f, t) for (f, t) in t2s_1to1.iteritems() if f != t]) else: - t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.items() if f != t] ) - toHans = dictToSortedList( t2s_1to1, 0 ) + dictToSortedList( t2s_word2word, 1 ) + t2s_1to1 = dict([(f, t) for (f, t) in t2s_1to1.items() if f != t]) + toHans = dictToSortedList(t2s_1to1, 0) + dictToSortedList(t2s_word2word, 1) # sorted list toHant if pyversion[:1] in ['2']: - s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.iteritems() if f != t] ) + s2t_1to1 = dict([(f, t) for (f, t) in s2t_1to1.iteritems() if f != t]) else: - s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.items() if f != t] ) - toHant = dictToSortedList( s2t_1to1, 0 ) + dictToSortedList( s2t_word2word, 1 ) + s2t_1to1 = dict([(f, t) for (f, t) in s2t_1to1.items() if f != t]) + toHant = dictToSortedList(s2t_1to1, 0) + dictToSortedList(s2t_word2word, 1) # sorted list toCN - toCN = dictToSortedList( customRules( 'toCN.manual' ), 1 ) + toCN = dictToSortedList(customRules('toCN.manual'), 1) # sorted list toHK - toHK = dictToSortedList( customRules( 'toHK.manual' ), 1 ) + toHK = dictToSortedList(customRules('toHK.manual'), 1) # sorted list toTW - toTW = dictToSortedList( customRules( 'toTW.manual' ), 1 ) - + toTW = dictToSortedList(customRules('toTW.manual'), 1) + # Get PHP Array php = '''