includes/zhtable/Makefile.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8  -*-
   3 # @author Philip
   4 import tarfile as tf
   5 import zipfile as zf
   6 import os, re, shutil, sys, platform
   7
   8 pyversion = platform.python_version()
   9 islinux = platform.system().lower() == 'linux'
  10
  11 if pyversion[:3] in ['2.6', '2.7']:
  12     import urllib as urllib_request
  13     import codecs
  14     open = codecs.open
  15     _unichr = unichr
  16     if sys.maxunicode < 0x10000:
  17         def unichr(i):
  18             if i < 0x10000:
  19                 return _unichr(i)
  20             else:
  21                 return _unichr( 0xD7C0 + ( i>>10 ) ) + _unichr( 0xDC00 + ( i & 0x3FF ) )
  22 elif pyversion[:2] == '3.':
  23     import urllib.request as urllib_request
  24     unichr = chr
  25
  26 def unichr2( *args ):
  27     return [unichr( int( i.split('<')[0][2:], 16 ) ) for i in args]
  28
  29 def unichr3( *args ):
  30     return [unichr( int( i[2:7], 16 ) ) for i in args if i[2:7]]
  31
  32 # DEFINE
  33 SF_MIRROR = 'easynews'
  34 SCIM_TABLES_VER = '0.5.9'
  35 SCIM_PINYIN_VER = '0.5.91'
  36 LIBTABE_VER = '0.2.3'
  37 # END OF DEFINE
  38
  39 def download( url, dest ):
  40     if os.path.isfile( dest ):
  41         print( 'File %s up to date.' % dest )
  42         return
  43     global islinux
  44     if islinux:
  45         # we use wget instead urlretrieve under Linux,
  46         # because wget could display details like download progress
  47         os.system('wget %s' % url)
  48     else:
  49         print( 'Downloading from [%s] ...' % url )
  50         urllib_request.urlretrieve( url, dest )
  51         print( 'Download complete.\n' )
  52     return
  53
  54 def uncompress( fp, member, encoding = 'U8' ):
  55     name = member.rsplit( '/', 1 )[-1]
  56     print( 'Extracting %s ...' % name )
  57     fp.extract( member )
  58     shutil.move( member, name )
  59     if '/' in member:
  60         shutil.rmtree( member.split( '/', 1 )[0] )
  61     return open( name, 'rb', encoding, 'ignore' )
  62
  63 unzip = lambda path, member, encoding = 'U8': \
  64         uncompress( zf.ZipFile( path ), member, encoding )
  65
  66 untargz = lambda path, member, encoding = 'U8': \
  67         uncompress( tf.open( path, 'r:gz' ), member, encoding )
  68
  69 def parserCore( fp, pos, beginmark = None, endmark = None ):
  70     if beginmark and endmark:
  71         start = False
  72     else: start = True
  73     mlist = set()
  74     for line in fp:
  75         if beginmark and line.startswith( beginmark ):
  76             start = True
  77             continue
  78         elif endmark and line.startswith( endmark ):
  79             break
  80         if start and not line.startswith( '#' ):
  81             elems = line.split()
  82             if len( elems ) < 2:
  83                 continue
  84             elif len( elems[0] ) > 1:
  85                 mlist.add( elems[pos] )
  86     return mlist
  87
  88 def tablesParser( path, name ):
  89     """ Read file from scim-tables and parse it. """
  90     global SCIM_TABLES_VER
  91     src = 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER, name )
  92     fp = untargz( path, src, 'U8' )
  93     return parserCore( fp, 1, 'BEGIN_TABLE', 'END_TABLE' )
  94
  95 ezbigParser = lambda path: tablesParser( path, 'EZ-Big.txt.in' )
  96 wubiParser = lambda path: tablesParser( path, 'Wubi.txt.in' )
  97 zrmParser = lambda path: tablesParser( path, 'Ziranma.txt.in' )
  98
  99 def phraseParser( path ):
 100     """ Read phrase_lib.txt and parse it. """
 101     global SCIM_PINYIN_VER
 102     src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
 103     dst = 'phrase_lib.txt'
 104     fp = untargz( path, src, 'U8' )
 105     return parserCore( fp, 0 )
 106
 107 def tsiParser( path ):
 108     """ Read tsi.src and parse it. """
 109     src = 'libtabe/tsi-src/tsi.src'
 110     dst = 'tsi.src'
 111     fp = untargz( path, src, 'big5hkscs' )
 112     return parserCore( fp, 0 )
 113
 114 def unihanParser( path ):
 115     """ Read Unihan_Variants.txt and parse it. """
 116     fp = unzip( path, 'Unihan_Variants.txt', 'U8' )
 117     t2s = dict()
 118     s2t = dict()
 119     for line in fp:
 120         if line.startswith( '#' ):
 121             continue
 122         else:
 123             elems = line.split()
 124             if len( elems ) < 3:
 125                 continue
 126             type = elems.pop( 1 )
 127             elems = unichr2( *elems )
 128             if type == 'kTraditionalVariant':
 129                 s2t[elems[0]] = elems[1:]
 130             elif type == 'kSimplifiedVariant':
 131                 t2s[elems[0]] = elems[1:]
 132     fp.close()
 133     return ( t2s, s2t )
 134
 135 def applyExcludes( mlist, path ):
 136     """ Apply exclude rules from path to mlist. """
 137     excludes = open( path, 'rb', 'U8' ).read().split()
 138     excludes = [word.split( '#' )[0].strip() for word in excludes]
 139     excludes = '|'.join( excludes )
 140     excptn = re.compile( '.*(?:%s).*' % excludes )
 141     diff = [mword for mword in mlist if excptn.search( mword )]
 142     mlist.difference_update( diff )
 143     return mlist
 144
 145 def charManualTable( path ):
 146     fp = open( path, 'rb', 'U8' )
 147     ret = {}
 148     for line in fp:
 149         elems = line.split( '#' )[0].split( '|' )
 150         elems = unichr3( *elems )
 151         if len( elems ) > 1:
 152             ret[elems[0]] = elems[1:]
 153     return ret
 154
 155 def toManyRules( src_table ):
 156     tomany = set()
 157     for ( f, t ) in src_table.iteritems():
 158         for i in range( 1, len( t ) ):
 159             tomany.add( t[i] )
 160     return tomany
 161
 162 def removeRules( path, table ):
 163     fp = open( path, 'rb', 'U8' )
 164     texc = list()
 165     for line in fp:
 166         elems = line.split( '=>' )
 167         f = t = elems[0].strip()
 168         if len( elems ) == 2:
 169             t = elems[1].strip()
 170         f = f.strip('"').strip("'")
 171         t = t.strip('"').strip("'")
 172         if f:
 173             try:
 174                 table.pop( f )
 175             except:
 176                 pass
 177         if t:
 178             texc.append( t )
 179     texcptn = re.compile( '^(?:%s)$' % '|'.join( texc ) )
 180     for (tmp_f, tmp_t) in table.copy().iteritems():
 181         if texcptn.match( tmp_t ):
 182             table.pop( tmp_f )
 183     return table
 184
 185 def customRules( path ):
 186     fp = open( path, 'rb', 'U8' )
 187     ret = dict()
 188     for line in fp:
 189         elems = line.split( '#' )[0].split()
 190         if len( elems ) > 1:
 191             ret[elems[0]] = elems[1]
 192     return ret
 193
 194 def dictToSortedList( src_table, pos ):
 195     return sorted( src_table.items(), key = lambda m: m[pos] )
 196
 197 def translate( text, conv_table ):
 198     i = 0
 199     while i < len( text ):
 200         for j in range( len( text ) - i, 0, -1 ):
 201             f = text[i:][:j]
 202             t = conv_table.get( f )
 203             if t:
 204                 text = text[:i] + t + text[i:][j:]
 205                 i += len(t) - 1
 206                 break
 207         i += 1
 208     return text
 209
 210 def manualWordsTable( path, conv_table, reconv_table ):
 211     fp = open( path, 'rb', 'U8' )
 212     reconv_table = {}
 213     wordlist = [line.split( '#' )[0].strip() for line in fp]
 214     wordlist = list( set( wordlist ) )
 215     wordlist.sort( key = len, reverse = True )
 216     while wordlist:
 217         word = wordlist.pop()
 218         new_word = translate( word, conv_table )
 219         rcv_word = translate( word, reconv_table )
 220         if word != rcv_word:
 221             reconv_table[word] = word
 222         reconv_table[new_word] = word
 223     return reconv_table
 224
 225 def defaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ):
 226     wordlist = list( src_wordlist )
 227     wordlist.sort( key = len, reverse = True )
 228     word_conv_table = {}
 229     word_reconv_table = {}
 230     conv_table = char_conv_table.copy()
 231     reconv_table = char_reconv_table.copy()
 232     tomanyptn = re.compile( '(?:%s)' % '|'.join( src_tomany ) )
 233     while wordlist:
 234         conv_table.update( word_conv_table )
 235         reconv_table.update( word_reconv_table )
 236         word = wordlist.pop()
 237         new_word_len = word_len = len( word )
 238         while new_word_len == word_len:
 239             add = False
 240             test_word = translate( word, reconv_table )
 241             new_word = translate( word, conv_table )
 242             if not reconv_table.get( new_word ) \
 243                and ( test_word != word \
 244                or ( tomanyptn.search( word ) \
 245                and word != translate( new_word, reconv_table ) ) ):
 246                 word_conv_table[word] = new_word
 247                 word_reconv_table[new_word] = word
 248             try:
 249                 word = wordlist.pop()
 250             except IndexError:
 251                 break
 252             new_word_len = len(word)
 253     return word_reconv_table
 254
 255 def PHPArray( table ):
 256     lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t]
 257     return '\n'.join(lines)
 258
 259 def main():
 260     #Get Unihan.zip:
 261     url  = 'http://www.unicode.org/Public/UNIDATA/Unihan.zip'
 262     han_dest = 'Unihan.zip'
 263     download( url, han_dest )
 264
 265     # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
 266     url  = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER )
 267     tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
 268     download( url, tbe_dest )
 269
 270     # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
 271     url  = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER )
 272     pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
 273     download( url, pyn_dest )
 274
 275     # Get libtabe-$(LIBTABE_VER).tgz:
 276     url  = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER )
 277     lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
 278     download( url, lbt_dest )
 279
 280     # Unihan.txt
 281     ( t2s_1tomany, s2t_1tomany ) = unihanParser( han_dest )
 282
 283     t2s_1tomany.update( charManualTable( 'trad2simp.manual' ) )
 284     s2t_1tomany.update( charManualTable( 'simp2trad.manual' ) )
 285
 286     t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.iteritems()] )
 287     s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.iteritems()] )
 288
 289     s_tomany = toManyRules( t2s_1tomany )
 290     t_tomany = toManyRules( s2t_1tomany )
 291
 292     # noconvert rules
 293     t2s_1to1 = removeRules( 'trad2simp_noconvert.manual', t2s_1to1 )
 294     s2t_1to1 = removeRules( 'simp2trad_noconvert.manual', s2t_1to1 )
 295
 296     # the supper set for word to word conversion
 297     t2s_1to1_supp = t2s_1to1.copy()
 298     s2t_1to1_supp = s2t_1to1.copy()
 299     t2s_1to1_supp.update( customRules( 'trad2simp_supp_set.manual' ) )
 300     s2t_1to1_supp.update( customRules( 'simp2trad_supp_set.manual' ) )
 301
 302     # word to word manual rules
 303     t2s_word2word_manual = manualWordsTable( 'simpphrases.manual', s2t_1to1_supp, t2s_1to1_supp )
 304     t2s_word2word_manual.update( customRules( 'toSimp.manual' ) )
 305     s2t_word2word_manual = manualWordsTable( 'tradphrases.manual', t2s_1to1_supp, s2t_1to1_supp )
 306     s2t_word2word_manual.update( customRules( 'toTrad.manual' ) )
 307
 308     # word to word rules from input methods
 309     t_wordlist = set()
 310     s_wordlist = set()
 311     t_wordlist.update( ezbigParser( tbe_dest ),
 312                        tsiParser( lbt_dest ) )
 313     s_wordlist.update( wubiParser( tbe_dest ),
 314                        zrmParser( tbe_dest ),
 315                        phraseParser( pyn_dest ) )
 316
 317     # exclude
 318     s_wordlist = applyExcludes( s_wordlist, 'simpphrases_exclude.manual' )
 319     t_wordlist = applyExcludes( t_wordlist, 'tradphrases_exclude.manual' )
 320
 321     s2t_supp = s2t_1to1_supp.copy()
 322     s2t_supp.update( s2t_word2word_manual )
 323     t2s_supp = t2s_1to1_supp.copy()
 324     t2s_supp.update( t2s_word2word_manual )
 325
 326     # parse list to dict
 327     t2s_word2word = defaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp )
 328     t2s_word2word.update( t2s_word2word_manual )
 329     s2t_word2word = defaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp )
 330     s2t_word2word.update( s2t_word2word_manual )
 331
 332     # Final tables
 333     # sorted list toHans
 334     t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.iteritems() if f != t] )
 335     toHans = dictToSortedList( t2s_1to1, 0 ) + dictToSortedList( t2s_word2word, 1 )
 336     # sorted list toHant
 337     s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.iteritems() if f != t] )
 338     toHant = dictToSortedList( s2t_1to1, 0 ) + dictToSortedList( s2t_word2word, 1 )
 339     # sorted list toCN
 340     toCN = dictToSortedList( customRules( 'toCN.manual' ), 1 )
 341     # sorted list toHK
 342     toHK = dictToSortedList( customRules( 'toHK.manual' ), 1 )
 343     # sorted list toSG
 344     toSG = dictToSortedList( customRules( 'toSG.manual' ), 1 )
 345     # sorted list toTW
 346     toTW = dictToSortedList( customRules( 'toTW.manual' ), 1 )
 347
 348     # Get PHP Array
 349     php = '''<?php
 350 /**
 351  * Simplified / Traditional Chinese conversion tables
 352  *
 353  * Automatically generated using code and data in includes/zhtable/
 354  * Do not modify directly!
 355  *
 356  * @file
 357  */
 358
 359 $zh2Hant = array(\n'''
 360     php += PHPArray( toHant ) \
 361         +  '\n);\n\n$zh2Hans = array(\n' \
 362         +  PHPArray( toHans ) \
 363         +  '\n);\n\n$zh2TW = array(\n' \
 364         +  PHPArray( toTW ) \
 365         +  '\n);\n\n$zh2HK = array(\n' \
 366         +  PHPArray( toHK ) \
 367         +  '\n);\n\n$zh2CN = array(\n' \
 368         +  PHPArray( toCN ) \
 369         +  '\n);\n\n$zh2SG = array(\n' \
 370         +  PHPArray( toSG ) \
 371         +  '\n);'
 372
 373     f = open( 'ZhConversion.php', 'wb', encoding = 'utf8' )
 374     print ('Writing ZhConversion.php ... ')
 375     f.write( php )
 376     f.close()
 377
 378     #Remove temp files
 379     print ('Deleting temp files ... ')
 380     os.remove('EZ-Big.txt.in')
 381     os.remove('phrase_lib.txt')
 382     os.remove('tsi.src')
 383     os.remove('Unihan_Variants.txt')
 384     os.remove('Wubi.txt.in')
 385     os.remove('Ziranma.txt.in')
 386
 387
 388 if __name__ == '__main__':
 389     main()