includes/zhtable/Makefile.py

   1 #!/usr/bin/python
   2 # -*- coding: utf-8  -*-
   3 # @author Philip
   4 import tarfile, zipfile
   5 import os, re, shutil, sys, platform
   6
   7 pyversion = platform.python_version()
   8 if pyversion[:3] in ['2.5', '2.6', '2.7']:
   9     import urllib as urllib_request
  10     import codecs
  11     uniopen = codecs.open
  12     def unichr2(i):
  13         if sys.maxunicode >= 0x10000 or i < 0x10000:
  14             return unichr(i)
  15         else:
  16             return unichr(0xD7C0+(i>>10)) + unichr(0xDC00+(i&0x3FF))
  17 elif pyversion[:2] == '3.':
  18     import urllib.request as urllib_request
  19     uniopen = open
  20     unichr2 = chr
  21
  22 # DEFINE
  23 SF_MIRROR = 'easynews'
  24 SCIM_TABLES_VER = '0.5.9'
  25 SCIM_PINYIN_VER = '0.5.91'
  26 LIBTABE_VER = '0.2.3'
  27 # END OF DEFINE
  28
  29 def GetFileFromURL( url, dest ):
  30     if os.path.isfile(dest):
  31         print( 'File %s up to date.' % dest )
  32         return
  33     print( 'Downloading from [%s] ...' % url )
  34     urllib_request.urlretrieve( url, dest )
  35     print( 'Download complete.\n' )
  36     return
  37
  38 def GetFileFromUnihan( path ):
  39     print( 'Extracting files from %s ...' % path )
  40     text = zipfile.ZipFile(path).read('Unihan_Variants.txt')
  41     uhfile = uniopen('Unihan_Variants.txt', 'w')
  42     uhfile.write(text)
  43     uhfile.close()
  44     return
  45
  46 def GetFileFromTar( path, member, rename ):
  47     print( 'Extracting %s from %s ...' % (rename, path) )
  48     tarfile.open(path, 'r:gz').extract(member)
  49     shutil.move(member, rename)
  50     tree_rmv = member.split('/')[0]
  51     shutil.rmtree(tree_rmv)
  52     return
  53
  54 def ReadBIG5File( dest ):
  55     print( 'Reading and decoding %s ...' % dest )
  56     f1 = uniopen( dest, 'r', encoding='big5hkscs', errors='replace' )
  57     text = f1.read()
  58     text = text.replace( '\ufffd', '\n' )
  59     f1.close()
  60     f2 = uniopen( dest, 'w', encoding='utf8' )
  61     f2.write(text)
  62     f2.close()
  63     return text
  64
  65 def ReadFile( dest ):
  66     print( 'Reading and decoding %s ...' % dest )
  67     f = uniopen( dest, 'r', encoding='utf8' )
  68     ret = f.read()
  69     f.close()
  70     return ret
  71
  72 def ReadUnihanFile( dest ):
  73     print( 'Reading and decoding %s ...' % dest )
  74     f = uniopen( dest, 'r', encoding='utf8' )
  75     t2s_code = []
  76     s2t_code = []
  77     while True:
  78         line = f.readline()
  79         if line:
  80             if line.startswith('#'):
  81                 continue
  82             elif not line.find('kSimplifiedVariant') == -1:
  83                 temp = line.split('kSimplifiedVariant')
  84                 t2s_code.append( ( temp[0].strip(), temp[1].strip() ) )
  85             elif not line.find('kTraditionalVariant') == -1:
  86                 temp = line.split('kTraditionalVariant')
  87                 s2t_code.append( ( temp[0].strip(), temp[1].strip() ) )
  88         else:
  89             break
  90     f.close()
  91     return ( t2s_code, s2t_code )
  92
  93 def RemoveRows( text, num ):
  94     text = re.sub( '.*\s*', '', text, num)
  95     return text
  96
  97 def RemoveOneCharConv( text ):
  98     preg = re.compile('^.\s*$', re.MULTILINE)
  99     text = preg.sub( '', text )
 100     return text
 101
 102 def ConvertToChar( code ):
 103     code = code.split('<')[0]
 104     return unichr2( int( code[2:], 16 ) )
 105
 106 def GetDefaultTable( code_table ):
 107     char_table = {}
 108     for ( f, t ) in code_table:
 109         if f and t:
 110             from_char = ConvertToChar( f )
 111             to_chars = [ConvertToChar( code ) for code in t.split()]
 112             char_table[from_char] = to_chars
 113     return char_table
 114
 115 def GetManualTable( dest ):
 116     text = ReadFile( dest )
 117     temp1 = text.split()
 118     char_table = {}
 119     for elem in temp1:
 120         elem = elem.strip('|')
 121         if elem:
 122             temp2 = elem.split( '|', 1 )
 123             from_char = unichr2( int( temp2[0][2:7], 16 ) )
 124             to_chars = [unichr2( int( code[2:7], 16 ) ) for code in temp2[1].split('|')]
 125             char_table[from_char] = to_chars
 126     return char_table
 127
 128 def GetValidTable( src_table ):
 129     valid_table = {}
 130     for f, t in src_table.items():
 131         valid_table[f] = t[0]
 132     return valid_table
 133
 134 def GetToManyRules( src_table ):
 135     tomany_table = {}
 136     for f, t in src_table.items():
 137         for i in range(1, len(t)):
 138             tomany_table[t[i]] = True
 139     return tomany_table
 140
 141 def RemoveRules( dest, table ):
 142     text = ReadFile( dest )
 143     temp1 = text.split()
 144     for elem in temp1:
 145         f = ''
 146         t = ''
 147         elem = elem.strip().replace( '"', '' ).replace( '\'', '' )
 148         if '=>' in elem:
 149             if elem.startswith( '=>' ):
 150                 t = elem.replace( '=>', '' ).strip()
 151             elif elem.endswith( '=>' ):
 152                 f = elem.replace( '=>', '' ).strip()
 153             else:
 154                 temp2 = elem.split( '=>' )
 155                 f = temp2[0].strip()
 156                 t = temp2[1].strip()
 157                 try:
 158                     table.pop(f, t)
 159                     continue
 160                 except:
 161                     continue
 162         else:
 163             f = t = elem
 164         if f:
 165             try:
 166                 table.pop(f)
 167             except:
 168                 x = 1
 169         if t:
 170             for temp_f, temp_t in table.copy().items():
 171                 if temp_t == t:
 172                     table.pop(temp_f)
 173     return table
 174
 175 def DictToSortedList1( src_table ):
 176     return sorted( src_table.items(), key = lambda m: m[0] ) #sorted( temp_table, key = lambda m: len( m[0] ) )
 177
 178 def DictToSortedList2( src_table ):
 179     return sorted( src_table.items(), key = lambda m: m[1] )
 180
 181 def Converter( string, conv_table ):
 182     i = 0
 183     while i < len(string):
 184         for j in range(len(string) - i, 0, -1):
 185             f = string[i:][:j]
 186             t = conv_table.get( f )
 187             if t:
 188                 string = string[:i] + t + string[i:][j:]
 189                 i += len(t) - 1
 190                 break
 191         i += 1
 192     return string
 193
 194 def GetDefaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ):
 195     wordlist = list( set( src_wordlist ) )
 196     wordlist.sort( key = len, reverse = True )
 197     word_conv_table = {}
 198     word_reconv_table = {}
 199     while wordlist:
 200         conv_table = {}
 201         reconv_table = {}
 202         conv_table.update( word_conv_table )
 203         conv_table.update( char_conv_table )
 204         reconv_table.update( word_reconv_table )
 205         reconv_table.update( char_reconv_table )
 206         word = wordlist.pop()
 207         new_word_len = word_len = len(word)
 208         while new_word_len == word_len:
 209             rvt_test = False
 210             for char in word:
 211                 rvt_test = rvt_test or src_tomany.get(char)
 212             test_word = Converter( word, reconv_table )
 213             new_word = Converter( word, conv_table )
 214             if not reconv_table.get( new_word ):
 215                 if not test_word == word:
 216                     word_conv_table[word] = new_word
 217                     word_reconv_table[new_word] = word
 218                 elif rvt_test:
 219                     rvt_word = Converter( new_word, reconv_table )
 220                     if not rvt_word == word:
 221                         word_conv_table[word] = new_word
 222                         word_reconv_table[new_word] = word
 223             try:
 224                 word = wordlist.pop()
 225             except IndexError:
 226                 break
 227             new_word_len = len(word)
 228     return word_reconv_table
 229
 230 def GetManualWordsTable( src_wordlist, conv_table ):
 231     src_wordlist = [items.split('#')[0].strip() for items in src_wordlist]
 232     wordlist = list( set( src_wordlist ) )
 233     wordlist.sort( key = len, reverse = True )
 234     reconv_table = {}
 235     while wordlist:
 236         word = wordlist.pop()
 237         new_word = Converter( word, conv_table )
 238         reconv_table[new_word] = word
 239     return reconv_table
 240
 241 def CustomRules( dest ):
 242     text = ReadFile( dest )
 243     temp = text.split()
 244     ret = dict()
 245     for i in range( 0, len( temp ), 2 ):
 246         ret[temp[i]] = temp[i + 1]
 247     return ret
 248
 249 def GetPHPArray( table ):
 250     lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table]
 251     #lines = ['"%s"=>"%s",' % (f, t) for (f, t) in table]
 252     return '\n'.join(lines)
 253
 254 def RemoveSameChar( src_table ):
 255     dst_table = {}
 256     for f, t in src_table.items():
 257         if not f == t:
 258             dst_table[f] = t
 259     return dst_table
 260
 261 def main():
 262     #Get Unihan.zip:
 263     url  = 'http://www.unicode.org/Public/UNIDATA/Unihan.zip'
 264     han_dest = 'Unihan.zip'
 265     GetFileFromURL( url, han_dest )
 266
 267     # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
 268     url  = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER )
 269     tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
 270     GetFileFromURL( url, tbe_dest )
 271
 272     # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
 273     url  = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER )
 274     pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
 275     GetFileFromURL( url, pyn_dest )
 276
 277     # Get libtabe-$(LIBTABE_VER).tgz:
 278     url  = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER )
 279     lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
 280     GetFileFromURL( url, lbt_dest )
 281
 282     # Extract the file from a comressed files
 283
 284     # Unihan.txt Simp. & Trad
 285     GetFileFromUnihan( han_dest )
 286
 287     # Make word lists
 288     t_wordlist = []
 289     s_wordlist = []
 290
 291     # EZ.txt.in Trad
 292     src = 'scim-tables-%s/tables/zh/EZ-Big.txt.in' % SCIM_TABLES_VER
 293     dst = 'EZ.txt.in'
 294     GetFileFromTar( tbe_dest, src, dst )
 295     text = ReadFile( dst )
 296     text = text.split( 'BEGIN_TABLE' )[1].strip()
 297     text = text.split( 'END_TABLE' )[0].strip()
 298     text = re.sub( '.*\t', '', text )
 299     text = RemoveOneCharConv( text )
 300     t_wordlist.extend( text.split() )
 301
 302     # Wubi.txt.in Simp
 303     src = 'scim-tables-%s/tables/zh/Wubi.txt.in' % SCIM_TABLES_VER
 304     dst = 'Wubi.txt.in'
 305     GetFileFromTar( tbe_dest, src, dst )
 306     text = ReadFile( dst )
 307     text = text.split( 'BEGIN_TABLE' )[1].strip()
 308     text = text.split( 'END_TABLE' )[0].strip()
 309     text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text )
 310     text = RemoveOneCharConv( text )
 311     s_wordlist.extend( text.split() )
 312
 313     # Ziranma.txt.in Simp
 314     src = 'scim-tables-%s/tables/zh/Ziranma.txt.in' % SCIM_TABLES_VER
 315     dst = 'Ziranma.txt.in'
 316     GetFileFromTar( tbe_dest, src, dst )
 317     text = ReadFile( dst )
 318     text = text.split( 'BEGIN_TABLE' )[1].strip()
 319     text = text.split( 'END_TABLE' )[0].strip()
 320     text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text )
 321     text = RemoveOneCharConv( text )
 322     s_wordlist.extend( text.split() )
 323
 324     # phrase_lib.txt Simp
 325     src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
 326     dst = 'phrase_lib.txt'
 327     GetFileFromTar( pyn_dest, src, dst )
 328     text = ReadFile( 'phrase_lib.txt' )
 329     text = re.sub( '(.*)\t\d\d*.*', '\g<1>', text)
 330     text = RemoveRows( text, 5 )
 331     text = RemoveOneCharConv( text )
 332     s_wordlist.extend( text.split() )
 333
 334     # tsi.src Trad
 335     src = 'libtabe/tsi-src/tsi.src'
 336     dst = 'tsi.src'
 337     GetFileFromTar( lbt_dest, src, dst )
 338     text = ReadBIG5File( 'tsi.src' )
 339     text = re.sub( ' \d.*', '', text.replace('# ', ''))
 340     text = RemoveOneCharConv( text )
 341     t_wordlist.extend( text.split() )
 342
 343     # remove duplicate elements
 344     t_wordlist = list( set( t_wordlist ) )
 345     s_wordlist = list( set( s_wordlist ) )
 346
 347     # simpphrases_exclude.manual Simp
 348     text = ReadFile( 'simpphrases_exclude.manual' )
 349     temp = text.split()
 350     s_string = '\n'.join( s_wordlist )
 351     for elem in temp:
 352         s_string = re.sub( '.*%s.*\n' % elem, '', s_string )
 353     s_wordlist = s_string.split('\n')
 354
 355     # tradphrases_exclude.manual Trad
 356     text = ReadFile( 'tradphrases_exclude.manual' )
 357     temp = text.split()
 358     t_string = '\n'.join( t_wordlist )
 359     for elem in temp:
 360         t_string = re.sub( '.*%s.*\n' % elem, '', t_string )
 361     t_wordlist = t_string.split('\n')
 362
 363     # Make char to char convertion table
 364     # Unihan.txt, dict t2s_code, s2t_code = { 'U+XXXX': 'U+YYYY( U+ZZZZ) ... ', ... }
 365     ( t2s_code, s2t_code ) = ReadUnihanFile( 'Unihan_Variants.txt' )
 366     # dict t2s_1tomany = { '\uXXXX': '\uYYYY\uZZZZ ... ', ... }
 367     t2s_1tomany = {}
 368     t2s_1tomany.update( GetDefaultTable( t2s_code ) )
 369     t2s_1tomany.update( GetManualTable( 'trad2simp.manual' ) )
 370     # dict s2t_1tomany
 371     s2t_1tomany = {}
 372     s2t_1tomany.update( GetDefaultTable( s2t_code ) )
 373     s2t_1tomany.update( GetManualTable( 'simp2trad.manual' ) )
 374     # dict t2s_1to1 = { '\uXXXX': '\uYYYY', ... }; t2s_trans = { 'ddddd': '', ... }
 375     t2s_1to1 = GetValidTable( t2s_1tomany )
 376     s_tomany = GetToManyRules( t2s_1tomany )
 377     # dict s2t_1to1; s2t_trans
 378     s2t_1to1 = GetValidTable( s2t_1tomany )
 379     t_tomany = GetToManyRules( s2t_1tomany )
 380     # remove noconvert rules
 381     t2s_1to1 = RemoveRules( 'trad2simp_noconvert.manual', t2s_1to1 )
 382     s2t_1to1 = RemoveRules( 'simp2trad_noconvert.manual', s2t_1to1 )
 383
 384     # Make word to word convertion table
 385     t2s_1to1_supp = t2s_1to1.copy()
 386     s2t_1to1_supp = s2t_1to1.copy()
 387     # trad2simp_supp_set.manual
 388     t2s_1to1_supp.update( CustomRules( 'trad2simp_supp_set.manual' ) )
 389     # simp2trad_supp_set.manual
 390     s2t_1to1_supp.update( CustomRules( 'simp2trad_supp_set.manual' ) )
 391     # simpphrases.manual
 392     text = ReadFile( 'simpphrases.manual' )
 393     s_wordlist_manual = text.split('\n')
 394     t2s_word2word_manual = GetManualWordsTable(s_wordlist_manual, s2t_1to1_supp)
 395     t2s_word2word_manual.update( CustomRules( 'toSimp.manual' ) )
 396     # tradphrases.manual
 397     text = ReadFile( 'tradphrases.manual' )
 398     t_wordlist_manual = text.split('\n')
 399     s2t_word2word_manual = GetManualWordsTable(t_wordlist_manual, t2s_1to1_supp)
 400     s2t_word2word_manual.update( CustomRules( 'toTrad.manual' ) )
 401     # t2s_word2word
 402     s2t_supp = s2t_1to1_supp.copy()
 403     s2t_supp.update( s2t_word2word_manual )
 404     t2s_supp = t2s_1to1_supp.copy()
 405     t2s_supp.update( t2s_word2word_manual )
 406     t2s_word2word = GetDefaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp )
 407     ## toSimp.manual
 408     t2s_word2word.update( t2s_word2word_manual )
 409     # s2t_word2word
 410     s2t_word2word = GetDefaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp )
 411     ## toTrad.manual
 412     s2t_word2word.update( s2t_word2word_manual )
 413
 414     # Final tables
 415     # sorted list toHans
 416     t2s_1to1 = RemoveSameChar( t2s_1to1 )
 417     s2t_1to1 = RemoveSameChar( s2t_1to1 )
 418     toHans = DictToSortedList1( t2s_1to1 ) + DictToSortedList2( t2s_word2word )
 419     # sorted list toHant
 420     toHant = DictToSortedList1( s2t_1to1 ) + DictToSortedList2( s2t_word2word )
 421     # sorted list toCN
 422     toCN = DictToSortedList2( CustomRules( 'toCN.manual' ) )
 423     # sorted list toHK
 424     toHK = DictToSortedList2( CustomRules( 'toHK.manual' ) )
 425     # sorted list toSG
 426     toSG = DictToSortedList2( CustomRules( 'toSG.manual' ) )
 427     # sorted list toTW
 428     toTW = DictToSortedList2( CustomRules( 'toTW.manual' ) )
 429
 430     # Get PHP Array
 431     php = '''<?php
 432 /**
 433  * Simplified / Traditional Chinese conversion tables
 434  *
 435  * Automatically generated using code and data in includes/zhtable/
 436  * Do not modify directly!
 437  */
 438
 439 $zh2Hant = array(\n'''
 440     php += GetPHPArray( toHant )
 441     php += '\n);\n\n$zh2Hans = array(\n'
 442     php += GetPHPArray( toHans )
 443     php += '\n);\n\n$zh2TW = array(\n'
 444     php += GetPHPArray( toTW )
 445     php += '\n);\n\n$zh2HK = array(\n'
 446     php += GetPHPArray( toHK )
 447     php += '\n);\n\n$zh2CN = array(\n'
 448     php += GetPHPArray( toCN )
 449     php += '\n);\n\n$zh2SG = array(\n'
 450     php += GetPHPArray( toSG )
 451     php += '\n);'
 452
 453     f = uniopen( 'ZhConversion.php', 'w', encoding = 'utf8' )
 454     print ('Writing ZhConversion.php ... ')
 455     f.write( php )
 456     f.close()
 457
 458     #Remove temp files
 459     print ('Deleting temp files ... ')
 460     os.remove('EZ.txt.in')
 461     os.remove('phrase_lib.txt')
 462     os.remove('tsi.src')
 463     os.remove('Unihan_Variants.txt')
 464     os.remove('Wubi.txt.in')
 465     os.remove('Ziranma.txt.in')
 466
 467
 468 if __name__ == '__main__':
 469     main()