includes/zhtable/Makefile.py

   1 # @author Philip
   2 # You should run this script UNDER python 3000.
   3 import tarfile, zipfile
   4 import os, re, shutil, urllib.request
   5
   6 # DEFINE
   7 SF_MIRROR = 'easynews'
   8 SCIM_TABLES_VER = '0.5.9'
   9 SCIM_PINYIN_VER = '0.5.91'
  10 LIBTABE_VER = '0.2.3'
  11 # END OF DEFINE
  12
  13 def GetFileFromURL( url, dest ):
  14     if os.path.isfile(dest):
  15         print( 'File %s up to date.' % dest )
  16         return
  17     print( 'Downloading from [%s] ...' % url )
  18     urllib.request.urlretrieve( url, dest )
  19     print( 'Download complete.\n' )
  20     return
  21
  22 def GetFileFromZip( path ):
  23     print( 'Extracting files from %s ...' % path )
  24     zipfile.ZipFile(path).extractall()
  25     return
  26
  27 def GetFileFromTar( path, member, rename ):
  28     print( 'Extracting %s from %s ...' % (rename, path) )
  29     tarfile.open(path, 'r:gz').extract(member)
  30     shutil.move(member, rename)
  31     tree_rmv = member.split('/')[0]
  32     shutil.rmtree(tree_rmv)
  33     return
  34
  35 def ReadBIG5File( dest ):
  36     print( 'Reading and decoding %s ...' % dest )
  37     f1 = open( dest, 'r', encoding='big5hkscs', errors='replace' )
  38     text = f1.read()
  39     text = text.replace( '\ufffd', '\n' )
  40     f1.close()
  41     f2 = open( dest, 'w', encoding='utf8' )
  42     f2.write(text)
  43     f2.close()
  44     return text
  45
  46 def ReadFile( dest ):
  47     print( 'Reading and decoding %s ...' % dest )
  48     f = open( dest, 'r', encoding='utf8' )
  49     ret = f.read()
  50     f.close()
  51     return ret
  52
  53 def ReadUnihanFile( dest ):
  54     print( 'Reading and decoding %s ...' % dest )
  55     f = open( dest, 'r', encoding='utf8' )
  56     t2s_code = []
  57     s2t_code = []
  58     while True:
  59         line = f.readline()
  60         if line:
  61             if line.startswith('#'):
  62                 continue
  63             elif not line.find('kSimplifiedVariant') == -1:
  64                 temp = line.split('kSimplifiedVariant')
  65                 t2s_code.append( ( temp[0].strip(), temp[1].strip() ) )
  66             elif not line.find('kTraditionalVariant') == -1:
  67                 temp = line.split('kTraditionalVariant')
  68                 s2t_code.append( ( temp[0].strip(), temp[1].strip() ) )
  69         else:
  70             break
  71     f.close()
  72     return ( t2s_code, s2t_code )
  73
  74 def RemoveRows( text, num ):
  75     text = re.sub( '.*\s*', '', text, num)
  76     return text
  77
  78 def RemoveOneCharConv( text ):
  79     preg = re.compile('^.\s*$', re.MULTILINE)
  80     text = preg.sub( '', text )
  81     return text
  82
  83 def ConvertToChar( code ):
  84     code = code.split('<')[0]
  85     return chr( int( code[2:], 16 ) )
  86
  87 def GetDefaultTable( code_table ):
  88     char_table = {}
  89     for ( f, t ) in code_table:
  90         if f and t:
  91             from_char = ConvertToChar( f )
  92             to_chars = [ConvertToChar( code ) for code in t.split()]
  93             char_table[from_char] = to_chars
  94     return char_table
  95
  96 def GetManualTable( dest ):
  97     text = ReadFile( dest )
  98     temp1 = text.split()
  99     char_table = {}
 100     for elem in temp1:
 101         elem = elem.strip('|')
 102         if elem:
 103             temp2 = elem.split( '|', 1 )
 104             from_char = temp2[0][-1:]
 105             to_chars = [code[-1:] for code in temp2[1].split('|')]
 106             char_table[from_char] = to_chars
 107     return char_table
 108
 109 def GetValidTable( src_table ):
 110     valid_table = {}
 111     for f, t in src_table.items():
 112         valid_table[f] = t[0]
 113     return valid_table
 114
 115 def GetToManyRules( src_table ):
 116     tomany_table = {}
 117     for f, t in src_table.items():
 118         for i in range(1, len(t)):
 119             tomany_table[t[i]] = True
 120     return tomany_table
 121
 122 def RemoveRules( dest, table ):
 123     text = ReadFile( dest )
 124     temp1 = text.split()
 125     for elem in temp1:
 126         f = ''
 127         t = ''
 128         elem = elem.strip().replace( '"', '' ).replace( '\'', '' )
 129         if '=>' in elem:
 130             if elem.startswith( '=>' ):
 131                 t = elem.replace( '=>', '' ).strip()
 132             elif elem.endswith( '=>' ):
 133                 f = elem.replace( '=>', '' ).strip()
 134             else:
 135                 temp2 = elem.split( '=>' )
 136                 f = temp2[0].strip()
 137                 t = temp2[1].strip()
 138                 try:
 139                     table.pop(f, t)
 140                     continue
 141                 except:
 142                     continue
 143         else:
 144             f = t = elem
 145         if f:
 146             try:
 147                 table.pop(f)
 148             except:
 149                 x = 1
 150         if t:
 151             for temp_f, temp_t in table.copy().items():
 152                 if temp_t == t:
 153                     table.pop(temp_f)
 154     return table
 155
 156 def DictToSortedList1( src_table ):
 157     return sorted( src_table.items(), key = lambda m: m[0] ) #sorted( temp_table, key = lambda m: len( m[0] ) )
 158
 159 def DictToSortedList2( src_table ):
 160     return sorted( src_table.items(), key = lambda m: m[1] )
 161
 162 def Converter( string, conv_table ):
 163     i = 0
 164     while i < len(string):
 165         for j in range(len(string) - i, 0, -1):
 166             f = string[i:][:j]
 167             t = conv_table.get( f )
 168             if t:
 169                 string = string[:i] + t + string[i:][j:]
 170                 i += len(t) - 1
 171                 break
 172         i += 1
 173     return string
 174
 175 def GetDefaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ):
 176     wordlist = list( set( src_wordlist ) )
 177     wordlist.sort( key = len, reverse = True )
 178     word_conv_table = {}
 179     word_reconv_table = {}
 180     while wordlist:
 181         conv_table = {}
 182         reconv_table = {}
 183         conv_table.update( word_conv_table )
 184         conv_table.update( char_conv_table )
 185         reconv_table.update( word_reconv_table )
 186         reconv_table.update( char_reconv_table )
 187         word = wordlist.pop()
 188         new_word_len = word_len = len(word)
 189         while new_word_len == word_len:
 190             rvt_test = False
 191             for char in word:
 192                 rvt_test = rvt_test or src_tomany.get(char)
 193             test_word = Converter( word, reconv_table )
 194             new_word = Converter( word, conv_table )
 195             if not reconv_table.get( new_word ):
 196                 if not test_word == word:
 197                     word_conv_table[word] = new_word
 198                     word_reconv_table[new_word] = word
 199                 elif rvt_test:
 200                     rvt_word = Converter( new_word, reconv_table )
 201                     if not rvt_word == word:
 202                         word_conv_table[word] = new_word
 203                         word_reconv_table[new_word] = word
 204             try:
 205                 word = wordlist.pop()
 206             except IndexError:
 207                 break
 208             new_word_len = len(word)
 209     return word_reconv_table
 210
 211 def GetManualWordsTable( src_wordlist, conv_table ):
 212     wordlist = list( set( src_wordlist ) )
 213     wordlist.sort( key = len, reverse = True )
 214     reconv_table = {}
 215     while wordlist:
 216         word = wordlist.pop()
 217         new_word = Converter( word, conv_table )
 218         reconv_table[new_word] = word
 219     return reconv_table
 220
 221 def CustomRules( dest ):
 222     text = ReadFile( dest )
 223     temp = text.split()
 224     ret = {temp[i]: temp[i + 1] for i in range( 0, len( temp ), 2 )}
 225     return ret
 226
 227 def GetPHPArray( table ):
 228     lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table]
 229     #lines = ['"%s"=>"%s",' % (f, t) for (f, t) in table]
 230     return '\n'.join(lines)
 231
 232 def RemoveSameChar( src_table ):
 233     dst_table = {}
 234     for f, t in src_table.items():
 235         if not f == t:
 236             dst_table[f] = t
 237     return dst_table
 238
 239 def main():
 240     #Get Unihan.zip:
 241     url  = 'ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip'
 242     han_dest = 'Unihan.zip'
 243     GetFileFromURL( url, han_dest )
 244
 245     # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
 246     url  = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER )
 247     tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
 248     GetFileFromURL( url, tbe_dest )
 249
 250     # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
 251     url  = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER )
 252     pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
 253     GetFileFromURL( url, pyn_dest )
 254
 255     # Get libtabe-$(LIBTABE_VER).tgz:
 256     url  = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER )
 257     lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
 258     GetFileFromURL( url, lbt_dest )
 259
 260     # Extract the file from a comressed files
 261
 262     # Unihan.txt Simp. & Trad
 263     GetFileFromZip( han_dest )
 264
 265     # Make word lists
 266     t_wordlist = []
 267     s_wordlist = []
 268
 269     # EZ.txt.in Trad
 270     src = 'scim-tables-%s/tables/zh/EZ-Big.txt.in' % SCIM_TABLES_VER
 271     dst = 'EZ.txt.in'
 272     GetFileFromTar( tbe_dest, src, dst )
 273     text = ReadFile( dst )
 274     text = text.split( 'BEGIN_TABLE' )[1].strip()
 275     text = text.split( 'END_TABLE' )[0].strip()
 276     text = re.sub( '.*\t', '', text )
 277     text = RemoveOneCharConv( text )
 278     t_wordlist.extend( text.split() )
 279
 280     # Wubi.txt.in Simp
 281     src = 'scim-tables-%s/tables/zh/Wubi.txt.in' % SCIM_TABLES_VER
 282     dst = 'Wubi.txt.in'
 283     GetFileFromTar( tbe_dest, src, dst )
 284     text = ReadFile( dst )
 285     text = text.split( 'BEGIN_TABLE' )[1].strip()
 286     text = text.split( 'END_TABLE' )[0].strip()
 287     text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text )
 288     text = RemoveOneCharConv( text )
 289     s_wordlist.extend( text.split() )
 290
 291     # Ziranma.txt.in Simp
 292     src = 'scim-tables-%s/tables/zh/Ziranma.txt.in' % SCIM_TABLES_VER
 293     dst = 'Ziranma.txt.in'
 294     GetFileFromTar( tbe_dest, src, dst )
 295     text = ReadFile( dst )
 296     text = text.split( 'BEGIN_TABLE' )[1].strip()
 297     text = text.split( 'END_TABLE' )[0].strip()
 298     text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text )
 299     text = RemoveOneCharConv( text )
 300     s_wordlist.extend( text.split() )
 301
 302     # phrase_lib.txt Simp
 303     src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
 304     dst = 'phrase_lib.txt'
 305     GetFileFromTar( pyn_dest, src, dst )
 306     text = ReadFile( 'phrase_lib.txt' )
 307     text = re.sub( '(.*)\t\d\d*.*', '\g<1>', text)
 308     text = RemoveRows( text, 5 )
 309     text = RemoveOneCharConv( text )
 310     s_wordlist.extend( text.split() )
 311
 312     # tsi.src Trad
 313     src = 'libtabe/tsi-src/tsi.src'
 314     dst = 'tsi.src'
 315     GetFileFromTar( lbt_dest, src, dst )
 316     text = ReadBIG5File( 'tsi.src' )
 317     text = re.sub( ' \d.*', '', text.replace('# ', ''))
 318     text = RemoveOneCharConv( text )
 319     t_wordlist.extend( text.split() )
 320
 321     # remove duplicate elements
 322     t_wordlist = list( set( t_wordlist ) )
 323     s_wordlist = list( set( s_wordlist ) )
 324
 325     # simpphrases_exclude.manual Simp
 326     text = ReadFile( 'simpphrases_exclude.manual' )
 327     temp = text.split()
 328     s_string = '\n'.join( s_wordlist )
 329     for elem in temp:
 330         s_string = re.sub( '.*%s.*\n' % elem, '', s_string )
 331     s_wordlist = s_string.split('\n')
 332
 333     # tradphrases_exclude.manual Trad
 334     text = ReadFile( 'tradphrases_exclude.manual' )
 335     temp = text.split()
 336     t_string = '\n'.join( t_wordlist )
 337     for elem in temp:
 338         t_string = re.sub( '.*%s.*\n' % elem, '', t_string )
 339     t_wordlist = t_string.split('\n')
 340
 341     # Make char to char convertion table
 342     # Unihan.txt, dict t2s_code, s2t_code = { 'U+XXXX': 'U+YYYY( U+ZZZZ) ... ', ... }
 343     ( t2s_code, s2t_code ) = ReadUnihanFile( 'Unihan.txt' )
 344     # dict t2s_1tomany = { '\uXXXX': '\uYYYY\uZZZZ ... ', ... }
 345     t2s_1tomany = {}
 346     t2s_1tomany.update( GetDefaultTable( t2s_code ) )
 347     t2s_1tomany.update( GetManualTable( 'trad2simp.manual' ) )
 348     # dict s2t_1tomany
 349     s2t_1tomany = {}
 350     s2t_1tomany.update( GetDefaultTable( s2t_code ) )
 351     s2t_1tomany.update( GetManualTable( 'simp2trad.manual' ) )
 352     # dict t2s_1to1 = { '\uXXXX': '\uYYYY', ... }; t2s_trans = { 'ddddd': '', ... }
 353     t2s_1to1 = GetValidTable( t2s_1tomany )
 354     s_tomany = GetToManyRules( t2s_1tomany )
 355     # dict s2t_1to1; s2t_trans
 356     s2t_1to1 = GetValidTable( s2t_1tomany )
 357     t_tomany = GetToManyRules( s2t_1tomany )
 358     # remove noconvert rules
 359     t2s_1to1 = RemoveRules( 'trad2simp_noconvert.manual', t2s_1to1 )
 360     s2t_1to1 = RemoveRules( 'simp2trad_noconvert.manual', s2t_1to1 )
 361
 362     # Make word to word convertion table
 363     t2s_1to1_supp = t2s_1to1.copy()
 364     s2t_1to1_supp = s2t_1to1.copy()
 365     # trad2simp_supp_set.manual
 366     t2s_1to1_supp.update( CustomRules( 'trad2simp_supp_set.manual' ) )
 367     # simp2trad_supp_set.manual
 368     s2t_1to1_supp.update( CustomRules( 'simp2trad_supp_set.manual' ) )
 369     # simpphrases.manual
 370     text = ReadFile( 'simpphrases.manual' )
 371     s_wordlist_manual = text.split()
 372     t2s_word2word_manual = GetManualWordsTable(s_wordlist_manual, s2t_1to1_supp)
 373     t2s_word2word_manual.update( CustomRules( 'toSimp.manual' ) )
 374     # tradphrases.manual
 375     text = ReadFile( 'tradphrases.manual' )
 376     t_wordlist_manual = text.split()
 377     s2t_word2word_manual = GetManualWordsTable(t_wordlist_manual, t2s_1to1_supp)
 378     s2t_word2word_manual.update( CustomRules( 'toTrad.manual' ) )
 379     # t2s_word2word
 380     s2t_supp = s2t_1to1_supp.copy()
 381     s2t_supp.update( s2t_word2word_manual )
 382     t2s_supp = t2s_1to1_supp.copy()
 383     t2s_supp.update( t2s_word2word_manual )
 384     t2s_word2word = GetDefaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp )
 385     ## toSimp.manual
 386     t2s_word2word.update( t2s_word2word_manual )
 387     # s2t_word2word
 388     s2t_word2word = GetDefaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp )
 389     ## toTrad.manual
 390     s2t_word2word.update( s2t_word2word_manual )
 391
 392     # Final tables
 393     # sorted list toHans
 394     t2s_1to1 = RemoveSameChar( t2s_1to1 )
 395     s2t_1to1 = RemoveSameChar( s2t_1to1 )
 396     toHans = DictToSortedList1( t2s_1to1 ) + DictToSortedList2( t2s_word2word )
 397     # sorted list toHant
 398     toHant = DictToSortedList1( s2t_1to1 ) + DictToSortedList2( s2t_word2word )
 399     # sorted list toCN
 400     toCN = DictToSortedList2( CustomRules( 'toCN.manual' ) )
 401     # sorted list toHK
 402     toHK = DictToSortedList2( CustomRules( 'toHK.manual' ) )
 403     # sorted list toSG
 404     toSG = DictToSortedList2( CustomRules( 'toSG.manual' ) )
 405     # sorted list toTW
 406     toTW = DictToSortedList2( CustomRules( 'toTW.manual' ) )
 407
 408     # Get PHP Array
 409     php = '''<?php
 410 /**
 411  * Simplified / Traditional Chinese conversion tables
 412  *
 413  * Automatically generated using code and data in includes/zhtable/
 414  * Do not modify directly!
 415  */
 416
 417 $zh2Hant = array(\n'''
 418     php += GetPHPArray( toHant )
 419     php += '\n);\n\n$zh2Hans = array(\n'
 420     php += GetPHPArray( toHans )
 421     php += '\n);\n\n$zh2TW = array(\n'
 422     php += GetPHPArray( toTW )
 423     php += '\n);\n\n$zh2HK = array(\n'
 424     php += GetPHPArray( toHK )
 425     php += '\n);\n\n$zh2CN = array(\n'
 426     php += GetPHPArray( toCN )
 427     php += '\n);\n\n$zh2SG = array(\n'
 428     php += GetPHPArray( toSG )
 429     php += '\n);'
 430
 431     f = open( 'ZhConversion.php', 'w', encoding = 'utf8' )
 432     print ('Writing ZhConversion.php ... ')
 433     f.write( php )
 434     f.close()
 435
 436 if __name__ == '__main__':
 437     main()