2 # You should run this script UNDER python 3000.
3 import tarfile
, zipfile
4 import os
, re
, shutil
, urllib
.request
8 SCIM_TABLES_VER
= '0.5.9'
9 SCIM_PINYIN_VER
= '0.5.91'
13 def GetFileFromURL( url
, dest
):
14 if os
.path
.isfile(dest
):
15 print( 'File %s up to date.' % dest
)
17 print( 'Downloading from [%s] ...' % url
)
18 urllib
.request
.urlretrieve( url
, dest
)
19 print( 'Download complete.\n' )
22 def GetFileFromZip( path
):
23 print( 'Extracting files from %s ...' % path
)
24 zipfile
.ZipFile(path
).extractall()
27 def GetFileFromTar( path
, member
, rename
):
28 print( 'Extracting %s from %s ...' % (rename
, path
) )
29 tarfile
.open(path
, 'r:gz').extract(member
)
30 shutil
.move(member
, rename
)
31 tree_rmv
= member
.split('/')[0]
32 shutil
.rmtree(tree_rmv
)
35 def ReadBIG5File( dest
):
36 print( 'Reading and decoding %s ...' % dest
)
37 f1
= open( dest
, 'r', encoding
='big5hkscs', errors
='replace' )
39 text
= text
.replace( '\ufffd', '\n' )
41 f2
= open( dest
, 'w', encoding
='utf8' )
47 print( 'Reading and decoding %s ...' % dest
)
48 f
= open( dest
, 'r', encoding
='utf8' )
53 def ReadUnihanFile( dest
):
54 print( 'Reading and decoding %s ...' % dest
)
55 f
= open( dest
, 'r', encoding
='utf8' )
61 if line
.startswith('#'):
63 elif not line
.find('kSimplifiedVariant') == -1:
64 temp
= line
.split('kSimplifiedVariant')
65 t2s_code
.append( ( temp
[0].strip(), temp
[1].strip() ) )
66 elif not line
.find('kTraditionalVariant') == -1:
67 temp
= line
.split('kTraditionalVariant')
68 s2t_code
.append( ( temp
[0].strip(), temp
[1].strip() ) )
72 return ( t2s_code
, s2t_code
)
74 def RemoveRows( text
, num
):
75 text
= re
.sub( '.*\s*', '', text
, num
)
78 def RemoveOneCharConv( text
):
79 preg
= re
.compile('^.\s*$', re
.MULTILINE
)
80 text
= preg
.sub( '', text
)
83 def ConvertToChar( code
):
84 code
= code
.split('<')[0]
85 return chr( int( code
[2:], 16 ) )
87 def GetDefaultTable( code_table
):
89 for ( f
, t
) in code_table
:
91 from_char
= ConvertToChar( f
)
92 to_chars
= [ConvertToChar( code
) for code
in t
.split()]
93 char_table
[from_char
] = to_chars
96 def GetManualTable( dest
):
97 text
= ReadFile( dest
)
101 elem
= elem
.strip('|')
103 temp2
= elem
.split( '|', 1 )
104 from_char
= temp2
[0][-1:]
105 to_chars
= [code
[-1:] for code
in temp2
[1].split('|')]
106 char_table
[from_char
] = to_chars
109 def GetValidTable( src_table
):
111 for f
, t
in src_table
.items():
112 valid_table
[f
] = t
[0]
115 def GetToManyRules( src_table
):
117 for f
, t
in src_table
.items():
118 for i
in range(1, len(t
)):
119 tomany_table
[t
[i
]] = True
122 def RemoveRules( dest
, table
):
123 text
= ReadFile( dest
)
128 elem
= elem
.strip().replace( '"', '' ).replace( '\'', '' )
130 if elem
.startswith( '=>' ):
131 t
= elem
.replace( '=>', '' ).strip()
132 elif elem
.endswith( '=>' ):
133 f
= elem
.replace( '=>', '' ).strip()
135 temp2
= elem
.split( '=>' )
151 for temp_f
, temp_t
in table
.copy().items():
156 def DictToSortedList1( src_table
):
157 return sorted( src_table
.items(), key
= lambda m
: m
[0] ) #sorted( temp_table, key = lambda m: len( m[0] ) )
159 def DictToSortedList2( src_table
):
160 return sorted( src_table
.items(), key
= lambda m
: m
[1] )
162 def Converter( string
, conv_table
):
164 while i
< len(string
):
165 for j
in range(len(string
) - i
, 0, -1):
167 t
= conv_table
.get( f
)
169 string
= string
[:i
] + t
+ string
[i
:][j
:]
175 def GetDefaultWordsTable( src_wordlist
, src_tomany
, char_conv_table
, char_reconv_table
):
176 wordlist
= list( set( src_wordlist
) )
177 wordlist
.sort( key
= len, reverse
= True )
179 word_reconv_table
= {}
183 conv_table
.update( word_conv_table
)
184 conv_table
.update( char_conv_table
)
185 reconv_table
.update( word_reconv_table
)
186 reconv_table
.update( char_reconv_table
)
187 word
= wordlist
.pop()
188 new_word_len
= word_len
= len(word
)
189 while new_word_len
== word_len
:
192 rvt_test
= rvt_test
or src_tomany
.get(char
)
193 test_word
= Converter( word
, reconv_table
)
194 new_word
= Converter( word
, conv_table
)
195 if not reconv_table
.get( new_word
):
196 if not test_word
== word
:
197 word_conv_table
[word
] = new_word
198 word_reconv_table
[new_word
] = word
200 rvt_word
= Converter( new_word
, reconv_table
)
201 if not rvt_word
== word
:
202 word_conv_table
[word
] = new_word
203 word_reconv_table
[new_word
] = word
205 word
= wordlist
.pop()
208 new_word_len
= len(word
)
209 return word_reconv_table
211 def GetManualWordsTable( src_wordlist
, conv_table
):
212 wordlist
= list( set( src_wordlist
) )
213 wordlist
.sort( key
= len, reverse
= True )
216 word
= wordlist
.pop()
217 new_word
= Converter( word
, conv_table
)
218 reconv_table
[new_word
] = word
221 def CustomRules( dest
):
222 text
= ReadFile( dest
)
224 ret
= {temp
[i
]: temp
[i
+ 1] for i
in range( 0, len( temp
), 2 )}
227 def GetPHPArray( table
):
228 lines
= ['\'%s\' => \'%s\',' % (f
, t
) for (f
, t
) in table
]
229 #lines = ['"%s"=>"%s",' % (f, t) for (f, t) in table]
230 return '\n'.join(lines
)
232 def RemoveSameChar( src_table
):
234 for f
, t
in src_table
.items():
241 url
= 'ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip'
242 han_dest
= 'Unihan.zip'
243 GetFileFromURL( url
, han_dest
)
245 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
246 url
= 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR
, SCIM_TABLES_VER
)
247 tbe_dest
= 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
248 GetFileFromURL( url
, tbe_dest
)
250 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
251 url
= 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR
, SCIM_PINYIN_VER
)
252 pyn_dest
= 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
253 GetFileFromURL( url
, pyn_dest
)
255 # Get libtabe-$(LIBTABE_VER).tgz:
256 url
= 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR
, LIBTABE_VER
)
257 lbt_dest
= 'libtabe-%s.tgz' % LIBTABE_VER
258 GetFileFromURL( url
, lbt_dest
)
260 # Extract the file from a comressed files
262 # Unihan.txt Simp. & Trad
263 GetFileFromZip( han_dest
)
270 src
= 'scim-tables-%s/tables/zh/EZ-Big.txt.in' % SCIM_TABLES_VER
272 GetFileFromTar( tbe_dest
, src
, dst
)
273 text
= ReadFile( dst
)
274 text
= text
.split( 'BEGIN_TABLE' )[1].strip()
275 text
= text
.split( 'END_TABLE' )[0].strip()
276 text
= re
.sub( '.*\t', '', text
)
277 text
= RemoveOneCharConv( text
)
278 t_wordlist
.extend( text
.split() )
281 src
= 'scim-tables-%s/tables/zh/Wubi.txt.in' % SCIM_TABLES_VER
283 GetFileFromTar( tbe_dest
, src
, dst
)
284 text
= ReadFile( dst
)
285 text
= text
.split( 'BEGIN_TABLE' )[1].strip()
286 text
= text
.split( 'END_TABLE' )[0].strip()
287 text
= re
.sub( '.*\t(.*?)\t\d*', '\g<1>', text
)
288 text
= RemoveOneCharConv( text
)
289 s_wordlist
.extend( text
.split() )
291 # Ziranma.txt.in Simp
292 src
= 'scim-tables-%s/tables/zh/Ziranma.txt.in' % SCIM_TABLES_VER
293 dst
= 'Ziranma.txt.in'
294 GetFileFromTar( tbe_dest
, src
, dst
)
295 text
= ReadFile( dst
)
296 text
= text
.split( 'BEGIN_TABLE' )[1].strip()
297 text
= text
.split( 'END_TABLE' )[0].strip()
298 text
= re
.sub( '.*\t(.*?)\t\d*', '\g<1>', text
)
299 text
= RemoveOneCharConv( text
)
300 s_wordlist
.extend( text
.split() )
302 # phrase_lib.txt Simp
303 src
= 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
304 dst
= 'phrase_lib.txt'
305 GetFileFromTar( pyn_dest
, src
, dst
)
306 text
= ReadFile( 'phrase_lib.txt' )
307 text
= re
.sub( '(.*)\t\d\d*.*', '\g<1>', text
)
308 text
= RemoveRows( text
, 5 )
309 text
= RemoveOneCharConv( text
)
310 s_wordlist
.extend( text
.split() )
313 src
= 'libtabe/tsi-src/tsi.src'
315 GetFileFromTar( lbt_dest
, src
, dst
)
316 text
= ReadBIG5File( 'tsi.src' )
317 text
= re
.sub( ' \d.*', '', text
.replace('# ', ''))
318 text
= RemoveOneCharConv( text
)
319 t_wordlist
.extend( text
.split() )
321 # remove duplicate elements
322 t_wordlist
= list( set( t_wordlist
) )
323 s_wordlist
= list( set( s_wordlist
) )
325 # simpphrases_exclude.manual Simp
326 text
= ReadFile( 'simpphrases_exclude.manual' )
328 s_string
= '\n'.join( s_wordlist
)
330 s_string
= re
.sub( '.*%s.*\n' % elem
, '', s_string
)
331 s_wordlist
= s_string
.split('\n')
333 # tradphrases_exclude.manual Trad
334 text
= ReadFile( 'tradphrases_exclude.manual' )
336 t_string
= '\n'.join( t_wordlist
)
338 t_string
= re
.sub( '.*%s.*\n' % elem
, '', t_string
)
339 t_wordlist
= t_string
.split('\n')
341 # Make char to char convertion table
342 # Unihan.txt, dict t2s_code, s2t_code = { 'U+XXXX': 'U+YYYY( U+ZZZZ) ... ', ... }
343 ( t2s_code
, s2t_code
) = ReadUnihanFile( 'Unihan.txt' )
344 # dict t2s_1tomany = { '\uXXXX': '\uYYYY\uZZZZ ... ', ... }
346 t2s_1tomany
.update( GetDefaultTable( t2s_code
) )
347 t2s_1tomany
.update( GetManualTable( 'trad2simp.manual' ) )
350 s2t_1tomany
.update( GetDefaultTable( s2t_code
) )
351 s2t_1tomany
.update( GetManualTable( 'simp2trad.manual' ) )
352 # dict t2s_1to1 = { '\uXXXX': '\uYYYY', ... }; t2s_trans = { 'ddddd': '', ... }
353 t2s_1to1
= GetValidTable( t2s_1tomany
)
354 s_tomany
= GetToManyRules( t2s_1tomany
)
355 # dict s2t_1to1; s2t_trans
356 s2t_1to1
= GetValidTable( s2t_1tomany
)
357 t_tomany
= GetToManyRules( s2t_1tomany
)
358 # remove noconvert rules
359 t2s_1to1
= RemoveRules( 'trad2simp_noconvert.manual', t2s_1to1
)
360 s2t_1to1
= RemoveRules( 'simp2trad_noconvert.manual', s2t_1to1
)
362 # Make word to word convertion table
363 t2s_1to1_supp
= t2s_1to1
.copy()
364 s2t_1to1_supp
= s2t_1to1
.copy()
365 # trad2simp_supp_set.manual
366 t2s_1to1_supp
.update( CustomRules( 'trad2simp_supp_set.manual' ) )
367 # simp2trad_supp_set.manual
368 s2t_1to1_supp
.update( CustomRules( 'simp2trad_supp_set.manual' ) )
370 text
= ReadFile( 'simpphrases.manual' )
371 s_wordlist_manual
= text
.split()
372 t2s_word2word_manual
= GetManualWordsTable(s_wordlist_manual
, s2t_1to1_supp
)
373 t2s_word2word_manual
.update( CustomRules( 'toSimp.manual' ) )
375 text
= ReadFile( 'tradphrases.manual' )
376 t_wordlist_manual
= text
.split()
377 s2t_word2word_manual
= GetManualWordsTable(t_wordlist_manual
, t2s_1to1_supp
)
378 s2t_word2word_manual
.update( CustomRules( 'toTrad.manual' ) )
380 s2t_supp
= s2t_1to1_supp
.copy()
381 s2t_supp
.update( s2t_word2word_manual
)
382 t2s_supp
= t2s_1to1_supp
.copy()
383 t2s_supp
.update( t2s_word2word_manual
)
384 t2s_word2word
= GetDefaultWordsTable( s_wordlist
, s_tomany
, s2t_1to1_supp
, t2s_supp
)
386 t2s_word2word
.update( t2s_word2word_manual
)
388 s2t_word2word
= GetDefaultWordsTable( t_wordlist
, t_tomany
, t2s_1to1_supp
, s2t_supp
)
390 s2t_word2word
.update( s2t_word2word_manual
)
394 t2s_1to1
= RemoveSameChar( t2s_1to1
)
395 s2t_1to1
= RemoveSameChar( s2t_1to1
)
396 toHans
= DictToSortedList1( t2s_1to1
) + DictToSortedList2( t2s_word2word
)
398 toHant
= DictToSortedList1( s2t_1to1
) + DictToSortedList2( s2t_word2word
)
400 toCN
= DictToSortedList2( CustomRules( 'toCN.manual' ) )
402 toHK
= DictToSortedList2( CustomRules( 'toHK.manual' ) )
404 toSG
= DictToSortedList2( CustomRules( 'toSG.manual' ) )
406 toTW
= DictToSortedList2( CustomRules( 'toTW.manual' ) )
411 * Simplified / Traditional Chinese conversion tables
413 * Automatically generated using code and data in includes/zhtable/
414 * Do not modify directly!
417 $zh2Hant = array(\n'''
418 php
+= GetPHPArray( toHant
)
419 php
+= '\n);\n\n$zh2Hans = array(\n'
420 php
+= GetPHPArray( toHans
)
421 php
+= '\n);\n\n$zh2TW = array(\n'
422 php
+= GetPHPArray( toTW
)
423 php
+= '\n);\n\n$zh2HK = array(\n'
424 php
+= GetPHPArray( toHK
)
425 php
+= '\n);\n\n$zh2CN = array(\n'
426 php
+= GetPHPArray( toCN
)
427 php
+= '\n);\n\n$zh2SG = array(\n'
428 php
+= GetPHPArray( toSG
)
431 f
= open( 'ZhConversion.php', 'w', encoding
= 'utf8' )
432 print ('Writing ZhConversion.php ... ')
436 if __name__
== '__main__':