2 # -*- coding: utf-8 -*-
4 import tarfile
, zipfile
5 import os
, re
, shutil
, sys
, platform
7 pyversion
= platform
.python_version()
8 if pyversion
[:3] in ['2.5', '2.6', '2.7']:
9 import urllib
as urllib_request
13 if sys
.maxunicode
>= 0x10000 or i
< 0x10000:
16 return unichr(0xD7C0+(i
>>10)) + unichr(0xDC00+(i
&0x3FF))
17 elif pyversion
[:2] == '3.':
18 import urllib
.request
as urllib_request
23 SF_MIRROR
= 'easynews'
24 SCIM_TABLES_VER
= '0.5.9'
25 SCIM_PINYIN_VER
= '0.5.91'
29 def GetFileFromURL( url
, dest
):
30 if os
.path
.isfile(dest
):
31 print( 'File %s up to date.' % dest
)
33 print( 'Downloading from [%s] ...' % url
)
34 urllib_request
.urlretrieve( url
, dest
)
35 print( 'Download complete.\n' )
38 def GetFileFromUnihan( path
):
39 print( 'Extracting files from %s ...' % path
)
40 text
= zipfile
.ZipFile(path
).read('Unihan_Variants.txt')
41 uhfile
= uniopen('Unihan_Variants.txt', 'w')
46 def GetFileFromTar( path
, member
, rename
):
47 print( 'Extracting %s from %s ...' % (rename
, path
) )
48 tarfile
.open(path
, 'r:gz').extract(member
)
49 shutil
.move(member
, rename
)
50 tree_rmv
= member
.split('/')[0]
51 shutil
.rmtree(tree_rmv
)
54 def ReadBIG5File( dest
):
55 print( 'Reading and decoding %s ...' % dest
)
56 f1
= uniopen( dest
, 'r', encoding
='big5hkscs', errors
='replace' )
58 text
= text
.replace( '\ufffd', '\n' )
60 f2
= uniopen( dest
, 'w', encoding
='utf8' )
66 print( 'Reading and decoding %s ...' % dest
)
67 f
= uniopen( dest
, 'r', encoding
='utf8' )
72 def ReadUnihanFile( dest
):
73 print( 'Reading and decoding %s ...' % dest
)
74 f
= uniopen( dest
, 'r', encoding
='utf8' )
80 if line
.startswith('#'):
82 elif not line
.find('kSimplifiedVariant') == -1:
83 temp
= line
.split('kSimplifiedVariant')
84 t2s_code
.append( ( temp
[0].strip(), temp
[1].strip() ) )
85 elif not line
.find('kTraditionalVariant') == -1:
86 temp
= line
.split('kTraditionalVariant')
87 s2t_code
.append( ( temp
[0].strip(), temp
[1].strip() ) )
91 return ( t2s_code
, s2t_code
)
93 def RemoveRows( text
, num
):
94 text
= re
.sub( '.*\s*', '', text
, num
)
97 def RemoveOneCharConv( text
):
98 preg
= re
.compile('^.\s*$', re
.MULTILINE
)
99 text
= preg
.sub( '', text
)
102 def ConvertToChar( code
):
103 code
= code
.split('<')[0]
104 return unichr2( int( code
[2:], 16 ) )
106 def GetDefaultTable( code_table
):
108 for ( f
, t
) in code_table
:
110 from_char
= ConvertToChar( f
)
111 to_chars
= [ConvertToChar( code
) for code
in t
.split()]
112 char_table
[from_char
] = to_chars
115 def GetManualTable( dest
):
116 text
= ReadFile( dest
)
120 elem
= elem
.strip('|')
122 temp2
= elem
.split( '|', 1 )
123 from_char
= unichr2( int( temp2
[0][2:7], 16 ) )
124 to_chars
= [unichr2( int( code
[2:7], 16 ) ) for code
in temp2
[1].split('|')]
125 char_table
[from_char
] = to_chars
128 def GetValidTable( src_table
):
130 for f
, t
in src_table
.items():
131 valid_table
[f
] = t
[0]
134 def GetToManyRules( src_table
):
136 for f
, t
in src_table
.items():
137 for i
in range(1, len(t
)):
138 tomany_table
[t
[i
]] = True
141 def RemoveRules( dest
, table
):
142 text
= ReadFile( dest
)
147 elem
= elem
.strip().replace( '"', '' ).replace( '\'', '' )
149 if elem
.startswith( '=>' ):
150 t
= elem
.replace( '=>', '' ).strip()
151 elif elem
.endswith( '=>' ):
152 f
= elem
.replace( '=>', '' ).strip()
154 temp2
= elem
.split( '=>' )
170 for temp_f
, temp_t
in table
.copy().items():
175 def DictToSortedList1( src_table
):
176 return sorted( src_table
.items(), key
= lambda m
: m
[0] ) #sorted( temp_table, key = lambda m: len( m[0] ) )
178 def DictToSortedList2( src_table
):
179 return sorted( src_table
.items(), key
= lambda m
: m
[1] )
181 def Converter( string
, conv_table
):
183 while i
< len(string
):
184 for j
in range(len(string
) - i
, 0, -1):
186 t
= conv_table
.get( f
)
188 string
= string
[:i
] + t
+ string
[i
:][j
:]
194 def GetDefaultWordsTable( src_wordlist
, src_tomany
, char_conv_table
, char_reconv_table
):
195 wordlist
= list( set( src_wordlist
) )
196 wordlist
.sort( key
= len, reverse
= True )
198 word_reconv_table
= {}
202 conv_table
.update( word_conv_table
)
203 conv_table
.update( char_conv_table
)
204 reconv_table
.update( word_reconv_table
)
205 reconv_table
.update( char_reconv_table
)
206 word
= wordlist
.pop()
207 new_word_len
= word_len
= len(word
)
208 while new_word_len
== word_len
:
211 rvt_test
= rvt_test
or src_tomany
.get(char
)
212 test_word
= Converter( word
, reconv_table
)
213 new_word
= Converter( word
, conv_table
)
214 if not reconv_table
.get( new_word
):
215 if not test_word
== word
:
216 word_conv_table
[word
] = new_word
217 word_reconv_table
[new_word
] = word
219 rvt_word
= Converter( new_word
, reconv_table
)
220 if not rvt_word
== word
:
221 word_conv_table
[word
] = new_word
222 word_reconv_table
[new_word
] = word
224 word
= wordlist
.pop()
227 new_word_len
= len(word
)
228 return word_reconv_table
230 def GetManualWordsTable( src_wordlist
, conv_table
):
231 src_wordlist
= [items
.split('#')[0].strip() for items
in src_wordlist
]
232 wordlist
= list( set( src_wordlist
) )
233 wordlist
.sort( key
= len, reverse
= True )
236 word
= wordlist
.pop()
237 new_word
= Converter( word
, conv_table
)
238 reconv_table
[new_word
] = word
241 def CustomRules( dest
):
242 text
= ReadFile( dest
)
245 for i
in range( 0, len( temp
), 2 ):
246 ret
[temp
[i
]] = temp
[i
+ 1]
249 def GetPHPArray( table
):
250 lines
= ['\'%s\' => \'%s\',' % (f
, t
) for (f
, t
) in table
]
251 #lines = ['"%s"=>"%s",' % (f, t) for (f, t) in table]
252 return '\n'.join(lines
)
254 def RemoveSameChar( src_table
):
256 for f
, t
in src_table
.items():
263 url
= 'http://www.unicode.org/Public/UNIDATA/Unihan.zip'
264 han_dest
= 'Unihan.zip'
265 GetFileFromURL( url
, han_dest
)
267 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
268 url
= 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR
, SCIM_TABLES_VER
)
269 tbe_dest
= 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
270 GetFileFromURL( url
, tbe_dest
)
272 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
273 url
= 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR
, SCIM_PINYIN_VER
)
274 pyn_dest
= 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
275 GetFileFromURL( url
, pyn_dest
)
277 # Get libtabe-$(LIBTABE_VER).tgz:
278 url
= 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR
, LIBTABE_VER
)
279 lbt_dest
= 'libtabe-%s.tgz' % LIBTABE_VER
280 GetFileFromURL( url
, lbt_dest
)
282 # Extract the file from a comressed files
284 # Unihan.txt Simp. & Trad
285 GetFileFromUnihan( han_dest
)
292 src
= 'scim-tables-%s/tables/zh/EZ-Big.txt.in' % SCIM_TABLES_VER
294 GetFileFromTar( tbe_dest
, src
, dst
)
295 text
= ReadFile( dst
)
296 text
= text
.split( 'BEGIN_TABLE' )[1].strip()
297 text
= text
.split( 'END_TABLE' )[0].strip()
298 text
= re
.sub( '.*\t', '', text
)
299 text
= RemoveOneCharConv( text
)
300 t_wordlist
.extend( text
.split() )
303 src
= 'scim-tables-%s/tables/zh/Wubi.txt.in' % SCIM_TABLES_VER
305 GetFileFromTar( tbe_dest
, src
, dst
)
306 text
= ReadFile( dst
)
307 text
= text
.split( 'BEGIN_TABLE' )[1].strip()
308 text
= text
.split( 'END_TABLE' )[0].strip()
309 text
= re
.sub( '.*\t(.*?)\t\d*', '\g<1>', text
)
310 text
= RemoveOneCharConv( text
)
311 s_wordlist
.extend( text
.split() )
313 # Ziranma.txt.in Simp
314 src
= 'scim-tables-%s/tables/zh/Ziranma.txt.in' % SCIM_TABLES_VER
315 dst
= 'Ziranma.txt.in'
316 GetFileFromTar( tbe_dest
, src
, dst
)
317 text
= ReadFile( dst
)
318 text
= text
.split( 'BEGIN_TABLE' )[1].strip()
319 text
= text
.split( 'END_TABLE' )[0].strip()
320 text
= re
.sub( '.*\t(.*?)\t\d*', '\g<1>', text
)
321 text
= RemoveOneCharConv( text
)
322 s_wordlist
.extend( text
.split() )
324 # phrase_lib.txt Simp
325 src
= 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
326 dst
= 'phrase_lib.txt'
327 GetFileFromTar( pyn_dest
, src
, dst
)
328 text
= ReadFile( 'phrase_lib.txt' )
329 text
= re
.sub( '(.*)\t\d\d*.*', '\g<1>', text
)
330 text
= RemoveRows( text
, 5 )
331 text
= RemoveOneCharConv( text
)
332 s_wordlist
.extend( text
.split() )
335 src
= 'libtabe/tsi-src/tsi.src'
337 GetFileFromTar( lbt_dest
, src
, dst
)
338 text
= ReadBIG5File( 'tsi.src' )
339 text
= re
.sub( ' \d.*', '', text
.replace('# ', ''))
340 text
= RemoveOneCharConv( text
)
341 t_wordlist
.extend( text
.split() )
343 # remove duplicate elements
344 t_wordlist
= list( set( t_wordlist
) )
345 s_wordlist
= list( set( s_wordlist
) )
347 # simpphrases_exclude.manual Simp
348 text
= ReadFile( 'simpphrases_exclude.manual' )
350 s_string
= '\n'.join( s_wordlist
)
352 s_string
= re
.sub( '.*%s.*\n' % elem
, '', s_string
)
353 s_wordlist
= s_string
.split('\n')
355 # tradphrases_exclude.manual Trad
356 text
= ReadFile( 'tradphrases_exclude.manual' )
358 t_string
= '\n'.join( t_wordlist
)
360 t_string
= re
.sub( '.*%s.*\n' % elem
, '', t_string
)
361 t_wordlist
= t_string
.split('\n')
363 # Make char to char convertion table
364 # Unihan.txt, dict t2s_code, s2t_code = { 'U+XXXX': 'U+YYYY( U+ZZZZ) ... ', ... }
365 ( t2s_code
, s2t_code
) = ReadUnihanFile( 'Unihan_Variants.txt' )
366 # dict t2s_1tomany = { '\uXXXX': '\uYYYY\uZZZZ ... ', ... }
368 t2s_1tomany
.update( GetDefaultTable( t2s_code
) )
369 t2s_1tomany
.update( GetManualTable( 'trad2simp.manual' ) )
372 s2t_1tomany
.update( GetDefaultTable( s2t_code
) )
373 s2t_1tomany
.update( GetManualTable( 'simp2trad.manual' ) )
374 # dict t2s_1to1 = { '\uXXXX': '\uYYYY', ... }; t2s_trans = { 'ddddd': '', ... }
375 t2s_1to1
= GetValidTable( t2s_1tomany
)
376 s_tomany
= GetToManyRules( t2s_1tomany
)
377 # dict s2t_1to1; s2t_trans
378 s2t_1to1
= GetValidTable( s2t_1tomany
)
379 t_tomany
= GetToManyRules( s2t_1tomany
)
380 # remove noconvert rules
381 t2s_1to1
= RemoveRules( 'trad2simp_noconvert.manual', t2s_1to1
)
382 s2t_1to1
= RemoveRules( 'simp2trad_noconvert.manual', s2t_1to1
)
384 # Make word to word convertion table
385 t2s_1to1_supp
= t2s_1to1
.copy()
386 s2t_1to1_supp
= s2t_1to1
.copy()
387 # trad2simp_supp_set.manual
388 t2s_1to1_supp
.update( CustomRules( 'trad2simp_supp_set.manual' ) )
389 # simp2trad_supp_set.manual
390 s2t_1to1_supp
.update( CustomRules( 'simp2trad_supp_set.manual' ) )
392 text
= ReadFile( 'simpphrases.manual' )
393 s_wordlist_manual
= text
.split('\n')
394 t2s_word2word_manual
= GetManualWordsTable(s_wordlist_manual
, s2t_1to1_supp
)
395 t2s_word2word_manual
.update( CustomRules( 'toSimp.manual' ) )
397 text
= ReadFile( 'tradphrases.manual' )
398 t_wordlist_manual
= text
.split('\n')
399 s2t_word2word_manual
= GetManualWordsTable(t_wordlist_manual
, t2s_1to1_supp
)
400 s2t_word2word_manual
.update( CustomRules( 'toTrad.manual' ) )
402 s2t_supp
= s2t_1to1_supp
.copy()
403 s2t_supp
.update( s2t_word2word_manual
)
404 t2s_supp
= t2s_1to1_supp
.copy()
405 t2s_supp
.update( t2s_word2word_manual
)
406 t2s_word2word
= GetDefaultWordsTable( s_wordlist
, s_tomany
, s2t_1to1_supp
, t2s_supp
)
408 t2s_word2word
.update( t2s_word2word_manual
)
410 s2t_word2word
= GetDefaultWordsTable( t_wordlist
, t_tomany
, t2s_1to1_supp
, s2t_supp
)
412 s2t_word2word
.update( s2t_word2word_manual
)
416 t2s_1to1
= RemoveSameChar( t2s_1to1
)
417 s2t_1to1
= RemoveSameChar( s2t_1to1
)
418 toHans
= DictToSortedList1( t2s_1to1
) + DictToSortedList2( t2s_word2word
)
420 toHant
= DictToSortedList1( s2t_1to1
) + DictToSortedList2( s2t_word2word
)
422 toCN
= DictToSortedList2( CustomRules( 'toCN.manual' ) )
424 toHK
= DictToSortedList2( CustomRules( 'toHK.manual' ) )
426 toSG
= DictToSortedList2( CustomRules( 'toSG.manual' ) )
428 toTW
= DictToSortedList2( CustomRules( 'toTW.manual' ) )
433 * Simplified / Traditional Chinese conversion tables
435 * Automatically generated using code and data in includes/zhtable/
436 * Do not modify directly!
439 $zh2Hant = array(\n'''
440 php
+= GetPHPArray( toHant
)
441 php
+= '\n);\n\n$zh2Hans = array(\n'
442 php
+= GetPHPArray( toHans
)
443 php
+= '\n);\n\n$zh2TW = array(\n'
444 php
+= GetPHPArray( toTW
)
445 php
+= '\n);\n\n$zh2HK = array(\n'
446 php
+= GetPHPArray( toHK
)
447 php
+= '\n);\n\n$zh2CN = array(\n'
448 php
+= GetPHPArray( toCN
)
449 php
+= '\n);\n\n$zh2SG = array(\n'
450 php
+= GetPHPArray( toSG
)
453 f
= uniopen( 'ZhConversion.php', 'w', encoding
= 'utf8' )
454 print ('Writing ZhConversion.php ... ')
459 print ('Deleting temp files ... ')
460 os
.remove('EZ.txt.in')
461 os
.remove('phrase_lib.txt')
463 os
.remove('Unihan_Variants.txt')
464 os
.remove('Wubi.txt.in')
465 os
.remove('Ziranma.txt.in')
468 if __name__
== '__main__':