2 # -*- coding: utf-8 -*-
6 import os
, re
, shutil
, sys
, platform
8 pyversion
= platform
.python_version()
9 islinux
= platform
.system().lower() == 'linux'
11 if pyversion
[:3] in ['2.6', '2.7']:
12 import urllib
as urllib_request
16 if sys
.maxunicode
< 0x10000:
21 return _unichr( 0xD7C0 + ( i
>>10 ) ) + _unichr( 0xDC00 + ( i
& 0x3FF ) )
22 elif pyversion
[:2] == '3.':
23 import urllib
.request
as urllib_request
27 return [unichr( int( i
.split('<')[0][2:], 16 ) ) for i
in args
]
30 return [unichr( int( i
[2:7], 16 ) ) for i
in args
if i
[2:7]]
35 SCIM_TABLES_VER
= '0.5.13'
36 SCIM_PINYIN_VER
= '0.5.92'
40 def download( url
, dest
):
41 if os
.path
.isfile( dest
):
42 print( 'File %s is up to date.' % dest
)
46 # we use wget instead urlretrieve under Linux,
47 # because wget could display details like download progress
48 os
.system( 'wget %s -O %s' % ( url
, dest
) )
50 print( 'Downloading from [%s] ...' % url
)
51 urllib_request
.urlretrieve( url
, dest
)
52 print( 'Download complete.\n' )
55 def uncompress( fp
, member
, encoding
= 'U8' ):
56 name
= member
.rsplit( '/', 1 )[-1]
57 print( 'Extracting %s ...' % name
)
59 shutil
.move( member
, name
)
61 shutil
.rmtree( member
.split( '/', 1 )[0] )
62 if pyversion
[:1] in ['2']:
63 fc
= open( name
, 'rb', encoding
, 'ignore' )
65 fc
= open( name
, 'r', encoding
= encoding
, errors
= 'ignore' )
68 unzip
= lambda path
, member
, encoding
= 'U8': \
69 uncompress( zf
.ZipFile( path
), member
, encoding
)
71 untargz
= lambda path
, member
, encoding
= 'U8': \
72 uncompress( tf
.open( path
, 'r:gz' ), member
, encoding
)
74 def parserCore( fp
, pos
, beginmark
= None, endmark
= None ):
75 if beginmark
and endmark
:
80 if beginmark
and line
.startswith( beginmark
):
83 elif endmark
and line
.startswith( endmark
):
85 if start
and not line
.startswith( '#' ):
89 elif len( elems
[0] ) > 1 and \
90 len( elems
[pos
] ) > 1: # words only
91 mlist
.add( elems
[pos
] )
94 def tablesParser( path
, name
):
95 """ Read file from scim-tables and parse it. """
96 global SCIM_TABLES_VER
97 src
= 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER
, name
)
98 fp
= untargz( path
, src
, 'U8' )
99 return parserCore( fp
, 1, 'BEGIN_TABLE', 'END_TABLE' )
101 ezbigParser
= lambda path
: tablesParser( path
, 'EZ-Big.txt.in' )
102 wubiParser
= lambda path
: tablesParser( path
, 'Wubi.txt.in' )
103 zrmParser
= lambda path
: tablesParser( path
, 'Ziranma.txt.in' )
105 def phraseParser( path
):
106 """ Read phrase_lib.txt and parse it. """
107 global SCIM_PINYIN_VER
108 src
= 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
109 dst
= 'phrase_lib.txt'
110 fp
= untargz( path
, src
, 'U8' )
111 return parserCore( fp
, 0 )
113 def tsiParser( path
):
114 """ Read tsi.src and parse it. """
115 src
= 'libtabe/tsi-src/tsi.src'
117 fp
= untargz( path
, src
, 'big5hkscs' )
118 return parserCore( fp
, 0 )
120 def unihanParser( path
):
121 """ Read Unihan_Variants.txt and parse it. """
122 fp
= unzip( path
, 'Unihan_Variants.txt', 'U8' )
126 if line
.startswith( '#' ):
132 type = elems
.pop( 1 )
133 elems
= unichr2( *elems
)
134 if type == 'kTraditionalVariant':
135 s2t
[elems
[0]] = elems
[1:]
136 elif type == 'kSimplifiedVariant':
137 t2s
[elems
[0]] = elems
[1:]
141 def applyExcludes( mlist
, path
):
142 """ Apply exclude rules from path to mlist. """
143 if pyversion
[:1] in ['2']:
144 excludes
= open( path
, 'rb', 'U8' ).read().split()
146 excludes
= open( path
, 'r', encoding
= 'U8' ).read().split()
147 excludes
= [word
.split( '#' )[0].strip() for word
in excludes
]
148 excludes
= '|'.join( excludes
)
149 excptn
= re
.compile( '.*(?:%s).*' % excludes
)
150 diff
= [mword
for mword
in mlist
if excptn
.search( mword
)]
151 mlist
.difference_update( diff
)
154 def charManualTable( path
):
155 fp
= open( path
, 'r', encoding
= 'U8' )
157 elems
= line
.split( '#' )[0].split( '|' )
158 elems
= unichr3( *elems
)
160 yield elems
[0], elems
[1:]
162 def toManyRules( src_table
):
164 if pyversion
[:1] in ['2']:
165 for ( f
, t
) in src_table
.iteritems():
166 for i
in range( 1, len( t
) ):
169 for ( f
, t
) in src_table
.items():
170 for i
in range( 1, len( t
) ):
174 def removeRules( path
, table
):
175 fp
= open( path
, 'r', encoding
= 'U8' )
178 elems
= line
.split( '=>' )
179 f
= t
= elems
[0].strip()
180 if len( elems
) == 2:
182 f
= f
.strip('"').strip("'")
183 t
= t
.strip('"').strip("'")
191 texcptn
= re
.compile( '^(?:%s)$' % '|'.join( texc
) )
192 if pyversion
[:1] in ['2']:
193 for (tmp_f
, tmp_t
) in table
.copy().iteritems():
194 if texcptn
.match( tmp_t
):
197 for (tmp_f
, tmp_t
) in table
.copy().items():
198 if texcptn
.match( tmp_t
):
202 def customRules( path
):
203 fp
= open( path
, 'r', encoding
= 'U8' )
206 line
= line
.rstrip( '\r\n' )
208 line
= line
.split( '#' )[0].rstrip()
209 elems
= line
.split( '\t' )
211 ret
[elems
[0]] = elems
[1]
214 def dictToSortedList( src_table
, pos
):
215 return sorted( src_table
.items(), key
= lambda m
: ( m
[pos
], m
[1 - pos
] ) )
217 def translate( text
, conv_table
):
219 while i
< len( text
):
220 for j
in range( len( text
) - i
, 0, -1 ):
222 t
= conv_table
.get( f
)
224 text
= text
[:i
] + t
+ text
[i
:][j
:]
230 def manualWordsTable( path
, conv_table
, reconv_table
):
231 fp
= open( path
, 'r', encoding
= 'U8' )
233 wordlist
= [line
.split( '#' )[0].strip() for line
in fp
]
234 wordlist
= list( set( wordlist
) )
235 wordlist
.sort( key
= lambda w
: ( len(w
), w
), reverse
= True )
237 word
= wordlist
.pop()
238 new_word
= translate( word
, conv_table
)
239 rcv_word
= translate( word
, reconv_table
)
241 reconv_table
[word
] = word
242 reconv_table
[new_word
] = word
245 def defaultWordsTable( src_wordlist
, src_tomany
, char_conv_table
, char_reconv_table
):
246 wordlist
= list( src_wordlist
)
247 wordlist
.sort( key
= lambda w
: ( len(w
), w
), reverse
= True )
249 word_reconv_table
= {}
250 conv_table
= char_conv_table
.copy()
251 reconv_table
= char_reconv_table
.copy()
252 tomanyptn
= re
.compile( '(?:%s)' % '|'.join( src_tomany
) )
254 conv_table
.update( word_conv_table
)
255 reconv_table
.update( word_reconv_table
)
256 word
= wordlist
.pop()
257 new_word_len
= word_len
= len( word
)
258 while new_word_len
== word_len
:
260 test_word
= translate( word
, reconv_table
)
261 new_word
= translate( word
, conv_table
)
262 if not reconv_table
.get( new_word
) \
263 and ( test_word
!= word \
264 or ( tomanyptn
.search( word
) \
265 and word
!= translate( new_word
, reconv_table
) ) ):
266 word_conv_table
[word
] = new_word
267 word_reconv_table
[new_word
] = word
269 word
= wordlist
.pop()
272 new_word_len
= len(word
)
273 return word_reconv_table
275 def PHPArray( table
):
276 lines
= ['\'%s\' => \'%s\',' % (f
, t
) for (f
, t
) in table
if f
and t
]
277 return '\n'.join(lines
)
281 url
= 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
282 han_dest
= 'Unihan-%s.zip' % UNIHAN_VER
283 download( url
, han_dest
)
285 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
286 url
= 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR
, SCIM_TABLES_VER
)
287 tbe_dest
= 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
288 download( url
, tbe_dest
)
290 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
291 url
= 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR
, SCIM_PINYIN_VER
)
292 pyn_dest
= 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
293 download( url
, pyn_dest
)
295 # Get libtabe-$(LIBTABE_VER).tgz:
296 url
= 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR
, LIBTABE_VER
)
297 lbt_dest
= 'libtabe-%s.tgz' % LIBTABE_VER
298 download( url
, lbt_dest
)
301 ( t2s_1tomany
, s2t_1tomany
) = unihanParser( han_dest
)
303 t2s_1tomany
.update( charManualTable( 'symme_supp.manual' ) )
304 t2s_1tomany
.update( charManualTable( 'trad2simp.manual' ) )
305 s2t_1tomany
.update( ( t
[0], [f
] ) for ( f
, t
) in charManualTable( 'symme_supp.manual' ) )
306 s2t_1tomany
.update( charManualTable( 'simp2trad.manual' ) )
308 if pyversion
[:1] in ['2']:
309 t2s_1to1
= dict( [( f
, t
[0] ) for ( f
, t
) in t2s_1tomany
.iteritems()] )
310 s2t_1to1
= dict( [( f
, t
[0] ) for ( f
, t
) in s2t_1tomany
.iteritems()] )
312 t2s_1to1
= dict( [( f
, t
[0] ) for ( f
, t
) in t2s_1tomany
.items()] )
313 s2t_1to1
= dict( [( f
, t
[0] ) for ( f
, t
) in s2t_1tomany
.items()] )
315 s_tomany
= toManyRules( t2s_1tomany
)
316 t_tomany
= toManyRules( s2t_1tomany
)
319 t2s_1to1
= removeRules( 'trad2simp_noconvert.manual', t2s_1to1
)
320 s2t_1to1
= removeRules( 'simp2trad_noconvert.manual', s2t_1to1
)
322 # the supper set for word to word conversion
323 t2s_1to1_supp
= t2s_1to1
.copy()
324 s2t_1to1_supp
= s2t_1to1
.copy()
325 t2s_1to1_supp
.update( customRules( 'trad2simp_supp_set.manual' ) )
326 s2t_1to1_supp
.update( customRules( 'simp2trad_supp_set.manual' ) )
328 # word to word manual rules
329 t2s_word2word_manual
= manualWordsTable( 'simpphrases.manual', s2t_1to1_supp
, t2s_1to1_supp
)
330 t2s_word2word_manual
.update( customRules( 'toSimp.manual' ) )
331 s2t_word2word_manual
= manualWordsTable( 'tradphrases.manual', t2s_1to1_supp
, s2t_1to1_supp
)
332 s2t_word2word_manual
.update( customRules( 'toTrad.manual' ) )
334 # word to word rules from input methods
337 t_wordlist
.update( ezbigParser( tbe_dest
),
338 tsiParser( lbt_dest
) )
339 s_wordlist
.update( wubiParser( tbe_dest
),
340 zrmParser( tbe_dest
),
341 phraseParser( pyn_dest
) )
344 s_wordlist
= applyExcludes( s_wordlist
, 'simpphrases_exclude.manual' )
345 t_wordlist
= applyExcludes( t_wordlist
, 'tradphrases_exclude.manual' )
347 s2t_supp
= s2t_1to1_supp
.copy()
348 s2t_supp
.update( s2t_word2word_manual
)
349 t2s_supp
= t2s_1to1_supp
.copy()
350 t2s_supp
.update( t2s_word2word_manual
)
353 t2s_word2word
= defaultWordsTable( s_wordlist
, s_tomany
, s2t_1to1_supp
, t2s_supp
)
354 t2s_word2word
.update( t2s_word2word_manual
)
355 s2t_word2word
= defaultWordsTable( t_wordlist
, t_tomany
, t2s_1to1_supp
, s2t_supp
)
356 s2t_word2word
.update( s2t_word2word_manual
)
360 if pyversion
[:1] in ['2']:
361 t2s_1to1
= dict( [( f
, t
) for ( f
, t
) in t2s_1to1
.iteritems() if f
!= t
] )
363 t2s_1to1
= dict( [( f
, t
) for ( f
, t
) in t2s_1to1
.items() if f
!= t
] )
364 toHans
= dictToSortedList( t2s_1to1
, 0 ) + dictToSortedList( t2s_word2word
, 1 )
366 if pyversion
[:1] in ['2']:
367 s2t_1to1
= dict( [( f
, t
) for ( f
, t
) in s2t_1to1
.iteritems() if f
!= t
] )
369 s2t_1to1
= dict( [( f
, t
) for ( f
, t
) in s2t_1to1
.items() if f
!= t
] )
370 toHant
= dictToSortedList( s2t_1to1
, 0 ) + dictToSortedList( s2t_word2word
, 1 )
372 toCN
= dictToSortedList( customRules( 'toCN.manual' ), 1 )
374 toHK
= dictToSortedList( customRules( 'toHK.manual' ), 1 )
376 toTW
= dictToSortedList( customRules( 'toTW.manual' ), 1 )
381 * Simplified / Traditional Chinese conversion tables
383 * Automatically generated using code and data in maintenance/language/zhtable/
384 * Do not modify directly!
389 $zh2Hant = array(\n'''
390 php
+= PHPArray( toHant
) \
391 + '\n);\n\n$zh2Hans = array(\n' \
392 + PHPArray( toHans
) \
393 + '\n);\n\n$zh2TW = array(\n' \
395 + '\n);\n\n$zh2HK = array(\n' \
397 + '\n);\n\n$zh2CN = array(\n' \
401 if pyversion
[:1] in ['2']:
402 f
= open( os
.path
.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'wb', encoding
= 'utf8' )
404 f
= open( os
.path
.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'w', buffering
= 4096, encoding
= 'utf8' )
405 print ('Writing ZhConversion.php ... ')
409 # Remove temporary files
410 print ('Deleting temporary files ... ')
411 os
.remove('EZ-Big.txt.in')
412 os
.remove('phrase_lib.txt')
414 os
.remove('Unihan_Variants.txt')
415 os
.remove('Wubi.txt.in')
416 os
.remove('Ziranma.txt.in')
419 if __name__
== '__main__':