Modified Special:Categories to subclass SpecialPage
[lhc/web/wiklou.git] / includes / zhtable / Makefile.py
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
3 # @author Philip
4 import tarfile as tf
5 import zipfile as zf
6 import os, re, shutil, sys, platform
7
8 pyversion = platform.python_version()
9 islinux = platform.system().lower() == 'linux'
10
11 if pyversion[:3] in ['2.6', '2.7']:
12 import urllib as urllib_request
13 import codecs
14 open = codecs.open
15 _unichr = unichr
16 if sys.maxunicode < 0x10000:
17 def unichr(i):
18 if i < 0x10000:
19 return _unichr(i)
20 else:
21 return _unichr( 0xD7C0 + ( i>>10 ) ) + _unichr( 0xDC00 + ( i & 0x3FF ) )
22 elif pyversion[:2] == '3.':
23 import urllib.request as urllib_request
24 unichr = chr
25
26 def unichr2( *args ):
27 return [unichr( int( i.split('<')[0][2:], 16 ) ) for i in args]
28
29 def unichr3( *args ):
30 return [unichr( int( i[2:7], 16 ) ) for i in args if i[2:7]]
31
32 # DEFINE
33 SF_MIRROR = 'easynews'
34 SCIM_TABLES_VER = '0.5.9'
35 SCIM_PINYIN_VER = '0.5.91'
36 LIBTABE_VER = '0.2.3'
37 # END OF DEFINE
38
39 def download( url, dest ):
40 if os.path.isfile( dest ):
41 print( 'File %s up to date.' % dest )
42 return
43 global islinux
44 if islinux:
45 # we use wget instead urlretrieve under Linux,
46 # because wget could display details like download progress
47 os.system('wget %s' % url)
48 else:
49 print( 'Downloading from [%s] ...' % url )
50 urllib_request.urlretrieve( url, dest )
51 print( 'Download complete.\n' )
52 return
53
54 def uncompress( fp, member, encoding = 'U8' ):
55 name = member.rsplit( '/', 1 )[-1]
56 print( 'Extracting %s ...' % name )
57 fp.extract( member )
58 shutil.move( member, name )
59 if '/' in member:
60 shutil.rmtree( member.split( '/', 1 )[0] )
61 return open( name, 'rb', encoding, 'ignore' )
62
63 unzip = lambda path, member, encoding = 'U8': \
64 uncompress( zf.ZipFile( path ), member, encoding )
65
66 untargz = lambda path, member, encoding = 'U8': \
67 uncompress( tf.open( path, 'r:gz' ), member, encoding )
68
69 def parserCore( fp, pos, beginmark = None, endmark = None ):
70 if beginmark and endmark:
71 start = False
72 else: start = True
73 mlist = set()
74 for line in fp:
75 if beginmark and line.startswith( beginmark ):
76 start = True
77 continue
78 elif endmark and line.startswith( endmark ):
79 break
80 if start and not line.startswith( '#' ):
81 elems = line.split()
82 if len( elems ) < 2:
83 continue
84 elif len( elems[0] ) > 1:
85 mlist.add( elems[pos] )
86 return mlist
87
88 def tablesParser( path, name ):
89 """ Read file from scim-tables and parse it. """
90 global SCIM_TABLES_VER
91 src = 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER, name )
92 fp = untargz( path, src, 'U8' )
93 return parserCore( fp, 1, 'BEGIN_TABLE', 'END_TABLE' )
94
95 ezbigParser = lambda path: tablesParser( path, 'EZ-Big.txt.in' )
96 wubiParser = lambda path: tablesParser( path, 'Wubi.txt.in' )
97 zrmParser = lambda path: tablesParser( path, 'Ziranma.txt.in' )
98
99 def phraseParser( path ):
100 """ Read phrase_lib.txt and parse it. """
101 global SCIM_PINYIN_VER
102 src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
103 dst = 'phrase_lib.txt'
104 fp = untargz( path, src, 'U8' )
105 return parserCore( fp, 0 )
106
107 def tsiParser( path ):
108 """ Read tsi.src and parse it. """
109 src = 'libtabe/tsi-src/tsi.src'
110 dst = 'tsi.src'
111 fp = untargz( path, src, 'big5hkscs' )
112 return parserCore( fp, 0 )
113
114 def unihanParser( path ):
115 """ Read Unihan_Variants.txt and parse it. """
116 fp = unzip( path, 'Unihan_Variants.txt', 'U8' )
117 t2s = dict()
118 s2t = dict()
119 for line in fp:
120 if line.startswith( '#' ):
121 continue
122 else:
123 elems = line.split()
124 if len( elems ) < 3:
125 continue
126 type = elems.pop( 1 )
127 elems = unichr2( *elems )
128 if type == 'kTraditionalVariant':
129 s2t[elems[0]] = elems[1:]
130 elif type == 'kSimplifiedVariant':
131 t2s[elems[0]] = elems[1:]
132 fp.close()
133 return ( t2s, s2t )
134
135 def applyExcludes( mlist, path ):
136 """ Apply exclude rules from path to mlist. """
137 excludes = open( path, 'rb', 'U8' ).read().split()
138 excludes = [word.split( '#' )[0].strip() for word in excludes]
139 excludes = '|'.join( excludes )
140 excptn = re.compile( '.*(?:%s).*' % excludes )
141 diff = [mword for mword in mlist if excptn.search( mword )]
142 mlist.difference_update( diff )
143 return mlist
144
145 def charManualTable( path ):
146 fp = open( path, 'rb', 'U8' )
147 ret = {}
148 for line in fp:
149 elems = line.split( '#' )[0].split( '|' )
150 elems = unichr3( *elems )
151 if len( elems ) > 1:
152 ret[elems[0]] = elems[1:]
153 return ret
154
155 def toManyRules( src_table ):
156 tomany = set()
157 for ( f, t ) in src_table.iteritems():
158 for i in range( 1, len( t ) ):
159 tomany.add( t[i] )
160 return tomany
161
162 def removeRules( path, table ):
163 fp = open( path, 'rb', 'U8' )
164 texc = list()
165 for line in fp:
166 elems = line.split( '=>' )
167 f = t = elems[0].strip()
168 if len( elems ) == 2:
169 t = elems[1].strip()
170 f = f.strip('"').strip("'")
171 t = t.strip('"').strip("'")
172 if f:
173 try:
174 table.pop( f )
175 except:
176 pass
177 if t:
178 texc.append( t )
179 texcptn = re.compile( '^(?:%s)$' % '|'.join( texc ) )
180 for (tmp_f, tmp_t) in table.copy().iteritems():
181 if texcptn.match( tmp_t ):
182 table.pop( tmp_f )
183 return table
184
185 def customRules( path ):
186 fp = open( path, 'rb', 'U8' )
187 ret = dict()
188 for line in fp:
189 elems = line.split( '#' )[0].split()
190 if len( elems ) > 1:
191 ret[elems[0]] = elems[1]
192 return ret
193
194 def dictToSortedList( src_table, pos ):
195 return sorted( src_table.items(), key = lambda m: m[pos] )
196
197 def translate( text, conv_table ):
198 i = 0
199 while i < len( text ):
200 for j in range( len( text ) - i, 0, -1 ):
201 f = text[i:][:j]
202 t = conv_table.get( f )
203 if t:
204 text = text[:i] + t + text[i:][j:]
205 i += len(t) - 1
206 break
207 i += 1
208 return text
209
210 def manualWordsTable( path, conv_table, reconv_table ):
211 fp = open( path, 'rb', 'U8' )
212 reconv_table = {}
213 wordlist = [line.split( '#' )[0].strip() for line in fp]
214 wordlist = list( set( wordlist ) )
215 wordlist.sort( key = len, reverse = True )
216 while wordlist:
217 word = wordlist.pop()
218 new_word = translate( word, conv_table )
219 rcv_word = translate( word, reconv_table )
220 if word != rcv_word:
221 reconv_table[word] = word
222 reconv_table[new_word] = word
223 return reconv_table
224
225 def defaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ):
226 wordlist = list( src_wordlist )
227 wordlist.sort( key = len, reverse = True )
228 word_conv_table = {}
229 word_reconv_table = {}
230 conv_table = char_conv_table.copy()
231 reconv_table = char_reconv_table.copy()
232 tomanyptn = re.compile( '(?:%s)' % '|'.join( src_tomany ) )
233 while wordlist:
234 conv_table.update( word_conv_table )
235 reconv_table.update( word_reconv_table )
236 word = wordlist.pop()
237 new_word_len = word_len = len( word )
238 while new_word_len == word_len:
239 add = False
240 test_word = translate( word, reconv_table )
241 new_word = translate( word, conv_table )
242 if not reconv_table.get( new_word ) \
243 and ( test_word != word \
244 or ( tomanyptn.search( word ) \
245 and word != translate( new_word, reconv_table ) ) ):
246 word_conv_table[word] = new_word
247 word_reconv_table[new_word] = word
248 try:
249 word = wordlist.pop()
250 except IndexError:
251 break
252 new_word_len = len(word)
253 return word_reconv_table
254
255 def PHPArray( table ):
256 lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t]
257 return '\n'.join(lines)
258
259 def main():
260 #Get Unihan.zip:
261 url = 'http://www.unicode.org/Public/UNIDATA/Unihan.zip'
262 han_dest = 'Unihan.zip'
263 download( url, han_dest )
264
265 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
266 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER )
267 tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
268 download( url, tbe_dest )
269
270 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
271 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER )
272 pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
273 download( url, pyn_dest )
274
275 # Get libtabe-$(LIBTABE_VER).tgz:
276 url = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER )
277 lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
278 download( url, lbt_dest )
279
280 # Unihan.txt
281 ( t2s_1tomany, s2t_1tomany ) = unihanParser( han_dest )
282
283 t2s_1tomany.update( charManualTable( 'trad2simp.manual' ) )
284 s2t_1tomany.update( charManualTable( 'simp2trad.manual' ) )
285
286 t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.iteritems()] )
287 s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.iteritems()] )
288
289 s_tomany = toManyRules( t2s_1tomany )
290 t_tomany = toManyRules( s2t_1tomany )
291
292 # noconvert rules
293 t2s_1to1 = removeRules( 'trad2simp_noconvert.manual', t2s_1to1 )
294 s2t_1to1 = removeRules( 'simp2trad_noconvert.manual', s2t_1to1 )
295
296 # the supper set for word to word conversion
297 t2s_1to1_supp = t2s_1to1.copy()
298 s2t_1to1_supp = s2t_1to1.copy()
299 t2s_1to1_supp.update( customRules( 'trad2simp_supp_set.manual' ) )
300 s2t_1to1_supp.update( customRules( 'simp2trad_supp_set.manual' ) )
301
302 # word to word manual rules
303 t2s_word2word_manual = manualWordsTable( 'simpphrases.manual', s2t_1to1_supp, t2s_1to1_supp )
304 t2s_word2word_manual.update( customRules( 'toSimp.manual' ) )
305 s2t_word2word_manual = manualWordsTable( 'tradphrases.manual', t2s_1to1_supp, s2t_1to1_supp )
306 s2t_word2word_manual.update( customRules( 'toTrad.manual' ) )
307
308 # word to word rules from input methods
309 t_wordlist = set()
310 s_wordlist = set()
311 t_wordlist.update( ezbigParser( tbe_dest ),
312 tsiParser( lbt_dest ) )
313 s_wordlist.update( wubiParser( tbe_dest ),
314 zrmParser( tbe_dest ),
315 phraseParser( pyn_dest ) )
316
317 # exclude
318 s_wordlist = applyExcludes( s_wordlist, 'simpphrases_exclude.manual' )
319 t_wordlist = applyExcludes( t_wordlist, 'tradphrases_exclude.manual' )
320
321 s2t_supp = s2t_1to1_supp.copy()
322 s2t_supp.update( s2t_word2word_manual )
323 t2s_supp = t2s_1to1_supp.copy()
324 t2s_supp.update( t2s_word2word_manual )
325
326 # parse list to dict
327 t2s_word2word = defaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp )
328 t2s_word2word.update( t2s_word2word_manual )
329 s2t_word2word = defaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp )
330 s2t_word2word.update( s2t_word2word_manual )
331
332 # Final tables
333 # sorted list toHans
334 t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.iteritems() if f != t] )
335 toHans = dictToSortedList( t2s_1to1, 0 ) + dictToSortedList( t2s_word2word, 1 )
336 # sorted list toHant
337 s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.iteritems() if f != t] )
338 toHant = dictToSortedList( s2t_1to1, 0 ) + dictToSortedList( s2t_word2word, 1 )
339 # sorted list toCN
340 toCN = dictToSortedList( customRules( 'toCN.manual' ), 1 )
341 # sorted list toHK
342 toHK = dictToSortedList( customRules( 'toHK.manual' ), 1 )
343 # sorted list toSG
344 toSG = dictToSortedList( customRules( 'toSG.manual' ), 1 )
345 # sorted list toTW
346 toTW = dictToSortedList( customRules( 'toTW.manual' ), 1 )
347
348 # Get PHP Array
349 php = '''<?php
350 /**
351 * Simplified / Traditional Chinese conversion tables
352 *
353 * Automatically generated using code and data in includes/zhtable/
354 * Do not modify directly!
355 *
356 * @file
357 */
358
359 $zh2Hant = array(\n'''
360 php += PHPArray( toHant ) \
361 + '\n);\n\n$zh2Hans = array(\n' \
362 + PHPArray( toHans ) \
363 + '\n);\n\n$zh2TW = array(\n' \
364 + PHPArray( toTW ) \
365 + '\n);\n\n$zh2HK = array(\n' \
366 + PHPArray( toHK ) \
367 + '\n);\n\n$zh2CN = array(\n' \
368 + PHPArray( toCN ) \
369 + '\n);\n\n$zh2SG = array(\n' \
370 + PHPArray( toSG ) \
371 + '\n);'
372
373 f = open( 'ZhConversion.php', 'wb', encoding = 'utf8' )
374 print ('Writing ZhConversion.php ... ')
375 f.write( php )
376 f.close()
377
378 #Remove temp files
379 print ('Deleting temp files ... ')
380 os.remove('EZ-Big.txt.in')
381 os.remove('phrase_lib.txt')
382 os.remove('tsi.src')
383 os.remove('Unihan_Variants.txt')
384 os.remove('Wubi.txt.in')
385 os.remove('Ziranma.txt.in')
386
387
388 if __name__ == '__main__':
389 main()