Fix "you are blocked" message for users who were blocked by zero-ID user.
[lhc/web/wiklou.git] / includes / zhtable / Makefile.py
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
3 # @author Philip
4 import tarfile, zipfile
5 import os, re, shutil, sys, platform
6
7 pyversion = platform.python_version()
8 if pyversion[:3] in ['2.5', '2.6', '2.7']:
9 import urllib as urllib_request
10 import codecs
11 uniopen = codecs.open
12 def unichr2(i):
13 if sys.maxunicode >= 0x10000 or i < 0x10000:
14 return unichr(i)
15 else:
16 return unichr(0xD7C0+(i>>10)) + unichr(0xDC00+(i&0x3FF))
17 elif pyversion[:2] == '3.':
18 import urllib.request as urllib_request
19 uniopen = open
20 unichr2 = chr
21
22 # DEFINE
23 SF_MIRROR = 'easynews'
24 SCIM_TABLES_VER = '0.5.9'
25 SCIM_PINYIN_VER = '0.5.91'
26 LIBTABE_VER = '0.2.3'
27 # END OF DEFINE
28
29 def GetFileFromURL( url, dest ):
30 if os.path.isfile(dest):
31 print( 'File %s up to date.' % dest )
32 return
33 print( 'Downloading from [%s] ...' % url )
34 urllib_request.urlretrieve( url, dest )
35 print( 'Download complete.\n' )
36 return
37
38 def GetFileFromUnihan( path ):
39 print( 'Extracting files from %s ...' % path )
40 text = zipfile.ZipFile(path).read('Unihan_Variants.txt')
41 uhfile = uniopen('Unihan_Variants.txt', 'w')
42 uhfile.write(text)
43 uhfile.close()
44 return
45
46 def GetFileFromTar( path, member, rename ):
47 print( 'Extracting %s from %s ...' % (rename, path) )
48 tarfile.open(path, 'r:gz').extract(member)
49 shutil.move(member, rename)
50 tree_rmv = member.split('/')[0]
51 shutil.rmtree(tree_rmv)
52 return
53
54 def ReadBIG5File( dest ):
55 print( 'Reading and decoding %s ...' % dest )
56 f1 = uniopen( dest, 'r', encoding='big5hkscs', errors='replace' )
57 text = f1.read()
58 text = text.replace( '\ufffd', '\n' )
59 f1.close()
60 f2 = uniopen( dest, 'w', encoding='utf8' )
61 f2.write(text)
62 f2.close()
63 return text
64
65 def ReadFile( dest ):
66 print( 'Reading and decoding %s ...' % dest )
67 f = uniopen( dest, 'r', encoding='utf8' )
68 ret = f.read()
69 f.close()
70 return ret
71
72 def ReadUnihanFile( dest ):
73 print( 'Reading and decoding %s ...' % dest )
74 f = uniopen( dest, 'r', encoding='utf8' )
75 t2s_code = []
76 s2t_code = []
77 while True:
78 line = f.readline()
79 if line:
80 if line.startswith('#'):
81 continue
82 elif not line.find('kSimplifiedVariant') == -1:
83 temp = line.split('kSimplifiedVariant')
84 t2s_code.append( ( temp[0].strip(), temp[1].strip() ) )
85 elif not line.find('kTraditionalVariant') == -1:
86 temp = line.split('kTraditionalVariant')
87 s2t_code.append( ( temp[0].strip(), temp[1].strip() ) )
88 else:
89 break
90 f.close()
91 return ( t2s_code, s2t_code )
92
93 def RemoveRows( text, num ):
94 text = re.sub( '.*\s*', '', text, num)
95 return text
96
97 def RemoveOneCharConv( text ):
98 preg = re.compile('^.\s*$', re.MULTILINE)
99 text = preg.sub( '', text )
100 return text
101
102 def ConvertToChar( code ):
103 code = code.split('<')[0]
104 return unichr2( int( code[2:], 16 ) )
105
106 def GetDefaultTable( code_table ):
107 char_table = {}
108 for ( f, t ) in code_table:
109 if f and t:
110 from_char = ConvertToChar( f )
111 to_chars = [ConvertToChar( code ) for code in t.split()]
112 char_table[from_char] = to_chars
113 return char_table
114
115 def GetManualTable( dest ):
116 text = ReadFile( dest )
117 temp1 = text.split()
118 char_table = {}
119 for elem in temp1:
120 elem = elem.strip('|')
121 if elem:
122 temp2 = elem.split( '|', 1 )
123 from_char = unichr2( int( temp2[0][2:7], 16 ) )
124 to_chars = [unichr2( int( code[2:7], 16 ) ) for code in temp2[1].split('|')]
125 char_table[from_char] = to_chars
126 return char_table
127
128 def GetValidTable( src_table ):
129 valid_table = {}
130 for f, t in src_table.items():
131 valid_table[f] = t[0]
132 return valid_table
133
134 def GetToManyRules( src_table ):
135 tomany_table = {}
136 for f, t in src_table.items():
137 for i in range(1, len(t)):
138 tomany_table[t[i]] = True
139 return tomany_table
140
141 def RemoveRules( dest, table ):
142 text = ReadFile( dest )
143 temp1 = text.split()
144 for elem in temp1:
145 f = ''
146 t = ''
147 elem = elem.strip().replace( '"', '' ).replace( '\'', '' )
148 if '=>' in elem:
149 if elem.startswith( '=>' ):
150 t = elem.replace( '=>', '' ).strip()
151 elif elem.endswith( '=>' ):
152 f = elem.replace( '=>', '' ).strip()
153 else:
154 temp2 = elem.split( '=>' )
155 f = temp2[0].strip()
156 t = temp2[1].strip()
157 try:
158 table.pop(f, t)
159 continue
160 except:
161 continue
162 else:
163 f = t = elem
164 if f:
165 try:
166 table.pop(f)
167 except:
168 x = 1
169 if t:
170 for temp_f, temp_t in table.copy().items():
171 if temp_t == t:
172 table.pop(temp_f)
173 return table
174
175 def DictToSortedList1( src_table ):
176 return sorted( src_table.items(), key = lambda m: m[0] ) #sorted( temp_table, key = lambda m: len( m[0] ) )
177
178 def DictToSortedList2( src_table ):
179 return sorted( src_table.items(), key = lambda m: m[1] )
180
181 def Converter( string, conv_table ):
182 i = 0
183 while i < len(string):
184 for j in range(len(string) - i, 0, -1):
185 f = string[i:][:j]
186 t = conv_table.get( f )
187 if t:
188 string = string[:i] + t + string[i:][j:]
189 i += len(t) - 1
190 break
191 i += 1
192 return string
193
194 def GetDefaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ):
195 wordlist = list( set( src_wordlist ) )
196 wordlist.sort( key = len, reverse = True )
197 word_conv_table = {}
198 word_reconv_table = {}
199 while wordlist:
200 conv_table = {}
201 reconv_table = {}
202 conv_table.update( word_conv_table )
203 conv_table.update( char_conv_table )
204 reconv_table.update( word_reconv_table )
205 reconv_table.update( char_reconv_table )
206 word = wordlist.pop()
207 new_word_len = word_len = len(word)
208 while new_word_len == word_len:
209 rvt_test = False
210 for char in word:
211 rvt_test = rvt_test or src_tomany.get(char)
212 test_word = Converter( word, reconv_table )
213 new_word = Converter( word, conv_table )
214 if not reconv_table.get( new_word ):
215 if not test_word == word:
216 word_conv_table[word] = new_word
217 word_reconv_table[new_word] = word
218 elif rvt_test:
219 rvt_word = Converter( new_word, reconv_table )
220 if not rvt_word == word:
221 word_conv_table[word] = new_word
222 word_reconv_table[new_word] = word
223 try:
224 word = wordlist.pop()
225 except IndexError:
226 break
227 new_word_len = len(word)
228 return word_reconv_table
229
230 def GetManualWordsTable( src_wordlist, conv_table ):
231 src_wordlist = [items.split('#')[0].strip() for items in src_wordlist]
232 wordlist = list( set( src_wordlist ) )
233 wordlist.sort( key = len, reverse = True )
234 reconv_table = {}
235 while wordlist:
236 word = wordlist.pop()
237 new_word = Converter( word, conv_table )
238 reconv_table[new_word] = word
239 return reconv_table
240
241 def CustomRules( dest ):
242 text = ReadFile( dest )
243 temp = text.split()
244 ret = dict()
245 for i in range( 0, len( temp ), 2 ):
246 ret[temp[i]] = temp[i + 1]
247 return ret
248
249 def GetPHPArray( table ):
250 lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table]
251 #lines = ['"%s"=>"%s",' % (f, t) for (f, t) in table]
252 return '\n'.join(lines)
253
254 def RemoveSameChar( src_table ):
255 dst_table = {}
256 for f, t in src_table.items():
257 if not f == t:
258 dst_table[f] = t
259 return dst_table
260
261 def main():
262 #Get Unihan.zip:
263 url = 'http://www.unicode.org/Public/UNIDATA/Unihan.zip'
264 han_dest = 'Unihan.zip'
265 GetFileFromURL( url, han_dest )
266
267 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
268 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER )
269 tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
270 GetFileFromURL( url, tbe_dest )
271
272 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
273 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER )
274 pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
275 GetFileFromURL( url, pyn_dest )
276
277 # Get libtabe-$(LIBTABE_VER).tgz:
278 url = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER )
279 lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
280 GetFileFromURL( url, lbt_dest )
281
282 # Extract the file from a comressed files
283
284 # Unihan.txt Simp. & Trad
285 GetFileFromUnihan( han_dest )
286
287 # Make word lists
288 t_wordlist = []
289 s_wordlist = []
290
291 # EZ.txt.in Trad
292 src = 'scim-tables-%s/tables/zh/EZ-Big.txt.in' % SCIM_TABLES_VER
293 dst = 'EZ.txt.in'
294 GetFileFromTar( tbe_dest, src, dst )
295 text = ReadFile( dst )
296 text = text.split( 'BEGIN_TABLE' )[1].strip()
297 text = text.split( 'END_TABLE' )[0].strip()
298 text = re.sub( '.*\t', '', text )
299 text = RemoveOneCharConv( text )
300 t_wordlist.extend( text.split() )
301
302 # Wubi.txt.in Simp
303 src = 'scim-tables-%s/tables/zh/Wubi.txt.in' % SCIM_TABLES_VER
304 dst = 'Wubi.txt.in'
305 GetFileFromTar( tbe_dest, src, dst )
306 text = ReadFile( dst )
307 text = text.split( 'BEGIN_TABLE' )[1].strip()
308 text = text.split( 'END_TABLE' )[0].strip()
309 text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text )
310 text = RemoveOneCharConv( text )
311 s_wordlist.extend( text.split() )
312
313 # Ziranma.txt.in Simp
314 src = 'scim-tables-%s/tables/zh/Ziranma.txt.in' % SCIM_TABLES_VER
315 dst = 'Ziranma.txt.in'
316 GetFileFromTar( tbe_dest, src, dst )
317 text = ReadFile( dst )
318 text = text.split( 'BEGIN_TABLE' )[1].strip()
319 text = text.split( 'END_TABLE' )[0].strip()
320 text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text )
321 text = RemoveOneCharConv( text )
322 s_wordlist.extend( text.split() )
323
324 # phrase_lib.txt Simp
325 src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
326 dst = 'phrase_lib.txt'
327 GetFileFromTar( pyn_dest, src, dst )
328 text = ReadFile( 'phrase_lib.txt' )
329 text = re.sub( '(.*)\t\d\d*.*', '\g<1>', text)
330 text = RemoveRows( text, 5 )
331 text = RemoveOneCharConv( text )
332 s_wordlist.extend( text.split() )
333
334 # tsi.src Trad
335 src = 'libtabe/tsi-src/tsi.src'
336 dst = 'tsi.src'
337 GetFileFromTar( lbt_dest, src, dst )
338 text = ReadBIG5File( 'tsi.src' )
339 text = re.sub( ' \d.*', '', text.replace('# ', ''))
340 text = RemoveOneCharConv( text )
341 t_wordlist.extend( text.split() )
342
343 # remove duplicate elements
344 t_wordlist = list( set( t_wordlist ) )
345 s_wordlist = list( set( s_wordlist ) )
346
347 # simpphrases_exclude.manual Simp
348 text = ReadFile( 'simpphrases_exclude.manual' )
349 temp = text.split()
350 s_string = '\n'.join( s_wordlist )
351 for elem in temp:
352 s_string = re.sub( '.*%s.*\n' % elem, '', s_string )
353 s_wordlist = s_string.split('\n')
354
355 # tradphrases_exclude.manual Trad
356 text = ReadFile( 'tradphrases_exclude.manual' )
357 temp = text.split()
358 t_string = '\n'.join( t_wordlist )
359 for elem in temp:
360 t_string = re.sub( '.*%s.*\n' % elem, '', t_string )
361 t_wordlist = t_string.split('\n')
362
363 # Make char to char convertion table
364 # Unihan.txt, dict t2s_code, s2t_code = { 'U+XXXX': 'U+YYYY( U+ZZZZ) ... ', ... }
365 ( t2s_code, s2t_code ) = ReadUnihanFile( 'Unihan_Variants.txt' )
366 # dict t2s_1tomany = { '\uXXXX': '\uYYYY\uZZZZ ... ', ... }
367 t2s_1tomany = {}
368 t2s_1tomany.update( GetDefaultTable( t2s_code ) )
369 t2s_1tomany.update( GetManualTable( 'trad2simp.manual' ) )
370 # dict s2t_1tomany
371 s2t_1tomany = {}
372 s2t_1tomany.update( GetDefaultTable( s2t_code ) )
373 s2t_1tomany.update( GetManualTable( 'simp2trad.manual' ) )
374 # dict t2s_1to1 = { '\uXXXX': '\uYYYY', ... }; t2s_trans = { 'ddddd': '', ... }
375 t2s_1to1 = GetValidTable( t2s_1tomany )
376 s_tomany = GetToManyRules( t2s_1tomany )
377 # dict s2t_1to1; s2t_trans
378 s2t_1to1 = GetValidTable( s2t_1tomany )
379 t_tomany = GetToManyRules( s2t_1tomany )
380 # remove noconvert rules
381 t2s_1to1 = RemoveRules( 'trad2simp_noconvert.manual', t2s_1to1 )
382 s2t_1to1 = RemoveRules( 'simp2trad_noconvert.manual', s2t_1to1 )
383
384 # Make word to word convertion table
385 t2s_1to1_supp = t2s_1to1.copy()
386 s2t_1to1_supp = s2t_1to1.copy()
387 # trad2simp_supp_set.manual
388 t2s_1to1_supp.update( CustomRules( 'trad2simp_supp_set.manual' ) )
389 # simp2trad_supp_set.manual
390 s2t_1to1_supp.update( CustomRules( 'simp2trad_supp_set.manual' ) )
391 # simpphrases.manual
392 text = ReadFile( 'simpphrases.manual' )
393 s_wordlist_manual = text.split('\n')
394 t2s_word2word_manual = GetManualWordsTable(s_wordlist_manual, s2t_1to1_supp)
395 t2s_word2word_manual.update( CustomRules( 'toSimp.manual' ) )
396 # tradphrases.manual
397 text = ReadFile( 'tradphrases.manual' )
398 t_wordlist_manual = text.split('\n')
399 s2t_word2word_manual = GetManualWordsTable(t_wordlist_manual, t2s_1to1_supp)
400 s2t_word2word_manual.update( CustomRules( 'toTrad.manual' ) )
401 # t2s_word2word
402 s2t_supp = s2t_1to1_supp.copy()
403 s2t_supp.update( s2t_word2word_manual )
404 t2s_supp = t2s_1to1_supp.copy()
405 t2s_supp.update( t2s_word2word_manual )
406 t2s_word2word = GetDefaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp )
407 ## toSimp.manual
408 t2s_word2word.update( t2s_word2word_manual )
409 # s2t_word2word
410 s2t_word2word = GetDefaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp )
411 ## toTrad.manual
412 s2t_word2word.update( s2t_word2word_manual )
413
414 # Final tables
415 # sorted list toHans
416 t2s_1to1 = RemoveSameChar( t2s_1to1 )
417 s2t_1to1 = RemoveSameChar( s2t_1to1 )
418 toHans = DictToSortedList1( t2s_1to1 ) + DictToSortedList2( t2s_word2word )
419 # sorted list toHant
420 toHant = DictToSortedList1( s2t_1to1 ) + DictToSortedList2( s2t_word2word )
421 # sorted list toCN
422 toCN = DictToSortedList2( CustomRules( 'toCN.manual' ) )
423 # sorted list toHK
424 toHK = DictToSortedList2( CustomRules( 'toHK.manual' ) )
425 # sorted list toSG
426 toSG = DictToSortedList2( CustomRules( 'toSG.manual' ) )
427 # sorted list toTW
428 toTW = DictToSortedList2( CustomRules( 'toTW.manual' ) )
429
430 # Get PHP Array
431 php = '''<?php
432 /**
433 * Simplified / Traditional Chinese conversion tables
434 *
435 * Automatically generated using code and data in includes/zhtable/
436 * Do not modify directly!
437 */
438
439 $zh2Hant = array(\n'''
440 php += GetPHPArray( toHant )
441 php += '\n);\n\n$zh2Hans = array(\n'
442 php += GetPHPArray( toHans )
443 php += '\n);\n\n$zh2TW = array(\n'
444 php += GetPHPArray( toTW )
445 php += '\n);\n\n$zh2HK = array(\n'
446 php += GetPHPArray( toHK )
447 php += '\n);\n\n$zh2CN = array(\n'
448 php += GetPHPArray( toCN )
449 php += '\n);\n\n$zh2SG = array(\n'
450 php += GetPHPArray( toSG )
451 php += '\n);'
452
453 f = uniopen( 'ZhConversion.php', 'w', encoding = 'utf8' )
454 print ('Writing ZhConversion.php ... ')
455 f.write( php )
456 f.close()
457
458 #Remove temp files
459 print ('Deleting temp files ... ')
460 os.remove('EZ.txt.in')
461 os.remove('phrase_lib.txt')
462 os.remove('tsi.src')
463 os.remove('Unihan_Variants.txt')
464 os.remove('Wubi.txt.in')
465 os.remove('Ziranma.txt.in')
466
467
468 if __name__ == '__main__':
469 main()