Adding support for zh-sg (Singapore) and zh-hk (Hong Kong)
[lhc/web/wiklou.git] / includes / zhtable / Makefile
1 #
2 # Creating the file ZhConversion.php used for Simplified/Traditional
3 # Chinese conversion. It gets the basic conversion table from the Unihan
4 # database, and construct the phrase tables using phrase libraries in
5 # the SCIM packages. There are also special tables used to for adjustment.
6 # Some data in the file simp2trad.manual was taken from the following
7 # paper:
8 # Requirement: you need to set your locale to zh_CN.UTF-8 (or any
9 # other utf-8 locales, I suppose)
10 #
11 #
12 all: ZhConversion.php tradphrases.notsure simpphrases.notsure
13
14 Unihan.txt:
15 wget ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip
16 unzip Unihan.zip
17
18 EZ.txt.in:
19 wget http://freedesktop.org/~suzhe/sources/scim-tables-0.4.3.tar.gz
20 tar zxvf scim-tables-0.4.3.tar.gz > /dev/null
21 cp scim-tables-0.4.3/zh/EZ.txt.in .
22 rm -rf scim-tables-0.4.3*
23
24 phrase_lib.txt:
25 wget http://freedesktop.org/~suzhe/scim-chinese/scim-chinese-0.4.2.tar.gz
26 tar zxvf scim-chinese-0.4.2.tar.gz > /dev/null
27 cp scim-chinese-0.4.2/data/phrase_lib.txt .
28 rm -rf scim-chinese-0.4.2*
29
30 tsi.src:
31 wget http://unc.dl.sourceforge.net/sourceforge/libtabe/libtabe-0.2.3.tgz
32 tar zxvf libtabe-0.2.3.tgz > /dev/null
33 cp libtabe/tsi-src/tsi.src .
34 rm -rf libtabe*
35
36 printutf8: printutf8.c
37 gcc -o printutf8 printutf8.c
38
39 unihan.t2s.t: Unihan.txt printutf8
40 grep kSimplifiedVariant Unihan.txt | sed '/#/d' | sed 's/kSimplifiedVariant//' | ./printutf8 > unihan.t2s.t
41
42 trad2simp.t: trad2simp.manual unihan.t2s.t
43 cp unihan.t2s.t tmp1
44 for I in `colrm 11 < trad2simp.manual` ; do sed "/^$$I/d" tmp1 > tmp2; mv tmp2 tmp1; done
45 cat trad2simp.manual tmp1 > trad2simp.t
46
47 unihan.s2t.t: Unihan.txt printutf8
48 grep kTraditionalVariant Unihan.txt | sed '/#/d' | sed 's/kTraditionalVariant//' | ./printutf8 > unihan.s2t.t
49
50 simp2trad.t: unihan.s2t.t simp2trad.manual
51 cp unihan.s2t.t tmp1
52 for I in `colrm 11 < simp2trad.manual` ; do sed "/^$$I/d" tmp1 > tmp2; mv tmp2 tmp1; done
53 cat simp2trad.manual tmp1 > simp2trad.t
54
55 t2s_1tomany.t: trad2simp.t
56 grep -s ".\{19,\}" trad2simp.t | sed 's/U+...../"/' | sed 's/|U+...../"=>"/' | sed 's/|U+.....//g' | sed 's/|/",/' > t2s_1tomany.t
57
58 t2s_1to1.t: trad2simp.t s2t_1tomany.t
59 sed "/.*|.*|.*|.*/d" trad2simp.t | sed 's/U+[0-9a-z][0-9a-z]*/"/' | sed 's/|U+[0-9a-z][0-9a-z]*/"=>"/' | sed 's/|/",/' > t2s_1to1.t
60 grep '"."=>"..",' s2t_1tomany.t | sed 's/\("."\)=>".\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
61 grep '"."=>"...",' s2t_1tomany.t | sed 's/\("."\)=>".\(.\).",/"\2"=>\1,/' >> t2s_1to1.t
62 grep '"."=>"...",' s2t_1tomany.t | sed 's/\("."\)=>"..\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
63 grep '"."=>"....",' s2t_1tomany.t | sed 's/\("."\)=>".\(.\)..",/"\2"=>\1,/' >> t2s_1to1.t
64 grep '"."=>"....",' s2t_1tomany.t | sed 's/\("."\)=>"..\(.\).",/"\2"=>\1,/' >> t2s_1to1.t
65 grep '"."=>"....",' s2t_1tomany.t | sed 's/\("."\)=>"...\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
66 sort t2s_1to1.t | uniq > t
67 mv t t2s_1to1.t
68
69
70 s2t_1tomany.t: simp2trad.t
71 grep -s ".\{19,\}" simp2trad.t | sed 's/U+...../"/' | sed 's/|U+...../"=>"/' | sed 's/|U+.....//g' | sed 's/|/",/' > s2t_1tomany.t
72
73 s2t_1to1.t: simp2trad.t t2s_1tomany.t
74 sed "/.*|.*|.*|.*/d" simp2trad.t | sed 's/U+[0-9a-z][0-9a-z]*/"/' | sed 's/|U+[0-9a-z][0-9a-z]*/"=>"/' | sed 's/|/",/' > s2t_1to1.t
75 grep '"."=>"..",' t2s_1tomany.t | sed 's/\("."\)=>".\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
76 grep '"."=>"...",' t2s_1tomany.t | sed 's/\("."\)=>".\(.\).",/"\2"=>\1,/' >> s2t_1to1.t
77 grep '"."=>"...",' t2s_1tomany.t | sed 's/\("."\)=>"..\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
78 grep '"."=>"....",' t2s_1tomany.t | sed 's/\("."\)=>".\(.\)..",/"\2"=>\1,/' >> s2t_1to1.t
79 grep '"."=>"....",' t2s_1tomany.t | sed 's/\("."\)=>"..\(.\).",/"\2"=>\1,/' >> s2t_1to1.t
80 grep '"."=>"....",' t2s_1tomany.t | sed 's/\("."\)=>"...\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
81 sort s2t_1to1.t | uniq > t
82 mv t s2t_1to1.t
83
84 tphrase.t: EZ.txt.in tsi.src
85 colrm 1 8 < EZ.txt.in | sed 's/\t//g' | grep "^.\{2,4\}[0-9]" | sed 's/[0-9]//g' > t
86 iconv -c -f big5 -t utf8 tsi.src | sed 's/ [0-9].*//g' | sed 's/[# ]//g'| grep "^.\{2,4\}" >> t
87 sort t | uniq > tphrase.t
88
89 alltradphrases.t: tphrase.t s2t_1tomany.t
90 for i in `cat s2t_1tomany.t | sed 's/.*=>".//' | sed 's/"//g' |sed 's/,/\n/' | sed 's/\(.\)/\1\n/g' |sort | uniq`; do grep -s $$i tphrase.t ; done > alltradphrases.t || true
91
92
93 tradphrases_2.t: alltradphrases.t
94 cat alltradphrases.t | grep "^..$$" | sort | uniq > tradphrases_2.t
95
96 tradphrases_3.t: alltradphrases.t
97 cat alltradphrases.t | grep "^...$$" | sort | uniq > tradphrases_3.t
98 for i in `cat tradphrases_2.t`; do grep $$i tradphrases_3.t ; done | sort | uniq > t3 || true
99 diff t3 tradphrases_3.t | grep ">" | sed 's/> //' > t
100 mv t tradphrases_3.t
101
102
103 tradphrases_4.t: alltradphrases.t
104 cat alltradphrases.t | grep "^....$$" | sort | uniq > tradphrases_4.t
105 for i in `cat tradphrases_2.t`; do grep $$i tradphrases_4.t ; done | sort | uniq > t3 || true
106 diff t3 tradphrases_4.t | grep ">" | sed 's/> //' > t
107 mv t tradphrases_4.t
108 for i in `cat tradphrases_3.t`; do grep $$i tradphrases_4.t ; done | sort | uniq > t3 || true
109 diff t3 tradphrases_4.t | grep ">" | sed 's/> //' > t
110 mv t tradphrases_4.t
111
112 tradphrases.t: tradphrases.manual tradphrases_2.t tradphrases_3.t tradphrases_4.t t2s_1tomany.t
113 cat tradphrases.manual tradphrases_2.t tradphrases_3.t tradphrases_4.t |sort | uniq > tradphrases.t
114 for i in `sed 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do grep $$i tradphrases.t ; done | diff tradphrases.t - | grep '<' | sed 's/< //' > t
115 mv t tradphrases.t
116
117 tradphrases.notsure: tradphrases_2.t tradphrases_3.t tradphrases_4.t t2s_1tomany.t
118 cat tradphrases_2.t tradphrases_3.t tradphrases_4.t |sort | uniq > t
119 for i in `sed 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do grep $$i t; done | diff t - | grep '>' | sed 's/> //' > tradphrases.notsure
120
121
122 ph.t: phrase_lib.txt
123 sed 's/[\t0-9a-zA-Z]//g' phrase_lib.txt | grep "^.\{2,4\}$$" > ph.t
124
125 allsimpphrases.t: ph.t
126 rm -f allsimpphrases.t
127 for i in `cat t2s_1tomany.t | sed 's/.*=>".//' | sed 's/"//g' | sed 's/,/\n/' | sed 's/\(.\)/\1\n/g' | sort | uniq `; do grep $$i ph.t >> allsimpphrases.t; done
128
129 simpphrases_2.t: allsimpphrases.t
130 cat allsimpphrases.t | grep "^..$$" | sort | uniq > simpphrases_2.t
131
132 simpphrases_3.t: allsimpphrases.t
133 cat allsimpphrases.t | grep "^...$$" | sort | uniq > simpphrases_3.t
134 for i in `cat simpphrases_2.t`; do grep $$i simpphrases_3.t ; done | sort | uniq > t3 || true
135 diff t3 simpphrases_3.t | grep ">" | sed 's/> //' > t
136 mv t simpphrases_3.t
137
138 simpphrases_4.t: allsimpphrases.t
139 cat allsimpphrases.t | grep "^....$$" | sort | uniq > simpphrases_4.t
140 rm -f t
141 for i in `cat simpphrases_2.t`; do grep $$i simpphrases_4.t >> t; done || true
142 sort t | uniq > t3
143 diff t3 simpphrases_4.t | grep ">" | sed 's/> //' > t
144 mv t simpphrases_4.t
145 for i in `cat simpphrases_3.t`; do grep $$i simpphrases_4.t; done | sort | uniq > t3 || true
146 diff t3 simpphrases_4.t | grep ">" | sed 's/> //' > t
147 mv t simpphrases_4.t
148
149 simpphrases.t:simpphrases_2.t simpphrases_3.t simpphrases_4.t t2s_1tomany.t
150 cat simpphrases_2.t simpphrases_3.t simpphrases_4.t > simpphrases.t
151 for i in `sed 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do grep $$i simpphrases.t ; done | diff simpphrases.t - | grep '<' | sed 's/< //' > t
152 mv t simpphrases.t
153
154
155 simpphrases.notsure:simpphrases_2.t simpphrases_3.t simpphrases_4.t t2s_1tomany.t
156 cat simpphrases_2.t simpphrases_3.t simpphrases_4.t > t
157 for i in `sed 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do grep $$i t ; done | diff t - | grep '>' | sed 's/> //' > simpphrases.notsure
158
159 trad2simp1to1.t: t2s_1tomany.t t2s_1to1.t
160 sed 's/\(.......\).*/\1",/' t2s_1tomany.t > trad2simp1to1.t
161 cat t2s_1to1.t >> trad2simp1to1.t
162
163 simp2trad1to1.t: s2t_1tomany.t s2t_1to1.t
164 sed 's/\(.......\).*/\1",/' s2t_1tomany.t > simp2trad1to1.t
165 cat s2t_1to1.t >> simp2trad1to1.t
166
167 trad2simp.php: trad2simp1to1.t tradphrases.t
168 printf '<?php\n$$trad2simp=array(' > trad2simp.php
169 cat trad2simp1to1.t >> trad2simp.php
170 printf ');\n$$str=\n"' >> trad2simp.php
171 cat tradphrases.t >> trad2simp.php
172 printf '";\n$$t=strtr($$str, $$trad2simp);\necho $$t;\n?>' >> trad2simp.php
173
174 simp2trad.php: simp2trad1to1.t simpphrases.t
175 printf '<?php\n$$simp2trad=array(' > simp2trad.php
176 cat simp2trad1to1.t >> simp2trad.php
177 printf ');\n$$str=\n"' >> simp2trad.php
178 cat simpphrases.t >> simp2trad.php
179 printf '";\n$$t=strtr($$str, $$simp2trad);\necho $$t;\n?>' >> simp2trad.php
180
181 simp2trad.phrases.t: trad2simp.php tradphrases.t toTW.manual
182 php -f trad2simp.php | sed 's/\(.*\)/"\1" => /' > tmp1
183 cat tradphrases.t | sed 's/\(.*\)/"\1",/' > tmp2
184 paste tmp1 tmp2 > simp2trad.phrases.t
185 sed 's/\(.*\)\t\(.*\)/"\1"=>"\2",/' toTW.manual >> simp2trad.phrases.t
186
187 trad2simp.phrases.t: simp2trad.php simpphrases.t toCN.manual
188 php -f simp2trad.php | sed 's/\(.*\)/"\1" => /' > tmp1
189 cat simpphrases.t | sed 's/\(.*\)/"\1",/' > tmp2
190 paste tmp1 tmp2 > trad2simp.phrases.t
191 sed 's/\(.*\)\t\(.*\)/"\1"=>"\2",/' toCN.manual >> trad2simp.phrases.t
192
193 ZhConversion.php: simp2trad1to1.t simp2trad.phrases.t trad2simp1to1.t trad2simp.phrases.t toHK.manual toSG.manual
194 printf '<?php\n/**\n * Simplified/Traditional Chinese conversion tables\n' > ZhConversion.php
195 printf ' *\n * Automatically generated using code and data in includes/zhtable/\n' >> ZhConversion.php
196 printf ' * Do not modify directly! \n *\n * @package MediaWiki\n*/\n\n' >> ZhConversion.php
197 printf '$$zh2TW=array(\n' >> ZhConversion.php
198 cat simp2trad1to1.t >> ZhConversion.php
199 echo >> ZhConversion.php
200 cat simp2trad.phrases.t >> ZhConversion.php
201 echo >> ZhConversion.php
202 echo ');' >> ZhConversion.php
203 echo >> ZhConversion.php
204 echo >> ZhConversion.php
205 printf '$$zh2CN=array(\n' >> ZhConversion.php
206 cat trad2simp1to1.t >> ZhConversion.php
207 echo >> ZhConversion.php
208 cat trad2simp.phrases.t >> ZhConversion.php
209 echo >> ZhConversion.php
210 printf ');' >> ZhConversion.php
211 echo >> ZhConversion.php
212 echo >> ZhConversion.php
213 printf '$$zh2HK=array(\n' >> ZhConversion.php
214 sed 's/\(.*\)\t\(.*\)/"\1" => "\2",/' toHK.manual >> ZhConversion.php
215 echo >> ZhConversion.php
216 printf ');' >> ZhConversion.php
217 echo >> ZhConversion.php
218 echo >> ZhConversion.php
219 printf '$$zh2SG=array(\n' >> ZhConversion.php
220 sed 's/\(.*\)\t\(.*\)/"\1" => "\2",/' toSG.manual >> ZhConversion.php
221 echo >> ZhConversion.php
222 printf ');' >> ZhConversion.php
223 echo >> ZhConversion.php
224 printf '?>' >> ZhConversion.php
225
226
227 clean:
228 rm -f ZhConversion.php tmp1 tmp2 tmp3 t3 *.t trad2simp.php simp2trad.php