- set locale for each individual command, so that the scipt can be run in any locale.
[lhc/web/wiklou.git] / includes / zhtable / Makefile
1 #
2 # Creating the file ZhConversion.php used for Simplified/Traditional
3 # Chinese conversion. It gets the basic conversion table from the Unihan
4 # database, and construct the phrase tables using phrase libraries in
5 # the SCIM packages. There are also special tables used to for adjustment.
6 # Some data in the file simp2trad.manual was taken from the following
7 # paper:
8 #
9
10 GREP = LANG=zh_CN.UTF8 grep
11 SED = LANG=zh_CN.UTF8 sed
12 DIFF = LANG=zh_CN.UTF8 diff
13
14 #installation directory
15 INSTDIR = /usr/local/share/zhdaemons/
16
17 all: ZhConversion.php tradphrases.notsure simpphrases.notsure wordlist toCN.dict toTW.dict toHK.dict toSG.dict
18
19 Unihan.txt:
20 wget ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip
21 unzip Unihan.zip
22
23 EZ.txt.in:
24 wget http://freedesktop.org/~suzhe/sources/scim-tables-0.4.3.tar.gz
25 tar zxvf scim-tables-0.4.3.tar.gz > /dev/null
26 cp scim-tables-0.4.3/zh/EZ.txt.in .
27 rm -rf scim-tables-0.4.3*
28
29 phrase_lib.txt:
30 wget http://freedesktop.org/~suzhe/scim-chinese/scim-chinese-0.4.2.tar.gz
31 tar zxvf scim-chinese-0.4.2.tar.gz > /dev/null
32 cp scim-chinese-0.4.2/data/phrase_lib.txt .
33 rm -rf scim-chinese-0.4.2*
34
35 tsi.src:
36 wget http://unc.dl.sourceforge.net/sourceforge/libtabe/libtabe-0.2.3.tgz
37 tar zxvf libtabe-0.2.3.tgz > /dev/null
38 cp libtabe/tsi-src/tsi.src .
39 rm -rf libtabe*
40
41 wordlist: phrase_lib.txt EZ.txt.in tsi.src
42 iconv -c -f big5 -t utf8 tsi.src | $(SED) 's/# //g' | $(SED) 's/[ ][0-9].*//' > wordlist
43 $(SED) 's/\(.*\)\t[0-9][0-9]*.*/\1/' phrase_lib.txt | $(SED) '1,5d' >>wordlist
44 $(SED) '1,/BEGIN_TABLE/d' EZ.txt.in | colrm 1 8 | $(SED) 's/\t.*//' | $(GREP) "^...*" >> wordlist
45 sort wordlist | uniq | $(SED) 's/ //g' > t
46 mv t wordlist
47
48 printutf8: printutf8.c
49 gcc -o printutf8 printutf8.c
50
51 unihan.t2s.t: Unihan.txt printutf8
52 $(GREP) kSimplifiedVariant Unihan.txt | $(SED) '/#/d' | $(SED) 's/kSimplifiedVariant//' | ./printutf8 > unihan.t2s.t
53
54 trad2simp.t: trad2simp.manual unihan.t2s.t
55 cp unihan.t2s.t tmp1
56 for I in `colrm 11 < trad2simp.manual` ; do $(SED) "/^$$I/d" tmp1 > tmp2; mv tmp2 tmp1; done
57 cat trad2simp.manual tmp1 > trad2simp.t
58
59 unihan.s2t.t: Unihan.txt printutf8
60 $(GREP) kTraditionalVariant Unihan.txt | $(SED) '/#/d' | $(SED) 's/kTraditionalVariant//' | ./printutf8 > unihan.s2t.t
61
62 simp2trad.t: unihan.s2t.t simp2trad.manual
63 cp unihan.s2t.t tmp1
64 for I in `colrm 11 < simp2trad.manual` ; do $(SED) "/^$$I/d" tmp1 > tmp2; mv tmp2 tmp1; done
65 cat simp2trad.manual tmp1 > simp2trad.t
66
67 t2s_1tomany.t: trad2simp.t
68 $(GREP) -s ".\{19,\}" trad2simp.t | $(SED) 's/U+...../"/' | $(SED) 's/|U+...../"=>"/' | $(SED) 's/|U+.....//g' | $(SED) 's/|/",/' > t2s_1tomany.t
69
70 t2s_1to1.t: trad2simp.t s2t_1tomany.t
71 $(SED) "/.*|.*|.*|.*/d" trad2simp.t | $(SED) 's/U+[0-9a-z][0-9a-z]*/"/' | $(SED) 's/|U+[0-9a-z][0-9a-z]*/"=>"/' | $(SED) 's/|/",/' > t2s_1to1.t
72 $(GREP) '"."=>"..",' s2t_1tomany.t | $(SED) 's/\("."\)=>".\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
73 $(GREP) '"."=>"...",' s2t_1tomany.t | $(SED) 's/\("."\)=>".\(.\).",/"\2"=>\1,/' >> t2s_1to1.t
74 $(GREP) '"."=>"...",' s2t_1tomany.t | $(SED) 's/\("."\)=>"..\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
75 $(GREP) '"."=>"....",' s2t_1tomany.t | $(SED) 's/\("."\)=>".\(.\)..",/"\2"=>\1,/' >> t2s_1to1.t
76 $(GREP) '"."=>"....",' s2t_1tomany.t | $(SED) 's/\("."\)=>"..\(.\).",/"\2"=>\1,/' >> t2s_1to1.t
77 $(GREP) '"."=>"....",' s2t_1tomany.t | $(SED) 's/\("."\)=>"...\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
78 sort t2s_1to1.t | uniq > t
79 mv t t2s_1to1.t
80
81
82 s2t_1tomany.t: simp2trad.t
83 $(GREP) -s ".\{19,\}" simp2trad.t | $(SED) 's/U+...../"/' | $(SED) 's/|U+...../"=>"/' | $(SED) 's/|U+.....//g' | $(SED) 's/|/",/' > s2t_1tomany.t
84
85 s2t_1to1.t: simp2trad.t t2s_1tomany.t
86 $(SED) "/.*|.*|.*|.*/d" simp2trad.t | $(SED) 's/U+[0-9a-z][0-9a-z]*/"/' | $(SED) 's/|U+[0-9a-z][0-9a-z]*/"=>"/' | $(SED) 's/|/",/' > s2t_1to1.t
87 $(GREP) '"."=>"..",' t2s_1tomany.t | $(SED) 's/\("."\)=>".\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
88 $(GREP) '"."=>"...",' t2s_1tomany.t | $(SED) 's/\("."\)=>".\(.\).",/"\2"=>\1,/' >> s2t_1to1.t
89 $(GREP) '"."=>"...",' t2s_1tomany.t | $(SED) 's/\("."\)=>"..\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
90 $(GREP) '"."=>"....",' t2s_1tomany.t | $(SED) 's/\("."\)=>".\(.\)..",/"\2"=>\1,/' >> s2t_1to1.t
91 $(GREP) '"."=>"....",' t2s_1tomany.t | $(SED) 's/\("."\)=>"..\(.\).",/"\2"=>\1,/' >> s2t_1to1.t
92 $(GREP) '"."=>"....",' t2s_1tomany.t | $(SED) 's/\("."\)=>"...\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
93 sort s2t_1to1.t | uniq > t
94 mv t s2t_1to1.t
95
96 tphrase.t: EZ.txt.in tsi.src
97 colrm 1 8 < EZ.txt.in | $(SED) 's/\t//g' | $(GREP) "^.\{2,4\}[0-9]" | $(SED) 's/[0-9]//g' > t
98 iconv -c -f big5 -t utf8 tsi.src | $(SED) 's/ [0-9].*//g' | $(SED) 's/[# ]//g'| $(GREP) "^.\{2,4\}" >> t
99 sort t | uniq > tphrase.t
100
101 alltradphrases.t: tphrase.t s2t_1tomany.t
102 for i in `cat s2t_1tomany.t | $(SED) 's/.*=>".//' | $(SED) 's/"//g' |$(SED) 's/,/\n/' | $(SED) 's/\(.\)/\1\n/g' |sort | uniq`; do $(GREP) -s $$i tphrase.t ; done > alltradphrases.t || true
103
104
105 tradphrases_2.t: alltradphrases.t
106 cat alltradphrases.t | $(GREP) "^..$$" | sort | uniq > tradphrases_2.t
107
108 tradphrases_3.t: alltradphrases.t
109 cat alltradphrases.t | $(GREP) "^...$$" | sort | uniq > tradphrases_3.t
110 for i in `cat tradphrases_2.t`; do $(GREP) $$i tradphrases_3.t ; done | sort | uniq > t3 || true
111 $(DIFF) t3 tradphrases_3.t | $(GREP) ">" | $(SED) 's/> //' > t
112 mv t tradphrases_3.t
113
114
115 tradphrases_4.t: alltradphrases.t
116 cat alltradphrases.t | $(GREP) "^....$$" | sort | uniq > tradphrases_4.t
117 for i in `cat tradphrases_2.t`; do $(GREP) $$i tradphrases_4.t ; done | sort | uniq > t3 || true
118 $(DIFF) t3 tradphrases_4.t | $(GREP) ">" | $(SED) 's/> //' > t
119 mv t tradphrases_4.t
120 for i in `cat tradphrases_3.t`; do $(GREP) $$i tradphrases_4.t ; done | sort | uniq > t3 || true
121 $(DIFF) t3 tradphrases_4.t | $(GREP) ">" | $(SED) 's/> //' > t
122 mv t tradphrases_4.t
123
124 tradphrases.t: tradphrases.manual tradphrases_2.t tradphrases_3.t tradphrases_4.t t2s_1tomany.t
125 cat tradphrases.manual tradphrases_2.t tradphrases_3.t tradphrases_4.t |sort | uniq > tradphrases.t
126 for i in `$(SED) 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do $(GREP) $$i tradphrases.t ; done | $(DIFF) tradphrases.t - | $(GREP) '<' | $(SED) 's/< //' > t
127 mv t tradphrases.t
128
129 tradphrases.notsure: tradphrases_2.t tradphrases_3.t tradphrases_4.t t2s_1tomany.t
130 cat tradphrases_2.t tradphrases_3.t tradphrases_4.t |sort | uniq > t
131 for i in `$(SED) 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do $(GREP) $$i t; done | $(DIFF) t - | $(GREP) '>' | $(SED) 's/> //' > tradphrases.notsure
132
133
134 ph.t: phrase_lib.txt
135 $(SED) 's/[\t0-9a-zA-Z]//g' phrase_lib.txt | $(GREP) "^.\{2,4\}$$" > ph.t
136
137 allsimpphrases.t: ph.t
138 rm -f allsimpphrases.t
139 for i in `cat t2s_1tomany.t | $(SED) 's/.*=>".//' | $(SED) 's/"//g' | $(SED) 's/,/\n/' | $(SED) 's/\(.\)/\1\n/g' | sort | uniq `; do $(GREP) $$i ph.t >> allsimpphrases.t; done
140
141 simpphrases_2.t: allsimpphrases.t
142 cat allsimpphrases.t | $(GREP) "^..$$" | sort | uniq > simpphrases_2.t
143
144 simpphrases_3.t: allsimpphrases.t
145 cat allsimpphrases.t | $(GREP) "^...$$" | sort | uniq > simpphrases_3.t
146 for i in `cat simpphrases_2.t`; do $(GREP) $$i simpphrases_3.t ; done | sort | uniq > t3 || true
147 $(DIFF) t3 simpphrases_3.t | $(GREP) ">" | $(SED) 's/> //' > t
148 mv t simpphrases_3.t
149
150 simpphrases_4.t: allsimpphrases.t
151 cat allsimpphrases.t | $(GREP) "^....$$" | sort | uniq > simpphrases_4.t
152 rm -f t
153 for i in `cat simpphrases_2.t`; do $(GREP) $$i simpphrases_4.t >> t; done || true
154 sort t | uniq > t3
155 $(DIFF) t3 simpphrases_4.t | $(GREP) ">" | $(SED) 's/> //' > t
156 mv t simpphrases_4.t
157 for i in `cat simpphrases_3.t`; do $(GREP) $$i simpphrases_4.t; done | sort | uniq > t3 || true
158 $(DIFF) t3 simpphrases_4.t | $(GREP) ">" | $(SED) 's/> //' > t
159 mv t simpphrases_4.t
160
161 simpphrases.t:simpphrases_2.t simpphrases_3.t simpphrases_4.t t2s_1tomany.t
162 cat simpphrases_2.t simpphrases_3.t simpphrases_4.t > simpphrases.t
163 for i in `$(SED) 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do $(GREP) $$i simpphrases.t ; done | $(DIFF) simpphrases.t - | $(GREP) '<' | $(SED) 's/< //' > t
164 mv t simpphrases.t
165
166
167 simpphrases.notsure:simpphrases_2.t simpphrases_3.t simpphrases_4.t t2s_1tomany.t
168 cat simpphrases_2.t simpphrases_3.t simpphrases_4.t > t
169 for i in `$(SED) 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do $(GREP) $$i t ; done | $(DIFF) t - | $(GREP) '>' | $(SED) 's/> //' > simpphrases.notsure
170
171 trad2simp1to1.t: t2s_1tomany.t t2s_1to1.t
172 $(SED) 's/\(.......\).*/\1",/' t2s_1tomany.t > trad2simp1to1.t
173 cat t2s_1to1.t >> trad2simp1to1.t
174
175 simp2trad1to1.t: s2t_1tomany.t s2t_1to1.t
176 $(SED) 's/\(.......\).*/\1",/' s2t_1tomany.t > simp2trad1to1.t
177 cat s2t_1to1.t >> simp2trad1to1.t
178
179 trad2simp.php: trad2simp1to1.t tradphrases.t
180 printf '<?php\n$$trad2simp=array(' > trad2simp.php
181 cat trad2simp1to1.t >> trad2simp.php
182 printf ');\n$$str=\n"' >> trad2simp.php
183 cat tradphrases.t >> trad2simp.php
184 printf '";\n$$t=strtr($$str, $$trad2simp);\necho $$t;\n?>' >> trad2simp.php
185
186 simp2trad.php: simp2trad1to1.t simpphrases.t
187 printf '<?php\n$$simp2trad=array(' > simp2trad.php
188 cat simp2trad1to1.t >> simp2trad.php
189 printf ');\n$$str=\n"' >> simp2trad.php
190 cat simpphrases.t >> simp2trad.php
191 printf '";\n$$t=strtr($$str, $$simp2trad);\necho $$t;\n?>' >> simp2trad.php
192
193 simp2trad.phrases.t: trad2simp.php tradphrases.t toTW.manual
194 php -f trad2simp.php | $(SED) 's/\(.*\)/"\1" => /' > tmp1
195 cat tradphrases.t | $(SED) 's/\(.*\)/"\1",/' > tmp2
196 paste tmp1 tmp2 > simp2trad.phrases.t
197 $(SED) 's/\(.*\)\t\(.*\)/"\1"=>"\2",/' toTW.manual >> simp2trad.phrases.t
198
199 trad2simp.phrases.t: simp2trad.php simpphrases.t toCN.manual
200 php -f simp2trad.php | $(SED) 's/\(.*\)/"\1" => /' > tmp1
201 cat simpphrases.t | $(SED) 's/\(.*\)/"\1",/' > tmp2
202 paste tmp1 tmp2 > trad2simp.phrases.t
203 $(SED) 's/\(.*\)\t\(.*\)/"\1"=>"\2",/' toCN.manual >> trad2simp.phrases.t
204
205 toCN.dict: trad2simp1to1.t trad2simp.phrases.t
206 cat trad2simp1to1.t | $(SED) 's/[, \t]//g' | $(SED) 's/=>/\t/' > toCN.dict
207 cat trad2simp.phrases.t | $(SED) 's/[, \t]//g' | $(SED) 's/=>/\t/' >> toCN.dict
208
209 toTW.dict: simp2trad1to1.t simp2trad.phrases.t
210 cat simp2trad1to1.t | $(SED) 's/[, \t]//g' | $(SED) 's/=>/\t/' > toTW.dict
211 cat simp2trad.phrases.t | $(SED) 's/[, \t]//g' | $(SED) 's/=>/\t/' >> toTW.dict
212
213 toHK.dict: toHK.manual
214 cat toHK.manual | $(SED) 's/[ ]//g' | $(SED) 's/\(^[^ \t]*\)[ \t][ \t]*\([^ \t]*\)/"\1"\t"\2"/' > toHK.dict
215
216 toSG.dict: toSG.manual
217 cat toSG.manual | $(SED) 's/[ ]//g' | $(SED) 's/\(^[^ \t]*\)[ \t][ \t]*\([^ \t]*\)/"\1"\t"\2"/' > toSG.dict
218
219
220
221 ZhConversion.php: simp2trad1to1.t simp2trad.phrases.t trad2simp1to1.t trad2simp.phrases.t toHK.manual toSG.manual
222 printf '<?php\n/**\n * Simplified/Traditional Chinese conversion tables\n' > ZhConversion.php
223 printf ' *\n * Automatically generated using code and data in includes/zhtable/\n' >> ZhConversion.php
224 printf ' * Do not modify directly! \n *\n * @package MediaWiki\n*/\n\n' >> ZhConversion.php
225 printf '$$zh2TW=array(\n' >> ZhConversion.php
226 cat simp2trad1to1.t >> ZhConversion.php
227 echo >> ZhConversion.php
228 cat simp2trad.phrases.t >> ZhConversion.php
229 echo >> ZhConversion.php
230 echo ');' >> ZhConversion.php
231 echo >> ZhConversion.php
232 echo >> ZhConversion.php
233 printf '$$zh2CN=array(\n' >> ZhConversion.php
234 cat trad2simp1to1.t >> ZhConversion.php
235 echo >> ZhConversion.php
236 cat trad2simp.phrases.t >> ZhConversion.php
237 echo >> ZhConversion.php
238 printf ');' >> ZhConversion.php
239 echo >> ZhConversion.php
240 echo >> ZhConversion.php
241 printf '$$zh2HK=array(\n' >> ZhConversion.php
242 $(SED) 's/\(.*\)\t\(.*\)/"\1" => "\2",/' toHK.manual >> ZhConversion.php
243 echo >> ZhConversion.php
244 printf ');' >> ZhConversion.php
245 echo >> ZhConversion.php
246 echo >> ZhConversion.php
247 printf '$$zh2SG=array(\n' >> ZhConversion.php
248 $(SED) 's/\(.*\)\t\(.*\)/"\1" => "\2",/' toSG.manual >> ZhConversion.php
249 echo >> ZhConversion.php
250 printf ');' >> ZhConversion.php
251 echo >> ZhConversion.php
252 printf '?>' >> ZhConversion.php
253
254
255 clean:
256 rm -f ZhConversion.php tmp1 tmp2 tmp3 t3 *.t trad2simp.php simp2trad.php *.dict printutf8 *~
257