Update the Chinese conversion tables
[lhc/web/wiklou.git] / includes / zhtable / Makefile
1 #
2 # Creating the file ZhConversion.php used for Simplified/Traditional
3 # Chinese conversion. It gets the basic conversion table from the Unihan
4 # database, and construct the phrase tables using phrase libraries in
5 # the SCIM packages and the libtabe package. There are also special
6 # tables used to for adjustment.
7 #
8
9 GREP = LANG=zh_CN.UTF8 grep
10 SED = LANG=zh_CN.UTF8 sed
11 DIFF = LANG=zh_CN.UTF8 diff
12 CC ?= gcc
13
14 SF_MIRROR = easynews
15 SCIM_TABLES_VER = 0.5.8
16 SCIM_PINYIN_VER = 0.5.91
17 LIBTABE_VER = 0.2.3
18
19 # Installation directory
20 INSTDIR = /usr/local/share/zhdaemons/
21
22 all: ZhConversion.php tradphrases.notsure simpphrases.notsure wordlist toHans.dict toHant.dict toCN.dict toTW.dict toHK.dict toSG.dict
23
24 # Download Unihan database and Traditional Chinese / Simplified Chinese phrases files
25 Unihan.zip:
26 wget -nc ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip
27
28 scim-tables-$(SCIM_TABLES_VER).tar.gz:
29 wget -nc http://$(SF_MIRROR).dl.sourceforge.net/sourceforge/scim/scim-tables-$(SCIM_TABLES_VER).tar.gz
30
31 scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
32 wget -nc http://$(SF_MIRROR).dl.sourceforge.net/sourceforge/scim/scim-pinyin-$(SCIM_PINYIN_VER).tar.gz
33
34 libtabe-$(LIBTABE_VER).tgz:
35 wget -nc http://$(SF_MIRROR).dl.sourceforge.net/sourceforge/libtabe/libtabe-$(LIBTABE_VER).tgz
36
37 # Extract the file from a comressed files
38 Unihan.txt: Unihan.zip
39 unzip -oq Unihan.zip
40
41 EZ.txt.in: scim-tables-$(SCIM_TABLES_VER).tar.gz
42 tar -xzf scim-tables-$(SCIM_TABLES_VER).tar.gz -O scim-tables-$(SCIM_TABLES_VER)/tables/zh/EZ-Big.txt.in > EZ.txt.in
43
44 Wubi.txt.in: scim-tables-$(SCIM_TABLES_VER).tar.gz
45 tar -xzf scim-tables-$(SCIM_TABLES_VER).tar.gz -O scim-tables-$(SCIM_TABLES_VER)/tables/zh/Wubi.txt.in > Wubi.txt.in
46
47 Ziranma.txt.in: scim-tables-$(SCIM_TABLES_VER).tar.gz
48 tar -xzf scim-tables-$(SCIM_TABLES_VER).tar.gz -O scim-tables-$(SCIM_TABLES_VER)/tables/zh/Ziranma.txt.in > Ziranma.txt.in
49
50
51 phrase_lib.txt: scim-pinyin-$(SCIM_PINYIN_VER).tar.gz
52 tar -xzf scim-pinyin-$(SCIM_PINYIN_VER).tar.gz -O scim-pinyin-$(SCIM_PINYIN_VER)/data/phrase_lib.txt > phrase_lib.txt
53
54 tsi.src: libtabe-$(LIBTABE_VER).tgz
55 tar -xzf libtabe-$(LIBTABE_VER).tgz -O libtabe/tsi-src/tsi.src > tsi.src
56
57 # Make a word list
58 wordlist: phrase_lib.txt EZ.txt.in tsi.src
59 iconv -c -f big5 -t utf8 tsi.src | $(SED) 's/# //g' | $(SED) 's/[ ][0-9].*//' > wordlist
60 $(SED) 's/\(.*\)\t[0-9][0-9]*.*/\1/' phrase_lib.txt | $(SED) '1,5d' >> wordlist
61 $(SED) '1,/BEGIN_TABLE/d' EZ.txt.in | colrm 1 8 | $(SED) 's/\t.*//' | $(GREP) "^...*" >> wordlist
62 sort wordlist | uniq | $(SED) 's/ //g' > t
63 mv t wordlist
64
65 printutf8: printutf8.c
66 $(CC) -o printutf8 printutf8.c
67
68 unihan.t2s.t: Unihan.txt printutf8
69 $(GREP) kSimplifiedVariant Unihan.txt | $(SED) '/#/d' | $(SED) 's/kSimplifiedVariant//' | ./printutf8 > unihan.t2s.t
70
71 trad2simp.t: trad2simp.manual unihan.t2s.t
72 cp unihan.t2s.t tmp1
73 for I in `colrm 11 < trad2simp.manual` ; do $(SED) "/^$$I/d" tmp1 > tmp2; mv tmp2 tmp1; done
74 cat trad2simp.manual tmp1 > trad2simp.t
75
76 unihan.s2t.t: Unihan.txt printutf8
77 $(GREP) kTraditionalVariant Unihan.txt | $(SED) '/#/d' | $(SED) 's/kTraditionalVariant//' | ./printutf8 > unihan.s2t.t
78
79 simp2trad.t: unihan.s2t.t simp2trad.manual
80 cp unihan.s2t.t tmp1
81 for I in `colrm 11 < simp2trad.manual` ; do $(SED) "/^$$I/d" tmp1 > tmp2; mv tmp2 tmp1; done
82 cat simp2trad.manual tmp1 > simp2trad.t
83
84 t2s_1tomany.t: trad2simp.t
85 $(GREP) -s ".\{19,\}" trad2simp.t | $(SED) 's/U+...../"/' | $(SED) 's/|U+...../"=>"/' | $(SED) 's/|U+.....//g' | $(SED) 's/|/",/' > t2s_1tomany.t
86
87 t2s_1to1.t: trad2simp.t s2t_1tomany.t
88 $(SED) "/.*|.*|.*|.*/d" trad2simp.t | $(SED) 's/U+[0-9a-z][0-9a-z]*/"/' | $(SED) 's/|U+[0-9a-z][0-9a-z]*/"=>"/' | $(SED) 's/|/",/' > t2s_1to1.t
89 $(GREP) '"."=>"..",' s2t_1tomany.t | $(SED) 's/\("."\)=>".\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
90 $(GREP) '"."=>"...",' s2t_1tomany.t | $(SED) 's/\("."\)=>".\(.\).",/"\2"=>\1,/' >> t2s_1to1.t
91 $(GREP) '"."=>"...",' s2t_1tomany.t | $(SED) 's/\("."\)=>"..\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
92 $(GREP) '"."=>"....",' s2t_1tomany.t | $(SED) 's/\("."\)=>".\(.\)..",/"\2"=>\1,/' >> t2s_1to1.t
93 $(GREP) '"."=>"....",' s2t_1tomany.t | $(SED) 's/\("."\)=>"..\(.\).",/"\2"=>\1,/' >> t2s_1to1.t
94 $(GREP) '"."=>"....",' s2t_1tomany.t | $(SED) 's/\("."\)=>"...\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
95 sort t2s_1to1.t | uniq > t
96 mv t t2s_1to1.t
97
98
99 s2t_1tomany.t: simp2trad.t
100 $(GREP) -s ".\{19,\}" simp2trad.t | $(SED) 's/U+...../"/' | $(SED) 's/|U+...../"=>"/' | $(SED) 's/|U+.....//g' | $(SED) 's/|/",/' > s2t_1tomany.t
101
102 s2t_1to1.t: simp2trad.t t2s_1tomany.t
103 $(SED) "/.*|.*|.*|.*/d" simp2trad.t | $(SED) 's/U+[0-9a-z][0-9a-z]*/"/' | $(SED) 's/|U+[0-9a-z][0-9a-z]*/"=>"/' | $(SED) 's/|/",/' > s2t_1to1.t
104 $(GREP) '"."=>"..",' t2s_1tomany.t | $(SED) 's/\("."\)=>".\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
105 $(GREP) '"."=>"...",' t2s_1tomany.t | $(SED) 's/\("."\)=>".\(.\).",/"\2"=>\1,/' >> s2t_1to1.t
106 $(GREP) '"."=>"...",' t2s_1tomany.t | $(SED) 's/\("."\)=>"..\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
107 $(GREP) '"."=>"....",' t2s_1tomany.t | $(SED) 's/\("."\)=>".\(.\)..",/"\2"=>\1,/' >> s2t_1to1.t
108 $(GREP) '"."=>"....",' t2s_1tomany.t | $(SED) 's/\("."\)=>"..\(.\).",/"\2"=>\1,/' >> s2t_1to1.t
109 $(GREP) '"."=>"....",' t2s_1tomany.t | $(SED) 's/\("."\)=>"...\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
110 sort s2t_1to1.t | uniq > t
111 mv t s2t_1to1.t
112
113 tphrase.t: EZ.txt.in tsi.src
114 colrm 1 8 < EZ.txt.in | $(SED) 's/\t//g' | $(GREP) "^.\{2,4\}[0-9]" | $(SED) 's/[0-9]//g' > t
115 iconv -c -f big5 -t utf8 tsi.src | $(SED) 's/ [0-9].*//g' | $(SED) 's/[# ]//g'| $(GREP) "^.\{2,4\}" >> t
116 sort t | uniq > tphrase.t
117
118 alltradphrases.t: tphrase.t s2t_1tomany.t
119 for i in `cat s2t_1tomany.t | $(SED) 's/.*=>".//' | $(SED) 's/"//g' |$(SED) 's/,/\n/' | $(SED) 's/\(.\)/\1\n/g' |sort | uniq`; do $(GREP) -s $$i tphrase.t ; done > alltradphrases.t || true
120
121
122 tradphrases_2.t: alltradphrases.t
123 cat alltradphrases.t | $(GREP) "^..$$" | sort | uniq > tradphrases_2.t
124
125 tradphrases_3.t: alltradphrases.t
126 cat alltradphrases.t | $(GREP) "^...$$" | sort | uniq > tradphrases_3.t
127 for i in `cat tradphrases_2.t`; do $(GREP) $$i tradphrases_3.t ; done | sort | uniq > t3 || true
128 $(DIFF) t3 tradphrases_3.t | $(GREP) ">" | $(SED) 's/> //' > t
129 mv t tradphrases_3.t
130
131
132 tradphrases_4.t: alltradphrases.t
133 cat alltradphrases.t | $(GREP) "^....$$" | sort | uniq > tradphrases_4.t
134 for i in `cat tradphrases_2.t`; do $(GREP) $$i tradphrases_4.t ; done | sort | uniq > t3 || true
135 $(DIFF) t3 tradphrases_4.t | $(GREP) ">" | $(SED) 's/> //' > t
136 mv t tradphrases_4.t
137 for i in `cat tradphrases_3.t`; do $(GREP) $$i tradphrases_4.t ; done | sort | uniq > t3 || true
138 $(DIFF) t3 tradphrases_4.t | $(GREP) ">" | $(SED) 's/> //' > t
139 mv t tradphrases_4.t
140
141 tradphrases.t: tradphrases.manual tradphrases_2.t tradphrases_3.t tradphrases_4.t t2s_1tomany.t
142 cat tradphrases.manual tradphrases_2.t tradphrases_3.t tradphrases_4.t |sort | uniq > tradphrases.t
143 for i in `$(SED) 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do $(GREP) $$i tradphrases.t ; done | $(DIFF) tradphrases.t - | $(GREP) '<' | $(SED) 's/< //' > t
144 mv t tradphrases.t
145
146 tradphrases.notsure: tradphrases_2.t tradphrases_3.t tradphrases_4.t t2s_1tomany.t
147 cat tradphrases_2.t tradphrases_3.t tradphrases_4.t |sort | uniq > t
148 for i in `$(SED) 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do $(GREP) $$i t; done | $(DIFF) t - | $(GREP) '>' | $(SED) 's/> //' > tradphrases.notsure
149
150
151 ph.t: phrase_lib.txt
152 $(SED) 's/[\t0-9a-zA-Z]//g' phrase_lib.txt | $(GREP) "^.\{2,4\}$$" > ph.t
153
154 Wubi.t: Wubi.txt.in
155 $(SED) '1,/BEGIN_TABLE/d' Wubi.txt.in | colrm 1 8 | $(SED) 's/\t.*//' | $(GREP) "^...*" > Wubi.t
156
157 Ziranma.t: Ziranma.txt.in
158 $(SED) '1,/BEGIN_TABLE/d' Ziranma.txt.in | colrm 1 8 | $(SED) 's/\t.*//' | $(GREP) "^...*" > Ziranma.t
159
160
161 allsimpphrases.t: t2s_1tomany.t ph.t Wubi.t Ziranma.t
162 rm -f allsimpphrases.t
163 for i in `cat t2s_1tomany.t | $(SED) 's/.*=>".//' | $(SED) 's/"//g' | $(SED) 's/,/\n/' | $(SED) 's/\(.\)/\1\n/g' | sort | uniq `; do $(GREP) $$i Wubi.t >> allsimpphrases.t; done
164 for i in `cat t2s_1tomany.t | $(SED) 's/.*=>".//' | $(SED) 's/"//g' | $(SED) 's/,/\n/' | $(SED) 's/\(.\)/\1\n/g' | sort | uniq `; do $(GREP) $$i Ziranma.t >> allsimpphrases.t; done
165 for i in `cat t2s_1tomany.t | $(SED) 's/.*=>".//' | $(SED) 's/"//g' | $(SED) 's/,/\n/' | $(SED) 's/\(.\)/\1\n/g' | sort | uniq `; do $(GREP) $$i ph.t >> allsimpphrases.t; done
166
167 simpphrases_2.t: allsimpphrases.t
168 cat allsimpphrases.t | $(GREP) "^..$$" | sort | uniq > simpphrases_2.t
169
170 simpphrases_3.t: allsimpphrases.t
171 cat allsimpphrases.t | $(GREP) "^...$$" | sort | uniq > simpphrases_3.t
172 for i in `cat simpphrases_2.t`; do $(GREP) $$i simpphrases_3.t ; done | sort | uniq > t3 || true
173 $(DIFF) t3 simpphrases_3.t | $(GREP) ">" | $(SED) 's/> //' > t
174 mv t simpphrases_3.t
175
176 simpphrases_4.t: allsimpphrases.t
177 cat allsimpphrases.t | $(GREP) "^....$$" | sort | uniq > simpphrases_4.t
178 rm -f t
179 for i in `cat simpphrases_2.t`; do $(GREP) $$i simpphrases_4.t >> t; done || true
180 sort t | uniq > t3
181 $(DIFF) t3 simpphrases_4.t | $(GREP) ">" | $(SED) 's/> //' > t
182 mv t simpphrases_4.t
183 for i in `cat simpphrases_3.t`; do $(GREP) $$i simpphrases_4.t; done | sort | uniq > t3 || true
184 $(DIFF) t3 simpphrases_4.t | $(GREP) ">" | $(SED) 's/> //' > t
185 mv t simpphrases_4.t
186
187 simpphrases.t:simpphrases_2.t simpphrases_3.t simpphrases_4.t t2s_1tomany.t
188 cat simpphrases_2.t simpphrases_3.t simpphrases_4.t > simpphrases.t
189 for i in `$(SED) 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do $(GREP) $$i simpphrases.t ; done | $(DIFF) simpphrases.t - | $(GREP) '<' | $(SED) 's/< //' > t
190 mv t simpphrases.t
191
192
193 simpphrases.notsure:simpphrases_2.t simpphrases_3.t simpphrases_4.t t2s_1tomany.t
194 cat simpphrases_2.t simpphrases_3.t simpphrases_4.t > t
195 for i in `$(SED) 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do $(GREP) $$i t ; done | $(DIFF) t - | $(GREP) '>' | $(SED) 's/> //' > simpphrases.notsure
196
197 trad2simp1to1.t: t2s_1tomany.t t2s_1to1.t
198 $(SED) 's/\(.......\).*/\1",/' t2s_1tomany.t > trad2simp1to1.t
199 cat t2s_1to1.t >> trad2simp1to1.t
200
201 simp2trad1to1.t: s2t_1tomany.t s2t_1to1.t
202 $(SED) 's/\(.......\).*/\1",/' s2t_1tomany.t > simp2trad1to1.t
203 cat s2t_1to1.t >> simp2trad1to1.t
204
205 trad2simp.php: trad2simp1to1.t tradphrases.t
206 printf '<?php\n$$trad2simp=array(' > trad2simp.php
207 cat trad2simp1to1.t >> trad2simp.php
208 printf ');\n$$str=\n"' >> trad2simp.php
209 cat tradphrases.t >> trad2simp.php
210 printf '";\n$$t=strtr($$str, $$trad2simp);\necho $$t;\n?>' >> trad2simp.php
211
212 simp2trad.php: simp2trad1to1.t simpphrases.t
213 printf '<?php\n$$simp2trad=array(' > simp2trad.php
214 cat simp2trad1to1.t >> simp2trad.php
215 printf ');\n$$str=\n"' >> simp2trad.php
216 cat simpphrases.t >> simp2trad.php
217 printf '";\n$$t=strtr($$str, $$simp2trad);\necho $$t;\n?>' >> simp2trad.php
218
219 simp2trad.phrases.t: trad2simp.php tradphrases.t
220 php -f trad2simp.php | $(SED) 's/\(.*\)/"\1" => /' > tmp1
221 cat tradphrases.t | $(SED) 's/\(.*\)/"\1",/' > tmp2
222 paste tmp1 tmp2 > simp2trad.phrases.t
223
224 trad2simp.phrases.t: simp2trad.php simpphrases.t
225 php -f simp2trad.php | $(SED) 's/\(.*\)/"\1" => /' > tmp1
226 cat simpphrases.t | $(SED) 's/\(.*\)/"\1",/' > tmp2
227 paste tmp1 tmp2 > trad2simp.phrases.t
228
229 toHans.dict: trad2simp1to1.t trad2simp.phrases.t
230 cat trad2simp1to1.t | $(SED) 's/[, \t]//g' | $(SED) 's/=>/\t/' > toHans.dict
231 cat trad2simp.phrases.t | $(SED) 's/[, \t]//g' | $(SED) 's/=>/\t/' >> toHans.dict
232
233 toHant.dict: simp2trad1to1.t simp2trad.phrases.t
234 cat simp2trad1to1.t | $(SED) 's/[, \t]//g' | $(SED) 's/=>/\t/' > toHant.dict
235 cat simp2trad.phrases.t | $(SED) 's/[, \t]//g' | $(SED) 's/=>/\t/' >> toHant.dict
236
237 toTW.dict: toTW.manual
238 cat toTW.manual | $(SED) 's/ //g' | $(SED) 's/\(^.*\)\t\(.*\)/"\1"\t"\2"/' > toTW.dict
239
240 toHK.dict: toHK.manual
241 cat toHK.manual | $(SED) 's/ //g' | $(SED) 's/\(^.*\)\t\(.*\)/"\1"\t"\2"/' > toHK.dict
242
243 toCN.dict: toCN.manual
244 cat toCN.manual | $(SED) 's/ //g' | $(SED) 's/\(^.*\)\t\(.*\)/"\1"\t"\2"/' > toCN.dict
245
246 toSG.dict: toSG.manual
247 cat toSG.manual | $(SED) 's/ //g' | $(SED) 's/\(^.*\)\t\(.*\)/"\1"\t"\2"/' > toSG.dict
248
249 ZhConversion.php: simp2trad1to1.t simp2trad.phrases.t trad2simp1to1.t trad2simp.phrases.t toCN.manual toHK.manual toSG.manual toTW.manual
250 printf '<?php\n/**\n * Simplified / Traditional Chinese conversion tables\n' > ZhConversion.php
251 printf ' *\n * Automatically generated using code and data in includes/zhtable/\n' >> ZhConversion.php
252 printf ' * Do not modify directly!\n */\n\n' >> ZhConversion.php
253 printf '$$zh2Hant = array(\n' >> ZhConversion.php
254 cat simp2trad1to1.t >> ZhConversion.php
255 echo >> ZhConversion.php
256 cat simp2trad.phrases.t >> ZhConversion.php
257 echo ');' >> ZhConversion.php
258 echo >> ZhConversion.php
259 printf '$$zh2Hans = array(\n' >> ZhConversion.php
260 cat trad2simp1to1.t >> ZhConversion.php
261 echo >> ZhConversion.php
262 cat trad2simp.phrases.t >> ZhConversion.php
263 echo ');' >> ZhConversion.php
264 echo >> ZhConversion.php
265 printf '$$zh2TW = array(\n' >> ZhConversion.php
266 $(SED) 's/\(.*\)\t\(.*\)/"\1" => "\2",/' toTW.manual >> ZhConversion.php
267 echo ');' >> ZhConversion.php
268 echo >> ZhConversion.php
269 printf '$$zh2HK = array(\n' >> ZhConversion.php
270 $(SED) 's/\(.*\)\t\(.*\)/"\1" => "\2",/' toHK.manual >> ZhConversion.php
271 echo ');' >> ZhConversion.php
272 echo >> ZhConversion.php
273 printf '$$zh2CN = array(\n' >> ZhConversion.php
274 $(SED) 's/\(.*\)\t\(.*\)/"\1" => "\2",/' toCN.manual >> ZhConversion.php
275 echo ');' >> ZhConversion.php
276 echo >> ZhConversion.php
277 printf '$$zh2SG = array(\n' >> ZhConversion.php
278 $(SED) 's/\(.*\)\t\(.*\)/"\1" => "\2",/' toSG.manual >> ZhConversion.php
279 echo >> ZhConversion.php
280 printf ');' >> ZhConversion.php
281
282 clean: cleantmp cleandl
283
284 cleantmp:
285 # Stuff unpacked from the files fetched by wget
286 rm -f \
287 Unihan.txt \
288 EZ.txt.in \
289 Wubi.txt.in \
290 Ziranma.txt.in \
291 phrase_lib.txt \
292 tsi.src
293 # Temporary files and other trash
294 rm -f ZhConversion.php tmp1 tmp2 tmp3 t3 *.t trad2simp.php simp2trad.php *.dict printutf8 *~ \
295 simpphrases.notsure tradphrases.notsure wordlist
296
297 cleandl:
298 rm -f \
299 Unihan.zip \
300 scim-tables-$(SCIM_TABLES_VER).tar.gz \
301 scim-pinyin-$(SCIM_PINYIN_VER).tar.gz \
302 libtabe-$(LIBTABE_VER).tgz
303