8418269192c9ac17c4008f8779b098362c368a03
[lhc/web/wiklou.git] / includes / zhtable / Makefile
1 #
2 # Creating the file ZhConversion.php used for Simplified/Traditional
3 # Chinese conversion. It gets the basic conversion table from the Unihan
4 # database, and construct the phrase tables using phrase libraries in
5 # the SCIM packages. There are also special tables used to for adjustment.
6 # Some data in the file simp2trad.manual was taken from the following
7 # paper:
8 # Requirement: you need to set your locale to zh_CN.UTF-8 (or any
9 # other utf-8 locales, I suppose)
10 #
11 #
12 all: ZhConversion.php
13
14 Unihan.txt:
15 wget ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip
16 unzip Unihan.zip
17
18 EZ.txt.in:
19 wget http://freedesktop.org/~suzhe/sources/scim-tables-0.4.3.tar.gz
20 tar zxvf scim-tables-0.4.3.tar.gz > /dev/null
21 cp scim-tables-0.4.3/zh/EZ.txt.in .
22 rm -rf scim-tables-0.4.3*
23
24 phrase_lib.txt:
25 wget http://freedesktop.org/~suzhe/scim-chinese/scim-chinese-0.4.2.tar.gz
26 tar zxvf scim-chinese-0.4.2.tar.gz > /dev/null
27 cp scim-chinese-0.4.2/data/phrase_lib.txt .
28 rm -rf scim-chinese-0.4.2*
29
30 printutf8: printutf8.c
31 gcc -o printutf8 printutf8.c
32
33 unihan.t2s.t: Unihan.txt printutf8
34 grep kSimplifiedVariant Unihan.txt | sed '/#/d' | sed 's/kSimplifiedVariant//' | ./printutf8 > unihan.t2s.t
35
36 trad2simp.t: trad2simp.manual unihan.t2s.t
37 cp unihan.t2s.t tmp1
38 for I in `colrm 11 < trad2simp.manual` ; do sed "/^$$I/d" tmp1 > tmp2; mv tmp2 tmp1; done
39 cat trad2simp.manual tmp1 > trad2simp.t
40
41 unihan.s2t.t: Unihan.txt printutf8
42 grep kTraditionalVariant Unihan.txt | sed '/#/d' | sed 's/kTraditionalVariant//' | ./printutf8 > unihan.s2t.t
43
44 simp2trad.t: unihan.s2t.t simp2trad.manual
45 cp unihan.s2t.t tmp1
46 for I in `colrm 11 < simp2trad.manual` ; do sed "/^$$I/d" tmp1 > tmp2; mv tmp2 tmp1; done
47 cat simp2trad.manual tmp1 > simp2trad.t
48
49 t2s_1tomany.t: trad2simp.t
50 grep -s ".\{19,\}" trad2simp.t | sed 's/U+...../"/' | sed 's/|U+...../"=>"/' | sed 's/|U+.....//g' | sed 's/|/",/' > t2s_1tomany.t
51
52 t2s_1to1.t: trad2simp.t s2t_1tomany.t
53 sed "/.*|.*|.*|.*/d" trad2simp.t | sed 's/U+[0-9a-z][0-9a-z]*/"/' | sed 's/|U+[0-9a-z][0-9a-z]*/"=>"/' | sed 's/|/",/' > t2s_1to1.t
54 grep '"."=>"..",' s2t_1tomany.t | sed 's/\("."\)=>".\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
55 grep '"."=>"...",' s2t_1tomany.t | sed 's/\("."\)=>".\(.\).",/"\2"=>\1,/' >> t2s_1to1.t
56 grep '"."=>"...",' s2t_1tomany.t | sed 's/\("."\)=>"..\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
57 grep '"."=>"....",' s2t_1tomany.t | sed 's/\("."\)=>".\(.\)..",/"\2"=>\1,/' >> t2s_1to1.t
58 grep '"."=>"....",' s2t_1tomany.t | sed 's/\("."\)=>"..\(.\).",/"\2"=>\1,/' >> t2s_1to1.t
59 grep '"."=>"....",' s2t_1tomany.t | sed 's/\("."\)=>"...\(.\)",/"\2"=>\1,/' >> t2s_1to1.t
60 sort t2s_1to1.t | uniq > t
61 mv t t2s_1to1.t
62
63
64 s2t_1tomany.t: simp2trad.t
65 grep -s ".\{19,\}" simp2trad.t | sed 's/U+...../"/' | sed 's/|U+...../"=>"/' | sed 's/|U+.....//g' | sed 's/|/",/' > s2t_1tomany.t
66
67 s2t_1to1.t: simp2trad.t t2s_1tomany.t
68 sed "/.*|.*|.*|.*/d" simp2trad.t | sed 's/U+[0-9a-z][0-9a-z]*/"/' | sed 's/|U+[0-9a-z][0-9a-z]*/"=>"/' | sed 's/|/",/' > s2t_1to1.t
69 grep '"."=>"..",' t2s_1tomany.t | sed 's/\("."\)=>".\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
70 grep '"."=>"...",' t2s_1tomany.t | sed 's/\("."\)=>".\(.\).",/"\2"=>\1,/' >> s2t_1to1.t
71 grep '"."=>"...",' t2s_1tomany.t | sed 's/\("."\)=>"..\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
72 grep '"."=>"....",' t2s_1tomany.t | sed 's/\("."\)=>".\(.\)..",/"\2"=>\1,/' >> s2t_1to1.t
73 grep '"."=>"....",' t2s_1tomany.t | sed 's/\("."\)=>"..\(.\).",/"\2"=>\1,/' >> s2t_1to1.t
74 grep '"."=>"....",' t2s_1tomany.t | sed 's/\("."\)=>"...\(.\)",/"\2"=>\1,/' >> s2t_1to1.t
75 sort s2t_1to1.t | uniq > t
76 mv t s2t_1to1.t
77
78 ez.t: EZ.txt.in
79 colrm 1 8 < EZ.txt.in | sed 's/\t//g' | grep "^.\{2,4\}[0-9]" | sed 's/[0-9]//g' > ez.t
80
81 alltradphrases.t: ez.t s2t_1tomany.t
82 for i in `cat s2t_1tomany.t | sed 's/.*=>".//' | sed 's/"//g' |sed 's/,/\n/' | sed 's/\(.\)/\1\n/g' |sort | uniq`; do grep -s $$i ez.t ; done > alltradphrases.t || true
83
84
85 tradphrases_2.t: alltradphrases.t
86 cat alltradphrases.t | grep "^..$$" | sort | uniq > tradphrases_2.t
87
88 tradphrases_3.t: alltradphrases.t
89 cat alltradphrases.t | grep "^...$$" | sort | uniq > tradphrases_3.t
90 for i in `cat tradphrases_2.t`; do grep $$i tradphrases_3.t ; done | sort | uniq > t3 || true
91 diff t3 tradphrases_3.t | grep ">" | sed 's/> //' > t
92 mv t tradphrases_3.t
93
94
95 tradphrases_4.t: alltradphrases.t
96 cat alltradphrases.t | grep "^....$$" | sort | uniq > tradphrases_4.t
97 for i in `cat tradphrases_2.t`; do grep $$i tradphrases_4.t ; done | sort | uniq > t3 || true
98 diff t3 tradphrases_4.t | grep ">" | sed 's/> //' > t
99 mv t tradphrases_4.t
100 for i in `cat tradphrases_3.t`; do grep $$i tradphrases_4.t ; done | sort | uniq > t3 || true
101 diff t3 tradphrases_4.t | grep ">" | sed 's/> //' > t
102 mv t tradphrases_4.t
103
104 tradphrases.t: tradphrases.manual tradphrases_2.t tradphrases_3.t tradphrases_4.t t2s_1tomany.t
105 cat tradphrases.manual tradphrases_2.t tradphrases_3.t tradphrases_4.t |sort | uniq > tradphrases.t
106 for i in `sed 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do grep $$i tradphrases.t ; done | diff tradphrases.t - | grep '<' | sed 's/< //' > t
107 mv t tradphrases.t
108
109 ph.t: phrase_lib.txt
110 sed 's/[\t0-9a-zA-Z]//g' phrase_lib.txt | grep "^.\{2,4\}$$" > ph.t
111
112 allsimpphrases.t: ph.t
113 rm -f allsimpphrases.t
114 for i in `cat t2s_1tomany.t | sed 's/.*=>".//' | sed 's/"//g' | sed 's/,/\n/' | sed 's/\(.\)/\1\n/g' | sort | uniq `; do grep $$i ph.t >> allsimpphrases.t; done
115
116 simpphrases_2.t: allsimpphrases.t
117 cat allsimpphrases.t | grep "^..$$" | sort | uniq > simpphrases_2.t
118
119 simpphrases_3.t: allsimpphrases.t
120 cat allsimpphrases.t | grep "^...$$" | sort | uniq > simpphrases_3.t
121 for i in `cat simpphrases_2.t`; do grep $$i simpphrases_3.t ; done | sort | uniq > t3 || true
122 diff t3 simpphrases_3.t | grep ">" | sed 's/> //' > t
123 mv t simpphrases_3.t
124
125 simpphrases_4.t: allsimpphrases.t
126 cat allsimpphrases.t | grep "^....$$" | sort | uniq > simpphrases_4.t
127 rm -f t
128 for i in `cat simpphrases_2.t`; do grep $$i simpphrases_4.t >> t; done || true
129 sort t | uniq > t3
130 diff t3 simpphrases_4.t | grep ">" | sed 's/> //' > t
131 mv t simpphrases_4.t
132 for i in `cat simpphrases_3.t`; do grep $$i simpphrases_4.t; done | sort | uniq > t3 || true
133 diff t3 simpphrases_4.t | grep ">" | sed 's/> //' > t
134 mv t simpphrases_4.t
135
136 simpphrases.t:simpphrases_2.t simpphrases_3.t simpphrases_4.t t2s_1tomany.t
137 cat simpphrases_2.t simpphrases_3.t simpphrases_4.t > simpphrases.t
138 for i in `sed 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do grep $$i simpphrases.t ; done | diff simpphrases.t - | grep '<' | sed 's/< //' > t
139 mv t simpphrases.t
140
141
142 trad2simp1to1.t: t2s_1tomany.t t2s_1to1.t
143 sed 's/\(.......\).*/\1",/' t2s_1tomany.t > trad2simp1to1.t
144 cat t2s_1to1.t >> trad2simp1to1.t
145
146 simp2trad1to1.t: s2t_1tomany.t s2t_1to1.t
147 sed 's/\(.......\).*/\1",/' s2t_1tomany.t > simp2trad1to1.t
148 cat s2t_1to1.t >> simp2trad1to1.t
149
150 trad2simp.php: trad2simp1to1.t tradphrases.t
151 printf '<?php\n$$trad2simp=array(' > trad2simp.php
152 cat trad2simp1to1.t >> trad2simp.php
153 printf ');\n$$str=\n"' >> trad2simp.php
154 cat tradphrases.t >> trad2simp.php
155 printf '";\n$$t=strtr($$str, $$trad2simp);\necho $$t;\n?>' >> trad2simp.php
156
157 simp2trad.php: simp2trad1to1.t simpphrases.t
158 printf '<?php\n$$simp2trad=array(' > simp2trad.php
159 cat simp2trad1to1.t >> simp2trad.php
160 printf ');\n$$str=\n"' >> simp2trad.php
161 cat simpphrases.t >> simp2trad.php
162 printf '";\n$$t=strtr($$str, $$simp2trad);\necho $$t;\n?>' >> simp2trad.php
163
164 simp2trad.phrases.t: trad2simp.php tradphrases.t simp2tradPhrases.manual
165 php -f trad2simp.php | sed 's/\(.*\)/"\1" => /' > tmp1
166 cat tradphrases.t | sed 's/\(.*\)/"\1",/' > tmp2
167 paste tmp1 tmp2 > simp2trad.phrases.t
168 sed 's/\(.*\)\t\(.*\)/"\1"=>"\2",/' simp2tradPhrases.manual >> simp2trad.phrases.t
169
170 trad2simp.phrases.t: simp2trad.php simpphrases.t trad2simpPhrases.manual
171 php -f simp2trad.php | sed 's/\(.*\)/"\1" => /' > tmp1
172 cat simpphrases.t | sed 's/\(.*\)/"\1",/' > tmp2
173 paste tmp1 tmp2 > trad2simp.phrases.t
174 sed 's/\(.*\)\t\(.*\)/"\1"=>"\2",/' trad2simpPhrases.manual >> trad2simp.phrases.t
175
176 ZhConversion.php: simp2trad1to1.t simp2trad.phrases.t trad2simp1to1.t trad2simp.phrases.t
177 printf '<?php\n$$zhSimp2Trad=array(\n' > ZhConversion.php
178 cat simp2trad1to1.t >> ZhConversion.php
179 echo >> ZhConversion.php
180 cat simp2trad.phrases.t >> ZhConversion.php
181 echo ');' >> ZhConversion.php
182 echo >> ZhConversion.php
183 printf '$$zhTrad2Simp=array(\n' >> ZhConversion.php
184 cat trad2simp1to1.t >> ZhConversion.php
185 echo >> ZhConversion.php
186 cat trad2simp.phrases.t >> ZhConversion.php
187 printf ');\n?>' >> ZhConversion.php
188
189 clean:
190 rm -f ZhConversion.php tmp1 tmp2 tmp3 t3 *.t trad2simp.php simp2trad.php