From ce9907a72df0ee46587dd4b24f9c50996e7eedcb Mon Sep 17 00:00:00 2001 From: Zheng Zhu Date: Fri, 1 Oct 2004 03:05:18 +0000 Subject: [PATCH] Scripts and data used for generating ZhConversion.php --- includes/zhtable/Makefile | 180 +++++++++++++++++++++++ includes/zhtable/README | 9 ++ includes/zhtable/printutf8.c | 99 +++++++++++++ includes/zhtable/simp2trad.manual | 177 ++++++++++++++++++++++ includes/zhtable/simp2tradPhrases.manual | 100 +++++++++++++ includes/zhtable/trad2simp.manual | 15 ++ includes/zhtable/trad2simpPhrases.manual | 99 +++++++++++++ includes/zhtable/tradphrases.manual | 13 ++ 8 files changed, 692 insertions(+) create mode 100644 includes/zhtable/Makefile create mode 100644 includes/zhtable/README create mode 100644 includes/zhtable/printutf8.c create mode 100644 includes/zhtable/simp2trad.manual create mode 100644 includes/zhtable/simp2tradPhrases.manual create mode 100644 includes/zhtable/trad2simp.manual create mode 100644 includes/zhtable/trad2simpPhrases.manual create mode 100644 includes/zhtable/tradphrases.manual diff --git a/includes/zhtable/Makefile b/includes/zhtable/Makefile new file mode 100644 index 0000000000..304f286486 --- /dev/null +++ b/includes/zhtable/Makefile @@ -0,0 +1,180 @@ +# +# Creating the file ZhConversion.php used for Simplified/Traditional +# Chinese conversion. It gets the basic conversion table from the Unihan +# database, and construct the phrase tables using phrase libraries in +# the SCIM packages. There are also special tables used to for adjustment. +# Some data in the file simp2trad.manual was taken from the following +# paper: +# Requirement: you need to set your locale to zh_CN.UTF-8 (or any +# other utf-8 locales, I suppose) +# +# +all: ZhConversion.php + +Unihan.txt: + wget ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip + unzip Unihan.zip + +EZ.txt.in: + wget http://freedesktop.org/~suzhe/sources/scim-tables-0.4.3.tar.gz + tar zxvf scim-tables-0.4.3.tar.gz > /dev/null + cp scim-tables-0.4.3/zh/EZ.txt.in . + rm -rf scim-tables-0.4.3* + +phrase_lib.txt: + wget http://freedesktop.org/~suzhe/scim-chinese/scim-chinese-0.4.2.tar.gz + tar zxvf scim-chinese-0.4.2.tar.gz > /dev/null + cp scim-chinese-0.4.2/data/phrase_lib.txt . + rm -rf scim-chinese-0.4.2* + +printutf8: printutf8.c + gcc -o printutf8 printutf8.c + +trad2simp.t: Unihan.txt trad2simp.manual printutf8 + grep kSimplifiedVariant Unihan.txt | sed '/#/d' | sed 's/kSimplifiedVariant//' | ./printutf8 > tmp1 + for I in `colrm 11 < trad2simp.manual` ; do sed "/^$$I/d" tmp1 > tmp2; mv tmp2 tmp1; done + cat trad2simp.manual tmp1 > trad2simp.t + +simp2trad.t: Unihan.txt simp2trad.manual printutf8 + grep kTraditionalVariant Unihan.txt | sed '/#/d' | sed 's/kTraditionalVariant//' | ./printutf8 > tmp1 + for I in `colrm 11 < simp2trad.manual` ; do sed "/^$$I/d" tmp1 > tmp2; mv tmp2 tmp1; done + cat simp2trad.manual tmp1 > simp2trad.t + +t2s_1tomany.t: trad2simp.t + grep -s ".\{19,\}" trad2simp.t | sed 's/U+...../"/' | sed 's/|U+...../"=>"/' | sed 's/|U+.....//g' | sed 's/|/",/' > t2s_1tomany.t + +t2s_1to1.t: trad2simp.t s2t_1tomany.t + sed "/.*|.*|.*|.*/d" trad2simp.t | sed 's/U+[0-9a-z][0-9a-z]*/"/' | sed 's/|U+[0-9a-z][0-9a-z]*/"=>"/' | sed 's/|/",/' > t2s_1to1.t + grep '"."=>"..",' s2t_1tomany.t | sed 's/\("."\)=>".\(.\)",/"\2"=>\1,/' >> t2s_1to1.t + grep '"."=>"...",' s2t_1tomany.t | sed 's/\("."\)=>".\(.\).",/"\2"=>\1,/' >> t2s_1to1.t + grep '"."=>"...",' s2t_1tomany.t | sed 's/\("."\)=>"..\(.\)",/"\2"=>\1,/' >> t2s_1to1.t + grep '"."=>"....",' s2t_1tomany.t | sed 's/\("."\)=>".\(.\)..",/"\2"=>\1,/' >> t2s_1to1.t + grep '"."=>"....",' s2t_1tomany.t | sed 's/\("."\)=>"..\(.\).",/"\2"=>\1,/' >> t2s_1to1.t + grep '"."=>"....",' s2t_1tomany.t | sed 's/\("."\)=>"...\(.\)",/"\2"=>\1,/' >> t2s_1to1.t + sort t2s_1to1.t | uniq > t2s_1to1.t + + +s2t_1tomany.t: simp2trad.t + grep -s ".\{19,\}" simp2trad.t | sed 's/U+...../"/' | sed 's/|U+...../"=>"/' | sed 's/|U+.....//g' | sed 's/|/",/' > s2t_1tomany.t + +s2t_1to1.t: simp2trad.t t2s_1tomany.t + sed "/.*|.*|.*|.*/d" simp2trad.t | sed 's/U+[0-9a-z][0-9a-z]*/"/' | sed 's/|U+[0-9a-z][0-9a-z]*/"=>"/' | sed 's/|/",/' > s2t_1to1.t + grep '"."=>"..",' t2s_1tomany.t | sed 's/\("."\)=>".\(.\)",/"\2"=>\1,/' >> s2t_1to1.t + grep '"."=>"...",' t2s_1tomany.t | sed 's/\("."\)=>".\(.\).",/"\2"=>\1,/' >> s2t_1to1.t + grep '"."=>"...",' t2s_1tomany.t | sed 's/\("."\)=>"..\(.\)",/"\2"=>\1,/' >> s2t_1to1.t + grep '"."=>"....",' t2s_1tomany.t | sed 's/\("."\)=>".\(.\)..",/"\2"=>\1,/' >> s2t_1to1.t + grep '"."=>"....",' t2s_1tomany.t | sed 's/\("."\)=>"..\(.\).",/"\2"=>\1,/' >> s2t_1to1.t + grep '"."=>"....",' t2s_1tomany.t | sed 's/\("."\)=>"...\(.\)",/"\2"=>\1,/' >> s2t_1to1.t + sort s2t_1to1.t | uniq > s2t_1to1.t + +ez.t: EZ.txt.in + colrm 1 8 < EZ.txt.in | sed 's/\t//g' | grep "^.\{2,4\}[0-9]" | sed 's/[0-9]//g' > ez.t + +alltradphrases.t: ez.t s2t_1tomany.t + for i in `cat s2t_1tomany.t | sed 's/.......//' | sed 's/",/\n/' | sed 's/\(.\)/\1\n/g' |sort | uniq`; do grep -s $$i ez.t ; done > alltradphrases.t || true + +tradphrases_2.t: alltradphrases.t + cat alltradphrases.t | grep "^..$$" | sort | uniq > tradphrases_2.t + +tradphrases_3.t: alltradphrases.t + cat alltradphrases.t | grep "^...$$" | sort | uniq > tradphrases_3.t + for i in `cat tradphrases_2.t`; do grep $$i tradphrases_3.t ; done | sort | uniq > t3 || true + diff t3 tradphrases_3.t | grep ">" | sed 's/> //' > tradphrases_3.t + + +tradphrases_4.t: alltradphrases.t + cat alltradphrases.t | grep "^....$$" | sort | uniq > tradphrases_4.t + for i in `cat tradphrases_2.t`; do grep $$i tradphrases_4.t ; done | sort | uniq > t3 || true + diff t3 tradphrases_4.t | grep ">" | sed 's/> //' > t + mv t tradphrases_4.t + for i in `cat tradphrases_3.t`; do grep $$i tradphrases_4.t ; done | sort | uniq > t3 || true + diff t3 tradphrases_4.t | grep ">" | sed 's/> //' > t + mv t tradphrases_4.t + +tradphrases.t: tradphrases.manual tradphrases_2.t tradphrases_3.t tradphrases_4.t t2s_1tomany.t + cat tradphrases.manual tradphrases_2.t tradphrases_3.t tradphrases_4.t |sort | uniq > tradphrases.t + for i in `sed 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do grep $$i tradphrases.t ; done | diff tradphrases.t - | grep '<' | sed 's/< //' > t + mv t tradphrases.t + +ph.t: phrase_lib.txt + sed 's/[\t0-9a-zA-Z]//g' phrase_lib.txt | grep "^.\{2,4\}$$" > ph.t + +allsimpphrases.t: ph.t + rm -f allsimpphrases.t + for i in `cat t2s_1tomany.t | sed 's/.......//' | sed 's/",/\n/' | sed 's/\(.\)/\1\n/g' | sort | uniq `; do grep $$i ph.t >> allsimpphrases.t; done + +simpphrases_2.t: allsimpphrases.t + cat allsimpphrases.t | grep "^..$$" | sort | uniq > simpphrases_2.t + +simpphrases_3.t: allsimpphrases.t + cat allsimpphrases.t | grep "^...$$" | sort | uniq > simpphrases_3.t + for i in `cat simpphrases_2.t`; do grep $$i simpphrases_3.t ; done | sort | uniq > t3 || true + diff t3 simpphrases_3.t | grep ">" | sed 's/> //' > t + mv t simpphrases_3.t + +simpphrases_4.t: allsimpphrases.t + cat allsimpphrases.t | grep "^....$$" | sort | uniq > simpphrases_4.t + rm -f t + for i in `cat simpphrases_2.t`; do grep $$i simpphrases_4.t >> t; done || true + sort t | uniq > t3 + diff t3 simpphrases_4.t | grep ">" | sed 's/> //' > t + mv t simpphrases_4.t + for i in `cat simpphrases_3.t`; do grep $$i simpphrases_4.t; done | sort | uniq > t3 || true + diff t3 simpphrases_4.t | grep ">" | sed 's/> //' > t + mv t simpphrases_4.t + +simpphrases.t:simpphrases_2.t simpphrases_3.t simpphrases_4.t t2s_1tomany.t + cat simpphrases_2.t simpphrases_3.t simpphrases_4.t > simpphrases.t + for i in `sed 's/"\(.\).*/\1/' t2s_1tomany.t ` ; do grep $$i simpphrases.t ; done | diff simpphrases.t - | grep '<' | sed 's/< //' > t + mv t simpphrases.t + + +trad2simp1to1.t: t2s_1tomany.t t2s_1to1.t + sed 's/\(.......\).*/\1",/' t2s_1tomany.t > trad2simp1to1.t + cat t2s_1to1.t >> trad2simp1to1.t + +simp2trad1to1.t: s2t_1tomany.t s2t_1to1.t + sed 's/\(.......\).*/\1",/' s2t_1tomany.t > simp2trad1to1.t + cat s2t_1to1.t >> simp2trad1to1.t + +trad2simp.php: trad2simp1to1.t tradphrases.t + printf ' trad2simp.php + cat trad2simp1to1.t >> trad2simp.php + printf ');\n$$str=\n"' >> trad2simp.php + cat tradphrases.t >> trad2simp.php + printf '";\n$$t=strtr($$str, $$trad2simp);\necho $$t;\n?>' >> trad2simp.php + +simp2trad.php: simp2trad1to1.t simpphrases.t + printf ' simp2trad.php + cat simp2trad1to1.t >> simp2trad.php + printf ');\n$$str=\n"' >> simp2trad.php + cat simpphrases.t >> simp2trad.php + printf '";\n$$t=strtr($$str, $$simp2trad);\necho $$t;\n?>' >> simp2trad.php + +simp2trad.phrases.t: trad2simp.php tradphrases.t simp2tradPhrases.manual + php -f trad2simp.php | sed 's/\(.*\)/"\1" => /' > tmp1 + cat tradphrases.t | sed 's/\(.*\)/"\1",/' > tmp2 + paste tmp1 tmp2 > simp2trad.phrases.t + sed 's/\(.*\)\t\(.*\)/"\1"=>"\2",/' simp2tradPhrases.manual >> simp2trad.phrases.t + +trad2simp.phrases.t: simp2trad.php simpphrases.t trad2simpPhrases.manual + php -f simp2trad.php | sed 's/\(.*\)/"\1" => /' > tmp1 + cat simpphrases.t | sed 's/\(.*\)/"\1",/' > tmp2 + paste tmp1 tmp2 > trad2simp.phrases.t + sed 's/\(.*\)\t\(.*\)/"\1"=>"\2",/' trad2simpPhrases.manual >> trad2simp.phrases.t + +ZhConversion.php: simp2trad1to1.t simp2trad.phrases.t trad2simp1to1.t trad2simp.phrases.t + printf ' ZhConversion.php + cat simp2trad1to1.t >> ZhConversion.php + echo >> ZhConversion.php + cat simp2trad.phrases.t >> ZhConversion.php + echo ');' >> ZhConversion.php + echo >> ZhConversion.php + printf '$$zhTrad2Simp=array(\n' >> ZhConversion.php + cat trad2simp1to1.t >> ZhConversion.php + echo >> ZhConversion.php + cat trad2simp.phrases.t >> ZhConversion.php + printf ');\n?>' >> ZhConversion.php + +clean: + rm -f ZhConversion.php tmp1 tmp2 tmp3 t3 *.t trad2simp.php simp2trad.php \ No newline at end of file diff --git a/includes/zhtable/README b/includes/zhtable/README new file mode 100644 index 0000000000..4b7428bc4e --- /dev/null +++ b/includes/zhtable/README @@ -0,0 +1,9 @@ +the file manual.txt contains special mappings not included in the +unihan database. The first 116 entries are taken from + + 冯寿忠,“非对称繁简字”对照表, 《语文建设通讯》1997-9第53期. + /http://www.yywzw.com/jt/feng/fengb01.htm + +The rest are added by me after examining the conversion results. + +zhengzhu at gmail.com \ No newline at end of file diff --git a/includes/zhtable/printutf8.c b/includes/zhtable/printutf8.c new file mode 100644 index 0000000000..b6ccf17cd3 --- /dev/null +++ b/includes/zhtable/printutf8.c @@ -0,0 +1,99 @@ +#include +#include +#include +/* + Unicode UTF8 +0x00000000 - 0x0000007F: 0xxxxxxx +0x00000080 - 0x000007FF: 110xxx xx 10xx xxxx +0x00000800 - 0x0000FFFF: 1110xxxx 10xxxx xx 10xx xxxx +0x00010000 - 0x001FFFFF: 11110x xx 10xx xxxx 10xxxx xx 10xx xxxx +0x00200000 - 0x03FFFFFF: 111110xx 10xxxx xx 10xx xxxx 10xxxx xx 10xx xxxx +0x04000000 - 0x7FFFFFFF: 1111110x 10xx xxxx 10xxxx xx 10xx xxxx 10xxxx xx 10xx xxxx + +0000 0 1001 9 +0001 1 1010 A +0010 2 1011 B +0011 3 1100 C +0100 4 1101 D +0101 5 1110 E +0110 6 1111 F +0111 7 +1000 8 +*/ +void printUTF8(long long u) { + long long m; + if(u<0x80) { + printf("%c", (unsigned char)u); + } + else if(u<0x800) { + m = ((u&0x7c0)>>6) | 0xc0; + printf("%c", (unsigned char)m); + m = (u&0x3f) | 0x80; + printf("%c", (unsigned char)m); + } + else if(u<0x10000) { + m = ((u&0xf000)>>12) | 0xe0; + printf("%c",(unsigned char)m); + m = ((u&0xfc0)>>6) | 0x80; + printf("%c",(unsigned char)m); + m = (u & 0x3f) | 0x80; + printf("%c",(unsigned char)m); + } + else if(u<0x200000) { + m = ((u&0x1c0000)>>18) | 0xf0; + printf("%c", (unsigned char)m); + m = ((u& 0x3f000)>>12) | 0x80; + printf("%c", (unsigned char)m); + m = ((u& 0xfc0)>>6) | 0x80; + printf("%c", (unsigned char)m); + m = (u&0x3f) | 0x80; + printf("%c", (unsigned char)m); + } + else if(u<0x4000000){ + m = ((u&0x3000000)>>24) | 0xf8; + printf("%c", (unsigned char)m); + m = ((u&0xfc0000)>>18) | 0x80; + printf("%c", (unsigned char)m); + m = ((u&0x3f000)>>12) | 0x80; + printf("%c", (unsigned char)m); + m = ((u&0xfc00)>>6) | 0x80; + printf("%c", (unsigned char)m); + m = (u&0x3f) | 0x80; + printf("%c", (unsigned char)m); + } + else { + m = ((u&0x40000000)>>30) | 0xfc; + printf("%c", (unsigned char)m); + m = ((u&0x3f000000)>>24) | 0x80; + printf("%c", (unsigned char)m); + m = ((u&0xfc0000)>>18) | 0x80; + printf("%c", (unsigned char)m); + m = ((u&0x3f000)>>12) | 0x80; + printf("%c", (unsigned char)m); + m = ((u&0xfc0)>>6) | 0x80; + printf("%c", (unsigned char)m); + m = (u&0x3f)| 0x80; + printf("%c", (unsigned char)m); + } +} + +int main() { + int i,j; + long long n1, n2; + unsigned char b1[15], b2[15]; + unsigned char buf[1024]; + i=0; + while(fgets(buf, 1024, stdin)) { + // printf("read %s\n", buf); + for(i=0;i