From: Zheng Zhu Date: Tue, 19 Oct 2004 16:54:07 +0000 (+0000) Subject: Added code to generate a wordlist from the various sources. X-Git-Tag: 1.5.0alpha1~1511 X-Git-Url: http://git.cyclocoop.org/%40spipnet%40?a=commitdiff_plain;h=b604c6725fa2b5d5fdd6c3e367e2410f13b0a985;p=lhc%2Fweb%2Fwiklou.git Added code to generate a wordlist from the various sources. --- diff --git a/includes/zhtable/Makefile b/includes/zhtable/Makefile index a871590031..40db8165af 100644 --- a/includes/zhtable/Makefile +++ b/includes/zhtable/Makefile @@ -9,7 +9,7 @@ # other utf-8 locales, I suppose) # # -all: ZhConversion.php tradphrases.notsure simpphrases.notsure +all: ZhConversion.php tradphrases.notsure simpphrases.notsure wordlist Unihan.txt: wget ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip @@ -33,6 +33,13 @@ tsi.src: cp libtabe/tsi-src/tsi.src . rm -rf libtabe* +wordlist: phrase_lib.txt EZ.txt.in tsi.src + iconv -c -f big5 -t utf8 tsi.src | sed 's/# //g' | sed 's/[ ][0-9].*//' > wordlist + sed 's/\(.*\)\t[0-9][0-9]*.*/\1/' phrase_lib.txt | sed '1,5d' >>wordlist + sed '1,/BEGIN_TABLE/d' EZ.txt.in | colrm 1 8 | sed 's/\t.*//' | grep "^...*" >> wordlist + sort wordlist | uniq | sed 's/ //g' > t + mv t wordlist + printutf8: printutf8.c gcc -o printutf8 printutf8.c