Added code to generate a wordlist from the various sources.
authorZheng Zhu <zhengzhu@users.mediawiki.org>
Tue, 19 Oct 2004 16:54:07 +0000 (16:54 +0000)
committerZheng Zhu <zhengzhu@users.mediawiki.org>
Tue, 19 Oct 2004 16:54:07 +0000 (16:54 +0000)
includes/zhtable/Makefile

index a871590..40db816 100644 (file)
@@ -9,7 +9,7 @@
 # other utf-8 locales, I suppose)
 #
 #
-all: ZhConversion.php tradphrases.notsure simpphrases.notsure
+all: ZhConversion.php tradphrases.notsure simpphrases.notsure wordlist
 
 Unihan.txt:
        wget ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip
@@ -33,6 +33,13 @@ tsi.src:
        cp libtabe/tsi-src/tsi.src .
        rm -rf libtabe*
 
+wordlist: phrase_lib.txt EZ.txt.in tsi.src
+       iconv -c -f big5 -t utf8 tsi.src | sed 's/# //g' | sed 's/[ ][0-9].*//' > wordlist
+       sed 's/\(.*\)\t[0-9][0-9]*.*/\1/' phrase_lib.txt | sed '1,5d' >>wordlist
+       sed '1,/BEGIN_TABLE/d' EZ.txt.in | colrm 1 8 | sed 's/\t.*//' | grep "^...*" >> wordlist
+       sort wordlist | uniq | sed 's/ //g' > t
+       mv t wordlist
+
 printutf8: printutf8.c
        gcc -o printutf8 printutf8.c