3 * Unicode normalization routines: Unicode tables generator
5 * Copyright ( C) 2004 Ludovic ARNAUD <ludovic.arnaud@gmail.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * ( at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write to the Free Software Foundation, Inc.,
19 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20 * http://www.gnu.org/copyleft/gpl.html
22 * @author Ludovic ARNAUD <ludovic.arnaud@gmail.com>
23 * @license http://www.gnu.org/licenses/gpl.txt
28 if( php_sapi_name() != 'cli' ) {
29 die( "This program must be run from the command line.\n" );
32 require_once( 'UtfNormal.php' );
33 $file_contents = array();
36 * Generate some Hangul/Jamo stuff
38 echo "\nGenerating Hangul and Jamo tables\n";
39 for ( $i = 0; $i < UNICODE_HANGUL_LCOUNT
; ++
$i ) {
40 $utf_char = cp_to_utf( UNICODE_HANGUL_LBASE +
$i );
41 // $file_contents['UtfNormalData.inc']['utfJamoIndex'][$utf_char] = $i;
42 $file_contents['UtfNormalData.inc']['utfJamoIndex'][$utf_char] = $i * UNICODE_HANGUL_VCOUNT
* UNICODE_HANGUL_TCOUNT + UNICODE_HANGUL_SBASE
;
43 $file_contents['UtfNormalData.inc']['utfJamoType'][$utf_char] = UNICODE_JAMO_L
;
46 for ( $i = 0; $i < UNICODE_HANGUL_VCOUNT
; ++
$i ) {
47 $utf_char = cp_to_utf( UNICODE_HANGUL_VBASE +
$i );
48 // $file_contents['UtfNormalData.inc']['utfJamoIndex'][$utf_char] = $i;
49 $file_contents['UtfNormalData.inc']['utfJamoIndex'][$utf_char] = $i * UNICODE_HANGUL_TCOUNT
;
50 $file_contents['UtfNormalData.inc']['utfJamoType'][$utf_char] = UNICODE_JAMO_V
;
53 for ( $i = 0; $i < UNICODE_HANGUL_TCOUNT
; ++
$i ) {
54 $utf_char = cp_to_utf( UNICODE_HANGUL_TBASE +
$i );
55 $file_contents['UtfNormalData.inc']['utfJamoIndex'][$utf_char] = $i;
56 $file_contents['UtfNormalData.inc']['utfJamoType'][$utf_char] = UNICODE_JAMO_T
;
60 * Load the CompositionExclusions table
62 echo "Loading CompositionExclusion\n";
63 if( !$fp = fopen( 'CompositionExclusions.txt', 'rt' ) ) {
64 print "\nCan't open UnicodeData.txt for reading.\n";
65 print "If necessary, fetch this file from the internet:\n";
66 print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n";
71 while ( !feof( $fp ) ) {
72 $line = fgets( $fp, 1024 );
74 if( !ctype_xdigit( $line[0] ) ) {
78 $cp = strtok( $line, ' ' );
80 if( $pos = strpos( $cp, '..' ) ) {
81 $start = hexdec( substr( $cp, 0, $pos ) );
82 $end = hexdec( substr( $cp, $pos +
2 ) );
84 for ( $i = $start; $i < $end; ++
$i )
89 $exclude[hexdec( $cp )] = 1;
95 * Load QuickCheck tables
97 echo "Generating QuickCheck tables\n";
98 if( !$fp = fopen( 'DerivedNormalizationProps.txt', 'rt' ) ) {
99 print "\nCan't open UnicodeData.txt for reading.\n";
100 print "If necessary, fetch this file from the internet:\n";
101 print "http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt\n";
105 while ( !feof( $fp ) ) {
106 $line = fgets( $fp, 1024 );
108 if( !ctype_xdigit( $line[0] ) ) {
112 $p = array_map( 'trim', explode( ';', strtok( $line, '#' ) ) );
115 * Capture only NFC_QC, NFKC_QC
117 if( !preg_match( '#^NFK?C_QC$#', $p[1] ) ) {
121 if( $pos = strpos( $p[0], '..' ) ) {
122 $start = hexdec( substr( $p[0], 0, $pos ) );
123 $end = hexdec( substr( $p[0], $pos +
2 ) );
125 $start = $end = hexdec( $p[0] );
128 if( $start >= UTF8_HANGUL_FIRST
&& $end <= UTF8_HANGUL_LAST
) {
130 * We do not store Hangul syllables in the array
136 $val = UNICODE_QC_MAYBE
;
138 $val = UNICODE_QC_NO
;
141 if( $p[1] == 'NFKC_QC' ) {
142 $file = 'UtfNormalDataK.inc';
144 $file = 'UtfNormalData.inc';
147 for ( $i = $start; $i <= $end; ++
$i ) {
148 $file_contents[$file]['utfCheck' . substr( $p[1], 0, -3 )][cp_to_utf( $i )] = $val;
156 echo "Loading Unicode decomposition mappings\n";
157 if( !$fp = fopen( 'UnicodeData.txt', 'rt' ) ) {
158 print "\nCan't open UnicodeData.txt for reading.\n";
159 print "If necessary, fetch this file from the internet:\n";
160 print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
165 while ( !feof( $fp ) ) {
166 $p = explode( ';', fgets( $fp, 1024 ) );
167 $cp = hexdec( $p[0] );
169 if( !empty( $p[3] ) ) {
171 * Store combining class > 0
173 $file_contents['UtfNormalData.inc']['utfCombiningClass'][cp_to_utf( $cp )] = ( int ) $p[3];
176 if( !isset( $p[5] ) ||
!preg_match_all( '#[0-9A-F]+#', strip_tags( $p[5] ), $m ) ) {
180 if( strpos( $p[5], '>' ) ) {
181 $map['NFKD'][$cp] = implode( ' ', array_map( 'hexdec', $m[0] ) );
183 $map['NFD'][$cp] = $map['NFKD'][$cp] = implode( ' ', array_map( 'hexdec', $m[0] ) );
189 * Build the canonical composition table
191 echo "Generating the Canonical Composition table\n";
192 foreach ( $map['NFD'] as $cp => $decomp_seq ) {
193 if( !strpos( $decomp_seq, ' ' ) ||
isset( $exclude[$cp] ) ) {
195 * Singletons are excluded from canonical composition
200 $utf_seq = implode( '', array_map( 'cp_to_utf', explode( ' ', $decomp_seq ) ) );
202 if( !isset( $file_contents['UtfNormalData.inc']['utfCanonicalComp'][$utf_seq] ) ) {
203 $file_contents['UtfNormalData.inc']['utfCanonicalComp'][$utf_seq] = cp_to_utf( $cp );
208 * Decompose the NF[K]D mappings recursively and prepare the file contents
210 echo "Generating the Canonical and Compatibility Decomposition tables\n\n";
211 foreach ( $map as $type => $decomp_map ) {
212 foreach ( $decomp_map as $cp => $decomp_seq ) {
213 $decomp_map[$cp] = decompose( $decomp_map, $decomp_seq );
215 unset( $decomp_seq );
217 if( $type == 'NFKD' ) {
218 $file = 'UtfNormalDataK.inc';
219 $var = 'utfCompatibilityDecomp';
221 $file = 'UtfNormalData.inc';
222 $var = 'utfCanonicalDecomp';
226 * Generate the corresponding file
228 foreach ( $decomp_map as $cp => $decomp_seq ) {
229 $file_contents[$file][$var][cp_to_utf( $cp )] = implode( '', array_map( 'cp_to_utf', explode( ' ', $decomp_seq ) ) );
234 * Generate and/or alter the files
236 foreach ( $file_contents as $file => $contents ) {
238 foreach ( $contents as $var => $val ) {
239 $php .= '$GLOBALS[' . my_var_export( $var ) . ']=' . my_var_export( $val ) . ";\n";
243 * Generate a new file ( overwrite if applicable
245 echo "Generating $file\n";
247 if( !$fp = fopen( $file, 'wb' ) ) {
248 trigger_error( 'Cannot open ' . $file . ' for write' );
253 * This file was automatically generated -- do not edit!
254 * Run UtfNormalGenerate.php to create this file again ( make clean && make )
257 ' . $php . '?' . '>' );
261 die( "All done!\n" );
264 ////////////////////////////////////////////////////////////////////////////////
265 // Internal functions //
266 ////////////////////////////////////////////////////////////////////////////////
269 * Decompose a sequence recusively
271 * @param array $decomp_map Decomposition mapping, passed by reference
272 * @param string $decomp_seq Decomposition sequence as decimal codepoints separated with a space
273 * @return string Decomposition sequence, fully decomposed
275 function decompose( &$decomp_map, $decomp_seq ) {
277 foreach ( explode( ' ', $decomp_seq ) as $cp ) {
278 if( isset( $decomp_map[$cp] ) )
280 $ret[] = decompose( $decomp_map, $decomp_map[$cp] );
287 return implode( ' ', $ret );
291 * Convert a codepoint to a UTF char
293 * @param integer $cp Unicode codepoint
294 * @return string UTF string
296 function cp_to_utf( $cp ) {
298 return chr( 0xF0 |
( $cp >> 18 ) ) . chr( 0x80 |
( ( $cp >> 12 ) & 0x3F ) ) . chr( 0x80 |
( ( $cp >> 6 ) & 0x3F ) ) . chr( 0x80 |
( $cp & 0x3F ) );
299 } elseif( $cp > 0x7FF ) {
300 return chr( 0xE0 |
( $cp >> 12 ) ) . chr( 0x80 |
( ( $cp >> 6 ) & 0x3F ) ) . chr( 0x80 |
( $cp & 0x3F ) );
301 } elseif( $cp > 0x7F ) {
302 return chr( 0xC0 |
( $cp >> 6 ) ) . chr( 0x80 |
( $cp & 0x3F ) );
309 * Return a parsable string representation of a variable
311 * This is function is limited to array/strings/integers
313 * @param mixed $var Variable
314 * @return string PHP code representing the variable
316 function my_var_export( $var ) {
317 if( is_array( $var ) ) {
320 foreach ( $var as $k => $v )
322 $lines[] = my_var_export( $k ) . '=>' . my_var_export( $v );
325 return 'array(' . implode( ',', $lines ) . ')';
326 } elseif( is_string( $var ) ) {
327 return "'" . str_replace( array( '\\', "'" ), array( '\\\\', "\\'" ), $var ) . "'";