(bug 5303) Merge UtfNormal rewrite. Patch by Ludovic Arnaud (YuviPanda). This is...
[lhc/web/wiklou.git] / includes / normal / UtfNormalGenerate.php
1 <?php
2 /**
3 * Unicode normalization routines: Unicode tables generator
4 *
5 * Copyright ( C) 2004 Ludovic ARNAUD <ludovic.arnaud@gmail.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * ( at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write to the Free Software Foundation, Inc.,
19 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20 * http://www.gnu.org/copyleft/gpl.html
21 *
22 * @author Ludovic ARNAUD <ludovic.arnaud@gmail.com>
23 * @license http://www.gnu.org/licenses/gpl.txt
24 * @package UtfNormal
25 * @access private
26 */
27
28 if( php_sapi_name() != 'cli' ) {
29 die( "This program must be run from the command line.\n" );
30 }
31
32 require_once( 'UtfNormal.php' );
33 $file_contents = array();
34
35 /**
36 * Generate some Hangul/Jamo stuff
37 */
38 echo "\nGenerating Hangul and Jamo tables\n";
39 for ( $i = 0; $i < UNICODE_HANGUL_LCOUNT; ++$i ) {
40 $utf_char = cp_to_utf( UNICODE_HANGUL_LBASE + $i );
41 // $file_contents['UtfNormalData.inc']['utfJamoIndex'][$utf_char] = $i;
42 $file_contents['UtfNormalData.inc']['utfJamoIndex'][$utf_char] = $i * UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT + UNICODE_HANGUL_SBASE;
43 $file_contents['UtfNormalData.inc']['utfJamoType'][$utf_char] = UNICODE_JAMO_L;
44 }
45
46 for ( $i = 0; $i < UNICODE_HANGUL_VCOUNT; ++$i ) {
47 $utf_char = cp_to_utf( UNICODE_HANGUL_VBASE + $i );
48 // $file_contents['UtfNormalData.inc']['utfJamoIndex'][$utf_char] = $i;
49 $file_contents['UtfNormalData.inc']['utfJamoIndex'][$utf_char] = $i * UNICODE_HANGUL_TCOUNT;
50 $file_contents['UtfNormalData.inc']['utfJamoType'][$utf_char] = UNICODE_JAMO_V;
51 }
52
53 for ( $i = 0; $i < UNICODE_HANGUL_TCOUNT; ++$i ) {
54 $utf_char = cp_to_utf( UNICODE_HANGUL_TBASE + $i );
55 $file_contents['UtfNormalData.inc']['utfJamoIndex'][$utf_char] = $i;
56 $file_contents['UtfNormalData.inc']['utfJamoType'][$utf_char] = UNICODE_JAMO_T;
57 }
58
59 /**
60 * Load the CompositionExclusions table
61 */
62 echo "Loading CompositionExclusion\n";
63 if( !$fp = fopen( 'CompositionExclusions.txt', 'rt' ) ) {
64 print "\nCan't open UnicodeData.txt for reading.\n";
65 print "If necessary, fetch this file from the internet:\n";
66 print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n";
67 exit( -1 );
68 }
69
70 $exclude = array();
71 while ( !feof( $fp ) ) {
72 $line = fgets( $fp, 1024 );
73
74 if( !ctype_xdigit( $line[0] ) ) {
75 continue;
76 }
77
78 $cp = strtok( $line, ' ' );
79
80 if( $pos = strpos( $cp, '..' ) ) {
81 $start = hexdec( substr( $cp, 0, $pos ) );
82 $end = hexdec( substr( $cp, $pos + 2 ) );
83
84 for ( $i = $start; $i < $end; ++$i )
85 {
86 $exclude[$i] = 1;
87 }
88 } else {
89 $exclude[hexdec( $cp )] = 1;
90 }
91 }
92 fclose( $fp );
93
94 /**
95 * Load QuickCheck tables
96 */
97 echo "Generating QuickCheck tables\n";
98 if( !$fp = fopen( 'DerivedNormalizationProps.txt', 'rt' ) ) {
99 print "\nCan't open UnicodeData.txt for reading.\n";
100 print "If necessary, fetch this file from the internet:\n";
101 print "http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt\n";
102 exit( -1 );
103 }
104
105 while ( !feof( $fp ) ) {
106 $line = fgets( $fp, 1024 );
107
108 if( !ctype_xdigit( $line[0] ) ) {
109 continue;
110 }
111
112 $p = array_map( 'trim', explode( ';', strtok( $line, '#' ) ) );
113
114 /**
115 * Capture only NFC_QC, NFKC_QC
116 */
117 if( !preg_match( '#^NFK?C_QC$#', $p[1] ) ) {
118 continue;
119 }
120
121 if( $pos = strpos( $p[0], '..' ) ) {
122 $start = hexdec( substr( $p[0], 0, $pos ) );
123 $end = hexdec( substr( $p[0], $pos + 2 ) );
124 } else {
125 $start = $end = hexdec( $p[0] );
126 }
127
128 if( $start >= UTF8_HANGUL_FIRST && $end <= UTF8_HANGUL_LAST ) {
129 /**
130 * We do not store Hangul syllables in the array
131 */
132 continue;
133 }
134
135 if( $p[2] == 'M' ) {
136 $val = UNICODE_QC_MAYBE;
137 } else {
138 $val = UNICODE_QC_NO;
139 }
140
141 if( $p[1] == 'NFKC_QC' ) {
142 $file = 'UtfNormalDataK.inc';
143 } else {
144 $file = 'UtfNormalData.inc';
145 }
146
147 for ( $i = $start; $i <= $end; ++$i ) {
148 $file_contents[$file]['utfCheck' . substr( $p[1], 0, -3 )][cp_to_utf( $i )] = $val;
149 }
150 }
151 fclose( $fp );
152
153 /**
154 * Do mappings
155 */
156 echo "Loading Unicode decomposition mappings\n";
157 if( !$fp = fopen( 'UnicodeData.txt', 'rt' ) ) {
158 print "\nCan't open UnicodeData.txt for reading.\n";
159 print "If necessary, fetch this file from the internet:\n";
160 print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
161 exit( -1 );
162 }
163
164 $map = array();
165 while ( !feof( $fp ) ) {
166 $p = explode( ';', fgets( $fp, 1024 ) );
167 $cp = hexdec( $p[0] );
168
169 if( !empty( $p[3] ) ) {
170 /**
171 * Store combining class > 0
172 */
173 $file_contents['UtfNormalData.inc']['utfCombiningClass'][cp_to_utf( $cp )] = ( int ) $p[3];
174 }
175
176 if( !isset( $p[5] ) || !preg_match_all( '#[0-9A-F]+#', strip_tags( $p[5] ), $m ) ) {
177 continue;
178 }
179
180 if( strpos( $p[5], '>' ) ) {
181 $map['NFKD'][$cp] = implode( ' ', array_map( 'hexdec', $m[0] ) );
182 } else {
183 $map['NFD'][$cp] = $map['NFKD'][$cp] = implode( ' ', array_map( 'hexdec', $m[0] ) );
184 }
185 }
186 fclose( $fp );
187
188 /**
189 * Build the canonical composition table
190 */
191 echo "Generating the Canonical Composition table\n";
192 foreach ( $map['NFD'] as $cp => $decomp_seq ) {
193 if( !strpos( $decomp_seq, ' ' ) || isset( $exclude[$cp] ) ) {
194 /**
195 * Singletons are excluded from canonical composition
196 */
197 continue;
198 }
199
200 $utf_seq = implode( '', array_map( 'cp_to_utf', explode( ' ', $decomp_seq ) ) );
201
202 if( !isset( $file_contents['UtfNormalData.inc']['utfCanonicalComp'][$utf_seq] ) ) {
203 $file_contents['UtfNormalData.inc']['utfCanonicalComp'][$utf_seq] = cp_to_utf( $cp );
204 }
205 }
206
207 /**
208 * Decompose the NF[K]D mappings recursively and prepare the file contents
209 */
210 echo "Generating the Canonical and Compatibility Decomposition tables\n\n";
211 foreach ( $map as $type => $decomp_map ) {
212 foreach ( $decomp_map as $cp => $decomp_seq ) {
213 $decomp_map[$cp] = decompose( $decomp_map, $decomp_seq );
214 }
215 unset( $decomp_seq );
216
217 if( $type == 'NFKD' ) {
218 $file = 'UtfNormalDataK.inc';
219 $var = 'utfCompatibilityDecomp';
220 } else {
221 $file = 'UtfNormalData.inc';
222 $var = 'utfCanonicalDecomp';
223 }
224
225 /**
226 * Generate the corresponding file
227 */
228 foreach ( $decomp_map as $cp => $decomp_seq ) {
229 $file_contents[$file][$var][cp_to_utf( $cp )] = implode( '', array_map( 'cp_to_utf', explode( ' ', $decomp_seq ) ) );
230 }
231 }
232
233 /**
234 * Generate and/or alter the files
235 */
236 foreach ( $file_contents as $file => $contents ) {
237 $php = '';
238 foreach ( $contents as $var => $val ) {
239 $php .= '$GLOBALS[' . my_var_export( $var ) . ']=' . my_var_export( $val ) . ";\n";
240 }
241
242 /**
243 * Generate a new file ( overwrite if applicable
244 */
245 echo "Generating $file\n";
246
247 if( !$fp = fopen( $file, 'wb' ) ) {
248 trigger_error( 'Cannot open ' . $file . ' for write' );
249 }
250
251 fwrite( $fp, '<?php
252 /**
253 * This file was automatically generated -- do not edit!
254 * Run UtfNormalGenerate.php to create this file again ( make clean && make )
255 * @package MediaWiki
256 */
257 ' . $php . '?' . '>' );
258 fclose( $fp );
259 }
260
261 die( "All done!\n" );
262
263
264 ////////////////////////////////////////////////////////////////////////////////
265 // Internal functions //
266 ////////////////////////////////////////////////////////////////////////////////
267
268 /**
269 * Decompose a sequence recusively
270 *
271 * @param array $decomp_map Decomposition mapping, passed by reference
272 * @param string $decomp_seq Decomposition sequence as decimal codepoints separated with a space
273 * @return string Decomposition sequence, fully decomposed
274 */
275 function decompose( &$decomp_map, $decomp_seq ) {
276 $ret = array();
277 foreach ( explode( ' ', $decomp_seq ) as $cp ) {
278 if( isset( $decomp_map[$cp] ) )
279 {
280 $ret[] = decompose( $decomp_map, $decomp_map[$cp] );
281 } else
282 {
283 $ret[] = $cp;
284 }
285 }
286
287 return implode( ' ', $ret );
288 }
289
290 /**
291 * Convert a codepoint to a UTF char
292 *
293 * @param integer $cp Unicode codepoint
294 * @return string UTF string
295 */
296 function cp_to_utf( $cp ) {
297 if( $cp > 0xFFFF ) {
298 return chr( 0xF0 | ( $cp >> 18 ) ) . chr( 0x80 | ( ( $cp >> 12 ) & 0x3F ) ) . chr( 0x80 | ( ( $cp >> 6 ) & 0x3F ) ) . chr( 0x80 | ( $cp & 0x3F ) );
299 } elseif( $cp > 0x7FF ) {
300 return chr( 0xE0 | ( $cp >> 12 ) ) . chr( 0x80 | ( ( $cp >> 6 ) & 0x3F ) ) . chr( 0x80 | ( $cp & 0x3F ) );
301 } elseif( $cp > 0x7F ) {
302 return chr( 0xC0 | ( $cp >> 6 ) ) . chr( 0x80 | ( $cp & 0x3F ) );
303 } else {
304 return chr( $cp );
305 }
306 }
307
308 /**
309 * Return a parsable string representation of a variable
310 *
311 * This is function is limited to array/strings/integers
312 *
313 * @param mixed $var Variable
314 * @return string PHP code representing the variable
315 */
316 function my_var_export( $var ) {
317 if( is_array( $var ) ) {
318 $lines = array();
319
320 foreach ( $var as $k => $v )
321 {
322 $lines[] = my_var_export( $k ) . '=>' . my_var_export( $v );
323 }
324
325 return 'array(' . implode( ',', $lines ) . ')';
326 } elseif( is_string( $var ) ) {
327 return "'" . str_replace( array( '\\', "'" ), array( '\\\\', "\\'" ), $var ) . "'";
328 } else {
329 return $var;
330 }
331 }
332
333 ?>