* Standardised file description headers
[lhc/web/wiklou.git] / includes / normal / UtfNormalGenerate.php
1 <?php
2 /**
3 * This script generates UniNormalData.inc from the Unicode Character Database
4 * and supplementary files.
5 *
6 * Copyright (C) 2004 Brion Vibber <brion@pobox.com>
7 * http://www.mediawiki.org/
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 * http://www.gnu.org/copyleft/gpl.html
23 *
24 * @file
25 * @ingroup UtfNormal
26 */
27
28 if( php_sapi_name() != 'cli' ) {
29 die( "Run me from the command line please.\n" );
30 }
31
32 require_once 'UtfNormalUtil.php';
33
34 $in = fopen("DerivedNormalizationProps.txt", "rt" );
35 if( !$in ) {
36 print "Can't open DerivedNormalizationProps.txt for reading.\n";
37 print "If necessary, fetch this file from the internet:\n";
38 print "http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt\n";
39 exit(-1);
40 }
41 print "Initializing normalization quick check tables...\n";
42 $checkNFC = array();
43 while( false !== ($line = fgets( $in ) ) ) {
44 $matches = array();
45 if( preg_match( '/^([0-9A-F]+)(?:..([0-9A-F]+))?\s*;\s*(NFC_QC)\s*;\s*([MN])/', $line, $matches ) ) {
46 list( $junk, $first, $last, $prop, $value ) = $matches;
47 #print "$first $last $prop $value\n";
48 if( !$last ) $last = $first;
49 for( $i = hexdec( $first ); $i <= hexdec( $last ); $i++) {
50 $char = codepointToUtf8( $i );
51 $checkNFC[$char] = $value;
52 }
53 }
54 }
55 fclose( $in );
56
57 $in = fopen("CompositionExclusions.txt", "rt" );
58 if( !$in ) {
59 print "Can't open CompositionExclusions.txt for reading.\n";
60 print "If necessary, fetch this file from the internet:\n";
61 print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n";
62 exit(-1);
63 }
64 $exclude = array();
65 while( false !== ($line = fgets( $in ) ) ) {
66 if( preg_match( '/^([0-9A-F]+)/i', $line, $matches ) ) {
67 $codepoint = $matches[1];
68 $source = codepointToUtf8( hexdec( $codepoint ) );
69 $exclude[$source] = true;
70 }
71 }
72 fclose($in);
73
74 $in = fopen("UnicodeData.txt", "rt" );
75 if( !$in ) {
76 print "Can't open UnicodeData.txt for reading.\n";
77 print "If necessary, fetch this file from the internet:\n";
78 print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
79 exit(-1);
80 }
81
82 $compatibilityDecomp = array();
83 $canonicalDecomp = array();
84 $canonicalComp = array();
85 $combiningClass = array();
86 $total = 0;
87 $compat = 0;
88 $canon = 0;
89
90 print "Reading character definitions...\n";
91 while( false !== ($line = fgets( $in ) ) ) {
92 $columns = explode(';', $line);
93 $codepoint = $columns[0];
94 $name = $columns[1];
95 $canonicalCombiningClass = $columns[3];
96 $decompositionMapping = $columns[5];
97
98 $source = codepointToUtf8( hexdec( $codepoint ) );
99
100 if( $canonicalCombiningClass != 0 ) {
101 $combiningClass[$source] = intval( $canonicalCombiningClass );
102 }
103
104 if( $decompositionMapping === '' ) continue;
105 if( preg_match( '/^<(.+)> (.*)$/', $decompositionMapping, $matches ) ) {
106 # Compatibility decomposition
107 $canonical = false;
108 $decompositionMapping = $matches[2];
109 $compat++;
110 } else {
111 $canonical = true;
112 $canon++;
113 }
114 $total++;
115 $dest = hexSequenceToUtf8( $decompositionMapping );
116
117 $compatibilityDecomp[$source] = $dest;
118 if( $canonical ) {
119 $canonicalDecomp[$source] = $dest;
120 if( empty( $exclude[$source] ) ) {
121 $canonicalComp[$dest] = $source;
122 }
123 }
124 #print "$codepoint | $canonicalCombiningClasses | $decompositionMapping\n";
125 }
126 fclose( $in );
127
128 print "Recursively expanding canonical mappings...\n";
129 $changed = 42;
130 $pass = 1;
131 while( $changed > 0 ) {
132 print "pass $pass\n";
133 $changed = 0;
134 foreach( $canonicalDecomp as $source => $dest ) {
135 $newDest = preg_replace_callback(
136 '/([\xc0-\xff][\x80-\xbf]+)/',
137 'callbackCanonical',
138 $dest);
139 if( $newDest === $dest ) continue;
140 $changed++;
141 $canonicalDecomp[$source] = $newDest;
142 }
143 $pass++;
144 }
145
146 print "Recursively expanding compatibility mappings...\n";
147 $changed = 42;
148 $pass = 1;
149 while( $changed > 0 ) {
150 print "pass $pass\n";
151 $changed = 0;
152 foreach( $compatibilityDecomp as $source => $dest ) {
153 $newDest = preg_replace_callback(
154 '/([\xc0-\xff][\x80-\xbf]+)/',
155 'callbackCompat',
156 $dest);
157 if( $newDest === $dest ) continue;
158 $changed++;
159 $compatibilityDecomp[$source] = $newDest;
160 }
161 $pass++;
162 }
163
164 print "$total decomposition mappings ($canon canonical, $compat compatibility)\n";
165
166 $out = fopen("UtfNormalData.inc", "wt");
167 if( $out ) {
168 $serCombining = escapeSingleString( serialize( $combiningClass ) );
169 $serComp = escapeSingleString( serialize( $canonicalComp ) );
170 $serCanon = escapeSingleString( serialize( $canonicalDecomp ) );
171 $serCheckNFC = escapeSingleString( serialize( $checkNFC ) );
172 $outdata = "<" . "?php
173 /**
174 * This file was automatically generated -- do not edit!
175 * Run UtfNormalGenerate.php to create this file again (make clean && make)
176 *
177 * @file
178 */
179
180 UtfNormal::\$utfCombiningClass = unserialize( '$serCombining' );
181 UtfNormal::\$utfCanonicalComp = unserialize( '$serComp' );
182 UtfNormal::\$utfCanonicalDecomp = unserialize( '$serCanon' );
183 UtfNormal::\$utfCheckNFC = unserialize( '$serCheckNFC' );
184 \n";
185 fputs( $out, $outdata );
186 fclose( $out );
187 print "Wrote out UtfNormalData.inc\n";
188 } else {
189 print "Can't create file UtfNormalData.inc\n";
190 exit(-1);
191 }
192
193
194 $out = fopen("UtfNormalDataK.inc", "wt");
195 if( $out ) {
196 $serCompat = escapeSingleString( serialize( $compatibilityDecomp ) );
197 $outdata = "<" . "?php
198 /**
199 * This file was automatically generated -- do not edit!
200 * Run UtfNormalGenerate.php to create this file again (make clean && make)
201 *
202 * @file
203 */
204
205 UtfNormal::\$utfCompatibilityDecomp = unserialize( '$serCompat' );
206 \n";
207 fputs( $out, $outdata );
208 fclose( $out );
209 print "Wrote out UtfNormalDataK.inc\n";
210 exit(0);
211 } else {
212 print "Can't create file UtfNormalDataK.inc\n";
213 exit(-1);
214 }
215
216 # ---------------
217
218 function callbackCanonical( $matches ) {
219 global $canonicalDecomp;
220 if( isset( $canonicalDecomp[$matches[1]] ) ) {
221 return $canonicalDecomp[$matches[1]];
222 }
223 return $matches[1];
224 }
225
226 function callbackCompat( $matches ) {
227 global $compatibilityDecomp;
228 if( isset( $compatibilityDecomp[$matches[1]] ) ) {
229 return $compatibilityDecomp[$matches[1]];
230 }
231 return $matches[1];
232 }