Added an explanation for the way the code is set up.
[lhc/web/wiklou.git] / includes / normal / UtfNormalTest2.php
1 #!/usr/bin/php
2 <?php
3
4 if( php_sapi_name() != 'cli' ) {
5 die( "Run me from the command line please.\n" );
6 }
7
8 // From http://unicode.org/Public/UNIDATA/NormalizationTest.txt
9 $file = "NormalizationTest.txt";
10 $sep = ';';
11 $comment = "#";
12 $f = fopen($file, "r");
13
14 /**
15 * The following section will be used for testing different normalization methods.
16 * - Pure PHP
17 ~ no assertion errors
18 ~ 6.25 minutes
19
20 * - php_utfnormal.so or intl extension: both are wrappers around
21 libicu so we list the version of libicu when making the
22 comparison
23
24 * - libicu Ubuntu 3.8.1-3ubuntu1.1 php 5.2.6-3ubuntu4.5
25 ~ 2200 assertion errors
26 ~ 5 seconds
27 ~ output: http://paste2.org/p/921566
28
29 * - libicu Ubuntu 4.2.1-3 php 5.3.2-1ubuntu4.2
30 ~ 1384 assertion errors
31 ~ 15 seconds
32 ~ output: http://paste2.org/p/921435
33
34 * - libicu Debian 4.4.1-5 php 5.3.2-1ubuntu4.2
35 ~ no assertion errors
36 ~ 13 seconds
37
38 * - Tests comparing pure PHP output with libicu output were added
39 later and slow down the runtime.
40 */
41
42 require_once("./UtfNormal.php");
43 function normalize_form_c($c) { return UtfNormal::toNFC($c); }
44 function normalize_form_d($c) { return UtfNormal::toNFD($c); }
45 function normalize_form_kc($c) { return UtfNormal::toNFKC($c); }
46 function normalize_form_kd($c) { return UtfNormal::toNFKD($c); }
47
48 /**
49 * This set of functions is only useful if youve added a param to the
50 * following functions to force pure PHP usage. I decided not to
51 * commit that code since might produce a slowdown in the UTF
52 * normalization code just for the sake of these tests. -- hexmode
53 */
54 function normalize_form_c_php($c) { return UtfNormal::toNFC($c, "php"); }
55 function normalize_form_d_php($c) { return UtfNormal::toNFD($c, "php"); }
56 function normalize_form_kc_php($c) { return UtfNormal::toNFKC($c, "php"); }
57 function normalize_form_kd_php($c) { return UtfNormal::toNFKD($c, "php"); }
58
59 assert_options(ASSERT_ACTIVE, 1);
60 assert_options(ASSERT_WARNING, 0);
61 assert_options(ASSERT_QUIET_EVAL, 1);
62 assert_options(ASSERT_CALLBACK, 'my_assert');
63
64 function my_assert( $file, $line, $code ) {
65 global $col, $count, $lineNo;
66 echo "Assertion that '$code' failed on line $lineNo ($col[5])\n";
67 }
68
69 $count = 0;
70 $lineNo = 0;
71 if( $f !== false ) {
72 while( ( $col = getRow( $f ) ) !== false ) {
73 $lineNo++;
74
75 if(count($col) == 6) {
76 $count++;
77 if( $count % 100 === 0 ) echo "Count: $count\n";
78 } else {
79 continue;
80 }
81
82 # verify that the pure PHP version is correct
83 $NFCc1 = normalize_form_c($col[0]);
84 $NFCc1p = normalize_form_c_php($col[0]);
85 assert('$NFCc1 === $NFCc1p');
86 $NFCc2 = normalize_form_c($col[1]);
87 $NFCc2p = normalize_form_c_php($col[1]);
88 assert('$NFCc2 === $NFCc2p');
89 $NFCc3 = normalize_form_c($col[2]);
90 $NFCc3p = normalize_form_c_php($col[2]);
91 assert('$NFCc3 === $NFCc3p');
92 $NFCc4 = normalize_form_c($col[3]);
93 $NFCc4p = normalize_form_c_php($col[3]);
94 assert('$NFCc4 === $NFCc4p');
95 $NFCc5 = normalize_form_c($col[4]);
96 $NFCc5p = normalize_form_c_php($col[4]);
97 assert('$NFCc5 === $NFCc5p');
98
99 $NFDc1 = normalize_form_d($col[0]);
100 $NFDc1p = normalize_form_d_php($col[0]);
101 assert('$NFDc1 === $NFDc1p');
102 $NFDc2 = normalize_form_d($col[1]);
103 $NFDc2p = normalize_form_d_php($col[1]);
104 assert('$NFDc2 === $NFDc2p');
105 $NFDc3 = normalize_form_d($col[2]);
106 $NFDc3p = normalize_form_d_php($col[2]);
107 assert('$NFDc3 === $NFDc3p');
108 $NFDc4 = normalize_form_d($col[3]);
109 $NFDc4p = normalize_form_d_php($col[3]);
110 assert('$NFDc4 === $NFDc4p');
111 $NFDc5 = normalize_form_d($col[4]);
112 $NFDc5p = normalize_form_d_php($col[4]);
113 assert('$NFDc5 === $NFDc5p');
114
115 $NFKDc1 = normalize_form_kd($col[0]);
116 $NFKDc1p = normalize_form_kd_php($col[0]);
117 assert('$NFKDc1 === $NFKDc1p');
118 $NFKDc2 = normalize_form_kd($col[1]);
119 $NFKDc2p = normalize_form_kd_php($col[1]);
120 assert('$NFKDc2 === $NFKDc2p');
121 $NFKDc3 = normalize_form_kd($col[2]);
122 $NFKDc3p = normalize_form_kd_php($col[2]);
123 assert('$NFKDc3 === $NFKDc3p');
124 $NFKDc4 = normalize_form_kd($col[3]);
125 $NFKDc4p = normalize_form_kd_php($col[3]);
126 assert('$NFKDc4 === $NFKDc4p');
127 $NFKDc5 = normalize_form_kd($col[4]);
128 $NFKDc5p = normalize_form_kd_php($col[4]);
129 assert('$NFKDc5 === $NFKDc5p');
130
131 $NFKCc1 = normalize_form_kc($col[0]);
132 $NFKCc1p = normalize_form_kc_php($col[0]);
133 assert('$NFKCc1 === $NFKCc1p');
134 $NFKCc2 = normalize_form_kc($col[1]);
135 $NFKCc2p = normalize_form_kc_php($col[1]);
136 assert('$NFKCc2 === $NFKCc2p');
137 $NFKCc3 = normalize_form_kc($col[2]);
138 $NFKCc3p = normalize_form_kc_php($col[2]);
139 assert('$NFKCc3 === $NFKCc3p');
140 $NFKCc4 = normalize_form_kc($col[3]);
141 $NFKCc4p = normalize_form_kc_php($col[3]);
142 assert('$NFKCc4 === $NFKCc4p');
143 $NFKCc5 = normalize_form_kc($col[4]);
144 $NFKCc5p = normalize_form_kc_php($col[4]);
145 assert('$NFKCc5 === $NFKCc5p');
146
147 # c2 == NFC(c1) == NFC(c2) == NFC(c3)
148 assert('$col[1] === $NFCc1');
149 assert('$col[1] === $NFCc2');
150 assert('$col[1] === $NFCc3');
151
152 # c4 == NFC(c4) == NFC(c5)
153 assert('$col[3] === $NFCc4');
154 assert('$col[3] === $NFCc5');
155
156 # c3 == NFD(c1) == NFD(c2) == NFD(c3)
157 assert('$col[2] === $NFDc1');
158 assert('$col[2] === $NFDc2');
159 assert('$col[2] === $NFDc3');
160
161 # c5 == NFD(c4) == NFD(c5)
162 assert('$col[4] === $NFDc4');
163 assert('$col[4] === $NFDc5');
164
165 # c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
166 assert('$col[3] === $NFKCc1');
167 assert('$col[3] === $NFKCc2');
168 assert('$col[3] === $NFKCc3');
169 assert('$col[3] === $NFKCc4');
170 assert('$col[3] === $NFKCc5');
171
172 # c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
173 assert('$col[4] === $NFKDc1');
174 assert('$col[4] === $NFKDc2');
175 assert('$col[4] === $NFKDc3');
176 assert('$col[4] === $NFKDc4');
177 assert('$col[4] === $NFKDc5');
178 }
179 }
180 echo "done.\n";
181
182 // Compare against http://en.wikipedia.org/wiki/UTF-8#Description
183 function unichr($c) {
184 if ($c <= 0x7F) {
185 return chr($c);
186 } else if ($c <= 0x7FF) {
187 return chr(0xC0 | $c >> 6) . chr(0x80 | $c & 0x3F);
188 } else if ($c <= 0xFFFF) {
189 return chr(0xE0 | $c >> 12) . chr(0x80 | $c >> 6 & 0x3F)
190 . chr(0x80 | $c & 0x3F);
191 } else if ($c <= 0x10FFFF) {
192 return chr(0xF0 | $c >> 18) . chr(0x80 | $c >> 12 & 0x3F)
193 . chr(0x80 | $c >> 6 & 0x3F)
194 . chr(0x80 | $c & 0x3F);
195 } else {
196 return false;
197 }
198 }
199
200 function unistr($c) {
201 return implode("", array_map("unichr", array_map("hexdec", explode(" ", $c))));
202 }
203
204 function getRow( $f ) {
205 global $comment, $sep;
206
207 $row = fgets( $f );
208 if( $row === false ) return false;
209 $row = rtrim($row);
210 $pos = strpos( $row, $comment );
211 $pos2 = strpos( $row, ")" );
212 if( $pos === 0 ) return array($row);
213 $c = "";
214
215 if( $pos ) {
216 if($pos2) $c = substr( $row, $pos2 + 2 );
217 else $c = substr( $row, $pos );
218 $row = substr( $row, 0, $pos );
219 }
220
221 $ret = array();
222 foreach(explode( $sep, $row ) as $ent) {
223 if(trim($ent) !== "") {
224 $ret[] = unistr($ent);
225 }
226 }
227 $ret[] = $c;
228
229 return $ret;
230 }