d5ad18d85693f53a825967ee840e0bef4d95b9f3
[lhc/web/wiklou.git] / tests / phpunit / includes / normal / CleanUpTest.php
1 <?php
2 /**
3 * Tests for UtfNormal::cleanUp() function.
4 *
5 * Copyright © 2004 Brion Vibber <brion@pobox.com>
6 * http://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 */
25
26 /**
27 * Additional tests for UtfNormal::cleanUp() function, inclusion
28 * regression checks for known problems.
29 * Requires PHPUnit.
30 *
31 * @ingroup UtfNormal
32 */
33 class CleanUpTest extends MediaWikiTestCase {
34 /** @todo document */
35 function testAscii() {
36 $text = 'This is plain ASCII text.';
37 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
38 }
39
40 /** @todo document */
41 function testNull() {
42 $text = "a \x00 null";
43 $expect = "a \xef\xbf\xbd null";
44 $this->assertEquals(
45 bin2hex( $expect ),
46 bin2hex( UtfNormal::cleanUp( $text ) ) );
47 }
48
49 /** @todo document */
50 function testLatin() {
51 $text = "L'\xc3\xa9cole";
52 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
53 }
54
55 /** @todo document */
56 function testLatinNormal() {
57 $text = "L'e\xcc\x81cole";
58 $expect = "L'\xc3\xa9cole";
59 $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
60 }
61
62 /**
63 * This test is *very* expensive!
64 * @todo document
65 */
66 function XtestAllChars() {
67 $rep = UTF8_REPLACEMENT;
68 for( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
69 $char = codepointToUtf8( $i );
70 $clean = UtfNormal::cleanUp( $char );
71 $x = sprintf( "%04X", $i );
72 if( $i % 0x1000 == 0 ) echo "U+$x\n";
73 if( $i == 0x0009 ||
74 $i == 0x000a ||
75 $i == 0x000d ||
76 ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) ||
77 ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
78 ($i > 0xffff && $i <= UNICODE_MAX ) ) {
79 if( isset( UtfNormal::$utfCanonicalComp[$char] ) || isset( UtfNormal::$utfCanonicalDecomp[$char] ) ) {
80 $comp = UtfNormal::NFC( $char );
81 $this->assertEquals(
82 bin2hex( $comp ),
83 bin2hex( $clean ),
84 "U+$x should be decomposed" );
85 } else {
86 $this->assertEquals(
87 bin2hex( $char ),
88 bin2hex( $clean ),
89 "U+$x should be intact" );
90 }
91 } else {
92 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
93 }
94 }
95 }
96
97 /** @todo document */
98 function testAllBytes() {
99 $this->doTestBytes( '', '' );
100 $this->doTestBytes( 'x', '' );
101 $this->doTestBytes( '', 'x' );
102 $this->doTestBytes( 'x', 'x' );
103 }
104
105 /** @todo document */
106 function doTestBytes( $head, $tail ) {
107 for( $i = 0x0; $i < 256; $i++ ) {
108 $char = $head . chr( $i ) . $tail;
109 $clean = UtfNormal::cleanUp( $char );
110 $x = sprintf( "%02X", $i );
111 if( $i == 0x0009 ||
112 $i == 0x000a ||
113 $i == 0x000d ||
114 ($i > 0x001f && $i < 0x80) ) {
115 $this->assertEquals(
116 bin2hex( $char ),
117 bin2hex( $clean ),
118 "ASCII byte $x should be intact" );
119 if( $char != $clean ) return;
120 } else {
121 $norm = $head . UTF8_REPLACEMENT . $tail;
122 $this->assertEquals(
123 bin2hex( $norm ),
124 bin2hex( $clean ),
125 "Forbidden byte $x should be rejected" );
126 if( $norm != $clean ) return;
127 }
128 }
129 }
130
131 /** @todo document */
132 function testDoubleBytes() {
133 $this->doTestDoubleBytes( '', '' );
134 $this->doTestDoubleBytes( 'x', '' );
135 $this->doTestDoubleBytes( '', 'x' );
136 $this->doTestDoubleBytes( 'x', 'x' );
137 }
138
139 /**
140 * @todo document
141 */
142 function doTestDoubleBytes( $head, $tail ) {
143 for( $first = 0xc0; $first < 0x100; $first+=2 ) {
144 for( $second = 0x80; $second < 0x100; $second+=2 ) {
145 $char = $head . chr( $first ) . chr( $second ) . $tail;
146 $clean = UtfNormal::cleanUp( $char );
147 $x = sprintf( "%02X,%02X", $first, $second );
148 if( $first > 0xc1 &&
149 $first < 0xe0 &&
150 $second < 0xc0 ) {
151 $norm = UtfNormal::NFC( $char );
152 $this->assertEquals(
153 bin2hex( $norm ),
154 bin2hex( $clean ),
155 "Pair $x should be intact" );
156 if( $norm != $clean ) return;
157 } elseif( $first > 0xfd || $second > 0xbf ) {
158 # fe and ff are not legal head bytes -- expect two replacement chars
159 $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
160 $this->assertEquals(
161 bin2hex( $norm ),
162 bin2hex( $clean ),
163 "Forbidden pair $x should be rejected" );
164 if( $norm != $clean ) return;
165 } else {
166 $norm = $head . UTF8_REPLACEMENT . $tail;
167 $this->assertEquals(
168 bin2hex( $norm ),
169 bin2hex( $clean ),
170 "Forbidden pair $x should be rejected" );
171 if( $norm != $clean ) return;
172 }
173 }
174 }
175 }
176
177 /** @todo document */
178 function testTripleBytes() {
179 $this->doTestTripleBytes( '', '' );
180 $this->doTestTripleBytes( 'x', '' );
181 $this->doTestTripleBytes( '', 'x' );
182 $this->doTestTripleBytes( 'x', 'x' );
183 }
184
185 /** @todo document */
186 function doTestTripleBytes( $head, $tail ) {
187 for( $first = 0xc0; $first < 0x100; $first+=2 ) {
188 for( $second = 0x80; $second < 0x100; $second+=2 ) {
189 #for( $third = 0x80; $third < 0x100; $third++ ) {
190 for( $third = 0x80; $third < 0x81; $third++ ) {
191 $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
192 $clean = UtfNormal::cleanUp( $char );
193 $x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
194 if( $first >= 0xe0 &&
195 $first < 0xf0 &&
196 $second < 0xc0 &&
197 $third < 0xc0 ) {
198 if( $first == 0xe0 && $second < 0xa0 ) {
199 $this->assertEquals(
200 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
201 bin2hex( $clean ),
202 "Overlong triplet $x should be rejected" );
203 } elseif( $first == 0xed &&
204 ( chr( $first ) . chr( $second ) . chr( $third )) >= UTF8_SURROGATE_FIRST ) {
205 $this->assertEquals(
206 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
207 bin2hex( $clean ),
208 "Surrogate triplet $x should be rejected" );
209 } else {
210 $this->assertEquals(
211 bin2hex( UtfNormal::NFC( $char ) ),
212 bin2hex( $clean ),
213 "Triplet $x should be intact" );
214 }
215 } elseif( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
216 $this->assertEquals(
217 bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
218 bin2hex( $clean ),
219 "Valid 2-byte $x + broken tail" );
220 } elseif( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
221 $this->assertEquals(
222 bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
223 bin2hex( $clean ),
224 "Broken head + valid 2-byte $x" );
225 } elseif( ( $first > 0xfd || $second > 0xfd ) &&
226 ( ( $second > 0xbf && $third > 0xbf ) ||
227 ( $second < 0xc0 && $third < 0xc0 ) ||
228 ( $second > 0xfd ) ||
229 ( $third > 0xfd ) ) ) {
230 # fe and ff are not legal head bytes -- expect three replacement chars
231 $this->assertEquals(
232 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
233 bin2hex( $clean ),
234 "Forbidden triplet $x should be rejected" );
235 } elseif( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
236 $this->assertEquals(
237 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
238 bin2hex( $clean ),
239 "Forbidden triplet $x should be rejected" );
240 } else {
241 $this->assertEquals(
242 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
243 bin2hex( $clean ),
244 "Forbidden triplet $x should be rejected" );
245 }
246 }
247 }
248 }
249 }
250
251 /** @todo document */
252 function testChunkRegression() {
253 # Check for regression against a chunking bug
254 $text = "\x46\x55\xb8" .
255 "\xdc\x96" .
256 "\xee" .
257 "\xe7" .
258 "\x44" .
259 "\xaa" .
260 "\x2f\x25";
261 $expect = "\x46\x55\xef\xbf\xbd" .
262 "\xdc\x96" .
263 "\xef\xbf\xbd" .
264 "\xef\xbf\xbd" .
265 "\x44" .
266 "\xef\xbf\xbd" .
267 "\x2f\x25";
268
269 $this->assertEquals(
270 bin2hex( $expect ),
271 bin2hex( UtfNormal::cleanUp( $text ) ) );
272 }
273
274 /** @todo document */
275 function testInterposeRegression() {
276 $text = "\x4e\x30" .
277 "\xb1" . # bad tail
278 "\x3a" .
279 "\x92" . # bad tail
280 "\x62\x3a" .
281 "\x84" . # bad tail
282 "\x43" .
283 "\xc6" . # bad head
284 "\x3f" .
285 "\x92" . # bad tail
286 "\xad" . # bad tail
287 "\x7d" .
288 "\xd9\x95";
289
290 $expect = "\x4e\x30" .
291 "\xef\xbf\xbd" .
292 "\x3a" .
293 "\xef\xbf\xbd" .
294 "\x62\x3a" .
295 "\xef\xbf\xbd" .
296 "\x43" .
297 "\xef\xbf\xbd" .
298 "\x3f" .
299 "\xef\xbf\xbd" .
300 "\xef\xbf\xbd" .
301 "\x7d" .
302 "\xd9\x95";
303
304 $this->assertEquals(
305 bin2hex( $expect ),
306 bin2hex( UtfNormal::cleanUp( $text ) ) );
307 }
308
309 /** @todo document */
310 function testOverlongRegression() {
311 $text = "\x67" .
312 "\x1a" . # forbidden ascii
313 "\xea" . # bad head
314 "\xc1\xa6" . # overlong sequence
315 "\xad" . # bad tail
316 "\x1c" . # forbidden ascii
317 "\xb0" . # bad tail
318 "\x3c" .
319 "\x9e"; # bad tail
320 $expect = "\x67" .
321 "\xef\xbf\xbd" .
322 "\xef\xbf\xbd" .
323 "\xef\xbf\xbd" .
324 "\xef\xbf\xbd" .
325 "\xef\xbf\xbd" .
326 "\xef\xbf\xbd" .
327 "\x3c" .
328 "\xef\xbf\xbd";
329 $this->assertEquals(
330 bin2hex( $expect ),
331 bin2hex( UtfNormal::cleanUp( $text ) ) );
332 }
333
334 /** @todo document */
335 function testSurrogateRegression() {
336 $text = "\xed\xb4\x96" . # surrogate 0xDD16
337 "\x83" . # bad tail
338 "\xb4" . # bad tail
339 "\xac"; # bad head
340 $expect = "\xef\xbf\xbd" .
341 "\xef\xbf\xbd" .
342 "\xef\xbf\xbd" .
343 "\xef\xbf\xbd";
344 $this->assertEquals(
345 bin2hex( $expect ),
346 bin2hex( UtfNormal::cleanUp( $text ) ) );
347 }
348
349 /** @todo document */
350 function testBomRegression() {
351 $text = "\xef\xbf\xbe" . # U+FFFE, illegal char
352 "\xb2" . # bad tail
353 "\xef" . # bad head
354 "\x59";
355 $expect = "\xef\xbf\xbd" .
356 "\xef\xbf\xbd" .
357 "\xef\xbf\xbd" .
358 "\x59";
359 $this->assertEquals(
360 bin2hex( $expect ),
361 bin2hex( UtfNormal::cleanUp( $text ) ) );
362 }
363
364 /** @todo document */
365 function testForbiddenRegression() {
366 $text = "\xef\xbf\xbf"; # U+FFFF, illegal char
367 $expect = "\xef\xbf\xbd";
368 $this->assertEquals(
369 bin2hex( $expect ),
370 bin2hex( UtfNormal::cleanUp( $text ) ) );
371 }
372
373 /** @todo document */
374 function testHangulRegression() {
375 $text = "\xed\x9c\xaf" . # Hangul char
376 "\xe1\x87\x81"; # followed by another final jamo
377 $expect = $text; # Should *not* change.
378 $this->assertEquals(
379 bin2hex( $expect ),
380 bin2hex( UtfNormal::cleanUp( $text ) ) );
381 }
382 }