af83767e57599f5be269e00562ec60b746577ac0
[lhc/web/wiklou.git] / tests / phpunit / includes / normal / CleanUpTest.php
1 <?php
2 /**
3 * Tests for UtfNormal::cleanUp() function.
4 *
5 * Copyright © 2004 Brion Vibber <brion@pobox.com>
6 * https://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 */
25
26 /**
27 * Additional tests for UtfNormal::cleanUp() function, inclusion
28 * regression checks for known problems.
29 * Requires PHPUnit.
30 *
31 * @ingroup UtfNormal
32 * @group Large
33 *
34 * @todo covers tags, will be UtfNormal::cleanUp once the below is resolved
35 * @todo split me into test methods and providers per the below comment
36 * @todo Document individual tests
37 *
38 * We ignore code coverage for this test suite until they are rewritten
39 * to use data providers (bug 46561).
40 * @codeCoverageIgnore
41 */
42 class CleanUpTest extends MediaWikiTestCase {
43 public function testAscii() {
44 $text = 'This is plain ASCII text.';
45 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
46 }
47
48 public function testNull() {
49 $text = "a \x00 null";
50 $expect = "a \xef\xbf\xbd null";
51 $this->assertEquals(
52 bin2hex( $expect ),
53 bin2hex( UtfNormal::cleanUp( $text ) ) );
54 }
55
56 public function testLatin() {
57 $text = "L'\xc3\xa9cole";
58 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
59 }
60
61 public function testLatinNormal() {
62 $text = "L'e\xcc\x81cole";
63 $expect = "L'\xc3\xa9cole";
64 $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
65 }
66
67 /**
68 * This test is *very* expensive!
69 */
70 function XtestAllChars() {
71 $rep = UTF8_REPLACEMENT;
72 for ( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
73 $char = codepointToUtf8( $i );
74 $clean = UtfNormal::cleanUp( $char );
75 $x = sprintf( "%04X", $i );
76
77 if ( $i % 0x1000 == 0 ) {
78 echo "U+$x\n";
79 }
80
81 if ( $i == 0x0009 ||
82 $i == 0x000a ||
83 $i == 0x000d ||
84 ( $i > 0x001f && $i < UNICODE_SURROGATE_FIRST ) ||
85 ( $i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
86 ( $i > 0xffff && $i <= UNICODE_MAX )
87 ) {
88 if ( isset( UtfNormal::$utfCanonicalComp[$char] )
89 || isset( UtfNormal::$utfCanonicalDecomp[$char] )
90 ) {
91 $comp = UtfNormal::NFC( $char );
92 $this->assertEquals(
93 bin2hex( $comp ),
94 bin2hex( $clean ),
95 "U+$x should be decomposed" );
96 } else {
97 $this->assertEquals(
98 bin2hex( $char ),
99 bin2hex( $clean ),
100 "U+$x should be intact" );
101 }
102 } else {
103 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
104 }
105 }
106 }
107
108 public static function provideAllBytes() {
109 return array(
110 array( '', '' ),
111 array( 'x', '' ),
112 array( '', 'x' ),
113 array( 'x', 'x' ),
114 );
115 }
116
117 /**
118 * @dataProvider provideAllBytes
119 */
120 function testBytes( $head, $tail ) {
121 for ( $i = 0x0; $i < 256; $i++ ) {
122 $char = $head . chr( $i ) . $tail;
123 $clean = UtfNormal::cleanUp( $char );
124 $x = sprintf( "%02X", $i );
125
126 if ( $i == 0x0009 ||
127 $i == 0x000a ||
128 $i == 0x000d ||
129 ( $i > 0x001f && $i < 0x80 )
130 ) {
131 $this->assertEquals(
132 bin2hex( $char ),
133 bin2hex( $clean ),
134 "ASCII byte $x should be intact" );
135 if ( $char != $clean ) {
136 return;
137 }
138 } else {
139 $norm = $head . UTF8_REPLACEMENT . $tail;
140 $this->assertEquals(
141 bin2hex( $norm ),
142 bin2hex( $clean ),
143 "Forbidden byte $x should be rejected" );
144 if ( $norm != $clean ) {
145 return;
146 }
147 }
148 }
149 }
150
151 /**
152 * @dataProvider provideAllBytes
153 */
154 function testDoubleBytes( $head, $tail ) {
155 for ( $first = 0xc0; $first < 0x100; $first += 2 ) {
156 for ( $second = 0x80; $second < 0x100; $second += 2 ) {
157 $char = $head . chr( $first ) . chr( $second ) . $tail;
158 $clean = UtfNormal::cleanUp( $char );
159 $x = sprintf( "%02X,%02X", $first, $second );
160 if ( $first > 0xc1 &&
161 $first < 0xe0 &&
162 $second < 0xc0
163 ) {
164 $norm = UtfNormal::NFC( $char );
165 $this->assertEquals(
166 bin2hex( $norm ),
167 bin2hex( $clean ),
168 "Pair $x should be intact" );
169 if ( $norm != $clean ) {
170 return;
171 }
172 } elseif ( $first > 0xfd || $second > 0xbf ) {
173 # fe and ff are not legal head bytes -- expect two replacement chars
174 $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
175 $this->assertEquals(
176 bin2hex( $norm ),
177 bin2hex( $clean ),
178 "Forbidden pair $x should be rejected" );
179 if ( $norm != $clean ) {
180 return;
181 }
182 } else {
183 $norm = $head . UTF8_REPLACEMENT . $tail;
184 $this->assertEquals(
185 bin2hex( $norm ),
186 bin2hex( $clean ),
187 "Forbidden pair $x should be rejected" );
188 if ( $norm != $clean ) {
189 return;
190 }
191 }
192 }
193 }
194 }
195
196 /**
197 * @dataProvider provideAllBytes
198 */
199 function testTripleBytes( $head, $tail ) {
200 for ( $first = 0xc0; $first < 0x100; $first += 2 ) {
201 for ( $second = 0x80; $second < 0x100; $second += 2 ) {
202 #for( $third = 0x80; $third < 0x100; $third++ ) {
203 for ( $third = 0x80; $third < 0x81; $third++ ) {
204 $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
205 $clean = UtfNormal::cleanUp( $char );
206 $x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
207
208 if ( $first >= 0xe0 &&
209 $first < 0xf0 &&
210 $second < 0xc0 &&
211 $third < 0xc0
212 ) {
213 if ( $first == 0xe0 && $second < 0xa0 ) {
214 $this->assertEquals(
215 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
216 bin2hex( $clean ),
217 "Overlong triplet $x should be rejected" );
218 } elseif ( $first == 0xed &&
219 ( chr( $first ) . chr( $second ) . chr( $third ) ) >= UTF8_SURROGATE_FIRST
220 ) {
221 $this->assertEquals(
222 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
223 bin2hex( $clean ),
224 "Surrogate triplet $x should be rejected" );
225 } else {
226 $this->assertEquals(
227 bin2hex( UtfNormal::NFC( $char ) ),
228 bin2hex( $clean ),
229 "Triplet $x should be intact" );
230 }
231 } elseif ( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
232 $this->assertEquals(
233 bin2hex( UtfNormal::NFC( $head . chr( $first ) .
234 chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
235 bin2hex( $clean ),
236 "Valid 2-byte $x + broken tail" );
237 } elseif ( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
238 $this->assertEquals(
239 bin2hex( $head . UTF8_REPLACEMENT .
240 UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
241 bin2hex( $clean ),
242 "Broken head + valid 2-byte $x" );
243 } elseif ( ( $first > 0xfd || $second > 0xfd ) &&
244 ( ( $second > 0xbf && $third > 0xbf ) ||
245 ( $second < 0xc0 && $third < 0xc0 ) ||
246 ( $second > 0xfd ) ||
247 ( $third > 0xfd ) )
248 ) {
249 # fe and ff are not legal head bytes -- expect three replacement chars
250 $this->assertEquals(
251 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
252 bin2hex( $clean ),
253 "Forbidden triplet $x should be rejected" );
254 } elseif ( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
255 $this->assertEquals(
256 bin2hex( $head . UTF8_REPLACEMENT . $tail ),
257 bin2hex( $clean ),
258 "Forbidden triplet $x should be rejected" );
259 } else {
260 $this->assertEquals(
261 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
262 bin2hex( $clean ),
263 "Forbidden triplet $x should be rejected" );
264 }
265 }
266 }
267 }
268 }
269
270 public function testChunkRegression() {
271 # Check for regression against a chunking bug
272 $text = "\x46\x55\xb8" .
273 "\xdc\x96" .
274 "\xee" .
275 "\xe7" .
276 "\x44" .
277 "\xaa" .
278 "\x2f\x25";
279 $expect = "\x46\x55\xef\xbf\xbd" .
280 "\xdc\x96" .
281 "\xef\xbf\xbd" .
282 "\xef\xbf\xbd" .
283 "\x44" .
284 "\xef\xbf\xbd" .
285 "\x2f\x25";
286
287 $this->assertEquals(
288 bin2hex( $expect ),
289 bin2hex( UtfNormal::cleanUp( $text ) ) );
290 }
291
292 public function testInterposeRegression() {
293 $text = "\x4e\x30" .
294 "\xb1" . # bad tail
295 "\x3a" .
296 "\x92" . # bad tail
297 "\x62\x3a" .
298 "\x84" . # bad tail
299 "\x43" .
300 "\xc6" . # bad head
301 "\x3f" .
302 "\x92" . # bad tail
303 "\xad" . # bad tail
304 "\x7d" .
305 "\xd9\x95";
306
307 $expect = "\x4e\x30" .
308 "\xef\xbf\xbd" .
309 "\x3a" .
310 "\xef\xbf\xbd" .
311 "\x62\x3a" .
312 "\xef\xbf\xbd" .
313 "\x43" .
314 "\xef\xbf\xbd" .
315 "\x3f" .
316 "\xef\xbf\xbd" .
317 "\xef\xbf\xbd" .
318 "\x7d" .
319 "\xd9\x95";
320
321 $this->assertEquals(
322 bin2hex( $expect ),
323 bin2hex( UtfNormal::cleanUp( $text ) ) );
324 }
325
326 public function testOverlongRegression() {
327 $text = "\x67" .
328 "\x1a" . # forbidden ascii
329 "\xea" . # bad head
330 "\xc1\xa6" . # overlong sequence
331 "\xad" . # bad tail
332 "\x1c" . # forbidden ascii
333 "\xb0" . # bad tail
334 "\x3c" .
335 "\x9e"; # bad tail
336 $expect = "\x67" .
337 "\xef\xbf\xbd" .
338 "\xef\xbf\xbd" .
339 "\xef\xbf\xbd" .
340 "\xef\xbf\xbd" .
341 "\xef\xbf\xbd" .
342 "\xef\xbf\xbd" .
343 "\x3c" .
344 "\xef\xbf\xbd";
345 $this->assertEquals(
346 bin2hex( $expect ),
347 bin2hex( UtfNormal::cleanUp( $text ) ) );
348 }
349
350 public function testSurrogateRegression() {
351 $text = "\xed\xb4\x96" . # surrogate 0xDD16
352 "\x83" . # bad tail
353 "\xb4" . # bad tail
354 "\xac"; # bad head
355 $expect = "\xef\xbf\xbd" .
356 "\xef\xbf\xbd" .
357 "\xef\xbf\xbd" .
358 "\xef\xbf\xbd";
359 $this->assertEquals(
360 bin2hex( $expect ),
361 bin2hex( UtfNormal::cleanUp( $text ) ) );
362 }
363
364 public function testBomRegression() {
365 $text = "\xef\xbf\xbe" . # U+FFFE, illegal char
366 "\xb2" . # bad tail
367 "\xef" . # bad head
368 "\x59";
369 $expect = "\xef\xbf\xbd" .
370 "\xef\xbf\xbd" .
371 "\xef\xbf\xbd" .
372 "\x59";
373 $this->assertEquals(
374 bin2hex( $expect ),
375 bin2hex( UtfNormal::cleanUp( $text ) ) );
376 }
377
378 public function testForbiddenRegression() {
379 $text = "\xef\xbf\xbf"; # U+FFFF, illegal char
380 $expect = "\xef\xbf\xbd";
381 $this->assertEquals(
382 bin2hex( $expect ),
383 bin2hex( UtfNormal::cleanUp( $text ) ) );
384 }
385
386 public function testHangulRegression() {
387 $text = "\xed\x9c\xaf" . # Hangul char
388 "\xe1\x87\x81"; # followed by another final jamo
389 $expect = $text; # Should *not* change.
390 $this->assertEquals(
391 bin2hex( $expect ),
392 bin2hex( UtfNormal::cleanUp( $text ) ) );
393 }
394 }