Fix UTF-8 validation regression where a bad head byte is followed by ascii, then...
[lhc/web/wiklou.git] / includes / normal / CleanUpTest.php
index d165731..eed9c3f 100644 (file)
@@ -1,4 +1,8 @@
 <?php
+/** */
+if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
+       dl( 'php_utfnormal.so' );
+}
 
 #ini_set( 'memory_limit', '40M' );
 
@@ -53,8 +57,7 @@ class CleanUpTest extends PHPUnit_TestCase {
                            $i == 0x000a ||
                            $i == 0x000d ||
                            ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) ||
-                           ($i > UNICODE_SURROGATE_LAST && $i < 0xfdd0 ) ||
-                           ($i > 0xfdef && $i < 0xfffe ) ||
+                           ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
                            ($i > 0xffff && $i <= UNICODE_MAX ) ) {
                                if( isset( $utfCanonicalComp[$char] ) || isset( $utfCanonicalDecomp[$char] ) ) {
                                    $comp = UtfNormal::NFC( $char );
@@ -94,11 +97,14 @@ class CleanUpTest extends PHPUnit_TestCase {
                                        bin2hex( $char ), 
                                        bin2hex( $clean ),
                                        "ASCII byte $x should be intact" );
+                               if( $char != $clean ) return;
                        } else {
+                               $norm = $head . UTF8_REPLACEMENT . $tail;
                                $this->assertEquals(
-                                       bin2hex( $head . UTF8_REPLACEMENT . $tail ),
+                                       bin2hex( $norm ),
                                        bin2hex( $clean ),
                                        "Forbidden byte $x should be rejected" );
+                               if( $norm != $clean ) return;
                        }
                }
        }
@@ -119,21 +125,27 @@ class CleanUpTest extends PHPUnit_TestCase {
                                if( $first > 0xc1 &&
                                    $first < 0xe0 &&
                                    $second < 0xc0 ) {
+                                   $norm = UtfNormal::NFC( $char );
                                        $this->assertEquals(
-                                               bin2hex( UtfNormal::NFC( $char ) ), 
+                                               bin2hex( $norm ), 
                                                bin2hex( $clean ),
                                                "Pair $x should be intact" );
+                                   if( $norm != $clean ) return;
                                } elseif( $first > 0xfd || $second > 0xbf ) {
                                        # fe and ff are not legal head bytes -- expect two replacement chars
+                                       $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
                                        $this->assertEquals(
-                                               bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
+                                               bin2hex( $norm ),
                                                bin2hex( $clean ),
                                                "Forbidden pair $x should be rejected" );
+                                       if( $norm != $clean ) return;
                                } else {
+                                       $norm = $head . UTF8_REPLACEMENT . $tail;
                                        $this->assertEquals(
-                                               bin2hex( $head . UTF8_REPLACEMENT . $tail ),
+                                               bin2hex( $norm ),
                                                bin2hex( $clean ),
                                                "Forbidden pair $x should be rejected" );
+                                       if( $norm != $clean ) return;
                                }
                        }
                }
@@ -141,9 +153,9 @@ class CleanUpTest extends PHPUnit_TestCase {
 
        function testTripleBytes() {
                $this->doTestTripleBytes( '', '' );
-               #$this->doTestTripleBytes( 'x', '' );
-               #$this->doTestTripleBytes( '', 'x' );
-               #$this->doTestTripleBytes( 'x', 'x' );
+               $this->doTestTripleBytes( 'x', '' );
+               $this->doTestTripleBytes( '', 'x' );
+               $this->doTestTripleBytes( 'x', 'x' );
        }
        
        function doTestTripleBytes( $head, $tail ) {
@@ -160,13 +172,13 @@ class CleanUpTest extends PHPUnit_TestCase {
                                                $third < 0xc0 ) {
                                                if( $first == 0xe0 && $second < 0xa0 ) {
                                                        $this->assertEquals(
-                                                               bin2hex( UTF8_REPLACEMENT ), 
+                                                               bin2hex( $head . UTF8_REPLACEMENT . $tail ), 
                                                                bin2hex( $clean ),
                                                                "Overlong triplet $x should be rejected" );
                                                } elseif( $first == 0xed && 
                                                        ( chr( $first ) . chr( $second ) . chr( $third ))  >= UTF8_SURROGATE_FIRST ) {
                                                        $this->assertEquals(
-                                                               bin2hex( UTF8_REPLACEMENT ), 
+                                                               bin2hex( $head . UTF8_REPLACEMENT . $tail ), 
                                                                bin2hex( $clean ),
                                                                "Surrogate triplet $x should be rejected" );
                                                } else {
@@ -177,15 +189,19 @@ class CleanUpTest extends PHPUnit_TestCase {
                                                }
                                        } elseif( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
                                                $this->assertEquals(
-                                                       bin2hex( $head . UtfNormal::NFC( chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
+                                                       bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
                                                        bin2hex( $clean ),
                                                        "Valid 2-byte $x + broken tail" );
                                        } elseif( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
                                                $this->assertEquals(
-                                                       bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) ) . $tail ),
+                                                       bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
                                                        bin2hex( $clean ),
                                                        "Broken head + valid 2-byte $x" );
-                                       } elseif( $first > 0xfd && ( ( $second > 0xbf && $third > 0xbf ) || ($second < 0xc0 && $third < 0xc0 ) || ($second > 0xfd ) || ($third > 0xfd) ) ) {
+                                       } elseif( ( $first > 0xfd || $second > 0xfd ) &&
+                                                   ( ( $second > 0xbf && $third > 0xbf ) ||
+                                                     ( $second < 0xc0 && $third < 0xc0 ) ||
+                                                     ( $second > 0xfd ) ||
+                                                     ( $third > 0xfd ) ) ) {
                                                # fe and ff are not legal head bytes -- expect three replacement chars
                                                $this->assertEquals(
                                                        bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
@@ -206,7 +222,62 @@ class CleanUpTest extends PHPUnit_TestCase {
                        }
                }
        }
+       
+       function testChunkRegression() {
+               # Check for regression against a chunking bug
+               $text   = "\x46\x55\xb8" .
+                         "\xdc\x96" . 
+                         "\xee" .
+                         "\xe7" .
+                         "\x44" .
+                         "\xaa" .
+                         "\x2f\x25";
+               $expect = "\x46\x55\xef\xbf\xbd" .
+                         "\xdc\x96" . 
+                         "\xef\xbf\xbd" .
+                         "\xef\xbf\xbd" .
+                         "\x44" .
+                         "\xef\xbf\xbd" .
+                         "\x2f\x25";
+
+               $this->assertEquals(
+                       bin2hex( $expect ),
+                       bin2hex( UtfNormal::cleanUp( $text ) ) );
+       }
 
+       function testInterposeRegression() {
+               $text   = "\x4e\x30" .
+                         "\xb1" .              # bad tail
+                         "\x3a" .
+                         "\x92" .              # bad tail
+                         "\x62\x3a" .
+                         "\x84" .              # bad tail
+                         "\x43" .
+                         "\xc6" .              # bad head
+                         "\x3f" .
+                         "\x92" .              # bad tail
+                         "\xad" .              # bad tail
+                         "\x7d" .
+                         "\xd9\x95";
+       
+               $expect = "\x4e\x30" .
+                         "\xef\xbf\xbd" .
+                         "\x3a" .
+                         "\xef\xbf\xbd" .
+                         "\x62\x3a" .
+                         "\xef\xbf\xbd" .
+                         "\x43" .
+                         "\xef\xbf\xbd" .
+                         "\x3f" .
+                         "\xef\xbf\xbd" .
+                         "\xef\xbf\xbd" .
+                         "\x7d" .
+                         "\xd9\x95";
+               
+               $this->assertEquals(
+                       bin2hex( $expect ),
+                       bin2hex( UtfNormal::cleanUp( $text ) ) );
+       }
 }