Fix UTF-8 validation regression: well-formed but forbidden UTF-8 sequence followed...
authorBrion Vibber <brion@users.mediawiki.org>
Sun, 14 Nov 2004 04:07:28 +0000 (04:07 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Sun, 14 Nov 2004 04:07:28 +0000 (04:07 +0000)
includes/normal/CleanUpTest.php
includes/normal/UtfNormal.php

index eed9c3f..7aa83c4 100644 (file)
@@ -207,7 +207,7 @@ class CleanUpTest extends PHPUnit_TestCase {
                                                        bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
                                                        bin2hex( $clean ),
                                                        "Forbidden triplet $x should be rejected" );
-                                       } elseif( $second < 0xc0 && $second < 0xc0 ) {
+                                       } elseif( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
                                                $this->assertEquals(
                                                        bin2hex( $head . UTF8_REPLACEMENT . $tail ),
                                                        bin2hex( $clean ),
@@ -278,6 +278,31 @@ class CleanUpTest extends PHPUnit_TestCase {
                        bin2hex( $expect ),
                        bin2hex( UtfNormal::cleanUp( $text ) ) );
        }
+       
+       function testOverlongRegression() {
+               $text   = "\x67" .
+                         "\x1a" . # forbidden ascii
+                         "\xea" . # bad head
+                         "\xc1\xa6" . # overlong sequence
+                         "\xad" . # bad tail
+                         "\x1c" . # forbidden ascii
+                         "\xb0" . # bad tail
+                         "\x3c" .
+                         "\x9e";  # bad tail
+               $expect = "\x67" .
+                         "\xef\xbf\xbd" .
+                         "\xef\xbf\xbd" .
+                         "\xef\xbf\xbd" .
+                         "\xef\xbf\xbd" .
+                         "\xef\xbf\xbd" .
+                         "\xef\xbf\xbd" .
+                         "\x3c" .
+                         "\xef\xbf\xbd";
+               $this->assertEquals(
+                       bin2hex( $expect ),
+                       bin2hex( UtfNormal::cleanUp( $text ) ) );
+       }
+       
 }
 
 
index 3c01b07..1cbbd01 100644 (file)
@@ -408,6 +408,7 @@ class UtfNormal {
                                                                $replace[] = array( UTF8_REPLACEMENT,
                                                                                    $base + $i + 1 - strlen( $sequence ), 
                                                                                    strlen( $sequence ) );
+                                                               $head = '';
                                                                continue;
                                                        }
                                                }