Fix regression: surrogate half followed by extra tail bytes
authorBrion Vibber <brion@users.mediawiki.org>
Sun, 14 Nov 2004 04:27:03 +0000 (04:27 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Sun, 14 Nov 2004 04:27:03 +0000 (04:27 +0000)
includes/normal/CleanUpTest.php
includes/normal/UtfNormal.php

index 7aa83c4..0eb4058 100644 (file)
@@ -303,6 +303,19 @@ class CleanUpTest extends PHPUnit_TestCase {
                        bin2hex( UtfNormal::cleanUp( $text ) ) );
        }
        
+       function testSurrogateRegression() {
+               $text   = "\xed\xb4\x96" . # surrogate 0xDD16
+                         "\x83" . # bad tail
+                         "\xb4" . # bad tail
+                         "\xac";  # bad head
+               $expect = "\xef\xbf\xbd" .
+                         "\xef\xbf\xbd" .
+                         "\xef\xbf\xbd" .
+                         "\xef\xbf\xbd";
+               $this->assertEquals(
+                       bin2hex( $expect ),
+                       bin2hex( UtfNormal::cleanUp( $text ) ) );
+       }
 }
 
 
index 1cbbd01..254e9c0 100644 (file)
@@ -380,6 +380,7 @@ class UtfNormal {
                                                                $replace[] = array( UTF8_REPLACEMENT,
                                                                             $base + $i + 1 - strlen( $sequence ),
                                                                             strlen( $sequence ) );
+                                                               $head = '';
                                                                continue;
                                                        }
                                                } else {