From 4a4f248655de7ed513ae45058cbc9f3c3103426b Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Sun, 14 Nov 2004 04:27:03 +0000 Subject: [PATCH] Fix regression: surrogate half followed by extra tail bytes --- includes/normal/CleanUpTest.php | 13 +++++++++++++ includes/normal/UtfNormal.php | 1 + 2 files changed, 14 insertions(+) diff --git a/includes/normal/CleanUpTest.php b/includes/normal/CleanUpTest.php index 7aa83c433b..0eb4058819 100644 --- a/includes/normal/CleanUpTest.php +++ b/includes/normal/CleanUpTest.php @@ -303,6 +303,19 @@ class CleanUpTest extends PHPUnit_TestCase { bin2hex( UtfNormal::cleanUp( $text ) ) ); } + function testSurrogateRegression() { + $text = "\xed\xb4\x96" . # surrogate 0xDD16 + "\x83" . # bad tail + "\xb4" . # bad tail + "\xac"; # bad head + $expect = "\xef\xbf\xbd" . + "\xef\xbf\xbd" . + "\xef\xbf\xbd" . + "\xef\xbf\xbd"; + $this->assertEquals( + bin2hex( $expect ), + bin2hex( UtfNormal::cleanUp( $text ) ) ); + } } diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php index 1cbbd01b43..254e9c0472 100644 --- a/includes/normal/UtfNormal.php +++ b/includes/normal/UtfNormal.php @@ -380,6 +380,7 @@ class UtfNormal { $replace[] = array( UTF8_REPLACEMENT, $base + $i + 1 - strlen( $sequence ), strlen( $sequence ) ); + $head = ''; continue; } } else { -- 2.20.1