From 9535fc035b57fa548a94c08a8c67860fd87e8f53 Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Sun, 14 Nov 2004 04:07:28 +0000 Subject: [PATCH] Fix UTF-8 validation regression: well-formed but forbidden UTF-8 sequence followed by bogus tail bytes --- includes/normal/CleanUpTest.php | 27 ++++++++++++++++++++++++++- includes/normal/UtfNormal.php | 1 + 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/includes/normal/CleanUpTest.php b/includes/normal/CleanUpTest.php index eed9c3fb86..7aa83c433b 100644 --- a/includes/normal/CleanUpTest.php +++ b/includes/normal/CleanUpTest.php @@ -207,7 +207,7 @@ class CleanUpTest extends PHPUnit_TestCase { bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ), bin2hex( $clean ), "Forbidden triplet $x should be rejected" ); - } elseif( $second < 0xc0 && $second < 0xc0 ) { + } elseif( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) { $this->assertEquals( bin2hex( $head . UTF8_REPLACEMENT . $tail ), bin2hex( $clean ), @@ -278,6 +278,31 @@ class CleanUpTest extends PHPUnit_TestCase { bin2hex( $expect ), bin2hex( UtfNormal::cleanUp( $text ) ) ); } + + function testOverlongRegression() { + $text = "\x67" . + "\x1a" . # forbidden ascii + "\xea" . # bad head + "\xc1\xa6" . # overlong sequence + "\xad" . # bad tail + "\x1c" . # forbidden ascii + "\xb0" . # bad tail + "\x3c" . + "\x9e"; # bad tail + $expect = "\x67" . + "\xef\xbf\xbd" . + "\xef\xbf\xbd" . + "\xef\xbf\xbd" . + "\xef\xbf\xbd" . + "\xef\xbf\xbd" . + "\xef\xbf\xbd" . + "\x3c" . + "\xef\xbf\xbd"; + $this->assertEquals( + bin2hex( $expect ), + bin2hex( UtfNormal::cleanUp( $text ) ) ); + } + } diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php index 3c01b07341..1cbbd01b43 100644 --- a/includes/normal/UtfNormal.php +++ b/includes/normal/UtfNormal.php @@ -408,6 +408,7 @@ class UtfNormal { $replace[] = array( UTF8_REPLACEMENT, $base + $i + 1 - strlen( $sequence ), strlen( $sequence ) ); + $head = ''; continue; } } -- 2.20.1