From dd69eb14f5e5edb10bdca5fb5310b3162059b4d8 Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Sun, 14 Nov 2004 03:48:49 +0000 Subject: [PATCH] Fix UTF-8 validation regression where a bad head byte is followed by ascii, then bad tail byte. --- includes/normal/CleanUpTest.php | 39 ++++++++++++++++++++++++++++++++- includes/normal/UtfNormal.php | 2 ++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/includes/normal/CleanUpTest.php b/includes/normal/CleanUpTest.php index 2dd8d5616f..eed9c3fb86 100644 --- a/includes/normal/CleanUpTest.php +++ b/includes/normal/CleanUpTest.php @@ -197,7 +197,11 @@ class CleanUpTest extends PHPUnit_TestCase { bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ), bin2hex( $clean ), "Broken head + valid 2-byte $x" ); - } elseif( $first > 0xfd && ( ( $second > 0xbf && $third > 0xbf ) || ($second < 0xc0 && $third < 0xc0 ) || ($second > 0xfd ) || ($third > 0xfd) ) ) { + } elseif( ( $first > 0xfd || $second > 0xfd ) && + ( ( $second > 0xbf && $third > 0xbf ) || + ( $second < 0xc0 && $third < 0xc0 ) || + ( $second > 0xfd ) || + ( $third > 0xfd ) ) ) { # fe and ff are not legal head bytes -- expect three replacement chars $this->assertEquals( bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ), @@ -241,6 +245,39 @@ class CleanUpTest extends PHPUnit_TestCase { bin2hex( UtfNormal::cleanUp( $text ) ) ); } + function testInterposeRegression() { + $text = "\x4e\x30" . + "\xb1" . # bad tail + "\x3a" . + "\x92" . # bad tail + "\x62\x3a" . + "\x84" . # bad tail + "\x43" . + "\xc6" . # bad head + "\x3f" . + "\x92" . # bad tail + "\xad" . # bad tail + "\x7d" . + "\xd9\x95"; + + $expect = "\x4e\x30" . + "\xef\xbf\xbd" . + "\x3a" . + "\xef\xbf\xbd" . + "\x62\x3a" . + "\xef\xbf\xbd" . + "\x43" . + "\xef\xbf\xbd" . + "\x3f" . + "\xef\xbf\xbd" . + "\xef\xbf\xbd" . + "\x7d" . + "\xd9\x95"; + + $this->assertEquals( + bin2hex( $expect ), + bin2hex( UtfNormal::cleanUp( $text ) ) ); + } } diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php index 7dd9072185..3c01b07341 100644 --- a/includes/normal/UtfNormal.php +++ b/includes/normal/UtfNormal.php @@ -424,6 +424,7 @@ class UtfNormal { $head = ''; } elseif( $c < "\x80" ) { # ASCII byte. + $head = ''; } elseif( $c < "\xc0" ) { # Illegal tail bytes if( $head == '' ) { @@ -438,6 +439,7 @@ class UtfNormal { } else { # Miscellaneous freaks. $replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 ); + $head = ''; } } $base += $chunk; -- 2.20.1