From 7447669e83910865a061bbae34dad861bf3396d9 Mon Sep 17 00:00:00 2001 From: Kevin Israel Date: Sun, 15 Sep 2013 23:29:04 -0400 Subject: [PATCH] Adapt StringUtils::isUtf8 to the top of Unicode at U+10FFFF RFC 3629 defines the legal range of characters as U+0000..U+10FFFF and forbids overlong forms (encodings of a character that use more bytes than necessary). Let's make StringUtils::isUtf8() match the specification. * Changed the maximum value in the pure PHP code path and added a check for overlong forms. * Added another check, specific to PHP 5.3's mbstring extension, for values above U+10FFFF. * Fixed the mbstring test errors in PHP 5.4 using changes to StringUtilsTest by Platonides . * Uncommented some other tests that could fail because of the missing check for overlong forms. * Added additional tests for extra continuation bytes, overlong sequences/forms, and values in the UTF-16 surrogate range. The changes to the function were so extensive that I might as well say I rewrote it. Bug: 43679 Change-Id: I56ae496d17ffc3747550e06a72dacab3ac55da61 --- includes/StringUtils.php | 70 +++++++++++++++++----- tests/phpunit/includes/StringUtilsTest.php | 58 +++++++++++------- 2 files changed, 91 insertions(+), 37 deletions(-) diff --git a/includes/StringUtils.php b/includes/StringUtils.php index 48cde0eb19..fc3cfd55d7 100644 --- a/includes/StringUtils.php +++ b/includes/StringUtils.php @@ -38,6 +38,9 @@ class StringUtils { * unit testing our internal implementation. * * @since 1.21 + * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation. + * In particular, the pure PHP code path did not in fact check for overlong forms. + * Beware of this when backporting code to that version of MediaWiki. * * @param string $value String to check * @param boolean $disableMbstring Whether to use the pure PHP @@ -47,26 +50,63 @@ class StringUtils { * @return boolean Whether the given $value is a valid UTF-8 encoded string */ static function isUtf8( $value, $disableMbstring = false ) { - - if ( preg_match( '/[\x80-\xff]/', $value ) === 0 ) { - # no high bit set, this is pure ASCII which is de facto - # valid UTF-8 + $value = (string)$value; + if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) { + // String contains only ASCII characters, has to be valid return true; } + // If the mbstring extension is loaded, use it. However, before PHP 5.4, values above + // U+10FFFF are incorrectly allowed, so we have to check for them separately. if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) { - return mb_check_encoding( $value, 'UTF-8' ); - } else { - $hasUtf8 = preg_match( '/^(?> - [\x00-\x7f] - | [\xc0-\xdf][\x80-\xbf] - | [\xe0-\xef][\x80-\xbf]{2} - | [\xf0-\xf7][\x80-\xbf]{3} - | [\xf8-\xfb][\x80-\xbf]{4} - | \xfc[\x84-\xbf][\x80-\xbf]{4} - )+$/x', $value ); - return ( $hasUtf8 > 0 ); + static $newPHP; + if ( $newPHP === null ) { + $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' ); + } + + return mb_check_encoding( $value, 'UTF-8' ) && + ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 ); + } + + // PCRE implements repetition using recursion; to avoid a stack overflow (and segfault) + // for large input, we check for invalid sequences (<= 5 bytes) rather than valid + // sequences, which can be as long as the input string is. Multiple short regexes are + // used rather than a single long regex for performance. + static $regexes; + if ( $regexes === null ) { + $cont = "[\x80-\xbf]"; + $after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here + $regexes = array( + // Continuation byte at the start + "/^$cont/", + + // ASCII byte followed by a continuation byte + "/[\\x00-\x7f]$cont/S", + + // Illegal byte + "/[\xc0\xc1\xf5-\xff]/S", + + // Invalid 2-byte sequence, or valid one then an extra continuation byte + "/[\xc2-\xdf](?!$cont$after)/S", + + // Invalid 3-byte sequence, or valid one then an extra continuation byte + "/\xe0(?![\xa0-\xbf]$cont$after)/", + "/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S", + "/\xed(?![\x80-\x9f]$cont$after)/", + + // Invalid 4-byte sequence, or valid one then an extra continuation byte + "/\xf0(?![\x90-\xbf]$cont{2}$after)/", + "/[\xf1-\xf3](?!$cont{3}$after)/S", + "/\xf4(?![\x80-\x8f]$cont{2}$after)/", + ); + } + + foreach ( $regexes as $regex ) { + if ( preg_match( $regex, $value ) !== 0 ) { + return false; + } } + return true; } /** diff --git a/tests/phpunit/includes/StringUtilsTest.php b/tests/phpunit/includes/StringUtilsTest.php index 842e2fc431..94ba3a714e 100644 --- a/tests/phpunit/includes/StringUtilsTest.php +++ b/tests/phpunit/includes/StringUtilsTest.php @@ -67,32 +67,35 @@ class StringUtilsTest extends MediaWikiTestCase { array( $PASS, 'Some ASCII' ), array( $PASS, "Euro sign €" ), - # First possible sequences + // First possible sequences array( $PASS, "\x00" ), array( $PASS, "\xc2\x80" ), array( $PASS, "\xe0\xa0\x80" ), array( $PASS, "\xf0\x90\x80\x80" ), - array( $PASS, "\xf8\x88\x80\x80\x80" ), - array( $PASS, "\xfc\x84\x80\x80\x80\x80" ), + array( $FAIL, "\xf8\x88\x80\x80\x80" ), + array( $FAIL, "\xfc\x84\x80\x80\x80\x80" ), - # Last possible sequence + // Last possible sequence array( $PASS, "\x7f" ), array( $PASS, "\xdf\xbf" ), array( $PASS, "\xef\xbf\xbf" ), - array( $PASS, "\xf7\xbf\xbf\xbf" ), - array( $PASS, "\xfb\xbf\xbf\xbf\xbf" ), + array( $FAIL, "\xf7\xbf\xbf\xbf" ), // U+1FFFFF + array( $FAIL, "\xfb\xbf\xbf\xbf\xbf" ), array( $FAIL, "\xfd\xbf\xbf\xbf\xbf\xbf" ), - # boundaries: + // Boundaries array( $PASS, "\xed\x9f\xbf" ), array( $PASS, "\xee\x80\x80" ), array( $PASS, "\xef\xbf\xbd" ), - array( $PASS, "\xf4\x8f\xbf\xbf" ), - array( $PASS, "\xf4\x90\x80\x80" ), + array( $PASS, "\xf2\x80\x80\x80" ), + array( $PASS, "\xf3\xbf\xbf\xbf" ), // U+FFFFF + array( $PASS, "\xf4\x80\x80\x80" ), // U+100000 + array( $PASS, "\xf4\x8f\xbf\xbf" ), // U+10FFFF + array( $FAIL, "\xf4\x90\x80\x80" ), // U+110000 - # Malformed + // Malformed array( $FAIL, "\x80" ), - array( $FAIL, "\xBF" ), + array( $FAIL, "\xbf" ), array( $FAIL, "\x80\xbf" ), array( $FAIL, "\x80\xbf\x80" ), array( $FAIL, "\x80\xbf\x80\xbf" ), @@ -100,7 +103,7 @@ class StringUtilsTest extends MediaWikiTestCase { array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf" ), array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf\x80" ), - # last byte missing + // Last byte missing array( $FAIL, "\xc0" ), array( $FAIL, "\xe0\x80" ), array( $FAIL, "\xf0\x80\x80" ), @@ -112,31 +115,42 @@ class StringUtilsTest extends MediaWikiTestCase { array( $FAIL, "\xfb\xbf\xbf\xbf" ), array( $FAIL, "\xfd\xbf\xbf\xbf\xbf" ), - # impossible bytes + // Extra continuation byte + array( $FAIL, "e\xaf" ), + array( $FAIL, "\xc3\x89\xaf" ), + array( $FAIL, "\xef\xbc\xa5\xaf" ), + array( $FAIL, "\xf0\x9d\x99\xb4\xaf" ), + + // Impossible bytes array( $FAIL, "\xfe" ), array( $FAIL, "\xff" ), array( $FAIL, "\xfe\xfe\xff\xff" ), - /* - # The PHP implementation does not handle characters - # being represented in a form which is too long :( - - # overlong sequences + // Overlong sequences array( $FAIL, "\xc0\xaf" ), + array( $FAIL, "\xc1\xaf" ), array( $FAIL, "\xe0\x80\xaf" ), array( $FAIL, "\xf0\x80\x80\xaf" ), array( $FAIL, "\xf8\x80\x80\x80\xaf" ), array( $FAIL, "\xfc\x80\x80\x80\x80\xaf" ), - # Maximum overlong sequences + // Maximum overlong sequences array( $FAIL, "\xc1\xbf" ), array( $FAIL, "\xe0\x9f\xbf" ), - array( $FAIL, "\xf0\x8F\xbf\xbf" ), + array( $FAIL, "\xf0\x8f\xbf\xbf" ), array( $FAIL, "\xf8\x87\xbf\xbf" ), array( $FAIL, "\xfc\x83\xbf\xbf\xbf\xbf" ), - */ - # non characters + // Surrogates + array( $PASS, "\xed\x9f\xbf" ), // U+D799 + array( $PASS, "\xee\x80\x80" ), // U+E000 + array( $FAIL, "\xed\xa0\x80" ), // U+D800 + array( $FAIL, "\xed\xaf\xbf" ), // U+DBFF + array( $FAIL, "\xed\xb0\x80" ), // U+DC00 + array( $FAIL, "\xed\xbf\xbf" ), // U+DFFF + array( $FAIL, "\xed\xa0\x80\xed\xb0\x80" ), // U+D800 U+DC00 + + // Noncharacters array( $PASS, "\xef\xbf\xbe" ), array( $PASS, "\xef\xbf\xbf" ), ); -- 2.20.1