From 750db30d9bbd28cab8b9417efe56e9cc30340b5e Mon Sep 17 00:00:00 2001 From: Antoine Musso Date: Fri, 16 Nov 2012 13:47:10 +0100 Subject: [PATCH] abstract utf8 validation fallback Language class had a code snippet to verify whether a text is valid UTF-8 though that could not be used from another place. The snippet use mb_check_encoding() and fallback to some regex whenever mbstring is not available. * introduce StringUtils::isUtf8() which is mostly code moved out of the language class. * Enhance regex readability by using an expanded regex (//x) * Made the regex to recognize longer sequences * Add some unit tests to the mbstring and the PHP native implementation * An optional second parameter can be passed to isUtf8() to force the use of our PHP implementation. This is used for unit testing. Change-Id: I4cf4dfe2eb02f046db1726f4654ba649e01419f2 --- includes/StringUtils.php | 45 +++++++ languages/Language.php | 14 +- tests/phpunit/includes/StringUtilsTest.php | 142 +++++++++++++++++++++ 3 files changed, 188 insertions(+), 13 deletions(-) create mode 100644 tests/phpunit/includes/StringUtilsTest.php diff --git a/includes/StringUtils.php b/includes/StringUtils.php index fba31ea976..54a85dc0ba 100644 --- a/includes/StringUtils.php +++ b/includes/StringUtils.php @@ -24,6 +24,51 @@ * A collection of static methods to play with strings. */ class StringUtils { + + /** + * Test whether a string is valid UTF-8. + * + * The function check for invalid byte sequences, overlong encoding but + * not for different normalisations. + * + * This relies internally on the mbstring function mb_check_encoding() + * hardcoded to check against UTF-8. Whenever the function is not available + * we fallback to a pure PHP implementation. Setting $disableMbstring to + * true will skip the use of mb_check_encoding, this is mostly intended for + * unit testing our internal implementation. + * + * @since 1.21 + * + * @param string $value String to check + * @param boolean $disableMbstring Whether to use the pure PHP + * implementation instead of trying mb_check_encoding. Intended for unit + * testing. Default: false + * + * @return boolean Whether the given $value is a valid UTF-8 encoded string + */ + static function isUtf8( $value, $disableMbstring = false ) { + + if ( preg_match( '/[\x80-\xff]/', $value ) === 0 ) { + # no high bit set, this is pure ASCII which is defacto + # valid UTF-8 + return true; + } + + if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) { + return mb_check_encoding( $value, 'UTF-8' ); + } else { + $hasUtf8 = preg_match( '/^(?> + [\x00-\x7f] + | [\xc0-\xdf][\x80-\xbf] + | [\xe0-\xef][\x80-\xbf]{2} + | [\xf0-\xf7][\x80-\xbf]{3} + | [\xf8-\xfb][\x80-\xbf]{4} + | \xfc[\x84-\xbf][\x80-\xbf]{4} + )+$/x', $value ); + return ($hasUtf8 > 0 ); + } + } + /** * Perform an operation equivalent to * diff --git a/languages/Language.php b/languages/Language.php index 68d7d86576..21ba0bdfb1 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -2425,19 +2425,7 @@ class Language { if ( is_array( $s ) ) { wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' ); } - # Check for non-UTF-8 URLs - $ishigh = preg_match( '/[\x80-\xff]/', $s ); - if ( !$ishigh ) { - return $s; - } - - if ( function_exists( 'mb_check_encoding' ) ) { - $isutf8 = mb_check_encoding( $s, 'UTF-8' ); - } else { - $isutf8 = preg_match( '/^(?>[\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . - '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s ); - } - if ( $isutf8 ) { + if ( StringUtils::isUtf8( $s ) ) { return $s; } diff --git a/tests/phpunit/includes/StringUtilsTest.php b/tests/phpunit/includes/StringUtilsTest.php new file mode 100644 index 0000000000..401b322db3 --- /dev/null +++ b/tests/phpunit/includes/StringUtilsTest.php @@ -0,0 +1,142 @@ +markTestSkipped( 'Test requires the mbstring PHP extension' ); + } + $this->assertEquals( $expected, + StringUtils::isUtf8( $string ), + 'Testing string "' . $this->escaped( $string ) . '" with mb_check_encoding' + ); + } + + /** + * This test StringUtils::isUtf8 making sure we use the pure PHP + * implementation used as a fallback when mb_check_encoding() is + * not available. + * + * @cover StringUtils::isUtf8 + * @dataProvider provideStringsForIsUtf8Check + */ + function testIsUtf8WithPhpFallbackImplementation($expected, $string ) { + $this->assertEquals( $expected, + StringUtils::isUtf8( $string, /** disable mbstring: */ true ), + 'Testing string "' . $this->escaped( $string ) . '" with pure PHP implementation' + ); + } + + /** + * Print high range characters as an hexadecimal + */ + function escaped( $string ) { + $escaped = ''; + for($i=0; $i 127 ) { + $escaped .='\x' . dechex($val); + } else { + $escaped .= $char; + } + } + return $escaped; + } + + /** + * See also "UTF-8 decoder capability and stress test" by + * Markus Kuhn: + * http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt + */ + function provideStringsForIsUtf8Check() { + // Expected return values for StringUtils::isUtf8() + $PASS = true; + $FAIL = false; + + return array( + array( $PASS, 'Some ASCII' ), + array( $PASS, "Euro sign €" ), + + # First possible sequences + array( $PASS, "\x00" ), + array( $PASS, "\xc2\x80" ), + array( $PASS, "\xe0\xa0\x80" ), + array( $PASS, "\xf0\x90\x80\x80" ), + array( $PASS, "\xf8\x88\x80\x80\x80" ), + array( $PASS, "\xfc\x84\x80\x80\x80\x80" ), + + # Last possible sequence + array( $PASS, "\x7f" ), + array( $PASS, "\xdf\xbf" ), + array( $PASS, "\xef\xbf\xbf" ), + array( $PASS, "\xf7\xbf\xbf\xbf" ), + array( $PASS, "\xfb\xbf\xbf\xbf\xbf" ), + array( $FAIL, "\xfd\xbf\xbf\xbf\xbf\xbf" ), + + # boundaries: + array( $PASS, "\xed\x9f\xbf" ), + array( $PASS, "\xee\x80\x80" ), + array( $PASS, "\xef\xbf\xbd" ), + array( $PASS, "\xf4\x8f\xbf\xbf" ), + array( $PASS, "\xf4\x90\x80\x80" ), + + # Malformed + array( $FAIL, "\x80" ), + array( $FAIL, "\xBF" ), + array( $FAIL, "\x80\xbf" ), + array( $FAIL, "\x80\xbf\x80" ), + array( $FAIL, "\x80\xbf\x80\xbf" ), + array( $FAIL, "\x80\xbf\x80\xbf\x80" ), + array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf" ), + array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf\x80" ), + + # last byte missing + array( $FAIL, "\xc0" ), + array( $FAIL, "\xe0\x80" ), + array( $FAIL, "\xf0\x80\x80" ), + array( $FAIL, "\xf8\x80\x80\x80" ), + array( $FAIL, "\xfc\x80\x80\x80\x80" ), + array( $FAIL, "\xdf" ), + array( $FAIL, "\xef\xbf" ), + array( $FAIL, "\xf7\xbf\xbf" ), + array( $FAIL, "\xfb\xbf\xbf\xbf" ), + array( $FAIL, "\xfd\xbf\xbf\xbf\xbf" ), + + # impossible bytes + array( $FAIL, "\xfe" ), + array( $FAIL, "\xff" ), + array( $FAIL, "\xfe\xfe\xff\xff" ), + + /** + # The PHP implementation does not handle characters + # being represented in a form which is too long :( + + # overlong sequences + array( $FAIL, "\xc0\xaf" ), + array( $FAIL, "\xe0\x80\xaf" ), + array( $FAIL, "\xf0\x80\x80\xaf" ), + array( $FAIL, "\xf8\x80\x80\x80\xaf" ), + array( $FAIL, "\xfc\x80\x80\x80\x80\xaf" ), + + # Maximum overlong sequences + array( $FAIL, "\xc1\xbf" ), + array( $FAIL, "\xe0\x9f\xbf" ), + array( $FAIL, "\xf0\x8F\xbf\xbf" ), + array( $FAIL, "\xf8\x87\xbf\xbf" ), + array( $FAIL, "\xfc\x83\xbf\xbf\xbf\xbf" ), + **/ + + # non characters + array( $PASS, "\xef\xbf\xbe" ), + array( $PASS, "\xef\xbf\xbf" ), + ); + } +} -- 2.20.1