abstract utf8 validation fallback
authorAntoine Musso <hashar@free.fr>
Fri, 16 Nov 2012 12:47:10 +0000 (13:47 +0100)
committerGerrit Code Review <gerrit@wikimedia.org>
Wed, 12 Dec 2012 11:24:38 +0000 (11:24 +0000)
Language class had a code snippet to verify whether a text is valid
UTF-8 though that could not be used from another place. The snippet use
mb_check_encoding() and fallback to some regex whenever mbstring is not
available.

* introduce StringUtils::isUtf8() which is mostly code moved out of the
  language class.
* Enhance regex readability by using an expanded regex (//x)
* Made the regex to recognize longer sequences
* Add some unit tests to the mbstring and the PHP native implementation
* An optional second parameter can be passed to isUtf8() to force the
  use of our PHP implementation. This is used for unit testing.

Change-Id: I4cf4dfe2eb02f046db1726f4654ba649e01419f2

includes/StringUtils.php
languages/Language.php
tests/phpunit/includes/StringUtilsTest.php [new file with mode: 0644]

index fba31ea..54a85dc 100644 (file)
  * A collection of static methods to play with strings.
  */
 class StringUtils {
+
+       /**
+        * Test whether a string is valid UTF-8.
+        *
+        * The function check for invalid byte sequences, overlong encoding but
+        * not for different normalisations.
+        *
+        * This relies internally on the mbstring function mb_check_encoding()
+        * hardcoded to check against UTF-8. Whenever the function is not available
+        * we fallback to a pure PHP implementation. Setting $disableMbstring to
+        * true will skip the use of mb_check_encoding, this is mostly intended for
+        * unit testing our internal implementation.
+        *
+        * @since 1.21
+        *
+        * @param string $value String to check
+        * @param boolean $disableMbstring Whether to use the pure PHP
+        * implementation instead of trying mb_check_encoding. Intended for unit
+        * testing. Default: false
+        *
+        * @return boolean Whether the given $value is a valid UTF-8 encoded string
+        */
+       static function isUtf8( $value, $disableMbstring = false ) {
+
+               if ( preg_match( '/[\x80-\xff]/', $value ) === 0 ) {
+                       # no high bit set, this is pure ASCII which is defacto
+                       # valid UTF-8
+                       return true;
+               }
+
+               if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) {
+                       return mb_check_encoding( $value, 'UTF-8' );
+               } else {
+                       $hasUtf8 = preg_match( '/^(?>
+                                 [\x00-\x7f]
+                               | [\xc0-\xdf][\x80-\xbf]
+                               | [\xe0-\xef][\x80-\xbf]{2}
+                               | [\xf0-\xf7][\x80-\xbf]{3}
+                               | [\xf8-\xfb][\x80-\xbf]{4}
+                               | \xfc[\x84-\xbf][\x80-\xbf]{4}
+                       )+$/x', $value );
+                       return ($hasUtf8 > 0 );
+               }
+       }
+
        /**
         * Perform an operation equivalent to
         *
index 68d7d86..21ba0bd 100644 (file)
@@ -2425,19 +2425,7 @@ class Language {
                if ( is_array( $s ) ) {
                        wfDebugDieBacktrace( 'Given array to checkTitleEncoding.' );
                }
-               # Check for non-UTF-8 URLs
-               $ishigh = preg_match( '/[\x80-\xff]/', $s );
-               if ( !$ishigh ) {
-                       return $s;
-               }
-
-               if ( function_exists( 'mb_check_encoding' ) ) {
-                       $isutf8 = mb_check_encoding( $s, 'UTF-8' );
-               } else {
-                       $isutf8 = preg_match( '/^(?>[\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
-                                       '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
-               }
-               if ( $isutf8 ) {
+               if ( StringUtils::isUtf8( $s ) ) {
                        return $s;
                }
 
diff --git a/tests/phpunit/includes/StringUtilsTest.php b/tests/phpunit/includes/StringUtilsTest.php
new file mode 100644 (file)
index 0000000..401b322
--- /dev/null
@@ -0,0 +1,142 @@
+<?php
+
+class StringUtilsTest extends MediaWikiTestCase {
+
+       /**
+        * This test StringUtils::isUtf8 whenever we have mbstring extension
+        * loaded.
+        *
+        * @cover StringUtils::isUtf8
+        * @dataProvider provideStringsForIsUtf8Check
+        */
+       function testIsUtf8WithMbstring($expected, $string ) {
+               if( !function_exists( 'mb_check_encoding' ) ) {
+                       $this->markTestSkipped( 'Test requires the mbstring PHP extension' );
+               }
+               $this->assertEquals( $expected,
+                       StringUtils::isUtf8( $string ),
+                       'Testing string "' . $this->escaped( $string ) . '" with mb_check_encoding'
+               );
+       }
+
+       /**
+        * This test StringUtils::isUtf8 making sure we use the pure PHP
+        * implementation used as a fallback when mb_check_encoding() is
+        * not available.
+        *
+        * @cover StringUtils::isUtf8
+        * @dataProvider provideStringsForIsUtf8Check
+        */
+       function testIsUtf8WithPhpFallbackImplementation($expected, $string ) {
+               $this->assertEquals( $expected,
+                       StringUtils::isUtf8( $string, /** disable mbstring: */ true ),
+                       'Testing string "' . $this->escaped( $string ) . '" with pure PHP implementation'
+               );
+       }
+
+       /**
+        * Print high range characters as an hexadecimal
+        */
+       function escaped( $string ) {
+               $escaped = '';
+               for($i=0; $i<strlen($string);$i++) {
+                       $char = $string[$i];
+                       $val = ord($char);
+                       if( $val > 127 ) {
+                               $escaped .='\x' . dechex($val);
+                       } else {
+                               $escaped .= $char;
+                       }
+               }
+               return $escaped;
+       }
+
+       /**
+        * See also "UTF-8 decoder capability and stress test" by
+        * Markus Kuhn:
+        * http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
+        */
+       function provideStringsForIsUtf8Check() {
+               // Expected return values for StringUtils::isUtf8()
+               $PASS = true;
+               $FAIL = false;
+
+               return array(
+                       array( $PASS, 'Some ASCII' ),
+                       array( $PASS, "Euro sign €" ),
+
+                       # First possible sequences
+                       array( $PASS, "\x00" ),
+                       array( $PASS, "\xc2\x80" ),
+                       array( $PASS, "\xe0\xa0\x80" ),
+                       array( $PASS, "\xf0\x90\x80\x80" ),
+                       array( $PASS, "\xf8\x88\x80\x80\x80" ),
+                       array( $PASS, "\xfc\x84\x80\x80\x80\x80" ),
+
+                       # Last possible sequence
+                       array( $PASS, "\x7f" ),
+                       array( $PASS, "\xdf\xbf" ),
+                       array( $PASS, "\xef\xbf\xbf" ),
+                       array( $PASS, "\xf7\xbf\xbf\xbf" ),
+                       array( $PASS, "\xfb\xbf\xbf\xbf\xbf" ),
+                       array( $FAIL, "\xfd\xbf\xbf\xbf\xbf\xbf" ),
+
+                       # boundaries:
+                       array( $PASS, "\xed\x9f\xbf" ),
+                       array( $PASS, "\xee\x80\x80" ),
+                       array( $PASS, "\xef\xbf\xbd" ),
+                       array( $PASS, "\xf4\x8f\xbf\xbf" ),
+                       array( $PASS, "\xf4\x90\x80\x80" ),
+
+                       # Malformed
+                       array( $FAIL, "\x80" ),
+                       array( $FAIL, "\xBF" ),
+                       array( $FAIL, "\x80\xbf" ),
+                       array( $FAIL, "\x80\xbf\x80" ),
+                       array( $FAIL, "\x80\xbf\x80\xbf" ),
+                       array( $FAIL, "\x80\xbf\x80\xbf\x80" ),
+                       array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf" ),
+                       array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf\x80" ),
+
+                       # last byte missing
+                       array( $FAIL, "\xc0" ),
+                       array( $FAIL, "\xe0\x80" ),
+                       array( $FAIL, "\xf0\x80\x80" ),
+                       array( $FAIL, "\xf8\x80\x80\x80" ),
+                       array( $FAIL, "\xfc\x80\x80\x80\x80" ),
+                       array( $FAIL, "\xdf" ),
+                       array( $FAIL, "\xef\xbf" ),
+                       array( $FAIL, "\xf7\xbf\xbf" ),
+                       array( $FAIL, "\xfb\xbf\xbf\xbf" ),
+                       array( $FAIL, "\xfd\xbf\xbf\xbf\xbf" ),
+
+                       # impossible bytes
+                       array( $FAIL, "\xfe" ),
+                       array( $FAIL, "\xff" ),
+                       array( $FAIL, "\xfe\xfe\xff\xff" ),
+
+                       /**
+                       # The PHP implementation does not handle characters
+                       # being represented in a form which is too long :(
+
+                       # overlong sequences
+                       array( $FAIL, "\xc0\xaf" ),
+                       array( $FAIL, "\xe0\x80\xaf" ),
+                       array( $FAIL, "\xf0\x80\x80\xaf" ),
+                       array( $FAIL, "\xf8\x80\x80\x80\xaf" ),
+                       array( $FAIL, "\xfc\x80\x80\x80\x80\xaf" ),
+
+                       # Maximum overlong sequences
+                       array( $FAIL, "\xc1\xbf" ),
+                       array( $FAIL, "\xe0\x9f\xbf" ),
+                       array( $FAIL, "\xf0\x8F\xbf\xbf" ),
+                       array( $FAIL, "\xf8\x87\xbf\xbf" ),
+                       array( $FAIL, "\xfc\x83\xbf\xbf\xbf\xbf" ),
+                       **/
+
+                       # non characters
+                       array( $PASS, "\xef\xbf\xbe" ),
+                       array( $PASS, "\xef\xbf\xbf" ),
+               );
+       }
+}