From 7447669e83910865a061bbae34dad861bf3396d9 Mon Sep 17 00:00:00 2001
From: Kevin Israel <pleasestand@live.com>
Date: Sun, 15 Sep 2013 23:29:04 -0400
Subject: [PATCH] Adapt StringUtils::isUtf8 to the top of Unicode at U+10FFFF

RFC 3629 defines the legal range of characters as U+0000..U+10FFFF
and forbids overlong forms (encodings of a character that use more
bytes than necessary). Let's make StringUtils::isUtf8() match the
specification.

* Changed the maximum value in the pure PHP code path and added a
  check for overlong forms.
* Added another check, specific to PHP 5.3's mbstring extension,
  for values above U+10FFFF.
* Fixed the mbstring test errors in PHP 5.4 using changes to
  StringUtilsTest by Platonides <platonides@gmail.com>.
* Uncommented some other tests that could fail because of the
  missing check for overlong forms.
* Added additional tests for extra continuation bytes, overlong
  sequences/forms, and values in the UTF-16 surrogate range.

The changes to the function were so extensive that I might as
well say I rewrote it.

Bug: 43679
Change-Id: I56ae496d17ffc3747550e06a72dacab3ac55da61
---
 includes/StringUtils.php                   | 70 +++++++++++++++++-----
 tests/phpunit/includes/StringUtilsTest.php | 58 +++++++++++-------
 2 files changed, 91 insertions(+), 37 deletions(-)

diff --git a/includes/StringUtils.php b/includes/StringUtils.php
index 48cde0eb19..fc3cfd55d7 100644
--- a/includes/StringUtils.php
+++ b/includes/StringUtils.php
@@ -38,6 +38,9 @@ class StringUtils {
 	 * unit testing our internal implementation.
 	 *
 	 * @since 1.21
+	 * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
+	 * In particular, the pure PHP code path did not in fact check for overlong forms.
+	 * Beware of this when backporting code to that version of MediaWiki.
 	 *
 	 * @param string $value String to check
 	 * @param boolean $disableMbstring Whether to use the pure PHP
@@ -47,26 +50,63 @@ class StringUtils {
 	 * @return boolean Whether the given $value is a valid UTF-8 encoded string
 	 */
 	static function isUtf8( $value, $disableMbstring = false ) {
-
-		if ( preg_match( '/[\x80-\xff]/', $value ) === 0 ) {
-			# no high bit set, this is pure ASCII which is de facto
-			# valid UTF-8
+		$value = (string)$value;
+		if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) {
+			// String contains only ASCII characters, has to be valid
 			return true;
 		}
 
+		// If the mbstring extension is loaded, use it. However, before PHP 5.4, values above
+		// U+10FFFF are incorrectly allowed, so we have to check for them separately.
 		if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) {
-			return mb_check_encoding( $value, 'UTF-8' );
-		} else {
-			$hasUtf8 = preg_match( '/^(?>
-				  [\x00-\x7f]
-				| [\xc0-\xdf][\x80-\xbf]
-				| [\xe0-\xef][\x80-\xbf]{2}
-				| [\xf0-\xf7][\x80-\xbf]{3}
-				| [\xf8-\xfb][\x80-\xbf]{4}
-				| \xfc[\x84-\xbf][\x80-\xbf]{4}
-			)+$/x', $value );
-			return ( $hasUtf8 > 0 );
+			static $newPHP;
+			if ( $newPHP === null ) {
+				$newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' );
+			}
+
+			return mb_check_encoding( $value, 'UTF-8' ) &&
+				( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
+		}
+
+		// PCRE implements repetition using recursion; to avoid a stack overflow (and segfault)
+		// for large input, we check for invalid sequences (<= 5 bytes) rather than valid
+		// sequences, which can be as long as the input string is. Multiple short regexes are
+		// used rather than a single long regex for performance.
+		static $regexes;
+		if ( $regexes === null ) {
+			$cont = "[\x80-\xbf]";
+			$after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here
+			$regexes = array(
+				// Continuation byte at the start
+				"/^$cont/",
+
+				// ASCII byte followed by a continuation byte
+				"/[\\x00-\x7f]$cont/S",
+
+				// Illegal byte
+				"/[\xc0\xc1\xf5-\xff]/S",
+
+				// Invalid 2-byte sequence, or valid one then an extra continuation byte
+				"/[\xc2-\xdf](?!$cont$after)/S",
+
+				// Invalid 3-byte sequence, or valid one then an extra continuation byte
+				"/\xe0(?![\xa0-\xbf]$cont$after)/",
+				"/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S",
+				"/\xed(?![\x80-\x9f]$cont$after)/",
+
+				// Invalid 4-byte sequence, or valid one then an extra continuation byte
+				"/\xf0(?![\x90-\xbf]$cont{2}$after)/",
+				"/[\xf1-\xf3](?!$cont{3}$after)/S",
+				"/\xf4(?![\x80-\x8f]$cont{2}$after)/",
+			);
+		}
+
+		foreach ( $regexes as $regex ) {
+			if ( preg_match( $regex, $value ) !== 0 ) {
+				return false;
+			}
 		}
+		return true;
 	}
 
 	/**
diff --git a/tests/phpunit/includes/StringUtilsTest.php b/tests/phpunit/includes/StringUtilsTest.php
index 842e2fc431..94ba3a714e 100644
--- a/tests/phpunit/includes/StringUtilsTest.php
+++ b/tests/phpunit/includes/StringUtilsTest.php
@@ -67,32 +67,35 @@ class StringUtilsTest extends MediaWikiTestCase {
 			array( $PASS, 'Some ASCII' ),
 			array( $PASS, "Euro sign â¬" ),
 
-			# First possible sequences
+			// First possible sequences
 			array( $PASS, "\x00" ),
 			array( $PASS, "\xc2\x80" ),
 			array( $PASS, "\xe0\xa0\x80" ),
 			array( $PASS, "\xf0\x90\x80\x80" ),
-			array( $PASS, "\xf8\x88\x80\x80\x80" ),
-			array( $PASS, "\xfc\x84\x80\x80\x80\x80" ),
+			array( $FAIL, "\xf8\x88\x80\x80\x80" ),
+			array( $FAIL, "\xfc\x84\x80\x80\x80\x80" ),
 
-			# Last possible sequence
+			// Last possible sequence
 			array( $PASS, "\x7f" ),
 			array( $PASS, "\xdf\xbf" ),
 			array( $PASS, "\xef\xbf\xbf" ),
-			array( $PASS, "\xf7\xbf\xbf\xbf" ),
-			array( $PASS, "\xfb\xbf\xbf\xbf\xbf" ),
+			array( $FAIL, "\xf7\xbf\xbf\xbf" ), // U+1FFFFF
+			array( $FAIL, "\xfb\xbf\xbf\xbf\xbf" ),
 			array( $FAIL, "\xfd\xbf\xbf\xbf\xbf\xbf" ),
 
-			# boundaries:
+			// Boundaries
 			array( $PASS, "\xed\x9f\xbf" ),
 			array( $PASS, "\xee\x80\x80" ),
 			array( $PASS, "\xef\xbf\xbd" ),
-			array( $PASS, "\xf4\x8f\xbf\xbf" ),
-			array( $PASS, "\xf4\x90\x80\x80" ),
+			array( $PASS, "\xf2\x80\x80\x80" ),
+			array( $PASS, "\xf3\xbf\xbf\xbf" ), // U+FFFFF
+			array( $PASS, "\xf4\x80\x80\x80" ), // U+100000
+			array( $PASS, "\xf4\x8f\xbf\xbf" ), // U+10FFFF
+			array( $FAIL, "\xf4\x90\x80\x80" ), // U+110000
 
-			# Malformed
+			// Malformed
 			array( $FAIL, "\x80" ),
-			array( $FAIL, "\xBF" ),
+			array( $FAIL, "\xbf" ),
 			array( $FAIL, "\x80\xbf" ),
 			array( $FAIL, "\x80\xbf\x80" ),
 			array( $FAIL, "\x80\xbf\x80\xbf" ),
@@ -100,7 +103,7 @@ class StringUtilsTest extends MediaWikiTestCase {
 			array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf" ),
 			array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf\x80" ),
 
-			# last byte missing
+			// Last byte missing
 			array( $FAIL, "\xc0" ),
 			array( $FAIL, "\xe0\x80" ),
 			array( $FAIL, "\xf0\x80\x80" ),
@@ -112,31 +115,42 @@ class StringUtilsTest extends MediaWikiTestCase {
 			array( $FAIL, "\xfb\xbf\xbf\xbf" ),
 			array( $FAIL, "\xfd\xbf\xbf\xbf\xbf" ),
 
-			# impossible bytes
+			// Extra continuation byte
+			array( $FAIL, "e\xaf" ),
+			array( $FAIL, "\xc3\x89\xaf" ),
+			array( $FAIL, "\xef\xbc\xa5\xaf" ),
+			array( $FAIL, "\xf0\x9d\x99\xb4\xaf" ),
+
+			// Impossible bytes
 			array( $FAIL, "\xfe" ),
 			array( $FAIL, "\xff" ),
 			array( $FAIL, "\xfe\xfe\xff\xff" ),
 
-			/*
-			# The PHP implementation does not handle characters
-			# being represented in a form which is too long :(
-
-			# overlong sequences
+			// Overlong sequences
 			array( $FAIL, "\xc0\xaf" ),
+			array( $FAIL, "\xc1\xaf" ),
 			array( $FAIL, "\xe0\x80\xaf" ),
 			array( $FAIL, "\xf0\x80\x80\xaf" ),
 			array( $FAIL, "\xf8\x80\x80\x80\xaf" ),
 			array( $FAIL, "\xfc\x80\x80\x80\x80\xaf" ),
 
-			# Maximum overlong sequences
+			// Maximum overlong sequences
 			array( $FAIL, "\xc1\xbf" ),
 			array( $FAIL, "\xe0\x9f\xbf" ),
-			array( $FAIL, "\xf0\x8F\xbf\xbf" ),
+			array( $FAIL, "\xf0\x8f\xbf\xbf" ),
 			array( $FAIL, "\xf8\x87\xbf\xbf" ),
 			array( $FAIL, "\xfc\x83\xbf\xbf\xbf\xbf" ),
-			*/
 
-			# non characters
+			// Surrogates
+			array( $PASS, "\xed\x9f\xbf" ), // U+D799
+			array( $PASS, "\xee\x80\x80" ), // U+E000
+			array( $FAIL, "\xed\xa0\x80" ), // U+D800
+			array( $FAIL, "\xed\xaf\xbf" ), // U+DBFF
+			array( $FAIL, "\xed\xb0\x80" ), // U+DC00
+			array( $FAIL, "\xed\xbf\xbf" ), // U+DFFF
+			array( $FAIL, "\xed\xa0\x80\xed\xb0\x80" ), // U+D800 U+DC00
+
+			// Noncharacters
 			array( $PASS, "\xef\xbf\xbe" ),
 			array( $PASS, "\xef\xbf\xbf" ),
 		);
-- 
2.20.1