X-Git-Url: http://git.cyclocoop.org/%7B%24admin_url%7Dmes_infos.php?a=blobdiff_plain;f=includes%2Flibs%2FStringUtils.php;h=6b10c0998b8b6572b59600531c59ef4c398eb072;hb=62d543af640479a1ecbd4de69b3f975e509ca1ab;hp=d2226b6daa2d053a1b3c691e3433bccff147e787;hpb=2a1fcd27c19913394c179cbe8be1a9e981d7f81b;p=lhc%2Fweb%2Fwiklou.git

diff --git a/includes/libs/StringUtils.php b/includes/libs/StringUtils.php
index d2226b6daa..6b10c0998b 100644
--- a/includes/libs/StringUtils.php
+++ b/includes/libs/StringUtils.php
@@ -30,83 +30,28 @@ class StringUtils {
 	 * The function check for invalid byte sequences, overlong encoding but
 	 * not for different normalisations.
 	 *
-	 * This relies internally on the mbstring function mb_check_encoding()
-	 * hardcoded to check against UTF-8. Whenever the function is not available
-	 * we fallback to a pure PHP implementation. Setting $disableMbstring to
-	 * true will skip the use of mb_check_encoding, this is mostly intended for
-	 * unit testing our internal implementation.
-	 *
 	 * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
 	 * In particular, the pure PHP code path did not in fact check for overlong forms.
 	 * Beware of this when backporting code to that version of MediaWiki.
 	 *
 	 * @since 1.21
 	 * @param string $value String to check
-	 * @param bool $disableMbstring Whether to use the pure PHP
-	 *  implementation instead of trying mb_check_encoding. Intended for unit
-	 *  testing. Default: false
 	 * @return bool Whether the given $value is a valid UTF-8 encoded string
 	 */
-	static function isUtf8( $value, $disableMbstring = false ) {
+	static function isUtf8( $value ) {
 		$value = (string)$value;
 
-		// If the mbstring extension is loaded, use it. However, before PHP 5.4, values above
-		// U+10FFFF are incorrectly allowed, so we have to check for them separately.
-		if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) {
-			static $newPHP;
-			if ( $newPHP === null ) {
-				$newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' );
-			}
-
-			return mb_check_encoding( $value, 'UTF-8' ) &&
-				( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
-		}
-
-		if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) {
-			// String contains only ASCII characters, has to be valid
-			return true;
+		// HHVM 3.4 and older come with an outdated version of libmbfl that
+		// incorrectly allows values above U+10FFFF, so we have to check
+		// for them separately. (This issue also exists in PHP 5.3 and
+		// older, which are no longer supported.)
+		static $newPHP;
+		if ( $newPHP === null ) {
+			$newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' );
 		}
 
-		// PCRE implements repetition using recursion; to avoid a stack overflow (and segfault)
-		// for large input, we check for invalid sequences (<= 5 bytes) rather than valid
-		// sequences, which can be as long as the input string is. Multiple short regexes are
-		// used rather than a single long regex for performance.
-		static $regexes;
-		if ( $regexes === null ) {
-			$cont = "[\x80-\xbf]";
-			$after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here
-			$regexes = [
-				// Continuation byte at the start
-				"/^$cont/",
-
-				// ASCII byte followed by a continuation byte
-				"/[\\x00-\x7f]$cont/S",
-
-				// Illegal byte
-				"/[\xc0\xc1\xf5-\xff]/S",
-
-				// Invalid 2-byte sequence, or valid one then an extra continuation byte
-				"/[\xc2-\xdf](?!$cont$after)/S",
-
-				// Invalid 3-byte sequence, or valid one then an extra continuation byte
-				"/\xe0(?![\xa0-\xbf]$cont$after)/",
-				"/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S",
-				"/\xed(?![\x80-\x9f]$cont$after)/",
-
-				// Invalid 4-byte sequence, or valid one then an extra continuation byte
-				"/\xf0(?![\x90-\xbf]$cont{2}$after)/",
-				"/[\xf1-\xf3](?!$cont{3}$after)/S",
-				"/\xf4(?![\x80-\x8f]$cont{2}$after)/",
-			];
-		}
-
-		foreach ( $regexes as $regex ) {
-			if ( preg_match( $regex, $value ) !== 0 ) {
-				return false;
-			}
-		}
-
-		return true;
+		return mb_check_encoding( $value, 'UTF-8' ) &&
+			( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
 	}
 
 	/**
@@ -288,6 +233,31 @@ class StringUtils {
 		return $items;
 	}
 
+	/**
+	 * More or less "markup-safe" str_replace()
+	 * Ignores any instances of the separator inside `<...>`
+	 * @param string $search
+	 * @param string $replace
+	 * @param string $text
+	 * @return string
+	 */
+	static function replaceMarkup( $search, $replace, $text ) {
+		$placeholder = "\x00";
+
+		// Remove placeholder instances
+		$text = str_replace( $placeholder, '', $text );
+
+		// Replace instances of the separator inside HTML-like tags with the placeholder
+		$replacer = new DoubleReplacer( $search, $placeholder );
+		$cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
+
+		// Explode, then put the replaced separators back in
+		$cleaned = str_replace( $search, $replace, $cleaned );
+		$text = str_replace( $placeholder, $search, $cleaned );
+
+		return $text;
+	}
+
 	/**
 	 * Escape a string to make it suitable for inclusion in a preg_replace()
 	 * replacement parameter.