X-Git-Url: http://git.cyclocoop.org/%7B%24admin_url%7Dmes_infos.php?a=blobdiff_plain;f=includes%2Flibs%2FStringUtils.php;h=6b10c0998b8b6572b59600531c59ef4c398eb072;hb=62d543af640479a1ecbd4de69b3f975e509ca1ab;hp=d2226b6daa2d053a1b3c691e3433bccff147e787;hpb=2a1fcd27c19913394c179cbe8be1a9e981d7f81b;p=lhc%2Fweb%2Fwiklou.git diff --git a/includes/libs/StringUtils.php b/includes/libs/StringUtils.php index d2226b6daa..6b10c0998b 100644 --- a/includes/libs/StringUtils.php +++ b/includes/libs/StringUtils.php @@ -30,83 +30,28 @@ class StringUtils { * The function check for invalid byte sequences, overlong encoding but * not for different normalisations. * - * This relies internally on the mbstring function mb_check_encoding() - * hardcoded to check against UTF-8. Whenever the function is not available - * we fallback to a pure PHP implementation. Setting $disableMbstring to - * true will skip the use of mb_check_encoding, this is mostly intended for - * unit testing our internal implementation. - * * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation. * In particular, the pure PHP code path did not in fact check for overlong forms. * Beware of this when backporting code to that version of MediaWiki. * * @since 1.21 * @param string $value String to check - * @param bool $disableMbstring Whether to use the pure PHP - * implementation instead of trying mb_check_encoding. Intended for unit - * testing. Default: false * @return bool Whether the given $value is a valid UTF-8 encoded string */ - static function isUtf8( $value, $disableMbstring = false ) { + static function isUtf8( $value ) { $value = (string)$value; - // If the mbstring extension is loaded, use it. However, before PHP 5.4, values above - // U+10FFFF are incorrectly allowed, so we have to check for them separately. - if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) { - static $newPHP; - if ( $newPHP === null ) { - $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' ); - } - - return mb_check_encoding( $value, 'UTF-8' ) && - ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 ); - } - - if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) { - // String contains only ASCII characters, has to be valid - return true; + // HHVM 3.4 and older come with an outdated version of libmbfl that + // incorrectly allows values above U+10FFFF, so we have to check + // for them separately. (This issue also exists in PHP 5.3 and + // older, which are no longer supported.) + static $newPHP; + if ( $newPHP === null ) { + $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' ); } - // PCRE implements repetition using recursion; to avoid a stack overflow (and segfault) - // for large input, we check for invalid sequences (<= 5 bytes) rather than valid - // sequences, which can be as long as the input string is. Multiple short regexes are - // used rather than a single long regex for performance. - static $regexes; - if ( $regexes === null ) { - $cont = "[\x80-\xbf]"; - $after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here - $regexes = [ - // Continuation byte at the start - "/^$cont/", - - // ASCII byte followed by a continuation byte - "/[\\x00-\x7f]$cont/S", - - // Illegal byte - "/[\xc0\xc1\xf5-\xff]/S", - - // Invalid 2-byte sequence, or valid one then an extra continuation byte - "/[\xc2-\xdf](?!$cont$after)/S", - - // Invalid 3-byte sequence, or valid one then an extra continuation byte - "/\xe0(?![\xa0-\xbf]$cont$after)/", - "/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S", - "/\xed(?![\x80-\x9f]$cont$after)/", - - // Invalid 4-byte sequence, or valid one then an extra continuation byte - "/\xf0(?![\x90-\xbf]$cont{2}$after)/", - "/[\xf1-\xf3](?!$cont{3}$after)/S", - "/\xf4(?![\x80-\x8f]$cont{2}$after)/", - ]; - } - - foreach ( $regexes as $regex ) { - if ( preg_match( $regex, $value ) !== 0 ) { - return false; - } - } - - return true; + return mb_check_encoding( $value, 'UTF-8' ) && + ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 ); } /** @@ -288,6 +233,31 @@ class StringUtils { return $items; } + /** + * More or less "markup-safe" str_replace() + * Ignores any instances of the separator inside `<...>` + * @param string $search + * @param string $replace + * @param string $text + * @return string + */ + static function replaceMarkup( $search, $replace, $text ) { + $placeholder = "\x00"; + + // Remove placeholder instances + $text = str_replace( $placeholder, '', $text ); + + // Replace instances of the separator inside HTML-like tags with the placeholder + $replacer = new DoubleReplacer( $search, $placeholder ); + $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text ); + + // Explode, then put the replaced separators back in + $cleaned = str_replace( $search, $replace, $cleaned ); + $text = str_replace( $placeholder, $search, $cleaned ); + + return $text; + } + /** * Escape a string to make it suitable for inclusion in a preg_replace() * replacement parameter.