From 7a344588d36cf825a98dcb6a72779e711a803dfa Mon Sep 17 00:00:00 2001 From: lwelling Date: Fri, 3 May 2013 16:29:03 -0400 Subject: [PATCH] Remove reduntant regex from calls to StringUtils::isUtf8() I've cautiously moved the regex out of the most used code path. There is no string that will match that regex check that will not also be passed by mb_check_encoding. I think the regex was intended as a shortcut evaluation, but it is no faster than mb_check_encoding which will often need to be run anyway. I think it could just be deleted, but I have limited motivation to risk introducing a bug to improve performance on old PHP vesions and unusual configurations, so I've moved it to the fallback code path. Change-Id: Ie9425cc23ba032e5aff42beeb44cbb1146050452 --- includes/StringUtils.php | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/includes/StringUtils.php b/includes/StringUtils.php index 9e21d035c3..c1545e6ef5 100644 --- a/includes/StringUtils.php +++ b/includes/StringUtils.php @@ -51,10 +51,6 @@ class StringUtils { */ static function isUtf8( $value, $disableMbstring = false ) { $value = (string)$value; - if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) { - // String contains only ASCII characters, has to be valid - return true; - } // If the mbstring extension is loaded, use it. However, before PHP 5.4, values above // U+10FFFF are incorrectly allowed, so we have to check for them separately. @@ -68,6 +64,11 @@ class StringUtils { ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 ); } + if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) { + // String contains only ASCII characters, has to be valid + return true; + } + // PCRE implements repetition using recursion; to avoid a stack overflow (and segfault) // for large input, we check for invalid sequences (<= 5 bytes) rather than valid // sequences, which can be as long as the input string is. Multiple short regexes are -- 2.20.1