From: Max Semenik Date: Fri, 29 Jan 2016 19:42:44 +0000 (-0800) Subject: Kill mbstring fallbacks X-Git-Tag: 1.31.0-rc.0~7360 X-Git-Url: http://git.cyclocoop.org/%24href?a=commitdiff_plain;h=943563062f0a6995;p=lhc%2Fweb%2Fwiklou.git Kill mbstring fallbacks In the age when we require PHP 5.5, pretending that mbstring emulation is not slow and silly is silly. Bug: T129435 Change-Id: Ic8235c9da9a926df63ec7388900c44eab454eebe --- diff --git a/RELEASE-NOTES-1.27 b/RELEASE-NOTES-1.27 index b5dca9339b..a6dfeccd27 100644 --- a/RELEASE-NOTES-1.27 +++ b/RELEASE-NOTES-1.27 @@ -7,7 +7,12 @@ production. === PHP version requirement === As of 1.27, MediaWiki now requires PHP 5.5.9 or higher. This corresponds with -HHVM 3.1. +HHVM 3.1. Additionally, the following PHP extensions are required: +* ctype +* iconv +* json +* mbstring +* xml === Configuration changes in 1.27 === * $wgAllowMicrodataAttributes and $wgAllowRdfaAttributes were removed, diff --git a/autoload.php b/autoload.php index 388dd0dd49..7667035dcc 100644 --- a/autoload.php +++ b/autoload.php @@ -423,7 +423,6 @@ $wgAutoloadLocalClasses = [ 'FakeConverter' => __DIR__ . '/languages/FakeConverter.php', 'FakeMaintenance' => __DIR__ . '/maintenance/Maintenance.php', 'FakeResultWrapper' => __DIR__ . '/includes/db/DatabaseUtility.php', - 'Fallback' => __DIR__ . '/includes/Fallback.php', 'FatalError' => __DIR__ . '/includes/exception/FatalError.php', 'FauxRequest' => __DIR__ . '/includes/FauxRequest.php', 'FauxResponse' => __DIR__ . '/includes/WebResponse.php', diff --git a/composer.json b/composer.json index 702e8b57e2..23bff9ce84 100644 --- a/composer.json +++ b/composer.json @@ -19,6 +19,7 @@ "composer/semver": "1.4.0", "cssjanus/cssjanus": "1.1.2", "ext-iconv": "*", + "ext-mbstring": "*", "liuggio/statsd-php-client": "1.0.18", "mediawiki/at-ease": "1.1.0", "oojs/oojs-ui": "0.16.4", @@ -52,7 +53,6 @@ "ext-apc": "Local data and opcode cache", "ext-fileinfo": "Improved mime magic detection", "ext-intl": "ICU integration", - "ext-mbstring": "Multibyte string support", "ext-wikidiff2": "Diff accelerator", "monolog/monolog": "Flexible debug logging system", "nmred/kafka-php": "Send debug log events to kafka", diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php index 65032ad6e9..d9d04fd4a5 100644 --- a/includes/DefaultSettings.php +++ b/includes/DefaultSettings.php @@ -585,10 +585,9 @@ $wgLockManagers = []; * Requires PHP's Exif extension: http://www.php.net/manual/en/ref.exif.php * * @note FOR WINDOWS USERS: - * To enable Exif functions, add the following lines to the "Windows + * To enable Exif functions, add the following line to the "Windows * extensions" section of php.ini: * @code{.ini} - * extension=extensions/php_mbstring.dll * extension=extensions/php_exif.dll * @endcode */ diff --git a/includes/Fallback.php b/includes/Fallback.php deleted file mode 100644 index 4c19dca267..0000000000 --- a/includes/Fallback.php +++ /dev/null @@ -1,172 +0,0 @@ - 0 ) { - if ( $splitPos > 256 ) { - // Optimize large string offsets by skipping ahead N bytes. - // This will cut out most of our slow time on Latin-based text, - // and 1/2 to 1/3 on East European and Asian scripts. - $bytePos = $splitPos; - while ( $bytePos < $byteLen && $str[$bytePos] >= "\x80" && $str[$bytePos] < "\xc0" ) { - ++$bytePos; - } - $charPos = mb_strlen( substr( $str, 0, $bytePos ) ); - } else { - $charPos = 0; - $bytePos = 0; - } - - while ( $charPos++ < $splitPos ) { - ++$bytePos; - // Move past any tail bytes - while ( $bytePos < $byteLen && $str[$bytePos] >= "\x80" && $str[$bytePos] < "\xc0" ) { - ++$bytePos; - } - } - } else { - $splitPosX = $splitPos + 1; - $charPos = 0; // relative to end of string; we don't care about the actual char position here - $bytePos = $byteLen; - while ( $bytePos > 0 && $charPos-- >= $splitPosX ) { - --$bytePos; - // Move past any tail bytes - while ( $bytePos > 0 && $str[$bytePos] >= "\x80" && $str[$bytePos] < "\xc0" ) { - --$bytePos; - } - } - } - - return $bytePos; - } - - /** - * Fallback implementation of mb_strlen, hardcoded to UTF-8. - * @param string $str - * @param string $enc Optional encoding; ignored - * @return int - */ - public static function mb_strlen( $str, $enc = '' ) { - $counts = count_chars( $str ); - $total = 0; - - // Count ASCII bytes - for ( $i = 0; $i < 0x80; $i++ ) { - $total += $counts[$i]; - } - - // Count multibyte sequence heads - for ( $i = 0xc0; $i < 0xff; $i++ ) { - $total += $counts[$i]; - } - return $total; - } - - /** - * Fallback implementation of mb_strpos, hardcoded to UTF-8. - * @param string $haystack - * @param string $needle - * @param string $offset Optional start position - * @param string $encoding Optional encoding; ignored - * @return int - */ - public static function mb_strpos( $haystack, $needle, $offset = 0, $encoding = '' ) { - $needle = preg_quote( $needle, '/' ); - - $ar = []; - preg_match( '/' . $needle . '/u', $haystack, $ar, PREG_OFFSET_CAPTURE, $offset ); - - if ( isset( $ar[0][1] ) ) { - return $ar[0][1]; - } else { - return false; - } - } - - /** - * Fallback implementation of mb_strrpos, hardcoded to UTF-8. - * @param string $haystack - * @param string $needle - * @param string $offset Optional start position - * @param string $encoding Optional encoding; ignored - * @return int - */ - public static function mb_strrpos( $haystack, $needle, $offset = 0, $encoding = '' ) { - $needle = preg_quote( $needle, '/' ); - - $ar = []; - preg_match_all( '/' . $needle . '/u', $haystack, $ar, PREG_OFFSET_CAPTURE, $offset ); - - if ( isset( $ar[0] ) && count( $ar[0] ) > 0 && - isset( $ar[0][count( $ar[0] ) - 1][1] ) ) { - return $ar[0][count( $ar[0] ) - 1][1]; - } else { - return false; - } - } -} diff --git a/includes/GlobalFunctions.php b/includes/GlobalFunctions.php index 3fa91fa700..f8f078ff6a 100644 --- a/includes/GlobalFunctions.php +++ b/includes/GlobalFunctions.php @@ -39,59 +39,6 @@ use MediaWiki\Session\SessionManager; * PHP extensions may be included here. */ -if ( !function_exists( 'mb_substr' ) ) { - /** - * @codeCoverageIgnore - * @see Fallback::mb_substr - * @return string - */ - function mb_substr( $str, $start, $count = 'end' ) { - return Fallback::mb_substr( $str, $start, $count ); - } - - /** - * @codeCoverageIgnore - * @see Fallback::mb_substr_split_unicode - * @return int - */ - function mb_substr_split_unicode( $str, $splitPos ) { - return Fallback::mb_substr_split_unicode( $str, $splitPos ); - } -} - -if ( !function_exists( 'mb_strlen' ) ) { - /** - * @codeCoverageIgnore - * @see Fallback::mb_strlen - * @return int - */ - function mb_strlen( $str, $enc = '' ) { - return Fallback::mb_strlen( $str, $enc ); - } -} - -if ( !function_exists( 'mb_strpos' ) ) { - /** - * @codeCoverageIgnore - * @see Fallback::mb_strpos - * @return int - */ - function mb_strpos( $haystack, $needle, $offset = 0, $encoding = '' ) { - return Fallback::mb_strpos( $haystack, $needle, $offset, $encoding ); - } -} - -if ( !function_exists( 'mb_strrpos' ) ) { - /** - * @codeCoverageIgnore - * @see Fallback::mb_strrpos - * @return int - */ - function mb_strrpos( $haystack, $needle, $offset = 0, $encoding = '' ) { - return Fallback::mb_strrpos( $haystack, $needle, $offset, $encoding ); - } -} - // hash_equals function only exists in PHP >= 5.6.0 // http://php.net/hash_equals if ( !function_exists( 'hash_equals' ) ) { diff --git a/includes/HtmlFormatter.php b/includes/HtmlFormatter.php index 5749775749..206f0f7309 100644 --- a/includes/HtmlFormatter.php +++ b/includes/HtmlFormatter.php @@ -63,15 +63,9 @@ class HtmlFormatter { */ public function getDoc() { if ( !$this->doc ) { - // DOMDocument::loadHTML apparently isn't very good with encodings, so + // DOMDocument::loadHTML isn't very good with encodings, so // convert input to ASCII by encoding everything above 128 as entities. - if ( function_exists( 'mb_convert_encoding' ) ) { - $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' ); - } else { - $html = preg_replace_callback( '/[\x{80}-\x{10ffff}]/u', function ( $m ) { - return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';'; - }, $this->html ); - } + $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' ); // Workaround for bug that caused spaces before references // to disappear during processing: https://phabricator.wikimedia.org/T55086 @@ -251,13 +245,10 @@ class HtmlFormatter { } $html = $replacements->replace( $html ); - if ( function_exists( 'mb_convert_encoding' ) ) { - // Just in case the conversion in getDoc() above used named - // entities that aren't known to html_entity_decode(). - $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' ); - } else { - $html = html_entity_decode( $html, ENT_COMPAT, 'utf-8' ); - } + // Just in case the conversion in getDoc() above used named + // entities that aren't known to html_entity_decode(). + $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' ); + return $html; } diff --git a/includes/db/DatabasePostgres.php b/includes/db/DatabasePostgres.php index d42871057c..87c977dce6 100644 --- a/includes/db/DatabasePostgres.php +++ b/includes/db/DatabasePostgres.php @@ -392,9 +392,7 @@ class DatabasePostgres extends Database { } public function doQuery( $sql ) { - if ( function_exists( 'mb_convert_encoding' ) ) { - $sql = mb_convert_encoding( $sql, 'UTF-8' ); - } + $sql = mb_convert_encoding( $sql, 'UTF-8' ); // Clear previously left over PQresult while ( $res = pg_get_result( $this->mConn ) ) { pg_free_result( $res ); diff --git a/includes/filebackend/SwiftFileBackend.php b/includes/filebackend/SwiftFileBackend.php index 1f2cb06181..0f7e4b569e 100644 --- a/includes/filebackend/SwiftFileBackend.php +++ b/includes/filebackend/SwiftFileBackend.php @@ -153,7 +153,7 @@ class SwiftFileBackend extends FileBackendStore { } protected function resolveContainerPath( $container, $relStoragePath ) { - if ( !mb_check_encoding( $relStoragePath, 'UTF-8' ) ) { // mb_string required by CF + if ( !mb_check_encoding( $relStoragePath, 'UTF-8' ) ) { return null; // not UTF-8, makes it hard to use CF and the swift HTTP API } elseif ( strlen( urlencode( $relStoragePath ) ) > 1024 ) { return null; // too long for Swift diff --git a/includes/installer/Installer.php b/includes/installer/Installer.php index 442baf76a0..3d1c8600bd 100644 --- a/includes/installer/Installer.php +++ b/includes/installer/Installer.php @@ -757,6 +757,12 @@ abstract class Installer { return false; } + if ( !function_exists( 'mb_substr' ) ) { + $this->showError( 'config-mbstring-absent' ); + + return false; + } + return true; } diff --git a/includes/installer/i18n/en.json b/includes/installer/i18n/en.json index 6fa59306b4..b97cc961db 100644 --- a/includes/installer/i18n/en.json +++ b/includes/installer/i18n/en.json @@ -60,6 +60,7 @@ "config-ctype": "Fatal: PHP must be compiled with support for the [http://www.php.net/manual/en/ctype.installation.php Ctype extension].", "config-iconv": "Fatal: PHP must be compiled with support for the [http://www.php.net/manual/en/iconv.installation.php iconv extension].", "config-json": "Fatal: PHP was compiled without JSON support.\nYou must install either the PHP JSON extension or the [http://pecl.php.net/package/jsonc PECL jsonc] extension before installing MediaWiki.\n* The PHP extension is included in Red Hat Enterprise Linux (CentOS) 5 and 6, though must be enabled in /etc/php.ini or /etc/php.d/json.ini.\n* Some Linux distributions released after May 2013 omit the PHP extension, instead packaging the PECL extension as php5-json or php-pecl-jsonc.", + "config-mbstring-absent": "Fatal: PHP must be compiled with support for the [http://www.php.net/manual/en/mbstring.setup.php mbstring extension].", "config-xcache": "[http://xcache.lighttpd.net/ XCache] is installed", "config-apc": "[http://www.php.net/apc APC] is installed", "config-wincache": "[http://www.iis.net/download/WinCacheForPhp WinCache] is installed", diff --git a/includes/installer/i18n/qqq.json b/includes/installer/i18n/qqq.json index e73480134d..7010be0383 100644 --- a/includes/installer/i18n/qqq.json +++ b/includes/installer/i18n/qqq.json @@ -78,6 +78,7 @@ "config-ctype": "Message if support for [http://www.php.net/manual/en/ctype.installation.php Ctype] is missing from PHP.\n{{Related|Config-fatal}}", "config-iconv": "Message if support for [http://www.php.net/manual/en/iconv.installation.php iconv] is missing from PHP.\n{{Related|Config-fatal}}", "config-json": "Message if support for [[wikipedia:JSON|JSON]] is missing from PHP.\n* \"[[wikipedia:Red Hat Enterprise Linux|Red Hat Enterprise Linux]]\" (RHEL) and \"[[wikipedia:CentOS|CentOS]]\" refer to two almost-identical Linux distributions. \"5 and 6\" refers to version 5 or 6 of either distribution. Because RHEL 7 likely will not include the PHP extension, do not translate as \"5 or newer\".\n* \"The [http://www.php.net/json PHP extension]\" is the JSON extension included with PHP 5.2 and newer.\n* \"The [http://pecl.php.net/package/jsonc PECL extension]\" is based on the PHP extension, though excludes code some distributions have found unacceptable (see [[phab:T49431]]).\n{{Related|Config-fatal}}", + "config-mbstring-absent": "Message if support for [http://www.php.net/manual/en/mbstring.installation.php mbstring] is missing from PHP.\n{{Related|Config-fatal}}", "config-xcache": "Message indicates if this program is available", "config-apc": "Message indicates if this program is available", "config-wincache": "Message indicates if this program is available", diff --git a/includes/libs/StringUtils.php b/includes/libs/StringUtils.php index 057495ee97..2af3868e44 100644 --- a/includes/libs/StringUtils.php +++ b/includes/libs/StringUtils.php @@ -30,83 +30,26 @@ class StringUtils { * The function check for invalid byte sequences, overlong encoding but * not for different normalisations. * - * This relies internally on the mbstring function mb_check_encoding() - * hardcoded to check against UTF-8. Whenever the function is not available - * we fallback to a pure PHP implementation. Setting $disableMbstring to - * true will skip the use of mb_check_encoding, this is mostly intended for - * unit testing our internal implementation. - * * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation. * In particular, the pure PHP code path did not in fact check for overlong forms. * Beware of this when backporting code to that version of MediaWiki. * * @since 1.21 * @param string $value String to check - * @param bool $disableMbstring Whether to use the pure PHP - * implementation instead of trying mb_check_encoding. Intended for unit - * testing. Default: false * @return bool Whether the given $value is a valid UTF-8 encoded string */ - static function isUtf8( $value, $disableMbstring = false ) { + static function isUtf8( $value ) { $value = (string)$value; - // If the mbstring extension is loaded, use it. However, before PHP 5.4, values above - // U+10FFFF are incorrectly allowed, so we have to check for them separately. - if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) { - static $newPHP; - if ( $newPHP === null ) { - $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' ); - } - - return mb_check_encoding( $value, 'UTF-8' ) && - ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 ); - } - - if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) { - // String contains only ASCII characters, has to be valid - return true; - } - - // PCRE implements repetition using recursion; to avoid a stack overflow (and segfault) - // for large input, we check for invalid sequences (<= 5 bytes) rather than valid - // sequences, which can be as long as the input string is. Multiple short regexes are - // used rather than a single long regex for performance. - static $regexes; - if ( $regexes === null ) { - $cont = "[\x80-\xbf]"; - $after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here - $regexes = [ - // Continuation byte at the start - "/^$cont/", - - // ASCII byte followed by a continuation byte - "/[\\x00-\x7f]$cont/S", - - // Illegal byte - "/[\xc0\xc1\xf5-\xff]/S", - - // Invalid 2-byte sequence, or valid one then an extra continuation byte - "/[\xc2-\xdf](?!$cont$after)/S", - - // Invalid 3-byte sequence, or valid one then an extra continuation byte - "/\xe0(?![\xa0-\xbf]$cont$after)/", - "/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S", - "/\xed(?![\x80-\x9f]$cont$after)/", - - // Invalid 4-byte sequence, or valid one then an extra continuation byte - "/\xf0(?![\x90-\xbf]$cont{2}$after)/", - "/[\xf1-\xf3](?!$cont{3}$after)/S", - "/\xf4(?![\x80-\x8f]$cont{2}$after)/", - ]; - } - - foreach ( $regexes as $regex ) { - if ( preg_match( $regex, $value ) !== 0 ) { - return false; - } + // Before PHP 5.4, values above U+10FFFF are incorrectly allowed, so we have to + // check for them separately. + static $newPHP; + if ( $newPHP === null ) { + $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' ); } - return true; + return mb_check_encoding( $value, 'UTF-8' ) && + ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 ); } /** diff --git a/maintenance/update.php b/maintenance/update.php index 43b10c858b..e487449eef 100755 --- a/maintenance/update.php +++ b/maintenance/update.php @@ -83,6 +83,12 @@ class UpdateMediaWiki extends Maintenance { "ABORTING (see https://bugs.php.net/bug.php?id=45996).\n", true ); } + + if ( !function_exists( 'mb_strlen' ) ) { + $this->error( + "MediaWiki now requires the mbstring PHP extension, your system doesn't have it.\n" + . "ABORTING.\n" ); + } } function execute() { diff --git a/tests/phpunit/includes/FallbackTest.php b/tests/phpunit/includes/FallbackTest.php deleted file mode 100644 index 85732dd19f..0000000000 --- a/tests/phpunit/includes/FallbackTest.php +++ /dev/null @@ -1,72 +0,0 @@ -markTestSkipped( - "The mb_string functions must be installed to test the fallback functions" - ); - } - - $sampleUTF = "Östergötland_coat_of_arms.png"; - - // mb_substr - $substr_params = [ - [ 0, 0 ], - [ 5, -4 ], - [ 33 ], - [ 100, -5 ], - [ -8, 10 ], - [ 1, 1 ], - [ 2, -1 ] - ]; - - foreach ( $substr_params as $param_set ) { - $old_param_set = $param_set; - array_unshift( $param_set, $sampleUTF ); - - $this->assertEquals( - call_user_func_array( 'mb_substr', $param_set ), - call_user_func_array( 'Fallback::mb_substr', $param_set ), - 'Fallback mb_substr with params ' . implode( ', ', $old_param_set ) - ); - } - - // mb_strlen - $this->assertEquals( - mb_strlen( $sampleUTF ), - Fallback::mb_strlen( $sampleUTF ), - 'Fallback mb_strlen' - ); - - // mb_str(r?)pos - $strpos_params = [ - // array( 'ter' ), - // array( 'Ö' ), - // array( 'Ö', 3 ), - // array( 'oat_', 100 ), - // array( 'c', -10 ), - // Broken for now - ]; - - foreach ( $strpos_params as $param_set ) { - $old_param_set = $param_set; - array_unshift( $param_set, $sampleUTF ); - - $this->assertEquals( - call_user_func_array( 'mb_strpos', $param_set ), - call_user_func_array( 'Fallback::mb_strpos', $param_set ), - 'Fallback mb_strpos with params ' . implode( ', ', $old_param_set ) - ); - - $this->assertEquals( - call_user_func_array( 'mb_strrpos', $param_set ), - call_user_func_array( 'Fallback::mb_strrpos', $param_set ), - 'Fallback mb_strrpos with params ' . implode( ', ', $old_param_set ) - ); - } - } -}