From: Mark A. Hershberger Date: Thu, 29 Jul 2010 02:44:23 +0000 (+0000) Subject: Add detection for unicode normalization. Next step: use what we find! :) X-Git-Tag: 1.31.0-rc.0~35855 X-Git-Url: http://git.cyclocoop.org/%24href?a=commitdiff_plain;h=a21fb8651f20f1ef556985893c40201afe8d1e74;p=lhc%2Fweb%2Fwiklou.git Add detection for unicode normalization. Next step: use what we find! :) I think I want to point to an as-yet-to-be-created page on MediaWiki.org to help people understand what to do if they're stuck with pure PHP normalization, but any pointers here would help. --- diff --git a/includes/installer/Installer.i18n.php b/includes/installer/Installer.i18n.php index 343aff4e35..54b7b10212 100644 --- a/includes/installer/Installer.i18n.php +++ b/includes/installer/Installer.i18n.php @@ -79,6 +79,10 @@ You cannot install MediaWiki.', 'config-env-latest-old' => "'''Warning:''' You are installing an outdated version of Mediawiki.", 'config-env-latest-help' => 'You are installing version $1, but the latest version is $2. You are advised to use the latest release, which can be downloaded from [http://www.mediawiki.org/wiki/Download mediawiki.org]', + 'config-unicode-php' => "Using pure PHP to normalize Unicode characters.", + 'config-unicode-pure-php-warning' => "'''Warning''': Either the PECL Intl extension is not available, or it uses an older version of [http://site.icu-project.org/ the ICU project's] library for handling Unicode normalization. If you run a high-traffic site, you should read a little on [http://www.mediawiki.org/wiki/Unicode_normalization_considerations Unicode normalization].", + 'config-unicode-utf8' => "Using Brion Vibber's utf8_normalize.so for UTF", + 'config-unicode-intl' => "Using the [http://pecl.php.net/intl intl PECL extension] for UTF-8 normalization.", 'config-no-db' => 'Could not find a suitable database driver!', 'config-no-db-help' => 'You need to install a database driver for PHP. The following database types are supported: $1. diff --git a/includes/installer/Installer.php b/includes/installer/Installer.php index f8491747b3..3b48bdd3a1 100644 --- a/includes/installer/Installer.php +++ b/includes/installer/Installer.php @@ -88,6 +88,7 @@ abstract class Installer { 'envCheckExtension', 'envCheckShellLocale', 'envCheckUploadsDirectory', + 'envCheckLibicu' ); /** @@ -811,6 +812,69 @@ abstract class Installer { } } + /** + * Convert a hex string representing a Unicode code point to that code point. + * @param string $c + * @return string + */ + protected function unicodeChar( $c ) { + $c = hexdec($c); + if ($c <= 0x7F) { + return chr($c); + } else if ($c <= 0x7FF) { + return chr(0xC0 | $c >> 6) . chr(0x80 | $c & 0x3F); + } else if ($c <= 0xFFFF) { + return chr(0xE0 | $c >> 12) . chr(0x80 | $c >> 6 & 0x3F) + . chr(0x80 | $c & 0x3F); + } else if ($c <= 0x10FFFF) { + return chr(0xF0 | $c >> 18) . chr(0x80 | $c >> 12 & 0x3F) + . chr(0x80 | $c >> 6 & 0x3F) + . chr(0x80 | $c & 0x3F); + } else { + return false; + } + } + + + /** + * Check the libicu version + */ + public function envCheckLibicu() { + $utf8 = function_exists( 'utf8_normalize' ); + $intl = function_exists( 'normalizer_normalize' ); + + /** + * This needs to be updated something that the latest libicu + * will properly normalize. This normalization was found at + * http://www.unicode.org/versions/Unicode5.2.0/#Character_Additions + * Note that we use the hex representation to create the code + * points in order to avoid any Unicode-destroying during transite. + */ + $not_normal_c = $this->unicodeChar("FA6C"); + $normal_c = $this->unicodeChar("242EE"); + + $useNormalizer = 'config-unicode-php'; + + /** + * We're going to prefer the pecl extension here unless + * utf8_normalize is more up to date. + */ + if( $utf8 ) { + $utf8 = utf8_normalize( $not_normal_c, UNORM_NFC ); + $useNormalizer = 'config-unicode-utf8'; + } + if( $intl ) { + $intl = normalizer_normalize( $not_normal_c, Normalizer::FORM_C ); + $useNormalizer = 'config-unicode-intl'; + } + + $this->showMessage( $useNormalizer ); + if( $useNormalizer === 'config-unicode-php' ) { + $this->showMessage( 'config-unicode-pure-php-warning' ); + } + } + + /** * Search a path for any of the given executable names. Returns the * executable name if found. Also checks the version string returned