From: Brion Vibber Date: Thu, 7 Oct 2004 05:59:10 +0000 (+0000) Subject: Add support for using ICU to perform normalization, which is much much faster than... X-Git-Tag: 1.5.0alpha1~1638 X-Git-Url: http://git.cyclocoop.org/%28?a=commitdiff_plain;h=0824182956884c743a6b2edc6da591e9ec1d1c02;p=lhc%2Fweb%2Fwiklou.git Add support for using ICU to perform normalization, which is much much faster than the PHP code! Still need to add support for cleanup/verification. --- diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php index 3ea8ef705b..110793a999 100644 --- a/includes/normal/UtfNormal.php +++ b/includes/normal/UtfNormal.php @@ -92,6 +92,20 @@ define( 'UTF8_FFFF', codepointToUtf8( 0xffff ) ); define( 'UTF8_HEAD', false ); define( 'UTF8_TAIL', true ); + +/** + * For using the ICU wrapper + */ +define( 'UNORM_NONE', 1 ); +define( 'UNORM_NFD', 2 ); +define( 'UNORM_NFKD', 3 ); +define( 'UNORM_NFC', 4 ); +define( 'UNORM_DEFAULT', UNORM_NFC ); +define( 'UNORM_NFKC', 5 ); +define( 'UNORM_FCD', 6 ); + +define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) ); + /** * * @package MediaWiki @@ -123,7 +137,9 @@ class UtfNormal { * @return string a UTF-8 string in normal form C */ function toNFC( $string ) { - if( UtfNormal::quickIsNFC( $string ) ) + if( NORMALIZE_ICU ) + return utf8_normalize( $string, UNORM_NFC ); + elseif( UtfNormal::quickIsNFC( $string ) ) return $string; else return UtfNormal::NFC( $string ); @@ -137,7 +153,9 @@ class UtfNormal { * @return string a UTF-8 string in normal form D */ function toNFD( $string ) { - if( preg_match( '/[\x80-\xff]/', $string ) ) + if( NORMALIZE_ICU ) + return utf8_normalize( $string, UNORM_NFD ); + elseif( preg_match( '/[\x80-\xff]/', $string ) ) return UtfNormal::NFD( $string ); else return $string; @@ -152,7 +170,9 @@ class UtfNormal { * @return string a UTF-8 string in normal form KC */ function toNFKC( $string ) { - if( preg_match( '/[\x80-\xff]/', $string ) ) + if( NORMALIZE_ICU ) + return utf8_normalize( $string, UNORM_NFKC ); + elseif( preg_match( '/[\x80-\xff]/', $string ) ) return UtfNormal::NFKC( $string ); else return $string; @@ -167,7 +187,9 @@ class UtfNormal { * @return string a UTF-8 string in normal form KD */ function toNFKD( $string ) { - if( preg_match( '/[\x80-\xff]/', $string ) ) + if( NORMALIZE_ICU ) + return utf8_normalize( $string, UNORM_NFKD ); + elseif( preg_match( '/[\x80-\xff]/', $string ) ) return UtfNormal::NFKD( $string ); else return $string; diff --git a/includes/normal/UtfNormalBench.php b/includes/normal/UtfNormalBench.php index 2e1740c22b..d42d592099 100644 --- a/includes/normal/UtfNormalBench.php +++ b/includes/normal/UtfNormalBench.php @@ -23,6 +23,10 @@ */ /** */ +if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) { + dl( 'php_utfnormal.so' ); +} + require_once 'UtfNormalUtil.php'; require_once 'UtfNormal.php'; diff --git a/includes/normal/UtfNormalTest.php b/includes/normal/UtfNormalTest.php index 6360a7ca44..16992be9fb 100644 --- a/includes/normal/UtfNormalTest.php +++ b/includes/normal/UtfNormalTest.php @@ -44,6 +44,10 @@ if( defined( 'PRETTY_UTF8' ) ) { } } +if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) { + dl( 'php_utfnormal.so' ); +} + require_once 'UtfNormalUtil.php'; require_once 'UtfNormal.php'; @@ -106,7 +110,8 @@ while( false !== ($line = fgets( $in ) ) ) { $cols = explode( ';', $line ); $char = codepointToUtf8( hexdec( $cols[0] ) ); $desc = $cols[0] . ": " . $cols[1]; - if( $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) { + if( $char === "\x00" || $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) { + # Can't check NULL with the ICU plugin, as null bytes fail in C land. # Surrogates are illegal on their own or in UTF-8, ignore. continue; }