From f08147c313f538a45298771804cd8d718a1dd37e Mon Sep 17 00:00:00 2001 From: "Mark A. Hershberger" Date: Tue, 20 Jul 2010 15:41:24 +0000 Subject: [PATCH] Prefer the intl PECL extension for ICU Unicode --- includes/normal/UtfNormal.php | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php index c34641ba12..621ef9de4b 100644 --- a/includes/normal/UtfNormal.php +++ b/includes/normal/UtfNormal.php @@ -45,6 +45,7 @@ define( 'UNORM_NFKC', 5 ); define( 'UNORM_FCD', 6 ); define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) ); +define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) ); /** * Unicode normalization routines for working with UTF-8 strings. @@ -79,7 +80,7 @@ class UtfNormal { return $ret; } - if( NORMALIZE_ICU ) { + if( NORMALIZE_ICU || NORMALIZE_INTL ) { # We exclude a few chars that ICU would not. $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', @@ -90,7 +91,8 @@ class UtfNormal { # UnicodeString constructor fails if the string ends with a # head byte. Add a junk char at the end, we'll strip it off. - return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" ); + if ( NORMALIZE_ICU ) return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" ); + if ( NORMALIZE_INTL ) return normalizer_normalize( $string, Normalizer::FORM_C ); } elseif( UtfNormal::quickIsNFCVerify( $string ) ) { # Side effect -- $string has had UTF-8 errors cleaned up. return $string; @@ -108,7 +110,9 @@ class UtfNormal { * @return string a UTF-8 string in normal form C */ static function toNFC( $string ) { - if( NORMALIZE_ICU ) + if( NORMALIZE_INTL ) + return normalizer_normalize( $string, Normalizer::FORM_C ); + elseif( NORMALIZE_ICU ) return utf8_normalize( $string, UNORM_NFC ); elseif( UtfNormal::quickIsNFC( $string ) ) return $string; @@ -124,7 +128,9 @@ class UtfNormal { * @return string a UTF-8 string in normal form D */ static function toNFD( $string ) { - if( NORMALIZE_ICU ) + if( NORMALIZE_INTL ) + return normalizer_normalize( $string, Normalizer::FORM_D ); + elseif( NORMALIZE_ICU ) return utf8_normalize( $string, UNORM_NFD ); elseif( preg_match( '/[\x80-\xff]/', $string ) ) return UtfNormal::NFD( $string ); @@ -141,7 +147,9 @@ class UtfNormal { * @return string a UTF-8 string in normal form KC */ static function toNFKC( $string ) { - if( NORMALIZE_ICU ) + if( NORMALIZE_INTL ) + return normalizer_normalize( $string, Normalizer::FORM_KC ); + elseif( NORMALIZE_ICU ) return utf8_normalize( $string, UNORM_NFKC ); elseif( preg_match( '/[\x80-\xff]/', $string ) ) return UtfNormal::NFKC( $string ); @@ -158,7 +166,9 @@ class UtfNormal { * @return string a UTF-8 string in normal form KD */ static function toNFKD( $string ) { - if( NORMALIZE_ICU ) + if( NORMALIZE_INTL ) + return normalizer_normalize( $string, Normalizer::FORM_KD ); + elseif( NORMALIZE_ICU ) return utf8_normalize( $string, UNORM_NFKD ); elseif( preg_match( '/[\x80-\xff]/', $string ) ) return UtfNormal::NFKD( $string ); -- 2.20.1