Prefer the intl PECL extension for ICU Unicode
authorMark A. Hershberger <mah@users.mediawiki.org>
Tue, 20 Jul 2010 15:41:24 +0000 (15:41 +0000)
committerMark A. Hershberger <mah@users.mediawiki.org>
Tue, 20 Jul 2010 15:41:24 +0000 (15:41 +0000)
includes/normal/UtfNormal.php

index c34641b..621ef9d 100644 (file)
@@ -45,6 +45,7 @@ define( 'UNORM_NFKC', 5 );
 define( 'UNORM_FCD',  6 );
 
 define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
+define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) );
 
 /**
  * Unicode normalization routines for working with UTF-8 strings.
@@ -79,7 +80,7 @@ class UtfNormal {
                        return $ret;
                }
 
-               if( NORMALIZE_ICU ) {
+               if( NORMALIZE_ICU || NORMALIZE_INTL ) {
                        # We exclude a few chars that ICU would not.
                        $string = preg_replace(
                                '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
@@ -90,7 +91,8 @@ class UtfNormal {
 
                        # UnicodeString constructor fails if the string ends with a
                        # head byte. Add a junk char at the end, we'll strip it off.
-                       return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" );
+                       if ( NORMALIZE_ICU ) return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" );
+                       if ( NORMALIZE_INTL ) return normalizer_normalize( $string, Normalizer::FORM_C );
                } elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
                        # Side effect -- $string has had UTF-8 errors cleaned up.
                        return $string;
@@ -108,7 +110,9 @@ class UtfNormal {
         * @return string a UTF-8 string in normal form C
         */
        static function toNFC( $string ) {
-               if( NORMALIZE_ICU )
+               if( NORMALIZE_INTL )
+                       return normalizer_normalize( $string, Normalizer::FORM_C );
+               elseif( NORMALIZE_ICU )
                        return utf8_normalize( $string, UNORM_NFC );
                elseif( UtfNormal::quickIsNFC( $string ) )
                        return $string;
@@ -124,7 +128,9 @@ class UtfNormal {
         * @return string a UTF-8 string in normal form D
         */
        static function toNFD( $string ) {
-               if( NORMALIZE_ICU )
+               if( NORMALIZE_INTL )
+                       return normalizer_normalize( $string, Normalizer::FORM_D );
+               elseif( NORMALIZE_ICU )
                        return utf8_normalize( $string, UNORM_NFD );
                elseif( preg_match( '/[\x80-\xff]/', $string ) )
                        return UtfNormal::NFD( $string );
@@ -141,7 +147,9 @@ class UtfNormal {
         * @return string a UTF-8 string in normal form KC
         */
        static function toNFKC( $string ) {
-               if( NORMALIZE_ICU )
+               if( NORMALIZE_INTL )
+                       return normalizer_normalize( $string, Normalizer::FORM_KC );
+               elseif( NORMALIZE_ICU )
                        return utf8_normalize( $string, UNORM_NFKC );
                elseif( preg_match( '/[\x80-\xff]/', $string ) )
                        return UtfNormal::NFKC( $string );
@@ -158,7 +166,9 @@ class UtfNormal {
         * @return string a UTF-8 string in normal form KD
         */
        static function toNFKD( $string ) {
-               if( NORMALIZE_ICU )
+               if( NORMALIZE_INTL )
+                       return normalizer_normalize( $string, Normalizer::FORM_KD );
+               elseif( NORMALIZE_ICU )
                        return utf8_normalize( $string, UNORM_NFKD );
                elseif( preg_match( '/[\x80-\xff]/', $string ) )
                        return UtfNormal::NFKD( $string );