(follow-up r86130) the normalizer_normalize function doesn't replace things like...

author Brian Wolff <bawolff@users.mediawiki.org>

Sat, 16 Apr 2011 15:32:19 +0000 (15:32 +0000)

committer Brian Wolff <bawolff@users.mediawiki.org>

Sat, 16 Apr 2011 15:32:19 +0000 (15:32 +0000)
author Brian Wolff <bawolff@users.mediawiki.org>
Sat, 16 Apr 2011 15:32:19 +0000 (15:32 +0000)
committer Brian Wolff <bawolff@users.mediawiki.org>
Sat, 16 Apr 2011 15:32:19 +0000 (15:32 +0000)
diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php

index 75e3a08..116fb8f 100644 (file)
--- a/includes/normal/UtfNormal.php
+++ b/includes/normal/UtfNormal.php
@@ -80,18 +80,13 @@ class UtfNormal {
          */
         static function cleanUp( $string ) {
                 if( NORMALIZE_ICU ) {
-                       # We exclude a few chars that ICU would not.
-                       $string = preg_replace(
-                               '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
-                               UTF8_REPLACEMENT,
-                               $string );
-                       $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
-                       $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
+                       $string = self::replaceForNativeNormalize( $string );
  
                         # UnicodeString constructor fails if the string ends with a
                         # head byte. Add a junk char at the end, we'll strip it off.
                         return rtrim( utf8_normalize( $string . "\x01", UNORM_NFC ), "\x01" );
                 } elseif( NORMALIZE_INTL ) {
+                       $string = self::replaceForNativeNormalize( $string );
                         $norm = normalizer_normalize( $string, Normalizer::FORM_C );
                         if( $norm === null || $norm === false ) {
                                 # normalizer_normalize will either return false or null
@@ -764,4 +759,20 @@ class UtfNormal {
                 }
                 return $out;
         }
+       /**
+        * Function to replace some characters that we don't want
+        * but most of the native normalize functions keep.
+        *
+        * @param $string String The string
+        * @return String String with the character codes replaced.
+        */
+       private static function replaceForNativeNormalize( $string ) { 
+               $string = preg_replace(
+                       '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
+                       UTF8_REPLACEMENT,
+                       $string );
+               $string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
+               $string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
+               return $string;
+       }
  }
author	Brian Wolff <bawolff@users.mediawiki.org>
	Sat, 16 Apr 2011 15:32:19 +0000 (15:32 +0000)
committer	Brian Wolff <bawolff@users.mediawiki.org>
	Sat, 16 Apr 2011 15:32:19 +0000 (15:32 +0000)