Follow-up r84814: revert redundant summary message addition.

[lhc/web/wiklou.git] / includes / StringUtils.php
diff --git a/includes/StringUtils.php b/includes/StringUtils.php

index 1e34890..c1e617a 100644 (file)
--- a/includes/StringUtils.php
+++ b/includes/StringUtils.php
@@ -36,7 +36,11 @@ class StringUtils {
          * This implementation is slower than hungryDelimiterReplace but uses far less
          * memory. The delimiters are literal strings, not regular expressions.
          *
-        * @param string $flags Regular expression flags
+        * @param $startDelim String: start delimiter
+        * @param $endDelim String: end delimiter
+        * @param $callback Callback: function to call on each match
+        * @param $subject String
+        * @param $flags String: regular expression flags
          */
         # If the start delimiter ends with an initial substring of the end delimiter,
         # e.g. in the case of C-style comments, the behaviour differs from the model
@@ -77,16 +81,20 @@ class StringUtils {
                         }
  
                         if ( $tokenType == 'start' ) {
-                               $inputPos = $tokenOffset + $tokenLength;
                                 # Only move the start position if we haven't already found a start
                                 # This means that START START END matches outer pair
                                 if ( !$foundStart ) {
                                         # Found start
+                                       $inputPos = $tokenOffset + $tokenLength;
                                         # Write out the non-matching section
                                         $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
                                         $outputPos = $tokenOffset;
                                         $contentPos = $inputPos;
                                         $foundStart = true;
+                               } else {
+                                       # Move the input position past the *first character* of START,
+                                       # to protect against missing END when it overlaps with START
+                                       $inputPos = $tokenOffset + 1;
                                 }
                         } elseif ( $tokenType == 'end' ) {
                                 if ( $foundStart ) {
@@ -111,17 +119,18 @@ class StringUtils {
                 return $output;
         }
  
-       /*
+       /**
          * Perform an operation equivalent to
          *
          *   preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject )
          *
-        * @param string $startDelim Start delimiter regular expression
-        * @param string $endDelim End delimiter regular expression
-        * @param string $replace Replacement string. May contain $1, which will be
-        *               replaced by the text between the delimiters
-        * @param string $subject String to search
-        * @return string The string with the matches replaced
+        * @param $startDelim String: start delimiter regular expression
+        * @param $endDelim String: end delimiter regular expression
+        * @param $replace String: replacement string. May contain $1, which will be
+        *                 replaced by the text between the delimiters
+        * @param $subject String to search
+        * @param $flags String: regular expression flags
+        * @return String: The string with the matches replaced
          */
         static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
                 $replacer = new RegexlikeReplacer( $replace );
@@ -132,8 +141,8 @@ class StringUtils {
         /**
          * More or less "markup-safe" explode()
          * Ignores any instances of the separator inside <...>
-        * @param string $separator
-        * @param string $text
+        * @param $separator String
+        * @param $text String
          * @return array
          */
         static function explodeMarkup( $separator, $text ) {
@@ -159,8 +168,8 @@ class StringUtils {
          * Escape a string to make it suitable for inclusion in a preg_replace()
          * replacement parameter.
          *
-        * @param string $string
-        * @return string
+        * @param $string String
+        * @return String
          */
         static function escapeRegexReplacement( $string ) {
                 $string = str_replace( '\\', '\\\\', $string );
@@ -179,86 +188,6 @@ class StringUtils {
                         return new ArrayIterator( explode( $separator, $subject ) );
                 }
         }
-
-       /**
-        * Clean characters that are invalid in the given character set 
-        * from a given string.
-        * 
-        * @param $string \type{$string} String to clean
-        * @param $charset \type{$string} Character set (if unspecified, assume $wgOutputEncoding)
-        * @return \type{$string} Cleaned string
-        */
-       public static function cleanForCharset( $string, $charset='' ) {
-               global $wgOutputEncoding;
-               switch ( $charset ? $charset : $wgOutputEncoding ) {
-                       # UTF-8 should be all we need to worry about. :)
-               case 'UTF-8':
-                       return self::cleanUtf8( $string );
-               default:
-                       return $string;
-               }
-       }
-
-       /**
-        * Clean invalid UTF-8 characters and sequences from a given string,
-        * replacing them with U+FFFD.
-        * Should be RFC 3629 compliant.
-        * 
-        * @param $string \type{$string} String to clean
-        * @return \type{$string} Cleaned string
-        */
-       private static function cleanUtf8( $str ) {
-               # HERE BE DRAGONS!
-               # ABANDON ALL HOPE, ALL YE WHO ENTER THE BITWISE HELLFIRE.
-
-               $illegal = array( 0xD800, 0xDB7F, 0xDB80, 0xDBFF,
-                                 0xDC00, 0xDF80, 0xDFFF, 0xFFFE, 0xFFFF );
-               $len = strlen( $str );
-               $left = $bytes = 0;
-               for ( $i = 0; $i < $len; $i++ ) {
-                       $ch = ord( $str[$i] );
-                       if ( !$left ) {
-                               if ( !($ch & 0x80 ) )
-                                       continue;
-                               $left = (( $ch & 0xFE ) == 0xFC ? 5 :
-                                       (( $ch & 0xFC ) == 0xF8 ? 4 :
-                                       (( $ch & 0xF8 ) == 0xF0 ? 3 :
-                                       (( $ch & 0xF0 ) == 0xE0 ? 2 :
-                                       (( $ch & 0xE0 ) == 0xC0 ? 1 :
-                                                                 0 )))));
-                               if ( $left ) {
-                                       $bytes = $left + 1;
-                                       $sum = $ch & ( 0xFF >> $bytes + 1 );
-                                       continue;
-                               } else if ( $ch & 0x80 ) {
-                                       $bytes = 1;
-                               }
-                       } else if ( ( $ch & 0xC0 ) == 0x80 ) {
-                               $sum <<= 6;
-                               $sum += $ch & 0x3F;
-                               if ( --$left ) continue;
-                               if ( ( $bytes == 2 && $sum < 0x80     ) ||
-                                    ( $bytes == 3 && $sum < 0x800    ) ||
-                                    ( $bytes == 4 && $sum < 0x10000  ) ||
-                                    ( $bytes >  4 || $sum > 0x10FFFF ) ||
-                                    in_array( $sum, $illegal ) ) {
-                               } else continue;
-                               
-                       } else {
-                               $bytes -= $left;
-                               $i--;
-                       }
-
-                       $str = ( substr( $str, 0, $i - $bytes + 1 ) .
-                                "\xEF\xBF\xBD" .
-                                substr( $str, $i + 1 ) );
-                       $i   += 3 - $bytes;
-                       $len += 3 - $bytes;
-                       $left = 0;
-               }
-
-               return $str;
-       }
  }
  
  /**