Merge "(bug 32348) Allow descending order for list=alllinks"

[lhc/web/wiklou.git] / includes / StringUtils.php
diff --git a/includes/StringUtils.php b/includes/StringUtils.php

index e56d4b6..3b500ae 100644 (file)
--- a/includes/StringUtils.php
+++ b/includes/StringUtils.php
@@ -1,16 +1,46 @@
  <?php
+/**
+ * Methods to play with strings.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ */
  
+/**
+ * A collection of static methods to play with strings.
+ */
  class StringUtils {
         /**
-        * Perform an operation equivalent to 
+        * Perform an operation equivalent to
          *
          *     preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
          *
          * except that it's worst-case O(N) instead of O(N^2)
          *
          * Compared to delimiterReplace(), this implementation is fast but memory-
-        * hungry and inflexible. The memory requirements are such that I don't 
+        * hungry and inflexible. The memory requirements are such that I don't
          * recommend using it on anything but guaranteed small chunks of text.
+        *
+        * @param $startDelim
+        * @param $endDelim
+        * @param $replace
+        * @param $subject
+        *
+        * @return string
          */
         static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
                 $segments = explode( $startDelim, $subject );
@@ -25,47 +55,91 @@ class StringUtils {
                 }
                 return $output;
         }
-       
+
         /**
-        * Perform an operation equivalent to 
+        * Perform an operation equivalent to
          *
          *   preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject )
          *
-        * This implementation is slower than staticDelimiterReplace but uses far less
-        * memory and allows regular expression delimiters.
+        * This implementation is slower than hungryDelimiterReplace but uses far less
+        * memory. The delimiters are literal strings, not regular expressions.
+        *
+        * If the start delimiter ends with an initial substring of the end delimiter,
+        * e.g. in the case of C-style comments, the behaviour differs from the model
+        * regex. In this implementation, the end must share no characters with the
+        * start, so e.g. /*\/ is not considered to be both the start and end of a
+        * comment. /*\/xy/*\/ is considered to be a single comment with contents /xy/.
          *
-        * @param string $flags Regular expression flags
+        * @param $startDelim String: start delimiter
+        * @param $endDelim String: end delimiter
+        * @param $callback Callback: function to call on each match
+        * @param $subject String
+        * @param $flags String: regular expression flags
+        * @throws MWException
+        * @return string
          */
         static function delimiterReplaceCallback( $startDelim, $endDelim, $callback, $subject, $flags = '' ) {
                 $inputPos = 0;
                 $outputPos = 0;
                 $output = '';
                 $foundStart = false;
+               $encStart = preg_quote( $startDelim, '!' );
+               $encEnd = preg_quote( $endDelim, '!' );
+               $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
+               $endLength = strlen( $endDelim );
+               $m = array();
  
-               while ( $inputPos < strlen( $subject ) && 
-                 preg_match( "!($startDelim)|($endDelim)!$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos ) ) 
+               while ( $inputPos < strlen( $subject ) &&
+                 preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos ) )
                 {
+                       $tokenOffset = $m[0][1];
                         if ( $m[1][0] != '' ) {
-                               # Found start
-                               # Write out the non-matching section
-                               $output .= substr( $subject, $outputPos, $m[1][1] - $outputPos );
-                               $foundStart = true;
-                               $inputPos = $contentPos = $m[1][1] + strlen( $m[1][0] );
-                               $outputPos = $m[1][1];
+                               if ( $foundStart &&
+                                 $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0 )
+                               {
+                                       # An end match is present at the same location
+                                       $tokenType = 'end';
+                                       $tokenLength = $endLength;
+                               } else {
+                                       $tokenType = 'start';
+                                       $tokenLength = strlen( $m[0][0] );
+                               }
                         } elseif ( $m[2][0] != '' ) {
-                               # Found end
+                               $tokenType = 'end';
+                               $tokenLength = strlen( $m[0][0] );
+                       } else {
+                               throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
+                       }
+
+                       if ( $tokenType == 'start' ) {
+                               # Only move the start position if we haven't already found a start
+                               # This means that START START END matches outer pair
+                               if ( !$foundStart ) {
+                                       # Found start
+                                       $inputPos = $tokenOffset + $tokenLength;
+                                       # Write out the non-matching section
+                                       $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
+                                       $outputPos = $tokenOffset;
+                                       $contentPos = $inputPos;
+                                       $foundStart = true;
+                               } else {
+                                       # Move the input position past the *first character* of START,
+                                       # to protect against missing END when it overlaps with START
+                                       $inputPos = $tokenOffset + 1;
+                               }
+                       } elseif ( $tokenType == 'end' ) {
                                 if ( $foundStart ) {
                                         # Found match
                                         $output .= call_user_func( $callback, array(
-                                               substr( $subject, $outputPos, $m[2][1] + strlen( $m[2][0] ) - $outputPos ),
-                                               substr( $subject, $contentPos, $m[2][1] - $contentPos )
+                                               substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
+                                               substr( $subject, $contentPos, $tokenOffset - $contentPos )
                                         ));
                                         $foundStart = false;
                                 } else {
                                         # Non-matching end, write it out
-                                       $output .= substr( $subject, $inputPos, $m[2][1] + strlen( $m[2][0] ) - $outputPos );
+                                       $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
                                 }
-                               $inputPos = $outputPos = $m[2][1] + strlen( $m[2][0] );
+                               $inputPos = $outputPos = $tokenOffset + $tokenLength;
                         } else {
                                 throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
                         }
@@ -76,17 +150,18 @@ class StringUtils {
                 return $output;
         }
  
-       /*
-        * Perform an operation equivalent to 
+       /**
+        * Perform an operation equivalent to
          *
          *   preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject )
          *
-        * @param string $startDelim Start delimiter regular expression
-        * @param string $endDelim End delimiter regular expression
-        * @param string $replace Replacement string. May contain $1, which will be 
-        *               replaced by the text between the delimiters
-        * @param string $subject String to search
-        * @return string The string with the matches replaced
+        * @param $startDelim String: start delimiter regular expression
+        * @param $endDelim String: end delimiter regular expression
+        * @param $replace String: replacement string. May contain $1, which will be
+        *                 replaced by the text between the delimiters
+        * @param $subject String to search
+        * @param $flags String: regular expression flags
+        * @return String: The string with the matches replaced
          */
         static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
                 $replacer = new RegexlikeReplacer( $replace );
@@ -97,16 +172,16 @@ class StringUtils {
         /**
          * More or less "markup-safe" explode()
          * Ignores any instances of the separator inside <...>
-        * @param string $separator
-        * @param string $text
+        * @param $separator String
+        * @param $text String
          * @return array
          */
         static function explodeMarkup( $separator, $text ) {
                 $placeholder = "\x00";
-               
+
                 // Remove placeholder instances
                 $text = str_replace( $placeholder, '', $text );
-               
+
                 // Replace instances of the separator inside HTML-like tags with the placeholder
                 $replacer = new DoubleReplacer( $separator, $placeholder );
                 $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
@@ -116,7 +191,7 @@ class StringUtils {
                 foreach( $items as $i => $str ) {
                         $items[$i] = str_replace( $placeholder, $separator, $str );
                 }
-               
+
                 return $items;
         }
  
@@ -124,21 +199,40 @@ class StringUtils {
          * Escape a string to make it suitable for inclusion in a preg_replace()
          * replacement parameter.
          *
-        * @param string $string
-        * @return string
+        * @param $string String
+        * @return String
          */
         static function escapeRegexReplacement( $string ) {
                 $string = str_replace( '\\', '\\\\', $string );
                 $string = str_replace( '$', '\\$', $string );
                 return $string;
         }
+
+       /**
+        * Workalike for explode() with limited memory usage.
+        * Returns an Iterator
+        * @param $separator
+        * @param $subject
+        * @return ArrayIterator|\ExplodeIterator
+        */
+       static function explode( $separator, $subject ) {
+               if ( substr_count( $subject, $separator ) > 1000 ) {
+                       return new ExplodeIterator( $separator, $subject );
+               } else {
+                       return new ArrayIterator( explode( $separator, $subject ) );
+               }
+       }
  }
  
  /**
- * Base class for "replacers", objects used in preg_replace_callback() and 
+ * Base class for "replacers", objects used in preg_replace_callback() and
   * StringUtils::delimiterReplaceCallback()
   */
  class Replacer {
+
+       /**
+        * @return array
+        */
         function cb() {
                 return array( &$this, 'replace' );
         }
@@ -149,10 +243,18 @@ class Replacer {
   */
  class RegexlikeReplacer extends Replacer {
         var $r;
+
+       /**
+        * @param $r string
+        */
         function __construct( $r ) {
                 $this->r = $r;
         }
  
+       /**
+        * @param $matches array
+        * @return string
+        */
         function replace( $matches ) {
                 $pairs = array();
                 foreach ( $matches as $i => $match ) {
@@ -167,12 +269,22 @@ class RegexlikeReplacer extends Replacer {
   * Class to perform secondary replacement within each replacement string
   */
  class DoubleReplacer extends Replacer {
+
+       /**
+        * @param $from
+        * @param $to
+        * @param $index int
+        */
         function __construct( $from, $to, $index = 0 ) {
                 $this->from = $from;
                 $this->to = $to;
                 $this->index = $index;
         }
-       
+
+       /**
+        * @param $matches array
+        * @return mixed
+        */
         function replace( $matches ) {
                 return str_replace( $this->from, $this->to, $matches[$this->index] );
         }
@@ -184,11 +296,19 @@ class DoubleReplacer extends Replacer {
  class HashtableReplacer extends Replacer {
         var $table, $index;
  
+       /**
+        * @param $table
+        * @param $index int
+        */
         function __construct( $table, $index = 0 ) {
                 $this->table = $table;
                 $this->index = $index;
         }
  
+       /**
+        * @param $matches array
+        * @return mixed
+        */
         function replace( $matches ) {
                 return $this->table[$matches[$this->index]];
         }
@@ -205,11 +325,15 @@ class ReplacementArray {
         /**
          * Create an object with the specified replacement array
          * The array should have the same form as the replacement array for strtr()
+        * @param array $data
          */
         function __construct( $data = array() ) {
                 $this->data = $data;
         }
  
+       /**
+        * @return array
+        */
         function __sleep() {
                 return array( 'data' );
         }
@@ -226,28 +350,61 @@ class ReplacementArray {
                 $this->fss = false;
         }
  
+       /**
+        * @return array|bool
+        */
         function getArray() {
                 return $this->data;
         }
  
         /**
          * Set an element of the replacement array
+        * @param $from string
+        * @param $to stromg
          */
         function setPair( $from, $to ) {
                 $this->data[$from] = $to;
                 $this->fss = false;
         }
  
+       /**
+        * @param $data array
+        */
         function mergeArray( $data ) {
                 $this->data = array_merge( $this->data, $data );
                 $this->fss = false;
         }
  
+       /**
+        * @param $other
+        */
         function merge( $other ) {
                 $this->data = array_merge( $this->data, $other->data );
                 $this->fss = false;
         }
  
+       /**
+        * @param $from string
+        */
+       function removePair( $from ) {
+               unset($this->data[$from]);
+               $this->fss = false;
+       }
+
+       /**
+        * @param $data array
+        */
+       function removeArray( $data ) {
+               foreach( $data as $from => $to ) {
+                       $this->removePair( $from );
+               }
+               $this->fss = false;
+       }
+
+       /**
+        * @param $subject string
+        * @return string
+        */
         function replace( $subject ) {
                 if ( function_exists( 'fss_prep_replace' ) ) {
                         wfProfileIn( __METHOD__.'-fss' );
@@ -265,4 +422,95 @@ class ReplacementArray {
         }
  }
  
-?>
+/**
+ * An iterator which works exactly like:
+ * 
+ * foreach ( explode( $delim, $s ) as $element ) {
+ *    ...
+ * }
+ *
+ * Except it doesn't use 193 byte per element
+ */
+class ExplodeIterator implements Iterator {
+       // The subject string
+       var $subject, $subjectLength;
+
+       // The delimiter
+       var $delim, $delimLength;
+
+       // The position of the start of the line
+       var $curPos;
+
+       // The position after the end of the next delimiter
+       var $endPos;
+
+       // The current token
+       var $current;
+
+       /**
+        * Construct a DelimIterator
+        * @param $delim string
+        * @param $s string
+        */
+       function __construct( $delim, $s ) {
+               $this->subject = $s;
+               $this->delim = $delim;
+
+               // Micro-optimisation (theoretical)
+               $this->subjectLength = strlen( $s );
+               $this->delimLength = strlen( $delim );
+
+               $this->rewind();
+       }
+
+       function rewind() {
+               $this->curPos = 0;
+               $this->endPos = strpos( $this->subject, $this->delim );
+               $this->refreshCurrent();
+       }
+
+       function refreshCurrent() {
+               if ( $this->curPos === false ) {
+                       $this->current = false;
+               } elseif ( $this->curPos >= $this->subjectLength ) {
+                       $this->current = '';
+               } elseif ( $this->endPos === false ) {
+                       $this->current = substr( $this->subject, $this->curPos );
+               } else {
+                       $this->current = substr( $this->subject, $this->curPos, $this->endPos - $this->curPos );
+               }
+       }
+
+       function current() {
+               return $this->current;
+       }
+
+       function key() {
+               return $this->curPos;
+       }
+
+       /**
+        * @return string
+        */
+       function next() {
+               if ( $this->endPos === false ) {
+                       $this->curPos = false;
+               } else {
+                       $this->curPos = $this->endPos + $this->delimLength;
+                       if ( $this->curPos >= $this->subjectLength ) {
+                               $this->endPos = false;
+                       } else {
+                               $this->endPos = strpos( $this->subject, $this->delim, $this->curPos );
+                       }
+               }
+               $this->refreshCurrent();
+               return $this->current;
+       }
+
+       /**
+        * @return bool
+        */
+       function valid() {
+               return $this->curPos !== false;
+       }
+}