includes/libs/StringUtils.php

   1 <?php
   2
   3 use Wikimedia\AtEase\AtEase;
   4
   5 /**
   6  * Methods to play with strings.
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @file
  24  */
  25
  26 /**
  27  * A collection of static methods to play with strings.
  28  */
  29 class StringUtils {
  30         /**
  31          * Test whether a string is valid UTF-8.
  32          *
  33          * The function check for invalid byte sequences, overlong encoding but
  34          * not for different normalisations.
  35          *
  36          * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
  37          * In particular, the pure PHP code path did not in fact check for overlong forms.
  38          * Beware of this when backporting code to that version of MediaWiki.
  39          *
  40          * @since 1.21
  41          * @param string $value String to check
  42          * @return bool Whether the given $value is a valid UTF-8 encoded string
  43          */
  44         static function isUtf8( $value ) {
  45                 return mb_check_encoding( (string)$value, 'UTF-8' );
  46         }
  47
  48         /**
  49          * Explode a string, but ignore any instances of the separator inside
  50          * the given start and end delimiters, which may optionally nest.
  51          * The delimiters are literal strings, not regular expressions.
  52          * @param string $startDelim Start delimiter
  53          * @param string $endDelim End delimiter
  54          * @param string $separator Separator string for the explode.
  55          * @param string $subject Subject string to explode.
  56          * @param bool $nested True iff the delimiters are allowed to nest.
  57          * @return ArrayIterator
  58          */
  59         static function delimiterExplode( $startDelim, $endDelim, $separator,
  60                 $subject, $nested = false ) {
  61                 $inputPos = 0;
  62                 $lastPos = 0;
  63                 $depth = 0;
  64                 $encStart = preg_quote( $startDelim, '!' );
  65                 $encEnd = preg_quote( $endDelim, '!' );
  66                 $encSep = preg_quote( $separator, '!' );
  67                 $len = strlen( $subject );
  68                 $m = [];
  69                 $exploded = [];
  70                 while (
  71                         $inputPos < $len &&
  72                         preg_match(
  73                                 "!$encStart|$encEnd|$encSep!S", $subject, $m,
  74                                 PREG_OFFSET_CAPTURE, $inputPos
  75                         )
  76                 ) {
  77                         $match = $m[0][0];
  78                         $matchPos = $m[0][1];
  79                         $inputPos = $matchPos + strlen( $match );
  80                         if ( $match === $separator ) {
  81                                 if ( $depth === 0 ) {
  82                                         $exploded[] = substr(
  83                                                 $subject, $lastPos, $matchPos - $lastPos
  84                                         );
  85                                         $lastPos = $inputPos;
  86                                 }
  87                         } elseif ( $match === $startDelim ) {
  88                                 if ( $depth === 0 || $nested ) {
  89                                         $depth++;
  90                                 }
  91                         } else {
  92                                 $depth--;
  93                         }
  94                 }
  95                 $exploded[] = substr( $subject, $lastPos );
  96                 // This method could be rewritten in the future to avoid creating an
  97                 // intermediate array, since the return type is just an iterator.
  98                 return new ArrayIterator( $exploded );
  99         }
 100
 101         /**
 102          * Perform an operation equivalent to `preg_replace()`
 103          *
 104          * Matches this code:
 105          *
 106          *     preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
 107          *
 108          * ..except that it's worst-case O(N) instead of O(N^2). Compared to delimiterReplace(), this
 109          * implementation is fast but memory-hungry and inflexible. The memory requirements are such
 110          * that I don't recommend using it on anything but guaranteed small chunks of text.
 111          *
 112          * @param string $startDelim
 113          * @param string $endDelim
 114          * @param string $replace
 115          * @param string $subject
 116          * @return string
 117          */
 118         static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
 119                 $segments = explode( $startDelim, $subject );
 120                 $output = array_shift( $segments );
 121                 foreach ( $segments as $s ) {
 122                         $endDelimPos = strpos( $s, $endDelim );
 123                         if ( $endDelimPos === false ) {
 124                                 $output .= $startDelim . $s;
 125                         } else {
 126                                 $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
 127                         }
 128                 }
 129
 130                 return $output;
 131         }
 132
 133         /**
 134          * Perform an operation equivalent to `preg_replace_callback()`
 135          *
 136          * Matches this code:
 137          *
 138          *     preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject );
 139          *
 140          * If the start delimiter ends with an initial substring of the end delimiter,
 141          * e.g. in the case of C-style comments, the behavior differs from the model
 142          * regex. In this implementation, the end must share no characters with the
 143          * start, so e.g. `/*\/` is not considered to be both the start and end of a
 144          * comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`.
 145          *
 146          * The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
 147          * but uses far less memory. The delimiters are literal strings, not regular expressions.
 148          *
 149          * @param string $startDelim Start delimiter
 150          * @param string $endDelim End delimiter
 151          * @param callable $callback Function to call on each match
 152          * @param string $subject
 153          * @param string $flags Regular expression flags
 154          * @throws InvalidArgumentException
 155          * @return string
 156          */
 157         static function delimiterReplaceCallback( $startDelim, $endDelim, $callback,
 158                 $subject, $flags = ''
 159         ) {
 160                 $inputPos = 0;
 161                 $outputPos = 0;
 162                 $contentPos = 0;
 163                 $output = '';
 164                 $foundStart = false;
 165                 $encStart = preg_quote( $startDelim, '!' );
 166                 $encEnd = preg_quote( $endDelim, '!' );
 167                 $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
 168                 $endLength = strlen( $endDelim );
 169                 $m = [];
 170
 171                 while ( $inputPos < strlen( $subject ) &&
 172                         preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos )
 173                 ) {
 174                         $tokenOffset = $m[0][1];
 175                         if ( $m[1][0] != '' ) {
 176                                 if ( $foundStart &&
 177                                         $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
 178                                 ) {
 179                                         # An end match is present at the same location
 180                                         $tokenType = 'end';
 181                                         $tokenLength = $endLength;
 182                                 } else {
 183                                         $tokenType = 'start';
 184                                         $tokenLength = strlen( $m[0][0] );
 185                                 }
 186                         } elseif ( $m[2][0] != '' ) {
 187                                 $tokenType = 'end';
 188                                 $tokenLength = strlen( $m[0][0] );
 189                         } else {
 190                                 throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
 191                         }
 192
 193                         if ( $tokenType == 'start' ) {
 194                                 # Only move the start position if we haven't already found a start
 195                                 # This means that START START END matches outer pair
 196                                 if ( !$foundStart ) {
 197                                         # Found start
 198                                         $inputPos = $tokenOffset + $tokenLength;
 199                                         # Write out the non-matching section
 200                                         $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
 201                                         $outputPos = $tokenOffset;
 202                                         $contentPos = $inputPos;
 203                                         $foundStart = true;
 204                                 } else {
 205                                         # Move the input position past the *first character* of START,
 206                                         # to protect against missing END when it overlaps with START
 207                                         $inputPos = $tokenOffset + 1;
 208                                 }
 209                         } elseif ( $tokenType == 'end' ) {
 210                                 if ( $foundStart ) {
 211                                         # Found match
 212                                         $output .= $callback( [
 213                                                 substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
 214                                                 substr( $subject, $contentPos, $tokenOffset - $contentPos )
 215                                         ] );
 216                                         $foundStart = false;
 217                                 } else {
 218                                         # Non-matching end, write it out
 219                                         $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
 220                                 }
 221                                 $inputPos = $outputPos = $tokenOffset + $tokenLength;
 222                         } else {
 223                                 throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
 224                         }
 225                 }
 226                 if ( $outputPos < strlen( $subject ) ) {
 227                         $output .= substr( $subject, $outputPos );
 228                 }
 229
 230                 return $output;
 231         }
 232
 233         /**
 234          * Perform an operation equivalent to `preg_replace()` with flags.
 235          *
 236          * Matches this code:
 237          *
 238          *     preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject );
 239          *
 240          * @param string $startDelim Start delimiter regular expression
 241          * @param string $endDelim End delimiter regular expression
 242          * @param string $replace Replacement string. May contain $1, which will be
 243          *  replaced by the text between the delimiters
 244          * @param string $subject String to search
 245          * @param string $flags Regular expression flags
 246          * @return string The string with the matches replaced
 247          */
 248         static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
 249                 return self::delimiterReplaceCallback(
 250                         $startDelim, $endDelim,
 251                         function ( array $matches ) use ( $replace ) {
 252                                 return strtr( $replace, [ '$0' => $matches[0], '$1' => $matches[1] ] );
 253                         },
 254                         $subject, $flags
 255                 );
 256         }
 257
 258         /**
 259          * More or less "markup-safe" explode()
 260          * Ignores any instances of the separator inside `<...>`
 261          * @param string $separator
 262          * @param string $text
 263          * @return array
 264          */
 265         static function explodeMarkup( $separator, $text ) {
 266                 $placeholder = "\x00";
 267
 268                 // Remove placeholder instances
 269                 $text = str_replace( $placeholder, '', $text );
 270
 271                 // Replace instances of the separator inside HTML-like tags with the placeholder
 272                 $cleaned = self::delimiterReplaceCallback(
 273                         '<', '>',
 274                         function ( array $matches ) use ( $separator, $placeholder ) {
 275                                 return str_replace( $separator, $placeholder, $matches[0] );
 276                         },
 277                         $text
 278                 );
 279
 280                 // Explode, then put the replaced separators back in
 281                 $items = explode( $separator, $cleaned );
 282                 foreach ( $items as $i => $str ) {
 283                         $items[$i] = str_replace( $placeholder, $separator, $str );
 284                 }
 285
 286                 return $items;
 287         }
 288
 289         /**
 290          * More or less "markup-safe" str_replace()
 291          * Ignores any instances of the separator inside `<...>`
 292          * @param string $search
 293          * @param string $replace
 294          * @param string $text
 295          * @return string
 296          */
 297         static function replaceMarkup( $search, $replace, $text ) {
 298                 $placeholder = "\x00";
 299
 300                 // Remove placeholder instances
 301                 $text = str_replace( $placeholder, '', $text );
 302
 303                 // Replace instances of the separator inside HTML-like tags with the placeholder
 304                 $cleaned = self::delimiterReplaceCallback(
 305                         '<', '>',
 306                         function ( array $matches ) use ( $search, $placeholder ) {
 307                                 return str_replace( $search, $placeholder, $matches[0] );
 308                         },
 309                         $text
 310                 );
 311
 312                 // Explode, then put the replaced separators back in
 313                 $cleaned = str_replace( $search, $replace, $cleaned );
 314                 $text = str_replace( $placeholder, $search, $cleaned );
 315
 316                 return $text;
 317         }
 318
 319         /**
 320          * Escape a string to make it suitable for inclusion in a preg_replace()
 321          * replacement parameter.
 322          *
 323          * @param string $string
 324          * @return string
 325          */
 326         static function escapeRegexReplacement( $string ) {
 327                 $string = str_replace( '\\', '\\\\', $string );
 328                 $string = str_replace( '$', '\\$', $string );
 329                 return $string;
 330         }
 331
 332         /**
 333          * Workalike for explode() with limited memory usage.
 334          *
 335          * @param string $separator
 336          * @param string $subject
 337          * @return ArrayIterator|ExplodeIterator
 338          */
 339         static function explode( $separator, $subject ) {
 340                 if ( substr_count( $subject, $separator ) > 1000 ) {
 341                         return new ExplodeIterator( $separator, $subject );
 342                 } else {
 343                         return new ArrayIterator( explode( $separator, $subject ) );
 344                 }
 345         }
 346
 347         /**
 348          * Utility function to check if the given string is a valid regex. Avoids
 349          * manually calling suppressWarnings and restoreWarnings, and provides a
 350          * one-line solution without the need to use @.
 351          *
 352          * @since 1.34
 353          * @param string $string The string you want to check being a valid regex
 354          * @return bool
 355          */
 356         public static function isValidRegex( $string ) {
 357                 AtEase::suppressWarnings();
 358                 // @phan-suppress-next-line PhanParamSuspiciousOrder False positive
 359                 $isValid = preg_match( $string, '' );
 360                 AtEase::restoreWarnings();
 361                 return $isValid !== false;
 362         }
 363 }