includes/utils/StringUtils.php

   1 <?php
   2 /**
   3  * Methods to play with strings.
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License along
  16  * with this program; if not, write to the Free Software Foundation, Inc.,
  17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18  * http://www.gnu.org/copyleft/gpl.html
  19  *
  20  * @file
  21  */
  22
  23 /**
  24  * A collection of static methods to play with strings.
  25  */
  26 class StringUtils {
  27
  28         /**
  29          * Test whether a string is valid UTF-8.
  30          *
  31          * The function check for invalid byte sequences, overlong encoding but
  32          * not for different normalisations.
  33          *
  34          * This relies internally on the mbstring function mb_check_encoding()
  35          * hardcoded to check against UTF-8. Whenever the function is not available
  36          * we fallback to a pure PHP implementation. Setting $disableMbstring to
  37          * true will skip the use of mb_check_encoding, this is mostly intended for
  38          * unit testing our internal implementation.
  39          *
  40          * @since 1.21
  41          * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
  42          * In particular, the pure PHP code path did not in fact check for overlong forms.
  43          * Beware of this when backporting code to that version of MediaWiki.
  44          *
  45          * @param string $value String to check
  46          * @param boolean $disableMbstring Whether to use the pure PHP
  47          * implementation instead of trying mb_check_encoding. Intended for unit
  48          * testing. Default: false
  49          *
  50          * @return boolean Whether the given $value is a valid UTF-8 encoded string
  51          */
  52         static function isUtf8( $value, $disableMbstring = false ) {
  53                 $value = (string)$value;
  54
  55                 // If the mbstring extension is loaded, use it. However, before PHP 5.4, values above
  56                 // U+10FFFF are incorrectly allowed, so we have to check for them separately.
  57                 if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) {
  58                         static $newPHP;
  59                         if ( $newPHP === null ) {
  60                                 $newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' );
  61                         }
  62
  63                         return mb_check_encoding( $value, 'UTF-8' ) &&
  64                                 ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
  65                 }
  66
  67                 if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) {
  68                         // String contains only ASCII characters, has to be valid
  69                         return true;
  70                 }
  71
  72                 // PCRE implements repetition using recursion; to avoid a stack overflow (and segfault)
  73                 // for large input, we check for invalid sequences (<= 5 bytes) rather than valid
  74                 // sequences, which can be as long as the input string is. Multiple short regexes are
  75                 // used rather than a single long regex for performance.
  76                 static $regexes;
  77                 if ( $regexes === null ) {
  78                         $cont = "[\x80-\xbf]";
  79                         $after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here
  80                         $regexes = array(
  81                                 // Continuation byte at the start
  82                                 "/^$cont/",
  83
  84                                 // ASCII byte followed by a continuation byte
  85                                 "/[\\x00-\x7f]$cont/S",
  86
  87                                 // Illegal byte
  88                                 "/[\xc0\xc1\xf5-\xff]/S",
  89
  90                                 // Invalid 2-byte sequence, or valid one then an extra continuation byte
  91                                 "/[\xc2-\xdf](?!$cont$after)/S",
  92
  93                                 // Invalid 3-byte sequence, or valid one then an extra continuation byte
  94                                 "/\xe0(?![\xa0-\xbf]$cont$after)/",
  95                                 "/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S",
  96                                 "/\xed(?![\x80-\x9f]$cont$after)/",
  97
  98                                 // Invalid 4-byte sequence, or valid one then an extra continuation byte
  99                                 "/\xf0(?![\x90-\xbf]$cont{2}$after)/",
 100                                 "/[\xf1-\xf3](?!$cont{3}$after)/S",
 101                                 "/\xf4(?![\x80-\x8f]$cont{2}$after)/",
 102                         );
 103                 }
 104
 105                 foreach ( $regexes as $regex ) {
 106                         if ( preg_match( $regex, $value ) !== 0 ) {
 107                                 return false;
 108                         }
 109                 }
 110
 111                 return true;
 112         }
 113
 114         /**
 115          * Perform an operation equivalent to
 116          *
 117          *     preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
 118          *
 119          * except that it's worst-case O(N) instead of O(N^2)
 120          *
 121          * Compared to delimiterReplace(), this implementation is fast but memory-
 122          * hungry and inflexible. The memory requirements are such that I don't
 123          * recommend using it on anything but guaranteed small chunks of text.
 124          *
 125          * @param $startDelim
 126          * @param $endDelim
 127          * @param $replace
 128          * @param $subject
 129          *
 130          * @return string
 131          */
 132         static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
 133                 $segments = explode( $startDelim, $subject );
 134                 $output = array_shift( $segments );
 135                 foreach ( $segments as $s ) {
 136                         $endDelimPos = strpos( $s, $endDelim );
 137                         if ( $endDelimPos === false ) {
 138                                 $output .= $startDelim . $s;
 139                         } else {
 140                                 $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
 141                         }
 142                 }
 143
 144                 return $output;
 145         }
 146
 147         /**
 148          * Perform an operation equivalent to
 149          *
 150          *   preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject )
 151          *
 152          * This implementation is slower than hungryDelimiterReplace but uses far less
 153          * memory. The delimiters are literal strings, not regular expressions.
 154          *
 155          * If the start delimiter ends with an initial substring of the end delimiter,
 156          * e.g. in the case of C-style comments, the behavior differs from the model
 157          * regex. In this implementation, the end must share no characters with the
 158          * start, so e.g. /*\/ is not considered to be both the start and end of a
 159          * comment. /*\/xy/*\/ is considered to be a single comment with contents /xy/.
 160          *
 161          * @param string $startDelim start delimiter
 162          * @param string $endDelim end delimiter
 163          * @param $callback Callback: function to call on each match
 164          * @param $subject String
 165          * @param string $flags regular expression flags
 166          * @throws MWException
 167          * @return string
 168          */
 169         static function delimiterReplaceCallback( $startDelim, $endDelim, $callback, $subject, $flags = '' ) {
 170                 $inputPos = 0;
 171                 $outputPos = 0;
 172                 $output = '';
 173                 $foundStart = false;
 174                 $encStart = preg_quote( $startDelim, '!' );
 175                 $encEnd = preg_quote( $endDelim, '!' );
 176                 $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
 177                 $endLength = strlen( $endDelim );
 178                 $m = array();
 179
 180                 while ( $inputPos < strlen( $subject ) &&
 181                         preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos )
 182                 ) {
 183                         $tokenOffset = $m[0][1];
 184                         if ( $m[1][0] != '' ) {
 185                                 if ( $foundStart &&
 186                                         $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
 187                                 ) {
 188                                         # An end match is present at the same location
 189                                         $tokenType = 'end';
 190                                         $tokenLength = $endLength;
 191                                 } else {
 192                                         $tokenType = 'start';
 193                                         $tokenLength = strlen( $m[0][0] );
 194                                 }
 195                         } elseif ( $m[2][0] != '' ) {
 196                                 $tokenType = 'end';
 197                                 $tokenLength = strlen( $m[0][0] );
 198                         } else {
 199                                 throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
 200                         }
 201
 202                         if ( $tokenType == 'start' ) {
 203                                 # Only move the start position if we haven't already found a start
 204                                 # This means that START START END matches outer pair
 205                                 if ( !$foundStart ) {
 206                                         # Found start
 207                                         $inputPos = $tokenOffset + $tokenLength;
 208                                         # Write out the non-matching section
 209                                         $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
 210                                         $outputPos = $tokenOffset;
 211                                         $contentPos = $inputPos;
 212                                         $foundStart = true;
 213                                 } else {
 214                                         # Move the input position past the *first character* of START,
 215                                         # to protect against missing END when it overlaps with START
 216                                         $inputPos = $tokenOffset + 1;
 217                                 }
 218                         } elseif ( $tokenType == 'end' ) {
 219                                 if ( $foundStart ) {
 220                                         # Found match
 221                                         $output .= call_user_func( $callback, array(
 222                                                 substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
 223                                                 substr( $subject, $contentPos, $tokenOffset - $contentPos )
 224                                         ) );
 225                                         $foundStart = false;
 226                                 } else {
 227                                         # Non-matching end, write it out
 228                                         $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
 229                                 }
 230                                 $inputPos = $outputPos = $tokenOffset + $tokenLength;
 231                         } else {
 232                                 throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
 233                         }
 234                 }
 235                 if ( $outputPos < strlen( $subject ) ) {
 236                         $output .= substr( $subject, $outputPos );
 237                 }
 238
 239                 return $output;
 240         }
 241
 242         /**
 243          * Perform an operation equivalent to
 244          *
 245          *   preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject )
 246          *
 247          * @param string $startDelim start delimiter regular expression
 248          * @param string $endDelim end delimiter regular expression
 249          * @param string $replace replacement string. May contain $1, which will be
 250          *                 replaced by the text between the delimiters
 251          * @param string $subject to search
 252          * @param string $flags regular expression flags
 253          * @return String: The string with the matches replaced
 254          */
 255         static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
 256                 $replacer = new RegexlikeReplacer( $replace );
 257
 258                 return self::delimiterReplaceCallback( $startDelim, $endDelim,
 259                         $replacer->cb(), $subject, $flags );
 260         }
 261
 262         /**
 263          * More or less "markup-safe" explode()
 264          * Ignores any instances of the separator inside <...>
 265          * @param string $separator
 266          * @param string $text
 267          * @return array
 268          */
 269         static function explodeMarkup( $separator, $text ) {
 270                 $placeholder = "\x00";
 271
 272                 // Remove placeholder instances
 273                 $text = str_replace( $placeholder, '', $text );
 274
 275                 // Replace instances of the separator inside HTML-like tags with the placeholder
 276                 $replacer = new DoubleReplacer( $separator, $placeholder );
 277                 $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
 278
 279                 // Explode, then put the replaced separators back in
 280                 $items = explode( $separator, $cleaned );
 281                 foreach ( $items as $i => $str ) {
 282                         $items[$i] = str_replace( $placeholder, $separator, $str );
 283                 }
 284
 285                 return $items;
 286         }
 287
 288         /**
 289          * Escape a string to make it suitable for inclusion in a preg_replace()
 290          * replacement parameter.
 291          *
 292          * @param string $string
 293          * @return string
 294          */
 295         static function escapeRegexReplacement( $string ) {
 296                 $string = str_replace( '\\', '\\\\', $string );
 297                 $string = str_replace( '$', '\\$', $string );
 298
 299                 return $string;
 300         }
 301
 302         /**
 303          * Workalike for explode() with limited memory usage.
 304          * Returns an Iterator
 305          * @param string $separator
 306          * @param string $subject
 307          * @return ArrayIterator|ExplodeIterator
 308          */
 309         static function explode( $separator, $subject ) {
 310                 if ( substr_count( $subject, $separator ) > 1000 ) {
 311                         return new ExplodeIterator( $separator, $subject );
 312                 } else {
 313                         return new ArrayIterator( explode( $separator, $subject ) );
 314                 }
 315         }
 316 }
 317
 318 /**
 319  * Base class for "replacers", objects used in preg_replace_callback() and
 320  * StringUtils::delimiterReplaceCallback()
 321  */
 322 class Replacer {
 323
 324         /**
 325          * @return array
 326          */
 327         function cb() {
 328                 return array( &$this, 'replace' );
 329         }
 330 }
 331
 332 /**
 333  * Class to replace regex matches with a string similar to that used in preg_replace()
 334  */
 335 class RegexlikeReplacer extends Replacer {
 336         var $r;
 337
 338         /**
 339          * @param string $r
 340          */
 341         function __construct( $r ) {
 342                 $this->r = $r;
 343         }
 344
 345         /**
 346          * @param array $matches
 347          * @return string
 348          */
 349         function replace( $matches ) {
 350                 $pairs = array();
 351                 foreach ( $matches as $i => $match ) {
 352                         $pairs["\$$i"] = $match;
 353                 }
 354
 355                 return strtr( $this->r, $pairs );
 356         }
 357 }
 358
 359 /**
 360  * Class to perform secondary replacement within each replacement string
 361  */
 362 class DoubleReplacer extends Replacer {
 363
 364         /**
 365          * @param $from
 366          * @param $to
 367          * @param int $index
 368          */
 369         function __construct( $from, $to, $index = 0 ) {
 370                 $this->from = $from;
 371                 $this->to = $to;
 372                 $this->index = $index;
 373         }
 374
 375         /**
 376          * @param array $matches
 377          * @return mixed
 378          */
 379         function replace( $matches ) {
 380                 return str_replace( $this->from, $this->to, $matches[$this->index] );
 381         }
 382 }
 383
 384 /**
 385  * Class to perform replacement based on a simple hashtable lookup
 386  */
 387 class HashtableReplacer extends Replacer {
 388         var $table, $index;
 389
 390         /**
 391          * @param $table
 392          * @param int $index
 393          */
 394         function __construct( $table, $index = 0 ) {
 395                 $this->table = $table;
 396                 $this->index = $index;
 397         }
 398
 399         /**
 400          * @param array $matches
 401          * @return mixed
 402          */
 403         function replace( $matches ) {
 404                 return $this->table[$matches[$this->index]];
 405         }
 406 }
 407
 408 /**
 409  * Replacement array for FSS with fallback to strtr()
 410  * Supports lazy initialisation of FSS resource
 411  */
 412 class ReplacementArray {
 413         /*mostly private*/ var $data = false;
 414         /*mostly private*/ var $fss = false;
 415
 416         /**
 417          * Create an object with the specified replacement array
 418          * The array should have the same form as the replacement array for strtr()
 419          * @param array $data
 420          */
 421         function __construct( $data = array() ) {
 422                 $this->data = $data;
 423         }
 424
 425         /**
 426          * @return array
 427          */
 428         function __sleep() {
 429                 return array( 'data' );
 430         }
 431
 432         function __wakeup() {
 433                 $this->fss = false;
 434         }
 435
 436         /**
 437          * Set the whole replacement array at once
 438          * @param array $data
 439          */
 440         function setArray( $data ) {
 441                 $this->data = $data;
 442                 $this->fss = false;
 443         }
 444
 445         /**
 446          * @return array|bool
 447          */
 448         function getArray() {
 449                 return $this->data;
 450         }
 451
 452         /**
 453          * Set an element of the replacement array
 454          * @param string $from
 455          * @param string $to
 456          */
 457         function setPair( $from, $to ) {
 458                 $this->data[$from] = $to;
 459                 $this->fss = false;
 460         }
 461
 462         /**
 463          * @param array $data
 464          */
 465         function mergeArray( $data ) {
 466                 $this->data = array_merge( $this->data, $data );
 467                 $this->fss = false;
 468         }
 469
 470         /**
 471          * @param ReplacementArray $other
 472          */
 473         function merge( $other ) {
 474                 $this->data = array_merge( $this->data, $other->data );
 475                 $this->fss = false;
 476         }
 477
 478         /**
 479          * @param string $from
 480          */
 481         function removePair( $from ) {
 482                 unset( $this->data[$from] );
 483                 $this->fss = false;
 484         }
 485
 486         /**
 487          * @param array $data
 488          */
 489         function removeArray( $data ) {
 490                 foreach ( $data as $from => $to ) {
 491                         $this->removePair( $from );
 492                 }
 493                 $this->fss = false;
 494         }
 495
 496         /**
 497          * @param string $subject
 498          * @return string
 499          */
 500         function replace( $subject ) {
 501                 if ( function_exists( 'fss_prep_replace' ) ) {
 502                         wfProfileIn( __METHOD__ . '-fss' );
 503                         if ( $this->fss === false ) {
 504                                 $this->fss = fss_prep_replace( $this->data );
 505                         }
 506                         $result = fss_exec_replace( $this->fss, $subject );
 507                         wfProfileOut( __METHOD__ . '-fss' );
 508                 } else {
 509                         wfProfileIn( __METHOD__ . '-strtr' );
 510                         $result = strtr( $subject, $this->data );
 511                         wfProfileOut( __METHOD__ . '-strtr' );
 512                 }
 513
 514                 return $result;
 515         }
 516 }
 517
 518 /**
 519  * An iterator which works exactly like:
 520  *
 521  * foreach ( explode( $delim, $s ) as $element ) {
 522  *    ...
 523  * }
 524  *
 525  * Except it doesn't use 193 byte per element
 526  */
 527 class ExplodeIterator implements Iterator {
 528         // The subject string
 529         var $subject, $subjectLength;
 530
 531         // The delimiter
 532         var $delim, $delimLength;
 533
 534         // The position of the start of the line
 535         var $curPos;
 536
 537         // The position after the end of the next delimiter
 538         var $endPos;
 539
 540         // The current token
 541         var $current;
 542
 543         /**
 544          * Construct a DelimIterator
 545          * @param string $delim
 546          * @param string $subject
 547          */
 548         function __construct( $delim, $subject ) {
 549                 $this->subject = $subject;
 550                 $this->delim = $delim;
 551
 552                 // Micro-optimisation (theoretical)
 553                 $this->subjectLength = strlen( $subject );
 554                 $this->delimLength = strlen( $delim );
 555
 556                 $this->rewind();
 557         }
 558
 559         function rewind() {
 560                 $this->curPos = 0;
 561                 $this->endPos = strpos( $this->subject, $this->delim );
 562                 $this->refreshCurrent();
 563         }
 564
 565         function refreshCurrent() {
 566                 if ( $this->curPos === false ) {
 567                         $this->current = false;
 568                 } elseif ( $this->curPos >= $this->subjectLength ) {
 569                         $this->current = '';
 570                 } elseif ( $this->endPos === false ) {
 571                         $this->current = substr( $this->subject, $this->curPos );
 572                 } else {
 573                         $this->current = substr( $this->subject, $this->curPos, $this->endPos - $this->curPos );
 574                 }
 575         }
 576
 577         function current() {
 578                 return $this->current;
 579         }
 580
 581         /**
 582          * @return int|bool Current position or boolean false if invalid
 583          */
 584         function key() {
 585                 return $this->curPos;
 586         }
 587
 588         /**
 589          * @return string
 590          */
 591         function next() {
 592                 if ( $this->endPos === false ) {
 593                         $this->curPos = false;
 594                 } else {
 595                         $this->curPos = $this->endPos + $this->delimLength;
 596                         if ( $this->curPos >= $this->subjectLength ) {
 597                                 $this->endPos = false;
 598                         } else {
 599                                 $this->endPos = strpos( $this->subject, $this->delim, $this->curPos );
 600                         }
 601                 }
 602                 $this->refreshCurrent();
 603
 604                 return $this->current;
 605         }
 606
 607         /**
 608          * @return bool
 609          */
 610         function valid() {
 611                 return $this->curPos !== false;
 612         }
 613 }