3 use Wikimedia\AtEase\AtEase
;
6 * Methods to play with strings.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
27 * A collection of static methods to play with strings.
31 * Test whether a string is valid UTF-8.
33 * The function check for invalid byte sequences, overlong encoding but
34 * not for different normalisations.
36 * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
37 * In particular, the pure PHP code path did not in fact check for overlong forms.
38 * Beware of this when backporting code to that version of MediaWiki.
41 * @param string $value String to check
42 * @return bool Whether the given $value is a valid UTF-8 encoded string
44 static function isUtf8( $value ) {
45 return mb_check_encoding( (string)$value, 'UTF-8' );
49 * Explode a string, but ignore any instances of the separator inside
50 * the given start and end delimiters, which may optionally nest.
51 * The delimiters are literal strings, not regular expressions.
52 * @param string $startDelim Start delimiter
53 * @param string $endDelim End delimiter
54 * @param string $separator Separator string for the explode.
55 * @param string $subject Subject string to explode.
56 * @param bool $nested True iff the delimiters are allowed to nest.
57 * @return ArrayIterator
59 static function delimiterExplode( $startDelim, $endDelim, $separator,
60 $subject, $nested = false ) {
64 $encStart = preg_quote( $startDelim, '!' );
65 $encEnd = preg_quote( $endDelim, '!' );
66 $encSep = preg_quote( $separator, '!' );
67 $len = strlen( $subject );
73 "!$encStart|$encEnd|$encSep!S", $subject, $m,
74 PREG_OFFSET_CAPTURE
, $inputPos
79 $inputPos = $matchPos +
strlen( $match );
80 if ( $match === $separator ) {
83 $subject, $lastPos, $matchPos - $lastPos
87 } elseif ( $match === $startDelim ) {
88 if ( $depth === 0 ||
$nested ) {
95 $exploded[] = substr( $subject, $lastPos );
96 // This method could be rewritten in the future to avoid creating an
97 // intermediate array, since the return type is just an iterator.
98 return new ArrayIterator( $exploded );
102 * Perform an operation equivalent to `preg_replace()`
106 * preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
108 * ..except that it's worst-case O(N) instead of O(N^2). Compared to delimiterReplace(), this
109 * implementation is fast but memory-hungry and inflexible. The memory requirements are such
110 * that I don't recommend using it on anything but guaranteed small chunks of text.
112 * @param string $startDelim
113 * @param string $endDelim
114 * @param string $replace
115 * @param string $subject
118 static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
119 $segments = explode( $startDelim, $subject );
120 $output = array_shift( $segments );
121 foreach ( $segments as $s ) {
122 $endDelimPos = strpos( $s, $endDelim );
123 if ( $endDelimPos === false ) {
124 $output .= $startDelim . $s;
126 $output .= $replace . substr( $s, $endDelimPos +
strlen( $endDelim ) );
134 * Perform an operation equivalent to `preg_replace_callback()`
138 * preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject );
140 * If the start delimiter ends with an initial substring of the end delimiter,
141 * e.g. in the case of C-style comments, the behavior differs from the model
142 * regex. In this implementation, the end must share no characters with the
143 * start, so e.g. `/*\/` is not considered to be both the start and end of a
144 * comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`.
146 * The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace()
147 * but uses far less memory. The delimiters are literal strings, not regular expressions.
149 * @param string $startDelim Start delimiter
150 * @param string $endDelim End delimiter
151 * @param callable $callback Function to call on each match
152 * @param string $subject
153 * @param string $flags Regular expression flags
154 * @throws InvalidArgumentException
157 static function delimiterReplaceCallback( $startDelim, $endDelim, $callback,
158 $subject, $flags = ''
165 $encStart = preg_quote( $startDelim, '!' );
166 $encEnd = preg_quote( $endDelim, '!' );
167 $strcmp = strpos( $flags, 'i' ) === false ?
'strcmp' : 'strcasecmp';
168 $endLength = strlen( $endDelim );
171 while ( $inputPos < strlen( $subject ) &&
172 preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE
, $inputPos )
174 $tokenOffset = $m[0][1];
175 if ( $m[1][0] != '' ) {
177 $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
179 # An end match is present at the same location
181 $tokenLength = $endLength;
183 $tokenType = 'start';
184 $tokenLength = strlen( $m[0][0] );
186 } elseif ( $m[2][0] != '' ) {
188 $tokenLength = strlen( $m[0][0] );
190 throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__
);
193 if ( $tokenType == 'start' ) {
194 # Only move the start position if we haven't already found a start
195 # This means that START START END matches outer pair
196 if ( !$foundStart ) {
198 $inputPos = $tokenOffset +
$tokenLength;
199 # Write out the non-matching section
200 $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
201 $outputPos = $tokenOffset;
202 $contentPos = $inputPos;
205 # Move the input position past the *first character* of START,
206 # to protect against missing END when it overlaps with START
207 $inputPos = $tokenOffset +
1;
209 } elseif ( $tokenType == 'end' ) {
212 $output .= $callback( [
213 substr( $subject, $outputPos, $tokenOffset +
$tokenLength - $outputPos ),
214 substr( $subject, $contentPos, $tokenOffset - $contentPos )
218 # Non-matching end, write it out
219 $output .= substr( $subject, $inputPos, $tokenOffset +
$tokenLength - $outputPos );
221 $inputPos = $outputPos = $tokenOffset +
$tokenLength;
223 throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__
);
226 if ( $outputPos < strlen( $subject ) ) {
227 $output .= substr( $subject, $outputPos );
234 * Perform an operation equivalent to `preg_replace()` with flags.
238 * preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject );
240 * @param string $startDelim Start delimiter regular expression
241 * @param string $endDelim End delimiter regular expression
242 * @param string $replace Replacement string. May contain $1, which will be
243 * replaced by the text between the delimiters
244 * @param string $subject String to search
245 * @param string $flags Regular expression flags
246 * @return string The string with the matches replaced
248 static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
249 return self
::delimiterReplaceCallback(
250 $startDelim, $endDelim,
251 function ( array $matches ) use ( $replace ) {
252 return strtr( $replace, [ '$0' => $matches[0], '$1' => $matches[1] ] );
259 * More or less "markup-safe" explode()
260 * Ignores any instances of the separator inside `<...>`
261 * @param string $separator
262 * @param string $text
265 static function explodeMarkup( $separator, $text ) {
266 $placeholder = "\x00";
268 // Remove placeholder instances
269 $text = str_replace( $placeholder, '', $text );
271 // Replace instances of the separator inside HTML-like tags with the placeholder
272 $cleaned = self
::delimiterReplaceCallback(
274 function ( array $matches ) use ( $separator, $placeholder ) {
275 return str_replace( $separator, $placeholder, $matches[0] );
280 // Explode, then put the replaced separators back in
281 $items = explode( $separator, $cleaned );
282 foreach ( $items as $i => $str ) {
283 $items[$i] = str_replace( $placeholder, $separator, $str );
290 * More or less "markup-safe" str_replace()
291 * Ignores any instances of the separator inside `<...>`
292 * @param string $search
293 * @param string $replace
294 * @param string $text
297 static function replaceMarkup( $search, $replace, $text ) {
298 $placeholder = "\x00";
300 // Remove placeholder instances
301 $text = str_replace( $placeholder, '', $text );
303 // Replace instances of the separator inside HTML-like tags with the placeholder
304 $cleaned = self
::delimiterReplaceCallback(
306 function ( array $matches ) use ( $search, $placeholder ) {
307 return str_replace( $search, $placeholder, $matches[0] );
312 // Explode, then put the replaced separators back in
313 $cleaned = str_replace( $search, $replace, $cleaned );
314 $text = str_replace( $placeholder, $search, $cleaned );
320 * Escape a string to make it suitable for inclusion in a preg_replace()
321 * replacement parameter.
323 * @param string $string
326 static function escapeRegexReplacement( $string ) {
327 $string = str_replace( '\\', '\\\\', $string );
328 $string = str_replace( '$', '\\$', $string );
333 * Workalike for explode() with limited memory usage.
335 * @param string $separator
336 * @param string $subject
337 * @return ArrayIterator|ExplodeIterator
339 static function explode( $separator, $subject ) {
340 if ( substr_count( $subject, $separator ) > 1000 ) {
341 return new ExplodeIterator( $separator, $subject );
343 return new ArrayIterator( explode( $separator, $subject ) );
348 * Utility function to check if the given string is a valid regex. Avoids
349 * manually calling suppressWarnings and restoreWarnings, and provides a
350 * one-line solution without the need to use @.
353 * @param string $string The string you want to check being a valid regex
356 public static function isValidRegex( $string ) {
357 AtEase
::suppressWarnings();
358 // @phan-suppress-next-line PhanParamSuspiciousOrder False positive
359 $isValid = preg_match( $string, '' );
360 AtEase
::restoreWarnings();
361 return $isValid !== false;