From: C. Scott Ananian Date: Tue, 13 Dec 2016 19:49:36 +0000 (-0500) Subject: Protect -{...}- variant constructs in images. X-Git-Tag: 1.31.0-rc.0~4508 X-Git-Url: http://git.cyclocoop.org/%7D%7Cconcat%7B?a=commitdiff_plain;h=51d54b4b9150c32dec88e76e25ed90c3f026281d;p=lhc%2Fweb%2Fwiklou.git Protect -{...}- variant constructs in images. A protected version of explode is factored out as `StringUtils::delimiterExplode`, since it will be used in follow-up patches in this series. The `delimiterExplode` implementation creates an intermediate array of the exploded results, which is reasonable as the number of image options is small; but since an Iterator is returned the implementation can be upgraded in the future (at the cost of additional complexity) to avoid this. The additional code in that case would be similar to ExplodeIterator. Bug: T146305 Change-Id: I1327685e9e8c07ef476dceaa6f6dae4ba40989ef --- diff --git a/includes/libs/StringUtils.php b/includes/libs/StringUtils.php index 6b10c0998b..26f3c4ac61 100644 --- a/includes/libs/StringUtils.php +++ b/includes/libs/StringUtils.php @@ -54,6 +54,59 @@ class StringUtils { ( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 ); } + /** + * Explode a string, but ignore any instances of the separator inside + * the given start and end delimiters, which may optionally nest. + * The delimiters are literal strings, not regular expressions. + * @param string $startDelim Start delimiter + * @param string $endDelim End delimiter + * @param string $separator Separator string for the explode. + * @param string $subject Subject string to explode. + * @param bool $nested True iff the delimiters are allowed to nest. + * @return ArrayIterator + */ + static function delimiterExplode( $startDelim, $endDelim, $separator, + $subject, $nested = false ) { + $inputPos = 0; + $lastPos = 0; + $depth = 0; + $encStart = preg_quote( $startDelim, '!' ); + $encEnd = preg_quote( $endDelim, '!' ); + $encSep = preg_quote( $separator, '!' ); + $len = strlen( $subject ); + $m = []; + $exploded = []; + while ( + $inputPos < $len && + preg_match( + "!$encStart|$encEnd|$encSep!S", $subject, $m, + PREG_OFFSET_CAPTURE, $inputPos + ) + ) { + $match = $m[0][0]; + $matchPos = $m[0][1]; + $inputPos = $matchPos + strlen( $match ); + if ( $match === $separator ) { + if ( $depth === 0 ) { + $exploded[] = substr( + $subject, $lastPos, $matchPos - $lastPos + ); + $lastPos = $inputPos; + } + } elseif ( $match === $startDelim ) { + if ( $depth === 0 || $nested ) { + $depth++; + } + } else { + $depth--; + } + } + $exploded[] = substr( $subject, $lastPos ); + // This method could be rewritten in the future to avoid creating an + // intermediate array, since the return type is just an iterator. + return new ArrayIterator( $exploded ); + } + /** * Perform an operation equivalent to `preg_replace()` * diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index 741854749a..8f9830c188 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -5150,7 +5150,10 @@ class Parser { # * bottom # * text-bottom - $parts = StringUtils::explode( "|", $options ); + # Protect LanguageConverter markup when splitting into parts + $parts = StringUtils::delimiterExplode( + '-{', '}-', '|', $options, true /* allow nesting */ + ); # Give extensions a chance to select the file revision for us $options = []; diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index 505bc2d25f..edcc2c46ea 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -20617,16 +20617,30 @@ language=sr variant=sr-ec

!! end -# FIXME: This test is currently broken in the PHP parser (bug 52661) !! test -Don't break image parsing if language converter markup is in the caption. +T146305: Don't break image parsing if language converter markup is in the caption. !! options language=sr !! wikitext -[[File:Foobar.jpg|-{R|caption}-]] +[[File:Foobar.jpg|thumb|-{R|caption:}-]] +!! html/php +
caption:
+ !! html/parsoid -

caption -

+
+!! end + +!! test +T146305: Don't break image parsing if nested language converter markup is in the caption. +!! options +language=zh variant=zh-cn +!! wikitext +[[File:Foobar.jpg|thumb|-{zh-cn:blog (hk: -{zh-hans|WEBJOURNAL}-, tw: -{zh-hans|WEBLOG}-)}-]] +!! html/php +
blog (hk: WEBJOURNAL, tw: WEBLOG)
+ +!! html/parsoid +
!! end # FIXME: This test is currently broken in the PHP parser (bug 52661)