From ee002d67c95757eec75106cc4b880eb3b2917dc5 Mon Sep 17 00:00:00 2001 From: "C. Scott Ananian" Date: Tue, 13 Dec 2016 15:37:04 -0500 Subject: [PATCH] Protect -{...}- variant constructs in definition lists. Given the wikitext: ;-{zh-cn:AAA;zh-tw:BBB}- Prevent `doBlockLevels` from trying to split the definition list at the embedded colon and using `AAA;zh-tw:BBB}-` as the `
` portion. Bug: T153135 Change-Id: I3a4d02f1fbd0d0fe8278d6b7c66005f0dd3dd36b --- includes/parser/BlockLevelPass.php | 71 ++++++++++++++++++------------ tests/parser/parserTests.txt | 52 +++++++++++++++++++--- 2 files changed, 89 insertions(+), 34 deletions(-) diff --git a/includes/parser/BlockLevelPass.php b/includes/parser/BlockLevelPass.php index cbacd34811..e16cfd4688 100644 --- a/includes/parser/BlockLevelPass.php +++ b/includes/parser/BlockLevelPass.php @@ -38,6 +38,7 @@ class BlockLevelPass { const COLON_STATE_COMMENT = 5; const COLON_STATE_COMMENTDASH = 6; const COLON_STATE_COMMENTDASHDASH = 7; + const COLON_STATE_LC = 8; /** * Make lists from lines starting with ':', '*', '#', etc. @@ -389,15 +390,14 @@ class BlockLevelPass { * @return string The position of the ':', or false if none found */ private function findColonNoLinks( $str, &$before, &$after ) { - $colonPos = strpos( $str, ':' ); - if ( $colonPos === false ) { + if ( !preg_match( '/:|<|-\{/', $str, $m, PREG_OFFSET_CAPTURE ) ) { # Nothing to find! return false; } - $ltPos = strpos( $str, '<' ); - if ( $ltPos === false || $ltPos > $colonPos ) { + if ( $m[0][0] === ':' ) { # Easy; no tag nesting to worry about + $colonPos = $m[0][1]; $before = substr( $str, 0, $colonPos ); $after = substr( $str, $colonPos + 1 ); return $colonPos; @@ -405,9 +405,10 @@ class BlockLevelPass { # Ugly state machine to walk through avoiding tags. $state = self::COLON_STATE_TEXT; - $level = 0; + $ltLevel = 0; + $lcLevel = 0; $len = strlen( $str ); - for ( $i = 0; $i < $len; $i++ ) { + for ( $i = $m[0][1]; $i < $len; $i++ ) { $c = $str[$i]; switch ( $state ) { @@ -418,7 +419,7 @@ class BlockLevelPass { $state = self::COLON_STATE_TAGSTART; break; case ":": - if ( $level === 0 ) { + if ( $ltLevel === 0 ) { # We found it! $before = substr( $str, 0, $i ); $after = substr( $str, $i + 1 ); @@ -428,35 +429,44 @@ class BlockLevelPass { break; default: # Skip ahead looking for something interesting - $colonPos = strpos( $str, ':', $i ); - if ( $colonPos === false ) { + if ( !preg_match( '/:|<|-\{/', $str, $m, PREG_OFFSET_CAPTURE, $i ) ) { # Nothing else interesting return false; } - $ltPos = strpos( $str, '<', $i ); - if ( $level === 0 ) { - if ( $ltPos === false || $colonPos < $ltPos ) { - # We found it! - $before = substr( $str, 0, $colonPos ); - $after = substr( $str, $colonPos + 1 ); - return $i; - } + if ( $m[0][0] === '-{' ) { + $state = self::COLON_STATE_LC; + $lcLevel++; + $i = $m[0][1] + 1; + } else { + # Skip ahead to next interesting character. + $i = $m[0][1] - 1; } - if ( $ltPos === false ) { - # Nothing else interesting to find; abort! - # We're nested, but there's no close tags left. Abort! - break 2; + break; + } + break; + case self::COLON_STATE_LC: + # In language converter markup -{ ... }- + if ( !preg_match( '/-\{|\}-/', $str, $m, PREG_OFFSET_CAPTURE, $i ) ) { + # Nothing else interesting to find; abort! + # We're nested in language converter markup, but there + # are no close tags left. Abort! + break 2; + } elseif ( $m[0][0] === '-{' ) { + $i = $m[0][1] + 1; + $lcLevel++; + } elseif ( $m[0][0] === '}-' ) { + $i = $m[0][1] + 1; + $lcLevel--; + if ( $lcLevel === 0 ) { + $state = self::COLON_STATE_TEXT; } - # Skip ahead to next tag start - $i = $ltPos; - $state = self::COLON_STATE_TAGSTART; } break; case self::COLON_STATE_TAG: # In a switch ( $c ) { case ">": - $level++; + $ltLevel++; $state = self::COLON_STATE_TEXT; break; case "/": @@ -486,8 +496,8 @@ class BlockLevelPass { case self::COLON_STATE_CLOSETAG: # In a if ( $c === ">" ) { - $level--; - if ( $level < 0 ) { + $ltLevel--; + if ( $ltLevel < 0 ) { wfDebug( __METHOD__ . ": Invalid input; too many close tags\n" ); return false; } @@ -526,8 +536,11 @@ class BlockLevelPass { throw new MWException( "State machine error in " . __METHOD__ ); } } - if ( $level > 0 ) { - wfDebug( __METHOD__ . ": Invalid input; not enough close tags (level $level, state $state)\n" ); + if ( $ltLevel > 0 || $lcLevel > 0 ) { + wfDebug( + __METHOD__ . ": Invalid input; not enough close tags " . + "(level $ltLevel/$lcLevel, state $state)\n" + ); return false; } return false; diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index 46e10123be..a94b27633b 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -20893,19 +20893,61 @@ File:foobar.jpg|{{Test|unamedParam|alt=-{R|param}-}}|alt=galleryalt !! end -# FIXME: This test is currently broken in the PHP parser (bug 52661) !! test -Don't break list handling if language converter markup is in the item. +T153135: Don't break list handling if language converter markup is in the item. !! options language=zh variant=zh-cn !! wikitext ;-{zh-cn:AAA;zh-tw:BBB}- +;-{R|foo:bar}- !! html/php -
在手动语言转换规则中检测到错误
+
AAA
+
foo:bar
!! html/parsoid -
AAA -
+
+
+
+
+!! end + +// Note that parsoid does not protect colons unless language converter +// markup is properly nested, because it is a backtracking parser. +!! test +T153135: Unclosed markup in definition list (code coverage) +!! options +language=zh variant=zh-cn +!! wikitext +;foo:bar +;-{zh-cn:AAA +!! html/php +
foo:bar
+
-{zh-cn:AAA
+ +!! html/parsoid +
+
foo:bar
+ +
-{zh-cn
+
AAA
+
+!! end + +!! test +T153135: Nested language converter markup in definition list (code coverage) +!! options +language=zh variant=zh-cn +!! wikitext +;-{zh-cn:AAA -{zh-hans|foo:bar}- -{R|bat:baz}-}-:def +!! html/php +
AAA foo:bar bat:baz
+
def
+ +!! html/parsoid +
+
+
def
+
!! end !! test -- 2.20.1