From: Bartosz DziewoƄski Date: Thu, 4 Feb 2016 01:13:24 +0000 (+0000) Subject: Preprocessor: Don't allow unclosed extension tags (matching until end of input) X-Git-Tag: 1.31.0-rc.0~7372 X-Git-Url: https://git.cyclocoop.org/%7B%24www_url%7Dadmin/compta/banques/?a=commitdiff_plain;h=674e8388cba9305223ef52f733e726c71c755778;p=lhc%2Fweb%2Fwiklou.git Preprocessor: Don't allow unclosed extension tags (matching until end of input) (Previously done in f51d0d9a819f8f1c181350ced2f015ce97985fcc and reverted in 543f46e9c08e0ff8c5e8b4e917fcc045730ef1bc.) I think it's saner to treat this as invalid syntax, and output the mismatched tag code verbatim. The current behavior is particularly annoying for tags, which often swallow everything afterwards. This does not affect HTML tags, though. Assuming Tidy is enabled, they are still auto-closed at the end of the page content. (For tags that "shadow" a HTML tag name, this results in the tag being treated as a HTML tag. This currently only affects
 tags: if unclosed, they
are still displayed as preformatted text, but without suppressing
wikitext formatting.)

It also does not affect ,  and 
tags. Changing this behavior now would be too disruptive to existing
content, and is the reason why previous attempt was reverted. (They
are already special-cased enough that this isn't too weird, for example
mismatched closing tags are hidden.)

Related to T17712 and T58306. I think this brings the PHP parser closer
to Parsoid's interpretation.

It reduces performance somewhat in the worst case, though. Testing with
https://phabricator.wikimedia.org/F3245989 (a 1 MB page starting with
3000 opening tags of 15 different types), parsing time rises from
~0.2 seconds to ~1.1 seconds on my setup. We go from O(N) to O(kN),
where N is bytes of input and k is the number of types of tags present
on the page. Maximum k shouldn't exceed 30 or so in reasonable setups
(depends on installed extensions, it's 20 on English Wikipedia).

Change-Id: Ide8b034e464eefb1b7c9e2a48ed06e21a7f8d434
---

diff --git a/includes/parser/Preprocessor_DOM.php b/includes/parser/Preprocessor_DOM.php
index 4c94b2ae90..a28c0aa439 100644
--- a/includes/parser/Preprocessor_DOM.php
+++ b/includes/parser/Preprocessor_DOM.php
@@ -196,6 +196,7 @@ class Preprocessor_DOM extends Preprocessor {
 		$forInclusion = $flags & Parser::PTD_FOR_INCLUSION;
 
 		$xmlishElements = $this->parser->getStripList();
+		$xmlishAllowMissingEndTag = [ 'includeonly', 'noinclude', 'onlyinclude' ];
 		$enableOnlyinclude = false;
 		if ( $forInclusion ) {
 			$ignoredTags = [ 'includeonly', '/includeonly' ];
@@ -237,6 +238,8 @@ class Preprocessor_DOM extends Preprocessor {
 		$inHeading = false;
 		// True if there are no more greater-than (>) signs right of $i
 		$noMoreGT = false;
+		// Map of tag name => true if there are no more closing tags of given type right of $i
+		$noMoreClosingTag = [];
 		// True to ignore all input up to the next 
 		$findOnlyinclude = $enableOnlyinclude;
 		// Do a line-start run without outputting an LF character
@@ -457,17 +460,29 @@ class Preprocessor_DOM extends Preprocessor {
 				} else {
 					$attrEnd = $tagEndPos;
 					// Find closing tag
-					if ( preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
+					if (
+						!isset( $noMoreClosingTag[$name] ) &&
+						preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
 							$text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 )
 					) {
 						$inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 );
 						$i = $matches[0][1] + strlen( $matches[0][0] );
 						$close = '' . htmlspecialchars( $matches[0][0] ) . '';
 					} else {
-						// No end tag -- let it run out to the end of the text.
-						$inner = substr( $text, $tagEndPos + 1 );
-						$i = $lengthText;
-						$close = '';
+						// No end tag
+						if ( in_array( $name, $xmlishAllowMissingEndTag ) ) {
+							// Let it run out to the end of the text.
+							$inner = substr( $text, $tagEndPos + 1 );
+							$i = $lengthText;
+							$close = '';
+						} else {
+							// Don't match the tag, treat opening tag as literal and resume parsing.
+							$i = $tagEndPos + 1;
+							$accum .= htmlspecialchars( substr( $text, $tagStartPos, $tagEndPos + 1 - $tagStartPos ) );
+							// Cache results, otherwise we have O(N^2) performance for input like ...
+							$noMoreClosingTag[$name] = true;
+							continue;
+						}
 					}
 				}
 				//  and  just become  tags
diff --git a/includes/parser/Preprocessor_Hash.php b/includes/parser/Preprocessor_Hash.php
index f030ccac1c..0e1196793c 100644
--- a/includes/parser/Preprocessor_Hash.php
+++ b/includes/parser/Preprocessor_Hash.php
@@ -120,6 +120,7 @@ class Preprocessor_Hash extends Preprocessor {
 		$forInclusion = $flags & Parser::PTD_FOR_INCLUSION;
 
 		$xmlishElements = $this->parser->getStripList();
+		$xmlishAllowMissingEndTag = [ 'includeonly', 'noinclude', 'onlyinclude' ];
 		$enableOnlyinclude = false;
 		if ( $forInclusion ) {
 			$ignoredTags = [ 'includeonly', '/includeonly' ];
@@ -160,6 +161,8 @@ class Preprocessor_Hash extends Preprocessor {
 		$inHeading = false;
 		// True if there are no more greater-than (>) signs right of $i
 		$noMoreGT = false;
+		// Map of tag name => true if there are no more closing tags of given type right of $i
+		$noMoreClosingTag = [];
 		// True to ignore all input up to the next 
 		$findOnlyinclude = $enableOnlyinclude;
 		// Do a line-start run without outputting an LF character
@@ -380,17 +383,29 @@ class Preprocessor_Hash extends Preprocessor {
 				} else {
 					$attrEnd = $tagEndPos;
 					// Find closing tag
-					if ( preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
+					if (
+						!isset( $noMoreClosingTag[$name] ) &&
+						preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
 							$text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 )
 					) {
 						$inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 );
 						$i = $matches[0][1] + strlen( $matches[0][0] );
 						$close = $matches[0][0];
 					} else {
-						// No end tag -- let it run out to the end of the text.
-						$inner = substr( $text, $tagEndPos + 1 );
-						$i = $lengthText;
-						$close = null;
+						// No end tag
+						if ( in_array( $name, $xmlishAllowMissingEndTag ) ) {
+							// Let it run out to the end of the text.
+							$inner = substr( $text, $tagEndPos + 1 );
+							$i = $lengthText;
+							$close = null;
+						} else {
+							// Don't match the tag, treat opening tag as literal and resume parsing.
+							$i = $tagEndPos + 1;
+							$accum->addLiteral( substr( $text, $tagStartPos, $tagEndPos + 1 - $tagStartPos ) );
+							// Cache results, otherwise we have O(N^2) performance for input like ...
+							$noMoreClosingTag[$name] = true;
+							continue;
+						}
 					}
 				}
 				//  and  just become  tags
diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt
index c6eebe40cb..e2d4f14cb5 100644
--- a/tests/parser/parserTests.txt
+++ b/tests/parser/parserTests.txt
@@ -2519,7 +2519,6 @@ Barack Obama  of the United States
 

!! end -## PHP parser discards the "
 !! html/php
 
x
-
+<pre
+!! html/php+tidy +
+x
+
+

<pre

!! html/parsoid
x
diff --git a/tests/phpunit/includes/parser/PreprocessorTest.php b/tests/phpunit/includes/parser/PreprocessorTest.php index 420460195d..a62503a629 100644 --- a/tests/phpunit/includes/parser/PreprocessorTest.php +++ b/tests/phpunit/includes/parser/PreprocessorTest.php @@ -48,7 +48,7 @@ class PreprocessorTest extends MediaWikiTestCase { [ " Foo bar ", "<noinclude> Foo bar </noinclude>" ], [ "\n{{Foo}}\n", "<noinclude>\n\n</noinclude>" ], [ "\n{{Foo}}\n\n", "<noinclude>\n\n</noinclude>\n" ], - [ "foo bar", "galleryfoo bar" ], + [ "foo bar", "<gallery>foo bar" ], [ "<{{foo}}>", "<>" ], [ "<{{{foo}}}>", "<foo>" ], [ "", "gallery</gallery</gallery>" ],