From f51d0d9a819f8f1c181350ced2f015ce97985fcc Mon Sep 17 00:00:00 2001 From: =?utf8?q?Bartosz=20Dziewo=C5=84ski?= Date: Tue, 24 Nov 2015 22:55:24 +0100 Subject: [PATCH] Preprocessor: Don't allow unclosed extension tags (matching until end of input) I think it's saner to treat this as invalid syntax, and output the mismatched tag code verbatim. The current behavior is particularly annoying for tags, which often swallow everything afterwards. This does not affect HTML tags, though. Assuming Tidy is enabled, they are still auto-closed at the end of the page content. Related to T17712 and T58306. I think this brings the PHP parser closer to Parsoid's interpretation. It reduces performance somewhat in the worst case, though. Testing with https://phabricator.wikimedia.org/F3245989 (a 1 MB page starting with 3000 opening tags of 15 different types), parsing time rises from ~0.2 seconds to ~1.1 seconds on my setup. We go from O(N) to O(kN), where N is bytes of input and k is the number of types of tags present on the page. Maximum k shouldn't exceed 30 or so in reasonable setups (depends on installed extensions, it's 20 on English Wikipedia). To consider: * Should we keep previous behavior for unclosed / ? This would be particularly disruptive for these if someone relied on the old behavior, and they're already special-cased in places. * Unclosed
 tags are now treated as HTML tags, and are still
  displayed as preformatted text, but without suppressing wikitext
  formatting.

Change-Id: Ia2f24dbfb3567c4b0778761585e6c0303d11ddd0
---
 includes/parser/Preprocessor_DOM.php             | 16 +++++++++++-----
 includes/parser/Preprocessor_Hash.php            | 16 +++++++++++-----
 tests/parser/parserTests.txt                     | 10 ++++++++--
 .../phpunit/includes/parser/PreprocessorTest.php |  2 +-
 4 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/includes/parser/Preprocessor_DOM.php b/includes/parser/Preprocessor_DOM.php
index 4ca3a878bd..817f1538e3 100644
--- a/includes/parser/Preprocessor_DOM.php
+++ b/includes/parser/Preprocessor_DOM.php
@@ -237,6 +237,8 @@ class Preprocessor_DOM extends Preprocessor {
 		$inHeading = false;
 		// True if there are no more greater-than (>) signs right of $i
 		$noMoreGT = false;
+		// Map of tag name => true if there are no more closing tags of given type right of $i
+		$noMoreClosingTag = array();
 		// True to ignore all input up to the next 
 		$findOnlyinclude = $enableOnlyinclude;
 		// Do a line-start run without outputting an LF character
@@ -457,17 +459,21 @@ class Preprocessor_DOM extends Preprocessor {
 				} else {
 					$attrEnd = $tagEndPos;
 					// Find closing tag
-					if ( preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
+					if (
+						!isset( $noMoreClosingTag[$name] ) &&
+						preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
 							$text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 )
 					) {
 						$inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 );
 						$i = $matches[0][1] + strlen( $matches[0][0] );
 						$close = '' . htmlspecialchars( $matches[0][0] ) . '';
 					} else {
-						// No end tag -- let it run out to the end of the text.
-						$inner = substr( $text, $tagEndPos + 1 );
-						$i = $lengthText;
-						$close = '';
+						// No end tag -- don't match the tag, treat opening tag as literal and resume parsing.
+						$i = $tagEndPos + 1;
+						$accum .= htmlspecialchars( substr( $text, $tagStartPos, $tagEndPos + 1 - $tagStartPos ) );
+						// Cache results, otherwise we have O(N^2) performance for input like ...
+						$noMoreClosingTag[$name] = true;
+						continue;
 					}
 				}
 				//  and  just become  tags
diff --git a/includes/parser/Preprocessor_Hash.php b/includes/parser/Preprocessor_Hash.php
index 50eaefbce3..28c49fdd5e 100644
--- a/includes/parser/Preprocessor_Hash.php
+++ b/includes/parser/Preprocessor_Hash.php
@@ -160,6 +160,8 @@ class Preprocessor_Hash extends Preprocessor {
 		$inHeading = false;
 		// True if there are no more greater-than (>) signs right of $i
 		$noMoreGT = false;
+		// Map of tag name => true if there are no more closing tags of given type right of $i
+		$noMoreClosingTag = array();
 		// True to ignore all input up to the next 
 		$findOnlyinclude = $enableOnlyinclude;
 		// Do a line-start run without outputting an LF character
@@ -380,17 +382,21 @@ class Preprocessor_Hash extends Preprocessor {
 				} else {
 					$attrEnd = $tagEndPos;
 					// Find closing tag
-					if ( preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
+					if (
+						!isset( $noMoreClosingTag[$name] ) &&
+						preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
 							$text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 )
 					) {
 						$inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 );
 						$i = $matches[0][1] + strlen( $matches[0][0] );
 						$close = $matches[0][0];
 					} else {
-						// No end tag -- let it run out to the end of the text.
-						$inner = substr( $text, $tagEndPos + 1 );
-						$i = $lengthText;
-						$close = null;
+						// No end tag -- don't match the tag, treat opening tag as literal and resume parsing.
+						$i = $tagEndPos + 1;
+						$accum->addLiteral( substr( $text, $tagStartPos, $tagEndPos + 1 - $tagStartPos ) );
+						// Cache results, otherwise we have O(N^2) performance for input like ...
+						$noMoreClosingTag[$name] = true;
+						continue;
 					}
 				}
 				//  and  just become  tags
diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt
index cd2b769a53..438fe314ca 100644
--- a/tests/parser/parserTests.txt
+++ b/tests/parser/parserTests.txt
@@ -2520,7 +2520,6 @@ Barack Obama  of the United States
 

!! end -## PHP parser discards the "
 !! html/php
 
x
-
+<pre
+!! html/php+tidy +
+x
+
+

<pre

!! html/parsoid
x
@@ -10876,6 +10880,8 @@ Un-closed !! wikitext !! html +

<includeonly> +

!! end ## We used to, but no longer wt2wt this test since the default serializer diff --git a/tests/phpunit/includes/parser/PreprocessorTest.php b/tests/phpunit/includes/parser/PreprocessorTest.php index 1ebba1a5e2..b9402305bd 100644 --- a/tests/phpunit/includes/parser/PreprocessorTest.php +++ b/tests/phpunit/includes/parser/PreprocessorTest.php @@ -48,7 +48,7 @@ class PreprocessorTest extends MediaWikiTestCase { array( " Foo bar ", "<noinclude> Foo bar </noinclude>" ), array( "\n{{Foo}}\n", "<noinclude>\n\n</noinclude>" ), array( "\n{{Foo}}\n\n", "<noinclude>\n\n</noinclude>\n" ), - array( "foo bar", "galleryfoo bar" ), + array( "foo bar", "<gallery>foo bar" ), array( "<{{foo}}>", "<>" ), array( "<{{{foo}}}>", "<foo>" ), array( "", "gallery</gallery</gallery>" ), -- 2.20.1