From 674e8388cba9305223ef52f733e726c71c755778 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Bartosz=20Dziewo=C5=84ski?= <matma.rex@gmail.com>
Date: Thu, 4 Feb 2016 01:13:24 +0000
Subject: [PATCH] Preprocessor: Don't allow unclosed extension tags (matching
 until end of input)

(Previously done in f51d0d9a819f8f1c181350ced2f015ce97985fcc and
reverted in 543f46e9c08e0ff8c5e8b4e917fcc045730ef1bc.)

I think it's saner to treat this as invalid syntax, and output the
mismatched tag code verbatim. The current behavior is particularly
annoying for <ref> tags, which often swallow everything afterwards.

This does not affect HTML tags, though. Assuming Tidy is enabled, they
are still auto-closed at the end of the page content. (For tags that
"shadow" a HTML tag name, this results in the tag being treated as a
HTML tag. This currently only affects <pre> tags: if unclosed, they
are still displayed as preformatted text, but without suppressing
wikitext formatting.)

It also does not affect <includeonly>, <noinclude> and <onlyinclude>
tags. Changing this behavior now would be too disruptive to existing
content, and is the reason why previous attempt was reverted. (They
are already special-cased enough that this isn't too weird, for example
mismatched closing tags are hidden.)

Related to T17712 and T58306. I think this brings the PHP parser closer
to Parsoid's interpretation.

It reduces performance somewhat in the worst case, though. Testing with
https://phabricator.wikimedia.org/F3245989 (a 1 MB page starting with
3000 opening tags of 15 different types), parsing time rises from
~0.2 seconds to ~1.1 seconds on my setup. We go from O(N) to O(kN),
where N is bytes of input and k is the number of types of tags present
on the page. Maximum k shouldn't exceed 30 or so in reasonable setups
(depends on installed extensions, it's 20 on English Wikipedia).

Change-Id: Ide8b034e464eefb1b7c9e2a48ed06e21a7f8d434
---
 includes/parser/Preprocessor_DOM.php          | 25 +++++++++++++++----
 includes/parser/Preprocessor_Hash.php         | 25 +++++++++++++++----
 tests/parser/parserTests.txt                  |  8 ++++--
 .../includes/parser/PreprocessorTest.php      |  2 +-
 4 files changed, 47 insertions(+), 13 deletions(-)
diff --git a/includes/parser/Preprocessor_DOM.php b/includes/parser/Preprocessor_DOM.php
index 4c94b2ae90..a28c0aa439 100644
--- a/includes/parser/Preprocessor_DOM.php
+++ b/includes/parser/Preprocessor_DOM.php
@@ -196,6 +196,7 @@ class Preprocessor_DOM extends Preprocessor {
 		$forInclusion = $flags & Parser::PTD_FOR_INCLUSION;
 
 		$xmlishElements = $this->parser->getStripList();
+		$xmlishAllowMissingEndTag = [ 'includeonly', 'noinclude', 'onlyinclude' ];
 		$enableOnlyinclude = false;
 		if ( $forInclusion ) {
 			$ignoredTags = [ 'includeonly', '/includeonly' ];
@@ -237,6 +238,8 @@ class Preprocessor_DOM extends Preprocessor {
 		$inHeading = false;
 		// True if there are no more greater-than (>) signs right of $i
 		$noMoreGT = false;
+		// Map of tag name => true if there are no more closing tags of given type right of $i
+		$noMoreClosingTag = [];
 		// True to ignore all input up to the next <onlyinclude>
 		$findOnlyinclude = $enableOnlyinclude;
 		// Do a line-start run without outputting an LF character
@@ -457,17 +460,29 @@ class Preprocessor_DOM extends Preprocessor {
 				} else {
 					$attrEnd = $tagEndPos;
 					// Find closing tag
-					if ( preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
+					if (
+						!isset( $noMoreClosingTag[$name] ) &&
+						preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
 							$text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 )
 					) {
 						$inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 );
 						$i = $matches[0][1] + strlen( $matches[0][0] );
 						$close = '<close>' . htmlspecialchars( $matches[0][0] ) . '</close>';
 					} else {
-						// No end tag -- let it run out to the end of the text.
-						$inner = substr( $text, $tagEndPos + 1 );
-						$i = $lengthText;
-						$close = '';
+						// No end tag
+						if ( in_array( $name, $xmlishAllowMissingEndTag ) ) {
+							// Let it run out to the end of the text.
+							$inner = substr( $text, $tagEndPos + 1 );
+							$i = $lengthText;
+							$close = '';
+						} else {
+							// Don't match the tag, treat opening tag as literal and resume parsing.
+							$i = $tagEndPos + 1;
+							$accum .= htmlspecialchars( substr( $text, $tagStartPos, $tagEndPos + 1 - $tagStartPos ) );
+							// Cache results, otherwise we have O(N^2) performance for input like <foo><foo><foo>...
+							$noMoreClosingTag[$name] = true;
+							continue;
+						}
 					}
 				}
 				// <includeonly> and <noinclude> just become <ignore> tags
diff --git a/includes/parser/Preprocessor_Hash.php b/includes/parser/Preprocessor_Hash.php
index f030ccac1c..0e1196793c 100644
--- a/includes/parser/Preprocessor_Hash.php
+++ b/includes/parser/Preprocessor_Hash.php
@@ -120,6 +120,7 @@ class Preprocessor_Hash extends Preprocessor {
 		$forInclusion = $flags & Parser::PTD_FOR_INCLUSION;
 
 		$xmlishElements = $this->parser->getStripList();
+		$xmlishAllowMissingEndTag = [ 'includeonly', 'noinclude', 'onlyinclude' ];
 		$enableOnlyinclude = false;
 		if ( $forInclusion ) {
 			$ignoredTags = [ 'includeonly', '/includeonly' ];
@@ -160,6 +161,8 @@ class Preprocessor_Hash extends Preprocessor {
 		$inHeading = false;
 		// True if there are no more greater-than (>) signs right of $i
 		$noMoreGT = false;
+		// Map of tag name => true if there are no more closing tags of given type right of $i
+		$noMoreClosingTag = [];
 		// True to ignore all input up to the next <onlyinclude>
 		$findOnlyinclude = $enableOnlyinclude;
 		// Do a line-start run without outputting an LF character
@@ -380,17 +383,29 @@ class Preprocessor_Hash extends Preprocessor {
 				} else {
 					$attrEnd = $tagEndPos;
 					// Find closing tag
-					if ( preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
+					if (
+						!isset( $noMoreClosingTag[$name] ) &&
+						preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
 							$text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 )
 					) {
 						$inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 );
 						$i = $matches[0][1] + strlen( $matches[0][0] );
 						$close = $matches[0][0];
 					} else {
-						// No end tag -- let it run out to the end of the text.
-						$inner = substr( $text, $tagEndPos + 1 );
-						$i = $lengthText;
-						$close = null;
+						// No end tag
+						if ( in_array( $name, $xmlishAllowMissingEndTag ) ) {
+							// Let it run out to the end of the text.
+							$inner = substr( $text, $tagEndPos + 1 );
+							$i = $lengthText;
+							$close = null;
+						} else {
+							// Don't match the tag, treat opening tag as literal and resume parsing.
+							$i = $tagEndPos + 1;
+							$accum->addLiteral( substr( $text, $tagStartPos, $tagEndPos + 1 - $tagStartPos ) );
+							// Cache results, otherwise we have O(N^2) performance for input like <foo><foo><foo>...
+							$noMoreClosingTag[$name] = true;
+							continue;
+						}
 					}
 				}
 				// <includeonly> and <noinclude> just become <ignore> tags
diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt
index c6eebe40cb..e2d4f14cb5 100644
--- a/tests/parser/parserTests.txt
+++ b/tests/parser/parserTests.txt
@@ -2519,7 +2519,6 @@ Barack Obama <President> of the United States
 </p>
 !! end
 
-## PHP parser discards the "<pre " string
 !! test
 Handle broken pre-like tags (bug 64025)
 !! options
@@ -2530,8 +2529,13 @@ parsoid=wt2html
 <table><pre </table>
 !! html/php
 <pre>x</pre>
-<table><pre></pre></table>
+<table>&lt;pre </table>
 
+!! html/php+tidy
+<pre>
+x
+</pre>
+<p>&lt;pre</p>
 !! html/parsoid
 <pre about="#mwt1" typeof="mw:Transclusion" data-parsoid='{"a":{"&lt;pre":null},"sa":{"&lt;pre":""},"stx":"html","pi":[[{"k":"1"}]]}' data-mw='{"parts":[{"template":{"target":{"wt":"echo","href":"./Template:Echo"},"params":{"1":{"wt":"&lt;pre &lt;pre>x&lt;/pre>"}},"i":0}}]}'>x</pre>
 
diff --git a/tests/phpunit/includes/parser/PreprocessorTest.php b/tests/phpunit/includes/parser/PreprocessorTest.php
index 420460195d..a62503a629 100644
--- a/tests/phpunit/includes/parser/PreprocessorTest.php
+++ b/tests/phpunit/includes/parser/PreprocessorTest.php
@@ -48,7 +48,7 @@ class PreprocessorTest extends MediaWikiTestCase {
 			[ "<noinclude> Foo bar </noinclude>", "<root><ignore>&lt;noinclude&gt;</ignore> Foo bar <ignore>&lt;/noinclude&gt;</ignore></root>" ],
 			[ "<noinclude>\n{{Foo}}\n</noinclude>", "<root><ignore>&lt;noinclude&gt;</ignore>\n<template lineStart=\"1\"><title>Foo</title></template>\n<ignore>&lt;/noinclude&gt;</ignore></root>" ],
 			[ "<noinclude>\n{{Foo}}\n</noinclude>\n", "<root><ignore>&lt;noinclude&gt;</ignore>\n<template lineStart=\"1\"><title>Foo</title></template>\n<ignore>&lt;/noinclude&gt;</ignore>\n</root>" ],
-			[ "<gallery>foo bar", "<root><ext><name>gallery</name><attr></attr><inner>foo bar</inner></ext></root>" ],
+			[ "<gallery>foo bar", "<root>&lt;gallery&gt;foo bar</root>" ],
 			[ "<{{foo}}>", "<root>&lt;<template><title>foo</title></template>&gt;</root>" ],
 			[ "<{{{foo}}}>", "<root>&lt;<tplarg><title>foo</title></tplarg>&gt;</root>" ],
 			[ "<gallery></gallery</gallery>", "<root><ext><name>gallery</name><attr></attr><inner>&lt;/gallery</inner><close>&lt;/gallery&gt;</close></ext></root>" ],
-- 
2.20.1