From f51d0d9a819f8f1c181350ced2f015ce97985fcc Mon Sep 17 00:00:00 2001
From: =?utf8?q?Bartosz=20Dziewo=C5=84ski?=
Date: Tue, 24 Nov 2015 22:55:24 +0100
Subject: [PATCH] Preprocessor: Don't allow unclosed extension tags (matching
until end of input)
I think it's saner to treat this as invalid syntax, and output the
mismatched tag code verbatim. The current behavior is particularly
annoying for [ tags, which often swallow everything afterwards.
This does not affect HTML tags, though. Assuming Tidy is enabled, they
are still auto-closed at the end of the page content.
Related to T17712 and T58306. I think this brings the PHP parser closer
to Parsoid's interpretation.
It reduces performance somewhat in the worst case, though. Testing with
https://phabricator.wikimedia.org/F3245989 (a 1 MB page starting with
3000 opening tags of 15 different types), parsing time rises from
~0.2 seconds to ~1.1 seconds on my setup. We go from O(N) to O(kN),
where N is bytes of input and k is the number of types of tags present
on the page. Maximum k shouldn't exceed 30 or so in reasonable setups
(depends on installed extensions, it's 20 on English Wikipedia).
To consider:
* Should we keep previous behavior for unclosed /
? This would be particularly disruptive for these if
someone relied on the old behavior, and they're already
special-cased in places.
* Unclosed ] tags are now treated as HTML tags, and are still
displayed as preformatted text, but without suppressing wikitext
formatting.
Change-Id: Ia2f24dbfb3567c4b0778761585e6c0303d11ddd0
---
includes/parser/Preprocessor_DOM.php | 16 +++++++++++-----
includes/parser/Preprocessor_Hash.php | 16 +++++++++++-----
tests/parser/parserTests.txt | 10 ++++++++--
.../phpunit/includes/parser/PreprocessorTest.php | 2 +-
4 files changed, 31 insertions(+), 13 deletions(-)
diff --git a/includes/parser/Preprocessor_DOM.php b/includes/parser/Preprocessor_DOM.php
index 4ca3a878bd..817f1538e3 100644
--- a/includes/parser/Preprocessor_DOM.php
+++ b/includes/parser/Preprocessor_DOM.php
@@ -237,6 +237,8 @@ class Preprocessor_DOM extends Preprocessor {
$inHeading = false;
// True if there are no more greater-than (>) signs right of $i
$noMoreGT = false;
+ // Map of tag name => true if there are no more closing tags of given type right of $i
+ $noMoreClosingTag = array();
// True to ignore all input up to the next
$findOnlyinclude = $enableOnlyinclude;
// Do a line-start run without outputting an LF character
@@ -457,17 +459,21 @@ class Preprocessor_DOM extends Preprocessor {
} else {
$attrEnd = $tagEndPos;
// Find closing tag
- if ( preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
+ if (
+ !isset( $noMoreClosingTag[$name] ) &&
+ preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
$text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 )
) {
$inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 );
$i = $matches[0][1] + strlen( $matches[0][0] );
$close = '' . htmlspecialchars( $matches[0][0] ) . '';
} else {
- // No end tag -- let it run out to the end of the text.
- $inner = substr( $text, $tagEndPos + 1 );
- $i = $lengthText;
- $close = '';
+ // No end tag -- don't match the tag, treat opening tag as literal and resume parsing.
+ $i = $tagEndPos + 1;
+ $accum .= htmlspecialchars( substr( $text, $tagStartPos, $tagEndPos + 1 - $tagStartPos ) );
+ // Cache results, otherwise we have O(N^2) performance for input like ...
+ $noMoreClosingTag[$name] = true;
+ continue;
}
}
// and just become tags
diff --git a/includes/parser/Preprocessor_Hash.php b/includes/parser/Preprocessor_Hash.php
index 50eaefbce3..28c49fdd5e 100644
--- a/includes/parser/Preprocessor_Hash.php
+++ b/includes/parser/Preprocessor_Hash.php
@@ -160,6 +160,8 @@ class Preprocessor_Hash extends Preprocessor {
$inHeading = false;
// True if there are no more greater-than (>) signs right of $i
$noMoreGT = false;
+ // Map of tag name => true if there are no more closing tags of given type right of $i
+ $noMoreClosingTag = array();
// True to ignore all input up to the next
$findOnlyinclude = $enableOnlyinclude;
// Do a line-start run without outputting an LF character
@@ -380,17 +382,21 @@ class Preprocessor_Hash extends Preprocessor {
} else {
$attrEnd = $tagEndPos;
// Find closing tag
- if ( preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
+ if (
+ !isset( $noMoreClosingTag[$name] ) &&
+ preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
$text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 )
) {
$inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 );
$i = $matches[0][1] + strlen( $matches[0][0] );
$close = $matches[0][0];
} else {
- // No end tag -- let it run out to the end of the text.
- $inner = substr( $text, $tagEndPos + 1 );
- $i = $lengthText;
- $close = null;
+ // No end tag -- don't match the tag, treat opening tag as literal and resume parsing.
+ $i = $tagEndPos + 1;
+ $accum->addLiteral( substr( $text, $tagStartPos, $tagEndPos + 1 - $tagStartPos ) );
+ // Cache results, otherwise we have O(N^2) performance for input like ...
+ $noMoreClosingTag[$name] = true;
+ continue;
}
}
// and just become tags
diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt
index cd2b769a53..438fe314ca 100644
--- a/tests/parser/parserTests.txt
+++ b/tests/parser/parserTests.txt
@@ -2520,7 +2520,6 @@ Barack Obama of the United States
!! end
-## PHP parser discards the "
!! html/php
x
-
+
+!! html/php+tidy
+
+x
+
+<pre
!! html/parsoid
x
@@ -10876,6 +10880,8 @@ Un-closed
!! wikitext
!! html
+<includeonly>
+
!! end
## We used to, but no longer wt2wt this test since the default serializer
diff --git a/tests/phpunit/includes/parser/PreprocessorTest.php b/tests/phpunit/includes/parser/PreprocessorTest.php
index 1ebba1a5e2..b9402305bd 100644
--- a/tests/phpunit/includes/parser/PreprocessorTest.php
+++ b/tests/phpunit/includes/parser/PreprocessorTest.php
@@ -48,7 +48,7 @@ class PreprocessorTest extends MediaWikiTestCase {
array( " Foo bar ", "<noinclude> Foo bar </noinclude>" ),
array( "\n{{Foo}}\n", "<noinclude>\nFoo\n</noinclude>" ),
array( "\n{{Foo}}\n\n", "<noinclude>\nFoo\n</noinclude>\n" ),
- array( "foo bar", "galleryfoo bar" ),
+ array( "foo bar", "<gallery>foo bar" ),
array( "<{{foo}}>", "<foo>" ),
array( "<{{{foo}}}>", "<foo>" ),
array( "", "gallery</gallery</gallery>" ),
--
2.20.1