Preprocessor: Don't allow unclosed extension tags (matching until end of input)

author Bartosz Dziewoński <matma.rex@gmail.com>

Thu, 4 Feb 2016 01:13:24 +0000 (01:13 +0000)

committer Kunal Mehta <legoktm@member.fsf.org>

Tue, 5 Apr 2016 19:28:10 +0000 (12:28 -0700)
author Bartosz Dziewoński <matma.rex@gmail.com>
Thu, 4 Feb 2016 01:13:24 +0000 (01:13 +0000)
committer Kunal Mehta <legoktm@member.fsf.org>
Tue, 5 Apr 2016 19:28:10 +0000 (12:28 -0700)
diff --git a/includes/parser/Preprocessor_DOM.php b/includes/parser/Preprocessor_DOM.php

index 4c94b2a..a28c0aa 100644 (file)
--- a/includes/parser/Preprocessor_DOM.php
+++ b/includes/parser/Preprocessor_DOM.php
@@ -196,6 +196,7 @@ class Preprocessor_DOM extends Preprocessor {
                 $forInclusion = $flags & Parser::PTD_FOR_INCLUSION;
  
                 $xmlishElements = $this->parser->getStripList();
+               $xmlishAllowMissingEndTag = [ 'includeonly', 'noinclude', 'onlyinclude' ];
                 $enableOnlyinclude = false;
                 if ( $forInclusion ) {
                         $ignoredTags = [ 'includeonly', '/includeonly' ];
@@ -237,6 +238,8 @@ class Preprocessor_DOM extends Preprocessor {
                 $inHeading = false;
                 // True if there are no more greater-than (>) signs right of $i
                 $noMoreGT = false;
+               // Map of tag name => true if there are no more closing tags of given type right of $i
+               $noMoreClosingTag = [];
                 // True to ignore all input up to the next <onlyinclude>
                 $findOnlyinclude = $enableOnlyinclude;
                 // Do a line-start run without outputting an LF character
@@ -457,17 +460,29 @@ class Preprocessor_DOM extends Preprocessor {
                                 } else {
                                         $attrEnd = $tagEndPos;
                                         // Find closing tag
-                                       if ( preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
+                                       if (
+                                               !isset( $noMoreClosingTag[$name] ) &&
+                                               preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
                                                         $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 )
                                         ) {
                                                 $inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 );
                                                 $i = $matches[0][1] + strlen( $matches[0][0] );
                                                 $close = '<close>' . htmlspecialchars( $matches[0][0] ) . '</close>';
                                         } else {
-                                               // No end tag -- let it run out to the end of the text.
-                                               $inner = substr( $text, $tagEndPos + 1 );
-                                               $i = $lengthText;
-                                               $close = '';
+                                               // No end tag
+                                               if ( in_array( $name, $xmlishAllowMissingEndTag ) ) {
+                                                       // Let it run out to the end of the text.
+                                                       $inner = substr( $text, $tagEndPos + 1 );
+                                                       $i = $lengthText;
+                                                       $close = '';
+                                               } else {
+                                                       // Don't match the tag, treat opening tag as literal and resume parsing.
+                                                       $i = $tagEndPos + 1;
+                                                       $accum .= htmlspecialchars( substr( $text, $tagStartPos, $tagEndPos + 1 - $tagStartPos ) );
+                                                       // Cache results, otherwise we have O(N^2) performance for input like <foo><foo><foo>...
+                                                       $noMoreClosingTag[$name] = true;
+                                                       continue;
+                                               }
                                         }
                                 }
                                 // <includeonly> and <noinclude> just become <ignore> tags
diff --git a/includes/parser/Preprocessor_Hash.php b/includes/parser/Preprocessor_Hash.php

index f030cca..0e11967 100644 (file)
--- a/includes/parser/Preprocessor_Hash.php
+++ b/includes/parser/Preprocessor_Hash.php
@@ -120,6 +120,7 @@ class Preprocessor_Hash extends Preprocessor {
                 $forInclusion = $flags & Parser::PTD_FOR_INCLUSION;
  
                 $xmlishElements = $this->parser->getStripList();
+               $xmlishAllowMissingEndTag = [ 'includeonly', 'noinclude', 'onlyinclude' ];
                 $enableOnlyinclude = false;
                 if ( $forInclusion ) {
                         $ignoredTags = [ 'includeonly', '/includeonly' ];
@@ -160,6 +161,8 @@ class Preprocessor_Hash extends Preprocessor {
                 $inHeading = false;
                 // True if there are no more greater-than (>) signs right of $i
                 $noMoreGT = false;
+               // Map of tag name => true if there are no more closing tags of given type right of $i
+               $noMoreClosingTag = [];
                 // True to ignore all input up to the next <onlyinclude>
                 $findOnlyinclude = $enableOnlyinclude;
                 // Do a line-start run without outputting an LF character
@@ -380,17 +383,29 @@ class Preprocessor_Hash extends Preprocessor {
                                 } else {
                                         $attrEnd = $tagEndPos;
                                         // Find closing tag
-                                       if ( preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
+                                       if (
+                                               !isset( $noMoreClosingTag[$name] ) &&
+                                               preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
                                                         $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 )
                                         ) {
                                                 $inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 );
                                                 $i = $matches[0][1] + strlen( $matches[0][0] );
                                                 $close = $matches[0][0];
                                         } else {
-                                               // No end tag -- let it run out to the end of the text.
-                                               $inner = substr( $text, $tagEndPos + 1 );
-                                               $i = $lengthText;
-                                               $close = null;
+                                               // No end tag
+                                               if ( in_array( $name, $xmlishAllowMissingEndTag ) ) {
+                                                       // Let it run out to the end of the text.
+                                                       $inner = substr( $text, $tagEndPos + 1 );
+                                                       $i = $lengthText;
+                                                       $close = null;
+                                               } else {
+                                                       // Don't match the tag, treat opening tag as literal and resume parsing.
+                                                       $i = $tagEndPos + 1;
+                                                       $accum->addLiteral( substr( $text, $tagStartPos, $tagEndPos + 1 - $tagStartPos ) );
+                                                       // Cache results, otherwise we have O(N^2) performance for input like <foo><foo><foo>...
+                                                       $noMoreClosingTag[$name] = true;
+                                                       continue;
+                                               }
                                         }
                                 }
                                 // <includeonly> and <noinclude> just become <ignore> tags
diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt

index c6eebe4..e2d4f14 100644 (file)
--- a/tests/parser/parserTests.txt
+++ b/tests/parser/parserTests.txt
@@ -2519,7 +2519,6 @@ Barack Obama <President> of the United States
  </p>
  !! end
  
-## PHP parser discards the "<pre " string
  !! test
  Handle broken pre-like tags (bug 64025)
  !! options
@@ -2530,8 +2529,13 @@ parsoid=wt2html
  <table><pre </table>
  !! html/php
  <pre>x</pre>
-<table><pre></pre></table>
+<table>&lt;pre </table>
  
+!! html/php+tidy
+<pre>
+x
+</pre>
+<p>&lt;pre</p>
  !! html/parsoid
  <pre about="#mwt1" typeof="mw:Transclusion" data-parsoid='{"a":{"&lt;pre":null},"sa":{"&lt;pre":""},"stx":"html","pi":[[{"k":"1"}]]}' data-mw='{"parts":[{"template":{"target":{"wt":"echo","href":"./Template:Echo"},"params":{"1":{"wt":"&lt;pre &lt;pre>x&lt;/pre>"}},"i":0}}]}'>x</pre>
  
diff --git a/tests/phpunit/includes/parser/PreprocessorTest.php b/tests/phpunit/includes/parser/PreprocessorTest.php

index 4204601..a62503a 100644 (file)
--- a/tests/phpunit/includes/parser/PreprocessorTest.php
+++ b/tests/phpunit/includes/parser/PreprocessorTest.php
@@ -48,7 +48,7 @@ class PreprocessorTest extends MediaWikiTestCase {
                         [ "<noinclude> Foo bar </noinclude>", "<root><ignore>&lt;noinclude&gt;</ignore> Foo bar <ignore>&lt;/noinclude&gt;</ignore></root>" ],
                         [ "<noinclude>\n{{Foo}}\n</noinclude>", "<root><ignore>&lt;noinclude&gt;</ignore>\n<template lineStart=\"1\"><title>Foo</title></template>\n<ignore>&lt;/noinclude&gt;</ignore></root>" ],
                         [ "<noinclude>\n{{Foo}}\n</noinclude>\n", "<root><ignore>&lt;noinclude&gt;</ignore>\n<template lineStart=\"1\"><title>Foo</title></template>\n<ignore>&lt;/noinclude&gt;</ignore>\n</root>" ],
-                       [ "<gallery>foo bar", "<root><ext><name>gallery</name><attr></attr><inner>foo bar</inner></ext></root>" ],
+                       [ "<gallery>foo bar", "<root>&lt;gallery&gt;foo bar</root>" ],
                         [ "<{{foo}}>", "<root>&lt;<template><title>foo</title></template>&gt;</root>" ],
                         [ "<{{{foo}}}>", "<root>&lt;<tplarg><title>foo</title></tplarg>&gt;</root>" ],
                         [ "<gallery></gallery</gallery>", "<root><ext><name>gallery</name><attr></attr><inner>&lt;/gallery</inner><close>&lt;/gallery&gt;</close></ext></root>" ],
author	Bartosz Dziewoński <matma.rex@gmail.com>
	Thu, 4 Feb 2016 01:13:24 +0000 (01:13 +0000)
committer	Kunal Mehta <legoktm@member.fsf.org>
	Tue, 5 Apr 2016 19:28:10 +0000 (12:28 -0700)
includes/parser/Preprocessor_DOM.php		patch \| blob \| history
includes/parser/Preprocessor_Hash.php		patch \| blob \| history
tests/parser/parserTests.txt		patch \| blob \| history
tests/phpunit/includes/parser/PreprocessorTest.php		patch \| blob \| history