From: Arlo Breault
Date: Wed, 7 Jan 2015 20:46:59 +0000 (-0800)
Subject: Non-word characters shouldn't terminate tag names on the tidy side too
X-Git-Tag: 1.31.0-rc.0~12507^2
X-Git-Url: http://git.cyclocoop.org/%22.htmlspecialchars%28%24url_syndic%29.%22?a=commitdiff_plain;h=8e8b15afc6;p=lhc%2Fweb%2Fwiklou.git
Non-word characters shouldn't terminate tag names on the tidy side too
* Follow up to Iceec404f46703065bf080dd2cbfed1f88c204fa5.
* The accepted charset is changed to match the HTML5 parsing spec at:
http://dev.w3.org/html5/spec-preview/tokenization.html#tag-open-state
* Equivalent in parsoid at I462c336f9a00c8ccd11f3220a8738389e8ba7c7c.
Change-Id: I69cb000538fe195dd77273da5f91697fe1e7d283
---
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index f79e94d459..a2de0044a6 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -39,6 +39,12 @@ class Sanitizer {
|&\#[xX]([0-9A-Fa-f]+);
|(&)/x';
+ /**
+ * Acceptable tag name charset from HTML5 parsing spec
+ * http://dev.w3.org/html5/spec-preview/tokenization.html#tag-open-state
+ */
+ const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
+
/**
* Blacklist for evil uris like javascript:
* WARNING: DO NOT use this in any place that actually requires blacklisting
@@ -444,7 +450,7 @@ class Sanitizer {
# $params: String between element name and >
# $brace: Ending '>' or '/>'
# $rest: Everything until the next element of $bits
- if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
+ if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
} else {
$slash = $t = $params = $brace = $rest = null;
@@ -567,11 +573,7 @@ class Sanitizer {
} else {
# this might be possible using tidy itself
foreach ( $bits as $x ) {
- preg_match(
- '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
- $x,
- $regs
- );
+ preg_match( self::ELEMENT_BITS_REGEX, $x, $regs );
wfSuppressWarnings();
list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt
index 18aeabb540..04e39ee3fb 100644
--- a/tests/parser/parserTests.txt
+++ b/tests/parser/parserTests.txt
@@ -1191,28 +1191,49 @@ Ruby markup (W3C-style)
!! end
-# There is a tidy bug here: http://sourceforge.net/p/tidy/bugs/946/
+# The next two test different paths in the sanitizer.
!! test
Non-word characters don't terminate tag names (bug 17663, 40670, 52022)
!! wikitext
- doesn't work!
+ doesn't terminate
- doesn't work!
+ doesn't terminate
- works fine
+ doesn't terminate
-s.foo
+ doesn't terminate
!! html
-<bâ> doesn't work! </bâ>
-
<bä> doesn't work! </bä>
-
<boo> works fine </boo>
-
<s.foo>s.foo</s.foo>
+
<bâ> doesn't terminate </bâ>
+
<bä> doesn't terminate </bä>
+
<boo> doesn't terminate </boo>
+
<s.foo> doesn't terminate </s.foo>
<sub-ID#1>
!! end
+# There is a tidy bug here: http://sourceforge.net/p/tidy/bugs/946/
+!! test
+Non-word characters don't terminate tag names + tidy
+!! wikitext
+ doesn't terminate
+
+ doesn't terminate
+
+ doesn't terminate
+
+ doesn't terminate
+
+
+!! html+tidy
+<bâ> doesn't terminate </bâ>
+<bä> doesn't terminate </bä>
+<boo> doesn't terminate </boo>
+<s.foo> doesn't terminate </s.foo>
+<sub-ID#1>
+!! end
+
!! test
Isolated close tags should be treated as literal text (bug 52760)
!! wikitext