From f8b7cc890d9fa6fbb6c9673391f37e81abde274e Mon Sep 17 00:00:00 2001 From: "C. Scott Ananian" Date: Tue, 6 Aug 2013 11:17:38 -0400 Subject: [PATCH] Non-word characters don't terminate tag names. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The PHP sanitizer was including only \w+ in tag names. This meant that and were converted to tags (bug 17663); and were treated as tags (bug 40670), and was treated as a tag (bug 52022). (But note that *is* actually a valid synonym for .) Fix the sanitizer. Bug: 17663 Change-Id: Iceec404f46703065bf080dd2cbfed1f88c204fa5 --- includes/Sanitizer.php | 2 +- tests/parser/parserTests.txt | 43 ++++++++++++++++++++++++++++++++---- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index f3a5281845..1432a8b02d 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -448,7 +448,7 @@ class Sanitizer { # $params: String between element name and > # $brace: Ending '>' or '/>' # $rest: Everything until the next element of $bits - if ( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) { + if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) { list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; } else { $slash = $t = $params = $brace = $rest = null; diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index f4a85bc325..cdd7eed68b 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -874,6 +874,43 @@ Non-html5 tags should be accepted

!! end +# is HTML4, is HTML4/5. +!! test + or for strikethrough +!! input +strike + +s +!! result +

strike +

s +

+!! end + +!! test +Non-word characters don't terminate tag names (bug 17663, 40670, 52022) +!! input + doesn't work!
+ + doesn't work! + + works fine + +foo + +s.foo + + +!! result +

<b→> doesn't work! </b> +

<bä> doesn't work! </b> +

<boo> works fine </b> +

<s.foo>foo</s> +

<s.foo>s.foo</s.foo> +

<sub-ID#1> +

+!! end + ### ### Special characters ### @@ -16129,12 +16166,10 @@ a>b !! end -# This fails in the PHP parser (see bug 40670, -# https://bugzilla.wikimedia.org/show_bug.cgi?id=40670), so disabled for it. +# This was a bug in the PHP parser (see bug 17663 and its dups, +# https://bugzilla.wikimedia.org/show_bug.cgi?id=17663) !! test Tag names followed by punctuation should not be recognized as tags -!! options -parsoid !! input text !! result -- 2.20.1