From 87f43fd8031a0123d2cc17245cbabefa17c250a0 Mon Sep 17 00:00:00 2001 From: Arlo Breault Date: Wed, 5 Aug 2015 11:40:02 -0700 Subject: [PATCH] Match html5 unquoted attribute parsing * Brings us closer to the html5 attribute parsing algorithm described in http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state * There's a similar patch for the Parsoid in, I2160a23b2a3c914eb369347bbf5d58328440041d * The spec says
hi
should parse as
hi
, which it now does, whereas it used to yield
hi
. * Merge with caution. This is going to break pages like, frwikisource/La_Mirlitantouille_(Lenotre)?oldid=4669681 Bug: T108134 Change-Id: Ic2fc1b573a55a847e6c05707678b58c1189ecc52 --- includes/Sanitizer.php | 11 ++++---- tests/parser/parserTests.txt | 52 +++++++++++++++++++++--------------- 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index b84adc3a89..d52bc07324 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -332,7 +332,8 @@ class Sanitizer { /** * Regular expression to match HTML/XML attribute pairs within a tag. - * Allows some... latitude. + * Allows some... latitude. Based on, + * http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes * @return string */ @@ -340,15 +341,15 @@ class Sanitizer { if ( self::$attribsRegex === null ) { $attribFirst = '[:A-Z_a-z0-9]'; $attrib = '[:A-Z_a-z-.0-9]'; - $space = '[\x09\x0a\x0d\x20]'; + $space = '[\x09\x0a\x0c\x0d\x20]'; self::$attribsRegex = "/(?:^|$space)({$attribFirst}{$attrib}*) ($space*=$space* (?: # The attribute value: quoted or alone - \"([^<\"]*)(?:\"|\$) - | '([^<']*)(?:'|\$) - | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) + \"([^\"]*)(?:\"|\$) + | '([^']*)(?:'|\$) + | (((?!$space|>).)*) ) )?(?=$space|\$)/sx"; } diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index d4e711977b..9ad7bd547a 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -6552,14 +6552,18 @@ Table with empty line following the start tag !! end -# FIXME: Preserve the attribute properly (with an empty string as value) in -# the PHP parser. Parsoid implements the behavior below. !! test Table attributes with empty value !! wikitext {| | style=| hello |} +!! html/php + + +
hello +
+ !! html/parsoid @@ -15451,12 +15455,8 @@ div with illegal double attributes !!end -# FIXME: produce empty string instead of "class" in the PHP parser, following -# the HTML5 spec. !! test div with empty attribute value, space before equals -!! options -parsoid !! wikitext
HTML rocks
!! html @@ -15464,26 +15464,36 @@ parsoid !! end +# FIXME: Parsoid doesn't match the html5 spec !! test div with multiple empty attribute values !! options -parsoid +parsoid=wt2html,html2html !! wikitext
HTML rocks
-!! html -
HTML rocks
+!! html/php +
HTML rocks
+!! html/parsoid +
HTML rocks
!! end +# FIXME: Parsoid doesn't match the html5 spec !! test table with multiple empty attribute values !! options -parsoid +parsoid=wt2html,html2html !! wikitext {| title= id= | hi |} -!! html +!! html/php +
+ +
hi +
+ +!! html/parsoid
hi
@@ -15500,13 +15510,6 @@ div with braces in attribute value
Foo
!! end -# This it very inconsistent in the PHP parser: it returns -# class="class" if there is a space between the name and the equal sign (see -# 'div with empty attribute value, space before equals'), but strips the -# attribute completely if the space is missing. We hope that not much content -# depends on this, so are implementing the behavior below in Parsoid for -# consistencies' sake. -# FIXME: fix this behavior in the PHP parser? !! test div with empty attribute value, no space before equals !! options @@ -15514,7 +15517,7 @@ parsoid=wt2html,html2html !! wikitext
HTML rocks
!! html/php -
HTML rocks
+
HTML rocks
!! html/parsoid
HTML rocks
@@ -15836,7 +15839,7 @@ Attribute test: equals, then nothing !! wikitext foo !! html -

foo +

foo

!! end @@ -23909,14 +23912,15 @@ HTML tag with 'unnecessary' entity encoding in attributes !! test HTML tag with broken attribute value quoting +!! options +parsoid=wt2html,html2html !! wikitext Foo

!! html/parsoid -

Foo -

+

Foo

!! end !! test @@ -23934,6 +23938,8 @@ parsoid=wt2html,html2html !! test Table with broken attribute value quoting +!! options +parsoid=wt2html,html2html !! wikitext {| | title="Hello world|Foo @@ -23954,6 +23960,8 @@ Table with broken attribute value quoting !! test Table with broken attribute value quoting on consecutive lines +!! options +parsoid=wt2html,html2html !! wikitext {| | title="Hello world|Foo -- 2.20.1