From: Arlo Breault Date: Fri, 2 Nov 2018 23:20:52 +0000 (-0400) Subject: Quoted attributes don't need to be followed by a space X-Git-Tag: 1.34.0-rc.0~3429^2 X-Git-Url: http://git.cyclocoop.org//%27%40script%40/%27?a=commitdiff_plain;h=59bb8864a23f3df120789c7619ef07acefa27b9c;p=lhc%2Fweb%2Fwiklou.git Quoted attributes don't need to be followed by a space Further, this splits up attribute parsing from filtering. Change-Id: Ib4e0a808a6ca2ba032873e885837233e2f2feefe --- diff --git a/includes/parser/Sanitizer.php b/includes/parser/Sanitizer.php index 85c71eeb44..1c8a797c4c 100644 --- a/includes/parser/Sanitizer.php +++ b/includes/parser/Sanitizer.php @@ -349,18 +349,18 @@ class Sanitizer { /** * Regular expression to match HTML/XML attribute pairs within a tag. - * Allows some... latitude. Based on, - * https://www.w3.org/TR/html5/syntax.html#before-attribute-value-state - * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes + * Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state + * Used in Sanitizer::decodeTagAttributes * @return string */ static function getAttribsRegex() { if ( self::$attribsRegex === null ) { - $attribFirst = "[:_\p{L}\p{N}]"; - $attrib = "[:_\.\-\p{L}\p{N}]"; - $space = '[\x09\x0a\x0c\x0d\x20]'; + $spaceChars = '\x09\x0a\x0c\x0d\x20'; + $space = "[{$spaceChars}]"; + $attrib = "[^{$spaceChars}\/>=]"; + $attribFirst = "(?:{$attrib}|=)"; self::$attribsRegex = - "/(?:^|$space)({$attribFirst}{$attrib}*) + "/({$attribFirst}{$attrib}*) ($space*=$space* (?: # The attribute value: quoted or alone @@ -368,11 +368,29 @@ class Sanitizer { | '([^']*)(?:'|\$) | (((?!$space|>).)*) ) - )?(?=$space|\$)/sxu"; + )?/sxu"; } return self::$attribsRegex; } + /** + * Lazy-initialised attribute name regex, see getAttribNameRegex() + */ + private static $attribNameRegex; + + /** + * Used in Sanitizer::decodeTagAttributes to filter attributes. + * @return string + */ + static function getAttribNameRegex() { + if ( self::$attribNameRegex === null ) { + $attribFirst = "[:_\p{L}\p{N}]"; + $attrib = "[:_\.\-\p{L}\p{N}]"; + self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu"; + } + return self::$attribNameRegex; + } + /** * Return the various lists of recognized tags * @param array $extratags For any extra tags to include @@ -1433,18 +1451,24 @@ class Sanitizer { return []; } - $attribs = []; $pairs = []; if ( !preg_match_all( self::getAttribsRegex(), $text, $pairs, PREG_SET_ORDER ) ) { - return $attribs; + return []; } + $attribs = []; foreach ( $pairs as $set ) { $attribute = strtolower( $set[1] ); + + // Filter attribute names with unacceptable characters + if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) { + continue; + } + $value = self::getTagAttributeCallback( $set ); // Normalize whitespace diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index 6d5054e6ce..62983f5c86 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -6273,8 +6273,6 @@ parsoid=wt2html !! end -# Note that the PHP parser output appears to be broken when the table -# end tag is not separated by a space from the style attribute !! test A table with stray table end tags on start tag line (wt2html) !! options @@ -6294,13 +6292,13 @@ parsoid=wt2html |foo |} !! html/php+tidy - +
foo
- +
foo
@@ -9648,17 +9646,14 @@ Handling html with a div self-closing tag
-!! html/php -

<div title /> -<div title/> -

-
-

<div title=bar /> -<div title=bar/> -

-
-
- +!! html/php+tidy +
+
+
+
+
+
+
!! html/parsoid
@@ -9699,10 +9694,10 @@ Handling html with a br self-closing tag


-!! html/php +!! html/php+tidy



-
+



@@ -9717,6 +9712,18 @@ Handling html with a br self-closing tag

!! end +!! test +Quoted attributes without spaces +!! options +parsoid=wt2html +!! wikitext +
red
+!! html/php+tidy +
red
+!! html/parsoid +
red
+!! end + !! test Horizontal ruler (should it add that extra space?) !! wikitext @@ -18177,8 +18184,7 @@ HTML tag with leading space is parsed as text

!! end -## Don't expect Parsoid and PHP to match, since PHP isn't exactly following -## the HTML5 parsing spec. +## FIXME: The untrimmed attribute in Parsoid is T205737 !! test Element with broken attribute syntax !! options @@ -18187,7 +18193,7 @@ parsoid=wt2html
hi
ho
!! html/php -
hi
+
hi
ho
!! html/parsoid