/**
* Regular expression to match HTML/XML attribute pairs within a tag.
- * Allows some... latitude. Based on,
- * https://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
- * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
+ * Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
+ * Used in Sanitizer::decodeTagAttributes
* @return string
*/
static function getAttribsRegex() {
if ( self::$attribsRegex === null ) {
- $attribFirst = "[:_\p{L}\p{N}]";
- $attrib = "[:_\.\-\p{L}\p{N}]";
- $space = '[\x09\x0a\x0c\x0d\x20]';
+ $spaceChars = '\x09\x0a\x0c\x0d\x20';
+ $space = "[{$spaceChars}]";
+ $attrib = "[^{$spaceChars}\/>=]";
+ $attribFirst = "(?:{$attrib}|=)";
self::$attribsRegex =
- "/(?:^|$space)({$attribFirst}{$attrib}*)
+ "/({$attribFirst}{$attrib}*)
($space*=$space*
(?:
# The attribute value: quoted or alone
| '([^']*)(?:'|\$)
| (((?!$space|>).)*)
)
- )?(?=$space|\$)/sxu";
+ )?/sxu";
}
return self::$attribsRegex;
}
+ /**
+ * Lazy-initialised attribute name regex, see getAttribNameRegex()
+ */
+ private static $attribNameRegex;
+
+ /**
+ * Used in Sanitizer::decodeTagAttributes to filter attributes.
+ * @return string
+ */
+ static function getAttribNameRegex() {
+ if ( self::$attribNameRegex === null ) {
+ $attribFirst = "[:_\p{L}\p{N}]";
+ $attrib = "[:_\.\-\p{L}\p{N}]";
+ self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
+ }
+ return self::$attribNameRegex;
+ }
+
/**
* Return the various lists of recognized tags
* @param array $extratags For any extra tags to include
return [];
}
- $attribs = [];
$pairs = [];
if ( !preg_match_all(
self::getAttribsRegex(),
$text,
$pairs,
PREG_SET_ORDER ) ) {
- return $attribs;
+ return [];
}
+ $attribs = [];
foreach ( $pairs as $set ) {
$attribute = strtolower( $set[1] );
+
+ // Filter attribute names with unacceptable characters
+ if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
+ continue;
+ }
+
$value = self::getTagAttributeCallback( $set );
// Normalize whitespace
!! end
-# Note that the PHP parser output appears to be broken when the table
-# end tag is not separated by a space from the style attribute
!! test
A table with stray table end tags on start tag line (wt2html)
!! options
|foo
|}
!! html/php+tidy
-<table style=""color:">
+<table style="color: red;">
</table><table style="color: red;">
<tbody><tr>
<td>foo
</td></tr></tbody></table>
-<table style=""color:" id="foo">
+<table style="color: red;" id="foo">
<tbody><tr>
<td>foo
</td></tr></tbody></table>
<div title=bar />
<div title=bar/>
<div title=bar/ >
-!! html/php
-<p><div title />
-<div title/>
-</p>
-<div>
-<p><div title=bar />
-<div title=bar/>
-</p>
-<div title="bar/"></div>
-</div>
-
+!! html/php+tidy
+<div title=""></div>
+<div title=""></div>
+<div title="">
+<div title="bar"></div>
+<div title="bar"></div>
+<div title="bar/">
+</div></div>
!! html/parsoid
<div title="" data-parsoid='{"stx":"html","selfClose":true}'></div>
<div title="" data-parsoid='{"stx":"html","selfClose":true}'></div>
<br title=bar />
<br title=bar/>
<br title=bar/ >
-!! html/php
+!! html/php+tidy
<p><br title="" />
<br title="" />
-<br />
+<br title="" />
<br title="bar" />
<br title="bar" />
<br title="bar/" />
</p>
!! end
+!! test
+Quoted attributes without spaces
+!! options
+parsoid=wt2html
+!! wikitext
+<div class="foo"style="color:red">red</div>
+!! html/php+tidy
+<div class="foo" style="color:red">red</div>
+!! html/parsoid
+<div class="foo" style="color:red">red</div>
+!! end
+
!! test
Horizontal ruler (should it add that extra space?)
!! wikitext
</p>
!! end
-## Don't expect Parsoid and PHP to match, since PHP isn't exactly following
-## the HTML5 parsing spec.
+## FIXME: The untrimmed attribute in Parsoid is T205737
!! test
Element with broken attribute syntax
!! options
<div style=" style="123">hi</div>
<div =>ho</div>
!! html/php
-<div style="123">hi</div>
+<div style="style=">hi</div>
<div>ho</div>
!! html/parsoid