* Brings us closer to the html5 attribute parsing algorithm described
in http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
* There's a similar patch for the Parsoid in,
I2160a23b2a3c914eb369347bbf5d58328440041d
* The spec says <div class= style="123">hi</div> should parse as
<div class="style=\"123\"">hi</div>, which it now does, whereas it
used to yield <div class="" style="123">hi</div>.
* Merge with caution. This is going to break pages like,
frwikisource/La_Mirlitantouille_(Lenotre)?oldid=
4669681
Bug: T108134
Change-Id: Ic2fc1b573a55a847e6c05707678b58c1189ecc52
/**
* Regular expression to match HTML/XML attribute pairs within a tag.
/**
* Regular expression to match HTML/XML attribute pairs within a tag.
- * Allows some... latitude.
+ * Allows some... latitude. Based on,
+ * http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
* Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
* @return string
*/
* Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
* @return string
*/
if ( self::$attribsRegex === null ) {
$attribFirst = '[:A-Z_a-z0-9]';
$attrib = '[:A-Z_a-z-.0-9]';
if ( self::$attribsRegex === null ) {
$attribFirst = '[:A-Z_a-z0-9]';
$attrib = '[:A-Z_a-z-.0-9]';
- $space = '[\x09\x0a\x0d\x20]';
+ $space = '[\x09\x0a\x0c\x0d\x20]';
self::$attribsRegex =
"/(?:^|$space)({$attribFirst}{$attrib}*)
($space*=$space*
(?:
# The attribute value: quoted or alone
self::$attribsRegex =
"/(?:^|$space)({$attribFirst}{$attrib}*)
($space*=$space*
(?:
# The attribute value: quoted or alone
- \"([^<\"]*)(?:\"|\$)
- | '([^<']*)(?:'|\$)
- | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
+ \"([^\"]*)(?:\"|\$)
+ | '([^']*)(?:'|\$)
+ | (((?!$space|>).)*)
-# FIXME: Preserve the attribute properly (with an empty string as value) in
-# the PHP parser. Parsoid implements the behavior below.
!! test
Table attributes with empty value
!! wikitext
{|
| style=| hello
|}
!! test
Table attributes with empty value
!! wikitext
{|
| style=| hello
|}
+!! html/php
+<table>
+<tr>
+<td style=""> hello
+</td></tr></table>
+
!! html/parsoid
<table>
<tbody>
!! html/parsoid
<table>
<tbody>
-# FIXME: produce empty string instead of "class" in the PHP parser, following
-# the HTML5 spec.
!! test
div with empty attribute value, space before equals
!! test
div with empty attribute value, space before equals
!! wikitext
<div class =>HTML rocks</div>
!! html
!! wikitext
<div class =>HTML rocks</div>
!! html
+# FIXME: Parsoid doesn't match the html5 spec
!! test
div with multiple empty attribute values
!! options
!! test
div with multiple empty attribute values
!! options
+parsoid=wt2html,html2html
!! wikitext
<div id= title=>HTML rocks</div>
!! wikitext
<div id= title=>HTML rocks</div>
-!! html
-<div id="" title="">HTML rocks</div>
+!! html/php
+<div id="title.3D">HTML rocks</div>
+!! html/parsoid
+<div id="" title="">HTML rocks</div>
+# FIXME: Parsoid doesn't match the html5 spec
!! test
table with multiple empty attribute values
!! options
!! test
table with multiple empty attribute values
!! options
+parsoid=wt2html,html2html
!! wikitext
{| title= id=
| hi
|}
!! wikitext
{| title= id=
| hi
|}
+!! html/php
+<table title="id=">
+<tr>
+<td> hi
+</td></tr></table>
+
+!! html/parsoid
<table title="" id="">
<tbody><tr><td> hi</td></tr>
</tbody></table>
<table title="" id="">
<tbody><tr><td> hi</td></tr>
</tbody></table>
<div title="{}">Foo</div>
!! end
<div title="{}">Foo</div>
!! end
-# This it very inconsistent in the PHP parser: it returns
-# class="class" if there is a space between the name and the equal sign (see
-# 'div with empty attribute value, space before equals'), but strips the
-# attribute completely if the space is missing. We hope that not much content
-# depends on this, so are implementing the behavior below in Parsoid for
-# consistencies' sake.
-# FIXME: fix this behavior in the PHP parser?
!! test
div with empty attribute value, no space before equals
!! options
!! test
div with empty attribute value, no space before equals
!! options
!! wikitext
<div class=>HTML rocks</div>
!! html/php
!! wikitext
<div class=>HTML rocks</div>
!! html/php
+<div class="">HTML rocks</div>
!! html/parsoid
<div class="">HTML rocks</div>
!! html/parsoid
<div class="">HTML rocks</div>
!! wikitext
<font color=>foo</font>
!! html
!! wikitext
<font color=>foo</font>
!! html
+<p><font color="">foo</font>
!! test
HTML tag with broken attribute value quoting
!! test
HTML tag with broken attribute value quoting
+!! options
+parsoid=wt2html,html2html
!! wikitext
<span title="Hello world>Foo</span>
!! html/php
<p><span title="Hello world">Foo</span>
</p>
!! html/parsoid
!! wikitext
<span title="Hello world>Foo</span>
!! html/php
<p><span title="Hello world">Foo</span>
</p>
!! html/parsoid
-<p><span title="Hello world">Foo</span>
-</p>
+<p><span title="Hello world">Foo</span></p>
!! test
Table with broken attribute value quoting
!! test
Table with broken attribute value quoting
+!! options
+parsoid=wt2html,html2html
!! wikitext
{|
| title="Hello world|Foo
!! wikitext
{|
| title="Hello world|Foo
!! test
Table with broken attribute value quoting on consecutive lines
!! test
Table with broken attribute value quoting on consecutive lines
+!! options
+parsoid=wt2html,html2html
!! wikitext
{|
| title="Hello world|Foo
!! wikitext
{|
| title="Hello world|Foo