use \ReverseArrayIterator;
use \Sanitizer;
-# A note for future librarization[1] -- this file is a good candidate
-# for splitting into an independent library, except that it is currently
-# highly optimized for MediaWiki use. It only implements the portions
-# of the HTML5 tree builder used by tags supported by MediaWiki, and
-# does not contain a true tokenizer pass, instead relying on
-# comment stripping, attribute normalization, and escaping done by
-# the MediaWiki Sanitizer. It also deliberately avoids building
-# a true DOM in memory, instead serializing elements to an output string
-# as soon as possible (usually as soon as the tag is closed) to reduce
-# its memory footprint.
-
-# We've been gradually lifting some of these restrictions to handle
-# non-sanitized output generated by extensions, but we shortcut the tokenizer
-# for speed (primarily by splitting on `<`) and so rely on syntactic
-# well-formedness.
-
-# On the other hand, I've been pretty careful to note with comments in the
-# code the places where this implementation omits features of the spec or
-# depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
-# implement the missing pieces and make this a standalone PHP HTML5 parser.
-# In order to do so, some sort of MediaWiki-specific API will need
-# to be added to (a) allow the Balancer to bypass the tokenizer,
-# and (b) support on-the-fly flattening instead of DOM node creation.
-
-# [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
+// A note for future librarization[1] -- this file is a good candidate
+// for splitting into an independent library, except that it is currently
+// highly optimized for MediaWiki use. It only implements the portions
+// of the HTML5 tree builder used by tags supported by MediaWiki, and
+// does not contain a true tokenizer pass, instead relying on
+// comment stripping, attribute normalization, and escaping done by
+// the MediaWiki Sanitizer. It also deliberately avoids building
+// a true DOM in memory, instead serializing elements to an output string
+// as soon as possible (usually as soon as the tag is closed) to reduce
+// its memory footprint.
+
+// We've been gradually lifting some of these restrictions to handle
+// non-sanitized output generated by extensions, but we shortcut the tokenizer
+// for speed (primarily by splitting on `<`) and so rely on syntactic
+// well-formedness.
+
+// On the other hand, I've been pretty careful to note with comments in the
+// code the places where this implementation omits features of the spec or
+// depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
+// implement the missing pieces and make this a standalone PHP HTML5 parser.
+// In order to do so, some sort of MediaWiki-specific API will need
+// to be added to (a) allow the Balancer to bypass the tokenizer,
+// and (b) support on-the-fly flattening instead of DOM node creation.
+
+// [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
/**
* Utility constants and sets for the HTML5 tree building algorithm.
self::HTML_NAMESPACE => [
'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
'frame' => true,
- 'plaintext' => true, 'isindex' => true,
+ 'plaintext' => true,
'xmp' => true, 'iframe' => true, 'noembed' => true,
'noscript' => true, 'script' => true,
'title' => true
'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
- 'input' => true, 'isindex' => true, 'li' => true, 'link' => true,
+ 'input' => true, 'li' => true, 'link' => true,
'listing' => true, 'main' => true, 'marquee' => true,
- 'menu' => true, 'menuitem' => true, 'meta' => true, 'nav' => true,
+ 'menu' => true, 'meta' => true, 'nav' => true,
'noembed' => true, 'noframes' => true, 'noscript' => true,
'object' => true, 'ol' => true, 'p' => true, 'param' => true,
'plaintext' => true, 'pre' => true, 'script' => true,
public static $impliedEndTagsSet = [
self::HTML_NAMESPACE => [
- 'dd' => true, 'dt' => true, 'li' => true, 'optgroup' => true,
+ 'dd' => true, 'dt' => true, 'li' => true,
+ 'menuitem' => true, 'optgroup' => true,
'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
'rt' => true, 'rtc' => true
]
/**
* Parent of this element, or the string "flat" if this element has
* already been flattened into its parent.
- * @var string|null $parent
+ * @var BalanceElement|string|null $parent
*/
public $parent;
* child will be an actual BalanceElement object; the rest will
* be strings, representing either text nodes or flattened
* BalanceElement objects.
- * @var array $children
+ * @var BalanceElement[]|string[] $children
*/
public $children;
* by the HTML serialization specification, and replace this node
* in its parent by that string.
*
+ * @param array $config Balancer configuration; see Balancer::__construct().
+ * @return string
+ *
* @see __toString()
*/
- public function flatten( $tidyCompat = false ) {
+ public function flatten( array $config ) {
Assert::parameter( $this->parent !== null, '$this', 'must be a child' );
Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' );
$idx = array_search( $this, $this->parent->children, true );
Assert::parameter(
$idx !== false, '$this', 'must be a child of its parent'
);
+ $tidyCompat = $config['tidyCompat'];
if ( $tidyCompat ) {
$blank = true;
foreach ( $this->children as $elt ) {
if ( !is_string( $elt ) ) {
- $elt = $elt->flatten( $tidyCompat );
+ $elt = $elt->flatten( $config );
}
if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) {
$blank = false;
$this->attribs = [ 'class' => "mw-empty-elt" ];
}
$blank = false;
+ } elseif (
+ $this->isA( BalanceSets::$extraLinefeedSet ) &&
+ count( $this->children ) > 0 &&
+ substr( $this->children[0], 0, 1 ) == "\n"
+ ) {
+ // Double the linefeed after pre/listing/textarea
+ // according to the (old) HTML5 fragment serialization
+ // algorithm (see https://github.com/whatwg/html/issues/944)
+ // to ensure this will round-trip.
+ array_unshift( $this->children, "\n" );
}
$flat = $blank ? '' : "{$this}";
} else {
$flat = "{$this}";
}
$this->parent->children[$idx] = $flat;
- $this->parent = 'flat'; # for assertion checking
+ $this->parent = 'flat'; // for assertion checking
return $flat;
}
$out .= "{$elt}";
}
$out .= "</{$this->localName}>";
- if (
- $this->isA( BalanceSets::$extraLinefeedSet ) &&
- $out[$len] === "\n"
- ) {
- // Double the linefeed after pre/listing/textarea
- // according to the HTML5 fragment serialization algorithm.
- $out = substr( $out, 0, $len + 1 ) .
- substr( $out, $len );
- }
} else {
$out = "<{$this->localName}{$encAttribs} />";
Assert::invariant(
return $out;
}
- # Utility functions on BalanceElements.
+ // Utility functions on BalanceElements.
/**
* Determine if $this represents a specific HTML tag, is a member of
return isset( $set[$this->namespaceURI] ) &&
isset( $set[$this->namespaceURI][$this->localName] );
} else {
- # assume this is an HTML element name.
+ // assume this is an HTML element name.
return $this->isHtml() && $this->localName === $set;
}
}
class BalanceStack implements IteratorAggregate {
/**
* Backing storage for the stack.
- * @var array $elements
+ * @var BalanceElement[] $elements
*/
private $elements = [];
/**
*/
public $fosterParentMode = false;
/**
- * Tidy compatibility mode, determines behavior of body/blockquote
+ * Configuration options governing flattening.
+ * @var array $config
+ * @see Balancer::__construct()
*/
- public $tidyCompat = false;
+ private $config;
/**
* Reference to the current element
*/
/**
* Create a new BalanceStack with a single BalanceElement on it,
* representing the root <html> node.
+ * @param array $config Balancer configuration; see Balancer::_construct().
*/
- public function __construct() {
- # always a root <html> element on the stack
+ public function __construct( array $config ) {
+ // always a root <html> element on the stack
array_push(
$this->elements,
new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] )
);
$this->currentNode = $this->elements[0];
+ $this->config = $config;
}
/**
$out = '';
foreach ( $this->elements[0]->children as $elt ) {
$out .= is_string( $elt ) ? $elt :
- $elt->flatten( $this->tidyCompat );
+ $elt->flatten( $this->config );
}
return $out;
}
/**
* Insert text at the appropriate place for inserting a node.
* @param string $value
+ * @param bool $isComment
* @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
*/
public function insertText( $value, $isComment = false ) {
) {
$this->fosterParent( $value );
} elseif (
- $this->tidyCompat && !$isComment &&
+ $this->config['tidyCompat'] && !$isComment &&
$this->currentNode->isA( BalanceSets::$tidyPWrapSet )
) {
- $this->insertHTMLELement( 'mw:p-wrap', [] );
+ $this->insertHTMLElement( 'mw:p-wrap', [] );
return $this->insertText( $value );
} else {
$this->currentNode->appendChild( $value );
/**
* Return an iterator over this stack which visits the current node
* first, and the root node last.
- * @return Iterator
+ * @return \Iterator
*/
public function getIterator() {
return new ReverseArrayIterator( $this->elements );
$this->currentNode = null;
}
if ( !$elt->isHtmlNamed( 'mw:p-wrap' ) ) {
- $elt->flatten( $this->tidyCompat );
+ $elt->flatten( $this->config );
}
}
* @param int $idx
*/
public function popTo( $idx ) {
- $length = count( $this->elements );
for ( $length = count( $this->elements ); $length > $idx; $length-- ) {
$this->pop();
}
// otherwise, it will eventually serialize when the parent
// is serialized, we just hold onto the memory for its
// tree of objects a little longer.
- $elt->flatten( $this->tidyCompat );
+ $elt->flatten( $this->config );
}
Assert::postcondition(
array_search( $elt, $this->elements, true ) === false,
}
}
- # Fostering and adoption.
+ // Fostering and adoption.
/**
* Foster parent the given $elt in the stack of open elements.
* @param BalanceElement|string $elt
+ * @return BalanceElement|string
+ *
* @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
*/
private function fosterParent( $elt ) {
$parent = $this->elements[$lastTemplate];
} elseif ( $lastTable >= 0 ) {
$parent = $this->elements[$lastTable]->parent;
- # Assume all tables have parents, since we're not running scripts!
+ // Assume all tables have parents, since we're not running scripts!
Assert::invariant(
$parent !== null, "All tables should have parents"
);
$parent = $this->elements[0]; // the `html` element.
}
- if ( $this->tidyCompat ) {
+ if ( $this->config['tidyCompat'] ) {
if ( is_string( $elt ) ) {
// We're fostering text: do we need a p-wrapper?
if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) {
return true; // no more handling required
}
- // Let outer loop counter be zero.
- $outer = 0;
-
// Outer loop: If outer loop counter is greater than or
// equal to eight, then abort these steps.
- while ( $outer < 8 ) {
- // Increment outer loop counter by one.
- $outer++;
-
+ for ( $outer = 0; $outer < 8; $outer++ ) {
// Let the formatting element be the last element in the list
// of active formatting elements that: is between the end of
// the list and the last scope marker in the list, if any, or
// the start of the list otherwise, and has the same tag name
// as the token.
- $fmtelt = $afe->findElementByTag( $tag );
+ $fmtElt = $afe->findElementByTag( $tag );
// If there is no such node, then abort these steps and instead
// act as described in the "any other end tag" entry below.
- if ( !$fmtelt ) {
+ if ( !$fmtElt ) {
return false; // false means handle by the default case
}
// Otherwise, if there is such a node, but that node is not in
// the stack of open elements, then this is a parse error;
// remove the element from the list, and abort these steps.
- $index = $this->indexOf( $fmtelt );
+ $index = $this->indexOf( $fmtElt );
if ( $index < 0 ) {
- $afe->remove( $fmtelt );
+ $afe->remove( $fmtElt );
return true; // true means no more handling required
}
// the stack of open elements, but the element is not in scope,
// then this is a parse error; ignore the token, and abort
// these steps.
- if ( !$this->inScope( $fmtelt ) ) {
+ if ( !$this->inScope( $fmtElt ) ) {
return true;
}
// open elements that is lower in the stack than the formatting
// element, and is an element in the special category. There
// might not be one.
- $furthestblock = null;
- $furthestblockindex = -1;
- $stacklen = $this->length();
- for ( $i = $index+1; $i < $stacklen; $i++ ) {
+ $furthestBlock = null;
+ $furthestBlockIndex = -1;
+ $stackLength = $this->length();
+ for ( $i = $index+1; $i < $stackLength; $i++ ) {
if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) {
- $furthestblock = $this->node( $i );
- $furthestblockindex = $i;
+ $furthestBlock = $this->node( $i );
+ $furthestBlockIndex = $i;
break;
}
}
// up to and including the formatting element, and remove the
// formatting element from the list of active formatting
// elements.
- if ( !$furthestblock ) {
- $this->popTag( $fmtelt );
- $afe->remove( $fmtelt );
- return true;
- } else {
- // Let the common ancestor be the element immediately above
- // the formatting element in the stack of open elements.
- $ancestor = $this->node( $index-1 );
-
- // Let a bookmark note the position of the formatting
- // element in the list of active formatting elements
- // relative to the elements on either side of it in the
- // list.
- $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] );
- $afe->insertAfter( $fmtelt, $BOOKMARK );
-
- // Let node and last node be the furthest block.
- $node = $furthestblock;
- $lastnode = $furthestblock;
- $nodeindex = $furthestblockindex;
- $isAFE = false;
-
- // Let inner loop counter be zero.
- $inner = 0;
-
- while ( true ) {
-
- // Increment inner loop counter by one.
- $inner++;
-
- // Let node be the element immediately above node in
- // the stack of open elements, or if node is no longer
- // in the stack of open elements (e.g. because it got
- // removed by this algorithm), the element that was
- // immediately above node in the stack of open elements
- // before node was removed.
- $node = $this->node( --$nodeindex );
-
- // If node is the formatting element, then go
- // to the next step in the overall algorithm.
- if ( $node === $fmtelt ) break;
-
- // If the inner loop counter is greater than three and node
- // is in the list of active formatting elements, then remove
- // node from the list of active formatting elements.
- $isAFE = $afe->isInList( $node );
- if ( $inner > 3 && $isAFE ) {
- $afe->remove( $node );
- $isAFE = false;
- }
-
- // If node is not in the list of active formatting
- // elements, then remove node from the stack of open
- // elements and then go back to the step labeled inner
- // loop.
- if ( !$isAFE ) {
- // Don't flatten here, since we're about to relocate
- // parts of this $node.
- $this->removeElement( $node, false );
- continue;
- }
-
- // Create an element for the token for which the
- // element node was created with common ancestor as
- // the intended parent, replace the entry for node
- // in the list of active formatting elements with an
- // entry for the new element, replace the entry for
- // node in the stack of open elements with an entry for
- // the new element, and let node be the new element.
- $newelt = new BalanceElement(
- $node->namespaceURI, $node->localName, $node->attribs );
- $afe->replace( $node, $newelt );
- $this->replaceAt( $nodeindex, $newelt );
- $node = $newelt;
-
- // If last node is the furthest block, then move the
- // aforementioned bookmark to be immediately after the
- // new node in the list of active formatting elements.
- if ( $lastnode === $furthestblock ) {
- $afe->remove( $BOOKMARK );
- $afe->insertAfter( $newelt, $BOOKMARK );
- }
-
- // Insert last node into node, first removing it from
- // its previous parent node if any.
- $node->appendChild( $lastnode );
-
- // Let last node be node.
- $lastnode = $node;
- }
-
- // If the common ancestor node is a table, tbody, tfoot,
- // thead, or tr element, then, foster parent whatever last
- // node ended up being in the previous step, first removing
- // it from its previous parent node if any.
- if (
- $this->fosterParentMode &&
- $ancestor->isA( BalanceSets::$tableSectionRowSet )
- ) {
- $this->fosterParent( $lastnode );
- } else {
- // Otherwise, append whatever last node ended up being in
- // the previous step to the common ancestor node, first
- // removing it from its previous parent node if any.
- $ancestor->appendChild( $lastnode );
+ if ( !$furthestBlock ) {
+ $this->popTag( $fmtElt );
+ $afe->remove( $fmtElt );
+ return true;
+ }
+
+ // Let the common ancestor be the element immediately above
+ // the formatting element in the stack of open elements.
+ $ancestor = $this->node( $index-1 );
+
+ // Let a bookmark note the position of the formatting
+ // element in the list of active formatting elements
+ // relative to the elements on either side of it in the
+ // list.
+ $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] );
+ $afe->insertAfter( $fmtElt, $BOOKMARK );
+
+ // Let node and last node be the furthest block.
+ $node = $furthestBlock;
+ $lastNode = $furthestBlock;
+ $nodeIndex = $furthestBlockIndex;
+ $isAFE = false;
+
+ // Inner loop
+ for ( $inner = 1; true; $inner++ ) {
+ // Let node be the element immediately above node in
+ // the stack of open elements, or if node is no longer
+ // in the stack of open elements (e.g. because it got
+ // removed by this algorithm), the element that was
+ // immediately above node in the stack of open elements
+ // before node was removed.
+ $node = $this->node( --$nodeIndex );
+
+ // If node is the formatting element, then go
+ // to the next step in the overall algorithm.
+ if ( $node === $fmtElt ) break;
+
+ // If the inner loop counter is greater than three and node
+ // is in the list of active formatting elements, then remove
+ // node from the list of active formatting elements.
+ $isAFE = $afe->isInList( $node );
+ if ( $inner > 3 && $isAFE ) {
+ $afe->remove( $node );
+ $isAFE = false;
+ }
+
+ // If node is not in the list of active formatting
+ // elements, then remove node from the stack of open
+ // elements and then go back to the step labeled inner
+ // loop.
+ if ( !$isAFE ) {
+ // Don't flatten here, since we're about to relocate
+ // parts of this $node.
+ $this->removeElement( $node, false );
+ continue;
}
// Create an element for the token for which the
- // formatting element was created, with furthest block
- // as the intended parent.
- $newelt2 = new BalanceElement(
- $fmtelt->namespaceURI, $fmtelt->localName, $fmtelt->attribs );
+ // element node was created with common ancestor as
+ // the intended parent, replace the entry for node
+ // in the list of active formatting elements with an
+ // entry for the new element, replace the entry for
+ // node in the stack of open elements with an entry for
+ // the new element, and let node be the new element.
+ $newElt = new BalanceElement(
+ $node->namespaceURI, $node->localName, $node->attribs );
+ $afe->replace( $node, $newElt );
+ $this->replaceAt( $nodeIndex, $newElt );
+ $node = $newElt;
+
+ // If last node is the furthest block, then move the
+ // aforementioned bookmark to be immediately after the
+ // new node in the list of active formatting elements.
+ if ( $lastNode === $furthestBlock ) {
+ $afe->remove( $BOOKMARK );
+ $afe->insertAfter( $newElt, $BOOKMARK );
+ }
+
+ // Insert last node into node, first removing it from
+ // its previous parent node if any.
+ $node->appendChild( $lastNode );
+
+ // Let last node be node.
+ $lastNode = $node;
+ }
+
+ // If the common ancestor node is a table, tbody, tfoot,
+ // thead, or tr element, then, foster parent whatever last
+ // node ended up being in the previous step, first removing
+ // it from its previous parent node if any.
+ if (
+ $this->fosterParentMode &&
+ $ancestor->isA( BalanceSets::$tableSectionRowSet )
+ ) {
+ $this->fosterParent( $lastNode );
+ } else {
+ // Otherwise, append whatever last node ended up being in
+ // the previous step to the common ancestor node, first
+ // removing it from its previous parent node if any.
+ $ancestor->appendChild( $lastNode );
+ }
- // Take all of the child nodes of the furthest block and
- // append them to the element created in the last step.
- $newelt2->adoptChildren( $furthestblock );
+ // Create an element for the token for which the
+ // formatting element was created, with furthest block
+ // as the intended parent.
+ $newElt2 = new BalanceElement(
+ $fmtElt->namespaceURI, $fmtElt->localName, $fmtElt->attribs );
- // Append that new element to the furthest block.
- $furthestblock->appendChild( $newelt2 );
+ // Take all of the child nodes of the furthest block and
+ // append them to the element created in the last step.
+ $newElt2->adoptChildren( $furthestBlock );
- // Remove the formatting element from the list of active
- // formatting elements, and insert the new element into the
- // list of active formatting elements at the position of
- // the aforementioned bookmark.
- $afe->remove( $fmtelt );
- $afe->replace( $BOOKMARK, $newelt2 );
+ // Append that new element to the furthest block.
+ $furthestBlock->appendChild( $newElt2 );
- // Remove the formatting element from the stack of open
- // elements, and insert the new element into the stack of
- // open elements immediately below the position of the
- // furthest block in that stack.
- $this->removeElement( $fmtelt );
- $this->insertAfter( $furthestblock, $newelt2 );
- }
+ // Remove the formatting element from the list of active
+ // formatting elements, and insert the new element into the
+ // list of active formatting elements at the position of
+ // the aforementioned bookmark.
+ $afe->remove( $fmtElt );
+ $afe->replace( $BOOKMARK, $newElt2 );
+
+ // Remove the formatting element from the stack of open
+ // elements, and insert the new element into the stack of
+ // open elements immediately below the position of the
+ // furthest block in that stack.
+ $this->removeElement( $fmtElt );
+ $this->insertAfter( $furthestBlock, $newElt2 );
}
return true;
private $noahTableStack = [ [] ];
public function __destruct() {
+ $next = null;
for ( $node = $this->head; $node; $node = $next ) {
$next = $node->nextAFE;
$node->prevAFE = $node->nextAFE = $node->nextNoah = null;
/**
* Determine whether an element is in the list of formatting elements.
+ * @param BalanceElement $elt
* @return boolean
*/
public function isInList( BalanceElement $elt ) {
/**
* Find the element $elt in the list and remove it.
* Used when parsing <a> in body mode.
+ *
+ * @param BalanceElement $elt
*/
public function remove( BalanceElement $elt ) {
if ( $this->head !== $elt && !$elt->prevAFE ) {
/**
* Find element $a in the list and replace it with element $b
+ *
+ * @param BalanceElement $a
+ * @param BalanceElement $b
*/
public function replace( BalanceElement $a, BalanceElement $b ) {
if ( $this->head !== $a && !$a->prevAFE ) {
/**
* Find $a in the list and insert $b after it.
+
+ * @param BalanceElement $a
+ * @param BalanceElement $b
*/
public function insertAfter( BalanceElement $a, BalanceElement $b ) {
if ( $this->head !== $a && !$a->prevAFE ) {
// Loop backward through the list until we find a marker or an
// open element
- $foundit = false;
+ $foundIt = false;
while ( $entry->prevAFE ) {
$entry = $entry->prevAFE;
if ( $entry instanceof BalanceMarker || $stack->indexOf( $entry ) >= 0 ) {
- $foundit = true;
+ $foundIt = true;
break;
}
}
// the first element if we didn't find a marker or open element),
// recreating formatting elements and pushing them back onto the list
// of open elements.
- if ( $foundit ) {
+ if ( $foundIt ) {
$entry = $entry->nextAFE;
}
do {
* and escaped.
* - All null characters are assumed to have been removed.
* - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
- * <frame>, <plaintext>, <isindex>, <xmp>, <iframe>,
+ * <frame>, <plaintext>, <xmp>, <iframe>,
* <noembed>, <noscript>, <script>, <title>. As a result,
* further simplifications can be made:
* - `frameset-ok` is not tracked.
* <style>/<noframes> want RAWTEXT modes which we only loosely emulate.)
*
* We generally mark places where we omit cases from the spec due to
- * disallowed elements with a comment: `# OMITTED: <element-name>`.
+ * disallowed elements with a comment: `// OMITTED: <element-name>`.
*
* The HTML spec keeps a flag during the parsing process to track
* whether or not a "parse error" has been encountered. We don't
*/
class Balancer {
private $parseMode;
+ /** @var \Iterator */
private $bitsIterator;
private $allowedHtmlElements;
+ /** @var BalanceActiveFormattingElements */
private $afe;
+ /** @var BalanceStack */
private $stack;
private $strict;
- private $tidyCompat;
private $allowComments;
+ private $config;
private $textIntegrationMode;
private $pendingTableText;
private $inRCDATA;
private $inRAWTEXT;
+ /** @var callable|null */
+ private $processingCallback;
+ /** @var array */
+ private $processingArgs;
+
/**
* Valid HTML5 comments.
* Regex borrowed from Tim Starling's "remex-html" project.
*/
const VALID_COMMENT_REGEX = "~ !--
- ( # 1. Comment match detector
+ ( # 1. Comment match detector
> | -> | # Invalid short close
( # 2. Comment contents
(?:
( # 3. Comment close
--> | # Normal close
--!> | # Comment end bang
- ( # 4. Indicate matches requiring EOF
- --! | # EOF in comment end bang state
- -- | # EOF in comment end state
- - | # EOF in comment end dash state
- # EOF in comment state
+ ( # 4. Indicate matches requiring EOF
+ --! | # EOF in comment end bang state
+ -- | # EOF in comment end state
+ - | # EOF in comment end dash state
+ (?#nothing) # EOF in comment state
)
)
)
- ([^<]*) \z # 5. Non-tag text after the comment
+ ([^<]*) \z # 5. Non-tag text after the comment
~xs";
/**
* provide historical compatibility with the old "tidy"
* program: <p>-wrapping is done to the children of
* <body> and <blockquote> elements, and empty elements
- * are removed.
+ * are removed. The <pre>/<listing>/<textarea> serialization
+ * is also tweaked to allow lossless round trips.
+ * (See: https://github.com/whatwg/html/issues/944)
* 'allowComments': boolean, defaults to true.
* When true, allows HTML comments in the input.
* The Sanitizer generally strips all comments, so if you
* false to get a bit more performance.
*/
public function __construct( array $config = [] ) {
- $config = $config + [
+ $this->config = $config = $config + [
'strict' => false,
'allowedHtmlElements' => null,
'tidyCompat' => false,
];
$this->allowedHtmlElements = $config['allowedHtmlElements'];
$this->strict = $config['strict'];
- $this->tidyCompat = $config['tidyCompat'];
$this->allowComments = $config['allowComments'];
if ( $this->allowedHtmlElements !== null ) {
- # Sanity check!
+ // Sanity check!
$bad = array_uintersect_assoc(
$this->allowedHtmlElements,
BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE],
$this->parseMode = 'inBodyMode';
$this->bitsIterator = new ExplodeIterator( '<', $text );
$this->afe = new BalanceActiveFormattingElements();
- $this->stack = new BalanceStack();
- $this->stack->tidyCompat = $this->tidyCompat;
+ $this->stack = new BalanceStack( $this->config );
$this->processingCallback = $processingCallback;
$this->processingArgs = $processingArgs;
$this->inRCDATA =
$this->inRAWTEXT = false;
- # The stack is constructed with an <html> element already on it.
- # Set this up as a fragment parsed with <body> as the context.
+ // The stack is constructed with an <html> element already on it.
+ // Set this up as a fragment parsed with <body> as the context.
$this->fragmentContext =
new BalanceElement( BalanceSets::HTML_NAMESPACE, 'body', [] );
$this->resetInsertionMode();
* Pass a token to the tree builder. The $token will be one of the
* strings "tag", "endtag", or "text".
*/
- private function insertToken( $token, $value, $attribs = null, $selfclose = false ) {
+ private function insertToken( $token, $value, $attribs = null, $selfClose = false ) {
// validate tags against $unsupportedSet
if ( $token === 'tag' || $token === 'endtag' ) {
if ( isset( BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE][$value] ) ) {
- # As described in "simplifications" above, these tags are
- # not supported in the balancer.
+ // As described in "simplifications" above, these tags are
+ // not supported in the balancer.
Assert::invariant(
!$this->strict,
"Unsupported $token <$value> found."
return false;
}
} elseif ( $token === 'text' && $value === '' ) {
- # Don't actually inject the empty string as a text token.
+ // Don't actually inject the empty string as a text token.
return true;
}
// Support pre/listing/textarea by suppressing initial linefeed
if ( $token === 'text' ) {
if ( $value[0] === "\n" ) {
if ( $value === "\n" ) {
- # Nothing would be left, don't inject the empty string.
+ // Nothing would be left, don't inject the empty string.
return true;
}
$value = substr( $value, 1 );
// Some hoops we have to jump through
$adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
+ // The spec calls this the "tree construction dispatcher".
$isForeign = true;
if (
$this->stack->length() === 0 ||
$isForeign = false;
}
if ( $isForeign ) {
- return $this->insertForeignToken( $token, $value, $attribs, $selfclose );
+ return $this->insertForeignToken( $token, $value, $attribs, $selfClose );
} else {
$func = $this->parseMode;
- return $this->$func( $token, $value, $attribs, $selfclose );
+ return $this->$func( $token, $value, $attribs, $selfClose );
}
}
- private function insertForeignToken( $token, $value, $attribs = null, $selfclose = false ) {
+ private function insertForeignToken( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'text' ) {
$this->stack->insertText( $value );
return true;
+ } elseif ( $token === 'comment' ) {
+ $this->stack->insertComment( $value );
+ return true;
} elseif ( $token === 'tag' ) {
switch ( $value ) {
case 'font':
) {
break;
}
- /* otherwise, fall through */
+ // otherwise, fall through
case 'b':
case 'big':
case 'blockquote':
break;
}
}
- return $this->insertToken( $token, $value, $attribs, $selfclose );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
}
// "Any other start tag"
$adjusted = ( $this->fragmentContext && $this->stack->length()===1 ) ?
$this->stack->insertForeignElement(
$adjusted->namespaceURI, $value, $attribs
);
- if ( $selfclose ) {
+ if ( $selfClose ) {
$this->stack->pop();
}
return true;
if ( $node->isHtml() && !$first ) {
// process the end tag as HTML
$func = $this->parseMode;
- return $this->$func( $token, $value, $attribs, $selfclose );
+ return $this->$func( $token, $value, $attribs, $selfClose );
} elseif ( $i === 0 ) {
return true;
} elseif ( $node->localName === $value ) {
$x = $this->bitsIterator->current();
$this->bitsIterator->next();
$regs = [];
- # Handle comments. These won't be generated by mediawiki (they
- # are stripped in the Sanitizer) but may be generated by extensions.
+ // Handle comments. These won't be generated by mediawiki (they
+ // are stripped in the Sanitizer) but may be generated by extensions.
if (
$this->allowComments &&
!( $this->inRCDATA || $this->inRAWTEXT ) &&
preg_match( Balancer::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
- /* verify EOF condition where necessary */
+ // verify EOF condition where necessary
( $regs[4][1] < 0 || !$this->bitsIterator->valid() )
) {
$contents = $regs[2][0];
$this->insertToken( 'text', str_replace( '>', '>', $rest ) );
return;
}
- # $slash: Does the current element start with a '/'?
- # $t: Current element name
- # $attribStr: String between element name and >
- # $brace: Ending '>' or '/>'
- # $rest: Everything until the next element from the $bitsIterator
+ // $slash: Does the current element start with a '/'?
+ // $t: Current element name
+ // $attribStr: String between element name and >
+ // $brace: Ending '>' or '/>'
+ // $rest: Everything until the next element from the $bitsIterator
if ( preg_match( Sanitizer::ELEMENT_BITS_REGEX, $x, $regs ) ) {
list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs;
$t = strtolower( $t );
if ( $this->strict ) {
- /* Verify that attributes are all properly double-quoted */
+ // Verify that attributes are all properly double-quoted
Assert::invariant(
preg_match(
'/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr
);
$slash = $t = $attribStr = $brace = $rest = null;
}
- $goodtag = $t;
+ $goodTag = $t;
if ( $this->inRCDATA ) {
if ( $slash && $t === $this->inRCDATA ) {
$this->inRCDATA = false;
} else {
// No tags allowed; this emulates the "rcdata" tokenizer mode.
- $goodtag = false;
+ $goodTag = false;
}
}
if ( $this->inRAWTEXT ) {
$this->inRAWTEXT = false;
} else {
// No tags allowed, no entity-escaping done.
- $goodtag = false;
+ $goodTag = false;
}
}
$sanitize = $this->allowedHtmlElements !== null;
if ( $sanitize ) {
- $goodtag = $t && isset( $this->allowedHtmlElements[$t] );
+ $goodTag = $t && isset( $this->allowedHtmlElements[$t] );
}
- if ( $goodtag ) {
+ if ( $goodTag ) {
if ( is_callable( $this->processingCallback ) ) {
call_user_func_array( $this->processingCallback, [ &$attribStr, $this->processingArgs ] );
}
if ( $sanitize ) {
- $goodtag = Sanitizer::validateTag( $attribStr, $t );
+ $goodTag = Sanitizer::validateTag( $attribStr, $t );
}
}
- if ( $goodtag ) {
+ if ( $goodTag ) {
if ( $sanitize ) {
$attribs = Sanitizer::decodeTagAttributes( $attribStr );
$attribs = Sanitizer::validateTagAttributes( $attribs, $t );
} else {
$attribs = Sanitizer::decodeTagAttributes( $attribStr );
}
- $goodtag = $this->insertToken(
+ $goodTag = $this->insertToken(
$slash ? 'endtag' : 'tag', $t, $attribs, $brace === '/>'
);
}
- if ( $goodtag ) {
+ if ( $goodTag ) {
$rest = str_replace( '>', '>', $rest );
$this->insertToken( 'text', str_replace( '>', '>', $rest ) );
} elseif ( $this->inRAWTEXT ) {
$this->insertToken( 'text', "<$x" );
} else {
- # bad tag; serialize entire thing as text.
+ // bad tag; serialize entire thing as text.
$this->insertToken( 'text', '<' . str_replace( '>', '>', $x ) );
}
}
return $oldMode;
}
- private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfclose ) {
+ private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfClose ) {
$this->switchMode( $mode );
- return $this->insertToken( $token, $value, $attribs, $selfclose );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
}
private function resetInsertionMode() {
if ( $node->isHtml() ) {
switch ( $node->localName ) {
case 'select':
- $stacklen = $this->stack->length();
- for ( $j = $i + 1; $j < $stacklen-1; $j++ ) {
- $ancestor = $this->stack->node( $stacklen-$j-1 );
+ $stackLength = $this->stack->length();
+ for ( $j = $i + 1; $j < $stackLength-1; $j++ ) {
+ $ancestor = $this->stack->node( $stackLength-$j-1 );
if ( $ancestor->isHtmlNamed( 'template' ) ) {
break;
}
case 'body':
$this->switchMode( 'inBodyMode' );
return;
- # OMITTED: <frameset>
- # OMITTED: <html>
- # OMITTED: <head>
+ // OMITTED: <frameset>
+ // OMITTED: <html>
+ // OMITTED: <head>
default:
if ( !$last ) {
- # OMITTED: <head>
+ // OMITTED: <head>
if ( $node->isA( BalanceSets::$tableCellSet ) ) {
$this->switchMode( 'inCellMode' );
return;
}
private function stopParsing() {
- # Most of the spec methods are inapplicable, other than step 2:
- # "pop all the nodes off the stack of open elements".
- # We're going to keep the top-most <html> element on the stack, though.
-
- # Clear the AFE list first, otherwise the element objects will stay live
- # during serialization, potentially using O(N^2) memory. Note that
- # popping the stack will never result in reconstructing the active
- # formatting elements.
+ // Most of the spec methods are inapplicable, other than step 2:
+ // "pop all the nodes off the stack of open elements".
+ // We're going to keep the top-most <html> element on the stack, though.
+
+ // Clear the AFE list first, otherwise the element objects will stay live
+ // during serialization, potentially using O(N^2) memory. Note that
+ // popping the stack will never result in reconstructing the active
+ // formatting elements.
$this->afe = null;
$this->stack->popTo( 1 );
}
return true;
}
- private function inTextMode( $token, $value, $attribs = null, $selfclose = false ) {
+ private function inTextMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'text' ) {
$this->stack->insertText( $value );
return true;
} elseif ( $token === 'eof' ) {
$this->stack->pop();
return $this->switchModeAndReprocess(
- $this->originalInsertionMode, $token, $value, $attribs, $selfclose
+ $this->originalInsertionMode, $token, $value, $attribs, $selfClose
);
} elseif ( $token === 'endtag' ) {
$this->stack->pop();
return true;
}
- private function inHeadMode( $token, $value, $attribs = null, $selfclose = false ) {
+ private function inHeadMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'text' ) {
if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
$this->stack->insertText( $matches[0] );
} elseif ( $token === 'tag' ) {
switch ( $value ) {
case 'meta':
- # OMITTED: in a full HTML parser, this might change the encoding.
- /* falls through */
- # OMITTED: <html>
+ // OMITTED: in a full HTML parser, this might change the encoding.
+ // falls through
+ // OMITTED: <html>
case 'base':
case 'basefont':
case 'bgsound':
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->pop();
return true;
- # OMITTED: <title>
- # OMITTED: <noscript>
+ // OMITTED: <title>
+ // OMITTED: <noscript>
case 'noframes':
case 'style':
return $this->parseRawText( $value, $attribs );
- # OMITTED: <script>
+ // OMITTED: <script>
case 'template':
$this->stack->insertHTMLElement( $value, $attribs );
$this->afe->insertMarker();
- # OMITTED: frameset_ok
+ // OMITTED: frameset_ok
$this->switchMode( 'inTemplateMode' );
$this->templateInsertionModes[] = $this->parseMode;
return true;
- # OMITTED: <head>
+ // OMITTED: <head>
}
} elseif ( $token === 'endtag' ) {
switch ( $value ) {
- # OMITTED: <head>
- # OMITTED: <body>
- # OMITTED: <html>
+ // OMITTED: <head>
+ // OMITTED: <body>
+ // OMITTED: <html>
case 'br':
break; // handle at the bottom of the function
case 'template':
// If not handled above
$this->inHeadMode( 'endtag', 'head' ); // synthetic </head>
// Then redo this one
- return $this->insertToken( $token, $value, $attribs, $selfclose );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
}
- private function inBodyMode( $token, $value, $attribs = null, $selfclose = false ) {
+ private function inBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'text' ) {
$this->afe->reconstruct( $this->stack );
$this->stack->insertText( $value );
return true;
} elseif ( $token === 'eof' ) {
if ( !empty( $this->templateInsertionModes ) ) {
- return $this->inTemplateMode( $token, $value, $attribs, $selfclose );
+ return $this->inTemplateMode( $token, $value, $attribs, $selfClose );
}
$this->stopParsing();
return true;
} elseif ( $token === 'tag' ) {
switch ( $value ) {
- # OMITTED: <html>
+ // OMITTED: <html>
case 'base':
case 'basefont':
case 'bgsound':
case 'link':
case 'meta':
case 'noframes':
- # OMITTED: <script>
+ // OMITTED: <script>
case 'style':
case 'template':
- # OMITTED: <title>
- return $this->inHeadMode( $token, $value, $attribs, $selfclose );
- # OMITTED: <body>
- # OMITTED: <frameset>
+ // OMITTED: <title>
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
+ // OMITTED: <body>
+ // OMITTED: <frameset>
case 'address':
case 'article':
case 'header':
case 'hgroup':
case 'main':
- case 'menu':
case 'nav':
case 'ol':
case 'p':
$this->stack->insertHTMLElement( $value, $attribs );
return true;
+ case 'menu':
+ if ( $this->stack->inButtonScope( "p" ) ) {
+ $this->inBodyMode( 'endtag', 'p' );
+ }
+ if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
+ $this->stack->pop();
+ }
+ $this->stack->insertHTMLElement( $value, $attribs );
+ return true;
+
case 'h1':
case 'h2':
case 'h3':
}
$this->stack->insertHTMLElement( $value, $attribs );
$this->ignoreLinefeed = true;
- # OMITTED: frameset_ok
+ // OMITTED: frameset_ok
return true;
case 'form':
return true;
case 'li':
- # OMITTED: frameset_ok
+ // OMITTED: frameset_ok
foreach ( $this->stack as $node ) {
if ( $node->isHtmlNamed( 'li' ) ) {
$this->inBodyMode( 'endtag', 'li' );
case 'dd':
case 'dt':
- # OMITTED: frameset_ok
+ // OMITTED: frameset_ok
foreach ( $this->stack as $node ) {
if ( $node->isHtmlNamed( 'dd' ) ) {
$this->inBodyMode( 'endtag', 'dd' );
$this->stack->insertHTMLElement( $value, $attribs );
return true;
- # OMITTED: <plaintext>
+ // OMITTED: <plaintext>
case 'button':
if ( $this->stack->inScope( 'button' ) ) {
$this->inBodyMode( 'endtag', 'button' );
- return $this->insertToken( $token, $value, $attribs, $selfclose );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
}
$this->afe->reconstruct( $this->stack );
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->removeElement( $activeElement, false );
}
}
- /* Falls through */
+ // Falls through
case 'b':
case 'big':
case 'code':
case 'tt':
case 'u':
$this->afe->reconstruct( $this->stack );
- $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
+ $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
return true;
case 'nobr':
$this->inBodyMode( 'endtag', 'nobr' );
$this->afe->reconstruct( $this->stack );
}
- $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
+ $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ) );
return true;
case 'applet':
$this->afe->reconstruct( $this->stack );
$this->stack->insertHTMLElement( $value, $attribs );
$this->afe->insertMarker();
- # OMITTED: frameset_ok
+ // OMITTED: frameset_ok
return true;
case 'table':
- # The document is never in "quirks mode"; see simplifications
- # above.
+ // The document is never in "quirks mode"; see simplifications
+ // above.
if ( $this->stack->inButtonScope( 'p' ) ) {
$this->inBodyMode( 'endtag', 'p' );
}
$this->stack->insertHTMLElement( $value, $attribs );
- # OMITTED: frameset_ok
+ // OMITTED: frameset_ok
$this->switchMode( 'inTableMode' );
return true;
$this->afe->reconstruct( $this->stack );
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->pop();
- # OMITTED: frameset_ok
+ // OMITTED: frameset_ok
return true;
case 'input':
$this->afe->reconstruct( $this->stack );
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->pop();
- # OMITTED: frameset_ok
- # (hence we don't need to examine the tag's "type" attribute)
+ // OMITTED: frameset_ok
+ // (hence we don't need to examine the tag's "type" attribute)
return true;
- case 'menuitem':
case 'param':
case 'source':
case 'track':
if ( $this->stack->inButtonScope( 'p' ) ) {
$this->inBodyMode( 'endtag', 'p' );
}
+ if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
+ $this->stack->pop();
+ }
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->pop();
return true;
case 'image':
- # warts!
- return $this->inBodyMode( $token, 'img', $attribs, $selfclose );
-
- # OMITTED: <isindex>
+ // warts!
+ return $this->inBodyMode( $token, 'img', $attribs, $selfClose );
case 'textarea':
$this->stack->insertHTMLElement( $value, $attribs );
$this->ignoreLinefeed = true;
$this->inRCDATA = $value; // emulate rcdata tokenizer mode
- # OMITTED: frameset_ok
+ // OMITTED: frameset_ok
return true;
- # OMITTED: <xmp>
- # OMITTED: <iframe>
- # OMITTED: <noembed>
- # OMITTED: <noscript>
+ // OMITTED: <xmp>
+ // OMITTED: <iframe>
+ // OMITTED: <noembed>
+ // OMITTED: <noscript>
case 'select':
$this->afe->reconstruct( $this->stack );
$this->stack->insertHTMLElement( $value, $attribs );
return true;
+ case 'menuitem':
+ if ( $this->stack->currentNode->isHtmlNamed( 'menuitem' ) ) {
+ $this->stack->pop();
+ }
+ $this->afe->reconstruct( $this->stack );
+ $this->stack->insertHTMLElement( $value, $attribs );
+ return true;
+
case 'rb':
case 'rtc':
if ( $this->stack->inScope( 'ruby' ) ) {
case 'math':
$this->afe->reconstruct( $this->stack );
- # We skip the spec's "adjust MathML attributes" and
- # "adjust foreign attributes" steps, since the browser will
- # do this later when it parses the output and it doesn't affect
- # balancing.
+ // We skip the spec's "adjust MathML attributes" and
+ // "adjust foreign attributes" steps, since the browser will
+ // do this later when it parses the output and it doesn't affect
+ // balancing.
$this->stack->insertForeignElement(
BalanceSets::MATHML_NAMESPACE, $value, $attribs
);
- if ( $selfclose ) {
- # emit explicit </math> tag.
+ if ( $selfClose ) {
+ // emit explicit </math> tag.
$this->stack->pop();
}
return true;
case 'svg':
$this->afe->reconstruct( $this->stack );
- # We skip the spec's "adjust SVG attributes" and
- # "adjust foreign attributes" steps, since the browser will
- # do this later when it parses the output and it doesn't affect
- # balancing.
+ // We skip the spec's "adjust SVG attributes" and
+ // "adjust foreign attributes" steps, since the browser will
+ // do this later when it parses the output and it doesn't affect
+ // balancing.
$this->stack->insertForeignElement(
BalanceSets::SVG_NAMESPACE, $value, $attribs
);
- if ( $selfclose ) {
- # emit explicit </svg> tag.
+ if ( $selfClose ) {
+ // emit explicit </svg> tag.
$this->stack->pop();
}
return true;
case 'caption':
case 'col':
case 'colgroup':
- # OMITTED: <frame>
+ // OMITTED: <frame>
case 'head':
case 'tbody':
case 'td':
return true;
} elseif ( $token === 'endtag' ) {
switch ( $value ) {
- # </body>,</html> are unsupported.
+ // </body>,</html> are unsupported.
case 'template':
- return $this->inHeadMode( $token, $value, $attribs, $selfclose );
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
case 'address':
case 'article':
case 'p':
if ( !$this->stack->inButtonScope( 'p' ) ) {
$this->inBodyMode( 'tag', 'p', [] );
- return $this->insertToken( $token, $value, $attribs, $selfclose );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
}
$this->stack->generateImpliedEndTags( $value );
$this->stack->popTag( $value );
case 'li':
if ( !$this->stack->inListItemScope( $value ) ) {
- return true; # ignore
+ return true; // ignore
}
$this->stack->generateImpliedEndTags( $value );
$this->stack->popTag( $value );
case 'dd':
case 'dt':
if ( !$this->stack->inScope( $value ) ) {
- return true; # ignore
+ return true; // ignore
}
$this->stack->generateImpliedEndTags( $value );
$this->stack->popTag( $value );
case 'h5':
case 'h6':
if ( !$this->stack->inScope( BalanceSets::$headingSet ) ) {
- return true; # ignore
+ return true; // ignore
}
$this->stack->generateImpliedEndTags();
$this->stack->popTag( BalanceSets::$headingSet );
return true;
case 'sarcasm':
- # Take a deep breath, then:
+ // Take a deep breath, then:
break;
case 'a':
case 'tt':
case 'u':
if ( $this->stack->adoptionAgency( $value, $this->afe ) ) {
- return true; # If we did something, we're done.
+ return true; // If we did something, we're done.
}
- break; # Go to the "any other end tag" case.
+ break; // Go to the "any other end tag" case.
case 'applet':
case 'marquee':
case 'object':
if ( !$this->stack->inScope( $value ) ) {
- return true; # ignore
+ return true; // ignore
}
$this->stack->generateImpliedEndTags();
$this->stack->popTag( $value );
return true;
case 'br':
- # Turn </br> into <br>
+ // Turn </br> into <br>
return $this->inBodyMode( 'tag', $value, [] );
}
foreach ( $this->stack as $i => $node ) {
if ( $node->isHtmlNamed( $value ) ) {
$this->stack->generateImpliedEndTags( $value );
- $this->stack->popTo( $i ); # including $i
+ $this->stack->popTo( $i ); // including $i
break;
} elseif ( $node->isA( BalanceSets::$specialSet ) ) {
return true; // ignore this close token.
}
}
- private function inTableMode( $token, $value, $attribs = null, $selfclose = false ) {
+ private function inTableMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'text' ) {
if ( $this->textIntegrationMode ) {
- return $this->inBodyMode( $token, $value, $attribs, $selfclose );
+ return $this->inBodyMode( $token, $value, $attribs, $selfClose );
} elseif ( $this->stack->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) {
$this->pendingTableText = '';
$this->originalInsertionMode = $this->parseMode;
- return $this->switchModeAndReprocess( 'inTableTextMode', $token, $value, $attribs, $selfclose );
+ return $this->switchModeAndReprocess( 'inTableTextMode',
+ $token, $value, $attribs, $selfClose );
}
// fall through to default case.
} elseif ( $token === 'eof' ) {
return true;
case 'col':
$this->inTableMode( 'tag', 'colgroup', [] );
- return $this->insertToken( $token, $value, $attribs, $selfclose );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
case 'tbody':
case 'tfoot':
case 'thead':
case 'th':
case 'tr':
$this->inTableMode( 'tag', 'tbody', [] );
- return $this->insertToken( $token, $value, $attribs, $selfclose );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
case 'table':
if ( !$this->stack->inTableScope( $value ) ) {
return true; // Ignore this tag.
}
$this->inTableMode( 'endtag', $value );
- return $this->insertToken( $token, $value, $attribs, $selfclose );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
case 'style':
- # OMITTED: <script>
+ // OMITTED: <script>
case 'template':
- return $this->inHeadMode( $token, $value, $attribs, $selfclose );
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
case 'input':
if ( !isset( $attribs['type'] ) || strcasecmp( $attribs['type'], 'hidden' ) !== 0 ) {
$this->stack->popTag( $value );
$this->resetInsertionMode();
return true;
- # OMITTED: <body>
+ // OMITTED: <body>
case 'caption':
case 'col':
case 'colgroup':
- # OMITTED: <html>
+ // OMITTED: <html>
case 'tbody':
case 'td':
case 'tfoot':
case 'tr':
return true; // Ignore the token.
case 'template':
- return $this->inHeadMode( $token, $value, $attribs, $selfclose );
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
}
// Fall through for "anything else" clause.
} elseif ( $token === 'comment' ) {
}
// This is the "anything else" case:
$this->stack->fosterParentMode = true;
- $this->inBodyMode( $token, $value, $attribs, $selfclose );
+ $this->inBodyMode( $token, $value, $attribs, $selfClose );
$this->stack->fosterParentMode = false;
return true;
}
- private function inTableTextMode( $token, $value, $attribs = null, $selfclose = false ) {
+ private function inTableTextMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'text' ) {
$this->pendingTableText .= $value;
return true;
$this->stack->insertText( $text );
}
return $this->switchModeAndReprocess(
- $this->originalInsertionMode, $token, $value, $attribs, $selfclose
+ $this->originalInsertionMode, $token, $value, $attribs, $selfClose
);
}
return true;
}
- private function inCaptionMode( $token, $value, $attribs = null, $selfclose = false ) {
+ private function inCaptionMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'tag' ) {
switch ( $value ) {
case 'caption':
case 'thead':
case 'tr':
if ( $this->endCaption() ) {
- $this->insertToken( $token, $value, $attribs, $selfclose );
+ $this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
}
return true;
case 'table':
if ( $this->endCaption() ) {
- $this->insertToken( $token, $value, $attribs, $selfclose );
+ $this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
case 'body':
case 'col':
case 'colgroup':
- # OMITTED: <html>
+ // OMITTED: <html>
case 'tbody':
case 'td':
case 'tfoot':
// Fall through to "anything else" case.
}
// The Anything Else case
- return $this->inBodyMode( $token, $value, $attribs, $selfclose );
+ return $this->inBodyMode( $token, $value, $attribs, $selfClose );
}
- private function inColumnGroupMode( $token, $value, $attribs = null, $selfclose = false ) {
+ private function inColumnGroupMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'text' ) {
if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
$this->stack->insertText( $matches[0] );
// Fall through to handle non-whitespace below.
} elseif ( $token === 'tag' ) {
switch ( $value ) {
- # OMITTED: <html>
+ // OMITTED: <html>
case 'col':
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->pop();
return true;
case 'template':
- return $this->inHeadMode( $token, $value, $attribs, $selfclose );
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
}
// Fall through for "anything else".
} elseif ( $token === 'endtag' ) {
case 'col':
return true; // Ignore the token.
case 'template':
- return $this->inHeadMode( $token, $value, $attribs, $selfclose );
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
}
// Fall through for "anything else".
} elseif ( $token === 'eof' ) {
- return $this->inBodyMode( $token, $value, $attribs, $selfclose );
+ return $this->inBodyMode( $token, $value, $attribs, $selfClose );
} elseif ( $token === 'comment' ) {
$this->stack->insertComment( $value );
return true;
return true; // Ignore the token.
}
$this->inColumnGroupMode( 'endtag', 'colgroup' );
- return $this->insertToken( $token, $value, $attribs, $selfclose );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
}
// Helper function for inTableBodyMode
$this->switchMode( 'inTableMode' );
return true;
}
- private function inTableBodyMode( $token, $value, $attribs = null, $selfclose = false ) {
+ private function inTableBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'tag' ) {
switch ( $value ) {
case 'tr':
case 'th':
case 'td':
$this->inTableBodyMode( 'tag', 'tr', [] );
- $this->insertToken( $token, $value, $attribs, $selfclose );
+ $this->insertToken( $token, $value, $attribs, $selfClose );
return true;
case 'caption':
case 'col':
case 'tfoot':
case 'thead':
if ( $this->endSection() ) {
- $this->insertToken( $token, $value, $attribs, $selfclose );
+ $this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
}
switch ( $value ) {
case 'table':
if ( $this->endSection() ) {
- $this->insertToken( $token, $value, $attribs, $selfclose );
+ $this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
case 'tbody':
$this->endSection();
}
return true;
- # OMITTED: <body>
+ // OMITTED: <body>
case 'caption':
case 'col':
case 'colgroup':
- # OMITTED: <html>
+ // OMITTED: <html>
case 'td':
case 'th':
case 'tr':
}
}
// Anything else:
- return $this->inTableMode( $token, $value, $attribs, $selfclose );
+ return $this->inTableMode( $token, $value, $attribs, $selfClose );
}
// Helper function for inRowMode
$this->switchMode( 'inTableBodyMode' );
return true;
}
- private function inRowMode( $token, $value, $attribs = null, $selfclose = false ) {
+ private function inRowMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'tag' ) {
switch ( $value ) {
case 'th':
case 'thead':
case 'tr':
if ( $this->endRow() ) {
- $this->insertToken( $token, $value, $attribs, $selfclose );
+ $this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
}
return true;
case 'table':
if ( $this->endRow() ) {
- $this->insertToken( $token, $value, $attribs, $selfclose );
+ $this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
case 'tbody':
$this->stack->inTableScope( $value ) &&
$this->endRow()
) {
- $this->insertToken( $token, $value, $attribs, $selfclose );
+ $this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
- # OMITTED: <body>
+ // OMITTED: <body>
case 'caption':
case 'col':
case 'colgroup':
- # OMITTED: <html>
+ // OMITTED: <html>
case 'td':
case 'th':
return true; // Ignore the token.
}
}
// Anything else:
- return $this->inTableMode( $token, $value, $attribs, $selfclose );
+ return $this->inTableMode( $token, $value, $attribs, $selfClose );
}
// Helper for inCellMode
return false;
}
}
- private function inCellMode( $token, $value, $attribs = null, $selfclose = false ) {
+ private function inCellMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'tag' ) {
switch ( $value ) {
case 'caption':
case 'thead':
case 'tr':
if ( $this->endCell() ) {
- $this->insertToken( $token, $value, $attribs, $selfclose );
+ $this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
}
$this->switchMode( 'inRowMode' );
}
return true;
- # OMITTED: <body>
+ // OMITTED: <body>
case 'caption':
case 'col':
case 'colgroup':
- # OMITTED: <html>
+ // OMITTED: <html>
return true;
case 'table':
$this->stack->popTag( BalanceSets::$tableCellSet );
$this->afe->clearToMarker();
$this->switchMode( 'inRowMode' );
- $this->insertToken( $token, $value, $attribs, $selfclose );
+ $this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
}
}
// Anything else:
- return $this->inBodyMode( $token, $value, $attribs, $selfclose );
+ return $this->inBodyMode( $token, $value, $attribs, $selfClose );
}
- private function inSelectMode( $token, $value, $attribs = null, $selfclose = false ) {
+ private function inSelectMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'text' ) {
$this->stack->insertText( $value );
return true;
} elseif ( $token === 'eof' ) {
- return $this->inBodyMode( $token, $value, $attribs, $selfclose );
+ return $this->inBodyMode( $token, $value, $attribs, $selfClose );
} elseif ( $token === 'tag' ) {
switch ( $value ) {
- # OMITTED: <html>
+ // OMITTED: <html>
case 'option':
if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
$this->stack->pop();
return true; // ignore token (fragment case)
}
$this->inSelectMode( 'endtag', 'select' );
- return $this->insertToken( $token, $value, $attribs, $selfclose );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
case 'script':
case 'template':
- return $this->inHeadMode( $token, $value, $attribs, $selfclose );
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
}
} elseif ( $token === 'endtag' ) {
switch ( $value ) {
$this->resetInsertionMode();
return true;
case 'template':
- return $this->inHeadMode( $token, $value, $attribs, $selfclose );
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
}
} elseif ( $token === 'comment' ) {
$this->stack->insertComment( $value );
return true;
}
- private function inSelectInTableMode( $token, $value, $attribs = null, $selfclose = false ) {
+ private function inSelectInTableMode( $token, $value, $attribs = null, $selfClose = false ) {
switch ( $value ) {
case 'caption':
case 'table':
case 'th':
if ( $token === 'tag' ) {
$this->inSelectInTableMode( 'endtag', 'select' );
- return $this->insertToken( $token, $value, $attribs, $selfclose );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
} elseif ( $token === 'endtag' ) {
if ( $this->stack->inTableScope( $value ) ) {
$this->inSelectInTableMode( 'endtag', 'select' );
- return $this->insertToken( $token, $value, $attribs, $selfclose );
+ return $this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
}
}
// anything else
- return $this->inSelectMode( $token, $value, $attribs, $selfclose );
+ return $this->inSelectMode( $token, $value, $attribs, $selfClose );
}
- private function inTemplateMode( $token, $value, $attribs = null, $selfclose = false ) {
+ private function inTemplateMode( $token, $value, $attribs = null, $selfClose = false ) {
if ( $token === 'text' || $token === 'comment' ) {
- return $this->inBodyMode( $token, $value, $attribs, $selfclose );
+ return $this->inBodyMode( $token, $value, $attribs, $selfClose );
} elseif ( $token === 'eof' ) {
if ( $this->stack->indexOf( 'template' ) < 0 ) {
$this->stopParsing();
$this->afe->clearToMarker();
array_pop( $this->templateInsertionModes );
$this->resetInsertionMode();
- $this->insertToken( $token, $value, $attribs, $selfclose );
+ $this->insertToken( $token, $value, $attribs, $selfClose );
}
return true;
} elseif ( $token === 'tag' ) {
case 'link':
case 'meta':
case 'noframes':
- # OMITTED: <script>
+ // OMITTED: <script>
case 'style':
case 'template':
- # OMITTED: <title>
- return $this->inHeadMode( $token, $value, $attribs, $selfclose );
+ // OMITTED: <title>
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
case 'caption':
case 'colgroup':
case 'tfoot':
case 'thead':
return $this->switchModeAndReprocess(
- 'inTableMode', $token, $value, $attribs, $selfclose
+ 'inTableMode', $token, $value, $attribs, $selfClose
);
case 'col':
return $this->switchModeAndReprocess(
- 'inColumnGroupMode', $token, $value, $attribs, $selfclose
+ 'inColumnGroupMode', $token, $value, $attribs, $selfClose
);
case 'tr':
return $this->switchModeAndReprocess(
- 'inTableBodyMode', $token, $value, $attribs, $selfclose
+ 'inTableBodyMode', $token, $value, $attribs, $selfClose
);
case 'td':
case 'th':
return $this->switchModeAndReprocess(
- 'inRowMode', $token, $value, $attribs, $selfclose
+ 'inRowMode', $token, $value, $attribs, $selfClose
);
}
return $this->switchModeAndReprocess(
- 'inBodyMode', $token, $value, $attribs, $selfclose
+ 'inBodyMode', $token, $value, $attribs, $selfClose
);
} elseif ( $token === 'endtag' ) {
switch ( $value ) {
case 'template':
- return $this->inHeadMode( $token, $value, $attribs, $selfclose );
+ return $this->inHeadMode( $token, $value, $attribs, $selfClose );
}
return true;
} else {