3 * An implementation of the tree building portion of the HTML5 parsing
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
24 * @author C. Scott Ananian, 2016
26 namespace MediaWiki\Tidy
;
28 use Wikimedia\Assert\Assert
;
29 use Wikimedia\Assert\ParameterAssertionException
;
31 use \IteratorAggregate
;
32 use \ReverseArrayIterator
;
35 # A note for future librarization[1] -- this file is a good candidate
36 # for splitting into an independent library, except that it is currently
37 # highly optimized for MediaWiki use. It only implements the portions
38 # of the HTML5 tree builder used by tags supported by MediaWiki, and
39 # does not contain a true tokenizer pass, instead relying on
40 # comment stripping, attribute normalization, and escaping done by
41 # the MediaWiki Sanitizer. It also deliberately avoids building
42 # a true DOM in memory, instead serializing elements to an output string
43 # as soon as possible (usually as soon as the tag is closed) to reduce
44 # its memory footprint.
46 # On the other hand, I've been pretty careful to note with comments in the
47 # code the places where this implementation omits features of the spec or
48 # depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
49 # implement the missing pieces and make this a standalone PHP HTML5 parser.
50 # In order to do so, some sort of MediaWiki-specific API will need
51 # to be added to (a) allow the Balancer to bypass the tokenizer,
52 # and (b) support on-the-fly flattening instead of DOM node creation.
54 # [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
57 * Utility constants and sets for the HTML5 tree building algorithm.
58 * Sets are associative arrays indexed first by namespace and then by
59 * lower-cased tag name.
65 const HTML_NAMESPACE
= 'http://www.w3.org/1999/xhtml';
66 const MATHML_NAMESPACE
= 'http://www.w3.org/1998/Math/MathML';
67 const SVG_NAMESPACE
= 'http://www.w3.org/2000/svg';
69 public static $unsupportedSet = [
70 self
::HTML_NAMESPACE
=> [
71 'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
72 'form' => true, 'frame' => true,
73 'plaintext' => true, 'isindex' => true, 'textarea' => true,
74 'xmp' => true, 'iframe' => true, 'noembed' => true,
75 'noscript' => true, 'select' => true, 'script' => true,
80 public static $emptyElementSet = [
81 self
::HTML_NAMESPACE
=> [
82 'area' => true, 'base' => true, 'basefont' => true,
83 'bgsound' => true, 'br' => true, 'col' => true, 'command' => true,
84 'embed' => true, 'frame' => true, 'hr' => true, 'img' => true,
85 'input' => true, 'keygen' => true, 'link' => true, 'meta' => true,
86 'param' => true, 'source' => true, 'track' => true, 'wbr' => true
90 public static $headingSet = [
91 self
::HTML_NAMESPACE
=> [
92 'h1' => true, 'h2' => true, 'h3' => true,
93 'h4' => true, 'h5' => true, 'h6' => true
97 public static $specialSet = [
98 self
::HTML_NAMESPACE
=> [
99 'address' => true, 'applet' => true, 'area' => true,
100 'article' => true, 'aside' => true, 'base' => true,
101 'basefont' => true, 'bgsound' => true, 'blockquote' => true,
102 'body' => true, 'br' => true, 'button' => true, 'caption' => true,
103 'center' => true, 'col' => true, 'colgroup' => true, 'dd' => true,
104 'details' => true, 'dir' => true, 'div' => true, 'dl' => true,
105 'dt' => true, 'embed' => true, 'fieldset' => true,
106 'figcaption' => true, 'figure' => true, 'footer' => true,
107 'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true,
108 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
109 'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
110 'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
111 'input' => true, 'isindex' => true, 'li' => true, 'link' => true,
112 'listing' => true, 'main' => true, 'marquee' => true,
113 'menu' => true, 'menuitem' => true, 'meta' => true, 'nav' => true,
114 'noembed' => true, 'noframes' => true, 'noscript' => true,
115 'object' => true, 'ol' => true, 'p' => true, 'param' => true,
116 'plaintext' => true, 'pre' => true, 'script' => true,
117 'section' => true, 'select' => true, 'source' => true,
118 'style' => true, 'summary' => true, 'table' => true,
119 'tbody' => true, 'td' => true, 'template' => true,
120 'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true,
121 'title' => true, 'tr' => true, 'track' => true, 'ul' => true,
122 'wbr' => true, 'xmp' => true
124 self
::SVG_NAMESPACE
=> [
125 'foreignobject' => true, 'desc' => true, 'title' => true
127 self
::MATHML_NAMESPACE
=> [
128 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
129 'mtext' => true, 'annotation-xml' => true
133 public static $addressDivPSet = [
134 self
::HTML_NAMESPACE
=> [
135 'address' => true, 'div' => true, 'p' => true
139 public static $tableSectionRowSet = [
140 self
::HTML_NAMESPACE
=> [
141 'table' => true, 'thead' => true, 'tbody' => true,
142 'tfoot' => true, 'tr' => true
146 public static $impliedEndTagsSet = [
147 self
::HTML_NAMESPACE
=> [
148 'dd' => true, 'dt' => true, 'li' => true, 'optgroup' => true,
149 'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
150 'rt' => true, 'rtc' => true
154 public static $thoroughImpliedEndTagsSet = [
155 self
::HTML_NAMESPACE
=> [
156 'caption' => true, 'colgroup' => true, 'dd' => true, 'dt' => true,
157 'li' => true, 'optgroup' => true, 'option' => true, 'p' => true,
158 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true,
159 'tbody' => true, 'td' => true, 'tfoot' => true, 'th' => true,
160 'thead' => true, 'tr' => true
164 public static $tableCellSet = [
165 self
::HTML_NAMESPACE
=> [
166 'td' => true, 'th' => true
169 public static $tableContextSet = [
170 self
::HTML_NAMESPACE
=> [
171 'table' => true, 'template' => true, 'html' => true
175 public static $tableBodyContextSet = [
176 self
::HTML_NAMESPACE
=> [
177 'tbody' => true, 'tfoot' => true, 'thead' => true,
178 'template' => true, 'html' => true
182 public static $tableRowContextSet = [
183 self
::HTML_NAMESPACE
=> [
184 'tr' => true, 'template' => true, 'html' => true
188 # OMITTED: formAssociatedSet, since we don't allow <form>
190 public static $inScopeSet = [
191 self
::HTML_NAMESPACE
=> [
192 'applet' => true, 'caption' => true, 'html' => true,
193 'marquee' => true, 'object' => true,
194 'table' => true, 'td' => true, 'template' => true,
197 self
::SVG_NAMESPACE
=> [
198 'foreignobject' => true, 'desc' => true, 'title' => true
200 self
::MATHML_NAMESPACE
=> [
201 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
202 'mtext' => true, 'annotation-xml' => true
206 private static $inListItemScopeSet = null;
207 public static function inListItemScopeSet() {
208 if ( self
::$inListItemScopeSet === null ) {
209 self
::$inListItemScopeSet = self
::$inScopeSet;
210 self
::$inListItemScopeSet[self
::HTML_NAMESPACE
]['ol'] = true;
211 self
::$inListItemScopeSet[self
::HTML_NAMESPACE
]['ul'] = true;
213 return self
::$inListItemScopeSet;
216 private static $inButtonScopeSet = null;
217 public static function inButtonScopeSet() {
218 if ( self
::$inButtonScopeSet === null ) {
219 self
::$inButtonScopeSet = self
::$inScopeSet;
220 self
::$inButtonScopeSet[self
::HTML_NAMESPACE
]['button'] = true;
222 return self
::$inButtonScopeSet;
225 public static $inTableScopeSet = [
226 self
::HTML_NAMESPACE
=> [
227 'html' => true, 'table' => true, 'template' => true
231 public static $mathmlTextIntegrationPointSet = [
232 self
::MATHML_NAMESPACE
=> [
233 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
238 public static $htmlIntegrationPointSet = [
239 self
::SVG_NAMESPACE
=> [
240 'foreignobject' => true,
246 // For tidy compatibility.
247 public static $tidyPWrapSet = [
248 self
::HTML_NAMESPACE
=> [
249 'body' => true, 'blockquote' => true,
250 // We parse with <body> as the fragment context, but the top-level
251 // element on the stack is actually <html>. We could use the
252 // "adjusted current node" everywhere to work around this, but it's
253 // easier just to add <html> to the p-wrap set.
257 public static $tidyInlineSet = [
258 self
::HTML_NAMESPACE
=> [
259 'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true,
260 'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true,
261 'br' => true, 'button' => true, 'cite' => true, 'code' => true,
262 'dfn' => true, 'em' => true, 'font' => true, 'i' => true,
263 'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true,
264 'label' => true, 'legend' => true, 'map' => true, 'object' => true,
265 'param' => true, 'q' => true, 'rb' => true, 'rbc' => true,
266 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true,
267 's' => true, 'samp' => true, 'select' => true, 'small' => true,
268 'span' => true, 'strike' => true, 'strong' => true, 'sub' => true,
269 'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true,
276 * A BalanceElement is a simplified version of a DOM Node. The main
277 * difference is that we only keep BalanceElements around for nodes
278 * currently on the BalanceStack of open elements. As soon as an
279 * element is closed, with some minor exceptions relating to the
280 * tree builder "adoption agency algorithm", the element and all its
281 * children are serialized to a string using the flatten() method.
282 * This keeps our memory usage low.
287 class BalanceElement
{
289 * The namespace of the element.
290 * @var string $namespaceURI
292 public $namespaceURI;
294 * The lower-cased name of the element.
295 * @var string $localName
299 * Attributes for the element, in array form
300 * @var array $attribs
305 * Parent of this element, or the string "flat" if this element has
306 * already been flattened into its parent.
307 * @var string|null $parent
312 * An array of children of this element. Typically only the last
313 * child will be an actual BalanceElement object; the rest will
314 * be strings, representing either text nodes or flattened
315 * BalanceElement objects.
316 * @var array $children
321 * A unique string identifier for Noah's Ark purposes, lazy initialized
326 * The next active formatting element in the list, or null if this is the
327 * end of the AFE list or if the element is not in the AFE list.
332 * The previous active formatting element in the list, or null if this is
333 * the start of the list or if the element is not in the AFE list.
338 * The next element in the Noah's Ark species bucket.
343 * Make a new BalanceElement corresponding to the HTML DOM Element
344 * with the given localname, namespace, and attributes.
346 * @param string $namespaceURI The namespace of the element.
347 * @param string $localName The lowercased name of the tag.
348 * @param array $attribs Attributes of the element
350 public function __construct( $namespaceURI, $localName, array $attribs ) {
351 Assert
::parameterType( 'string', $namespaceURI, '$namespaceURI' );
352 Assert
::parameterType( 'string', $localName, '$localName' );
354 $this->localName
= $localName;
355 $this->namespaceURI
= $namespaceURI;
356 $this->attribs
= $attribs;
357 $this->contents
= '';
358 $this->parent
= null;
359 $this->children
= [];
363 * Remove the given child from this element.
364 * @param BalanceElement $elt
366 private function removeChild( $elt ) {
367 Assert
::precondition(
368 $this->parent
!== 'flat', "Can't removeChild after flattening $this"
370 Assert
::parameterType( 'MediaWiki\Tidy\BalanceElement', $elt, '$elt' );
372 $elt->parent
=== $this, 'elt', 'must have $this as a parent'
374 $idx = array_search( $elt, $this->children
, true );
375 Assert
::parameter( $idx !== false, '$elt', 'must be a child of $this' );
377 array_splice( $this->children
, $idx, 1 );
381 * Find $a in the list of children and insert $b before it.
382 * @param BalanceElement $a
383 * @param BalanceElement|string $b
385 public function insertBefore( $a, $b ) {
386 Assert
::precondition(
387 $this->parent
!== 'flat', "Can't insertBefore after flattening."
389 Assert
::parameterType( 'MediaWiki\Tidy\BalanceElement', $a, '$a' );
390 Assert
::parameterType( 'MediaWiki\Tidy\BalanceElement|string', $b, '$b' );
391 $idx = array_search( $a, $this->children
, true );
392 Assert
::parameter( $idx !== false, '$a', 'must be a child of $this' );
393 if ( is_string( $b ) ) {
394 array_splice( $this->children
, $idx, 0, [ $b ] );
396 Assert
::parameter( $b->parent
!== 'flat', '$b', "Can't be flat" );
397 if ( $b->parent
!== null ) {
398 $b->parent
->removeChild( $b );
400 array_splice( $this->children
, $idx, 0, [ $b ] );
406 * Append $elt to the end of the list of children.
407 * @param BalanceElement|string $elt
409 public function appendChild( $elt ) {
410 Assert
::precondition(
411 $this->parent
!== 'flat', "Can't appendChild after flattening."
413 Assert
::parameterType( 'MediaWiki\Tidy\BalanceElement|string', $elt, '$elt' );
414 if ( is_string( $elt ) ) {
415 array_push( $this->children
, $elt );
418 // Remove $elt from parent, if it had one.
419 if ( $elt->parent
!== null ) {
420 $elt->parent
->removeChild( $elt );
422 array_push( $this->children
, $elt );
423 $elt->parent
= $this;
427 * Transfer all of the children of $elt to $this.
428 * @param BalanceElement $elt
430 public function adoptChildren( $elt ) {
431 Assert
::parameterType( 'MediaWiki\Tidy\BalanceElement', $elt, '$elt' );
432 Assert
::precondition(
433 $elt->parent
!== 'flat', "Can't adoptChildren after flattening."
435 foreach ( $elt->children
as $child ) {
436 if ( !is_string( $child ) ) {
437 // This is an optimization which avoids an O(n^2) set of
438 // array_splice operations.
439 $child->parent
= null;
441 $this->appendChild( $child );
447 * Flatten this node and all of its children into a string, as specified
448 * by the HTML serialization specification, and replace this node
449 * in its parent by that string.
453 public function flatten( $tidyCompat = false ) {
454 Assert
::parameter( $this->parent
!== null, '$this', 'must be a child' );
455 Assert
::parameter( $this->parent
!== 'flat', '$this', 'already flat' );
456 $idx = array_search( $this, $this->parent
->children
, true );
458 $idx !== false, '$this', 'must be a child of its parent'
462 foreach ( $this->children
as $elt ) {
463 if ( !is_string( $elt ) ) {
464 $elt = $elt->flatten( $tidyCompat );
466 if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) {
470 if ( $this->isA( 'mw:p-wrap' ) ) {
471 $this->localName
= 'p';
472 } elseif ( $blank ) {
473 // Add 'mw-empty-elt' class so elements can be hidden via CSS
474 // for compatibility with legacy tidy.
475 if ( !count( $this->attribs
) &&
476 ( $this->localName
=== 'tr' ||
$this->localName
=== 'li' )
478 $this->attribs
= [ 'class' => "mw-empty-elt" ];
482 $flat = $blank ?
'' : "{$this}";
486 $this->parent
->children
[$idx] = $flat;
487 $this->parent
= 'flat'; # for assertion checking
492 * Serialize this node and all of its children to a string, as specified
493 * by the HTML serialization specification.
495 * @return string The serialization of the BalanceElement
496 * @see https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments
498 public function __toString() {
500 foreach ( $this->attribs
as $name => $value ) {
501 $encValue = Sanitizer
::encodeAttribute( $value );
502 $encAttribs .= " $name=\"$encValue\"";
504 if ( !$this->isA( BalanceSets
::$emptyElementSet ) ) {
505 $out = "<{$this->localName}{$encAttribs}>";
507 foreach ( $this->children
as $elt ) {
510 $out .= "</{$this->localName}>";
512 $out = "<{$this->localName}{$encAttribs} />";
514 count( $this->children
) === 0,
515 "Empty elements shouldn't have children."
521 # Utility functions on BalanceElements.
524 * Determine if $this represents a specific HTML tag, is a member of
525 * a tag set, or is equal to another BalanceElement.
527 * @param BalanceElement|array|string $set The target BalanceElement,
528 * set (from the BalanceSets class), or string (HTML tag name).
531 public function isA( $set ) {
532 Assert
::parameterType( 'MediaWiki\Tidy\BalanceElement|array|string', $set, '$set' );
533 if ( $set instanceof BalanceElement
) {
534 return $this === $set;
535 } elseif ( is_array( $set ) ) {
536 return isset( $set[$this->namespaceURI
] ) &&
537 isset( $set[$this->namespaceURI
][$this->localName
] );
539 # assume this is an HTML element name.
540 return $this->isHtml() && $this->localName
=== $set;
545 * Determine if $this represents an element in the HTML namespace.
549 public function isHtml() {
550 return $this->namespaceURI
=== BalanceSets
::HTML_NAMESPACE
;
554 * Determine if $this represents a MathML text integration point,
555 * as defined in the HTML5 specification.
558 * @see https://html.spec.whatwg.org/multipage/syntax.html#mathml-text-integration-point
560 public function isMathmlTextIntegrationPoint() {
561 return $this->isA( BalanceSets
::$mathmlTextIntegrationPointSet );
565 * Determine if $this represents an HTML integration point,
566 * as defined in the HTML5 specification.
569 * @see https://html.spec.whatwg.org/multipage/syntax.html#html-integration-point
571 public function isHtmlIntegrationPoint() {
572 if ( $this->isA( BalanceSets
::$htmlIntegrationPointSet ) ) {
576 $this->namespaceURI
=== BalanceSets
::MATHML_NAMESPACE
&&
577 $this->localName
=== 'annotation-xml' &&
578 isset( $this->attribs
['encoding'] ) &&
579 ( strcasecmp( $this->attribs
['encoding'], 'text/html' ) == 0 ||
580 strcasecmp( $this->attribs
['encoding'], 'application/xhtml+xml' ) == 0 )
588 * Get a string key for the Noah's Ark algorithm
590 public function getNoahKey() {
591 if ( $this->noahKey
=== null ) {
592 $attribs = $this->attribs
;
594 $this->noahKey
= serialize( [ $this->namespaceURI
, $this->localName
, $attribs ] );
596 return $this->noahKey
;
601 * The "stack of open elements" as defined in the HTML5 tree builder
602 * spec. This contains methods to ensure that content (start tags, text)
603 * are inserted at the correct place in the output string, and to
604 * flatten BalanceElements are they are closed to avoid holding onto
605 * a complete DOM tree for the document in memory.
607 * The stack defines a PHP iterator to traverse it in "reverse order",
608 * that is, the most-recently-added element is visited first in a
613 * @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
615 class BalanceStack
implements IteratorAggregate
{
617 * Backing storage for the stack.
618 * @var array $elements
620 private $elements = [];
622 * Foster parent mode determines how nodes are inserted into the
624 * @var bool $fosterParentMode
625 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
627 public $fosterParentMode = false;
629 * Tidy compatibility mode, determines behavior of body/blockquote
631 public $tidyCompat = false;
634 * Create a new BalanceStack with a single BalanceElement on it,
635 * representing the root <html> node.
637 public function __construct() {
638 # always a root <html> element on the stack
641 new BalanceElement( BalanceSets
::HTML_NAMESPACE
, 'html', [] )
646 * Return a string representing the output of the tree builder:
647 * all the children of the root <html> node.
650 public function getOutput() {
651 // Don't include the outer '<html>....</html>'
653 foreach ( $this->elements
[0]->children
as $elt ) {
654 $out .= is_string( $elt ) ?
$elt :
655 $elt->flatten( $this->tidyCompat
);
661 * Insert text at the appropriate place for inserting a node.
662 * @param string $value
663 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
665 public function insertText( $value ) {
666 Assert
::parameterType( 'string', $value, '$value' );
668 $this->fosterParentMode
&&
669 $this->currentNode()->isA( BalanceSets
::$tableSectionRowSet )
671 $this->fosterParent( $value );
674 $this->currentNode()->isA( BalanceSets
::$tidyPWrapSet )
676 $this->insertHTMLELement( 'mw:p-wrap', [] );
677 return $this->insertText( $value );
679 $this->currentNode()->appendChild( $value );
684 * Insert a BalanceElement at the appropriate place, pushing it
685 * on to the open elements stack.
686 * @param string $namespaceURI The element namespace
687 * @param string $tag The tag name
688 * @param string $attribs Normalized attributes, as a string.
689 * @return BalanceElement
690 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-foreign-element
692 public function insertForeignElement( $namespaceURI, $tag, $attribs ) {
693 return $this->insertElement(
694 new BalanceElement( $namespaceURI, $tag, $attribs )
699 * Insert an HTML element at the appropriate place, pushing it on to
700 * the open elements stack.
701 * @param string $tag The tag name
702 * @param string $attribs Normalized attributes, as a string.
703 * @return BalanceElement
704 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-an-html-element
706 public function insertHTMLElement( $tag, $attribs ) {
707 return $this->insertForeignElement(
708 BalanceSets
::HTML_NAMESPACE
, $tag, $attribs
713 * Insert an element at the appropriate place and push it on to the
714 * open elements stack.
715 * @param BalanceElement $elt
716 * @return BalanceElement
717 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
719 public function insertElement( $elt ) {
720 Assert
::parameterType( 'MediaWiki\Tidy\BalanceElement', $elt, '$elt' );
722 $this->currentNode()->isA( 'mw:p-wrap' ) &&
723 !$elt->isA( BalanceSets
::$tidyInlineSet )
725 // Tidy compatibility.
729 $this->fosterParentMode
&&
730 $this->currentNode()->isA( BalanceSets
::$tableSectionRowSet )
732 $elt = $this->fosterParent( $elt );
734 $this->currentNode()->appendChild( $elt );
736 Assert
::invariant( $elt->parent
!== null, "$elt must be in tree" );
737 Assert
::invariant( $elt->parent
!== 'flat', "$elt must not have been previous flattened" );
738 array_push( $this->elements
, $elt );
743 * Determine if the stack has $tag in scope.
744 * @param BalanceElement|array|string $tag
746 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-scope
748 public function inScope( $tag ) {
749 return $this->inSpecificScope( $tag, BalanceSets
::$inScopeSet );
753 * Determine if the stack has $tag in button scope.
754 * @param BalanceElement|array|string $tag
756 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-button-scope
758 public function inButtonScope( $tag ) {
759 return $this->inSpecificScope( $tag, BalanceSets
::inButtonScopeSet() );
763 * Determine if the stack has $tag in list item scope.
764 * @param BalanceElement|array|string $tag
766 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-list-item-scope
768 public function inListItemScope( $tag ) {
769 return $this->inSpecificScope( $tag, BalanceSets
::inListItemScopeSet() );
773 * Determine if the stack has $tag in table scope.
774 * @param BalanceElement|array|string $tag
776 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-table-scope
778 public function inTableScope( $tag ) {
779 return $this->inSpecificScope( $tag, BalanceSets
::$inTableScopeSet );
783 * Determine if the stack has $tag in a specific scope, $set.
784 * @param BalanceElement|array|string $tag
785 * @param BalanceElement|array|string $set
787 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-the-specific-scope
789 public function inSpecificScope( $tag, $set ) {
790 foreach ( $this as $elt ) {
791 if ( $elt->isA( $tag ) ) {
794 if ( $elt->isA( $set ) ) {
802 * Generate implied end tags.
803 * @param BalanceElement|array|string|null $butnot
804 * @param bool $thorough True if we should generate end tags thoroughly.
805 * @see https://html.spec.whatwg.org/multipage/syntax.html#generate-implied-end-tags
807 public function generateImpliedEndTags( $butnot = null, $thorough = false ) {
808 $endTagSet = $thorough ?
809 BalanceSets
::$thoroughImpliedEndTagsSet :
810 BalanceSets
::$impliedEndTagsSet;
811 while ( $this->length() > 0 ) {
812 if ( $butnot !== null && $this->currentNode()->isA( $butnot ) ) {
815 if ( !$this->currentNode()->isA( $endTagSet ) ) {
823 * Return the current node (the element in the stack with the largest
825 * @return BalanceElement
826 * @see https://html.spec.whatwg.org/multipage/syntax.html#current-node
828 public function currentNode() {
829 return $this->node( count( $this->elements
) - 1 );
833 * Return the adjusted current node.
835 public function adjustedCurrentNode( $fragmentContext ) {
836 return ( $fragmentContext && $this->length() === 1 ) ?
837 $fragmentContext : $this->currentNode();
841 * Return an iterator over this stack which visits the current node
842 * first, and the root node last.
845 public function getIterator() {
846 return new ReverseArrayIterator( $this->elements
);
850 * Return the BalanceElement at the given position $idx, where
851 * position 0 represents the root element.
853 * @return BalanceElement
855 public function node( $idx ) {
856 return $this->elements
[ $idx ];
860 * Replace the element at position $idx in the BalanceStack with $elt.
862 * @param BalanceElement $elt
864 public function replaceAt( $idx, $elt ) {
865 Assert
::parameterType( 'MediaWiki\Tidy\BalanceElement', $elt, '$elt' );
866 Assert
::precondition(
867 $this->elements
[$idx]->parent
!== 'flat',
868 'Replaced element should not have already been flattened.'
870 Assert
::precondition(
871 $elt->parent
!== 'flat',
872 'New element should not have already been flattened.'
874 $this->elements
[$idx] = $elt;
878 * Return the position of the given BalanceElement, set, or
879 * HTML tag name string in the BalanceStack.
880 * @param BalanceElement|array|string $tag
883 public function indexOf( $tag ) {
884 for ( $i = count( $this->elements
) - 1; $i >= 0; $i-- ) {
885 if ( $this->elements
[$i]->isA( $tag ) ) {
893 * Return the number of elements currently in the BalanceStack.
896 public function length() {
897 return count( $this->elements
);
901 * Remove the current node from the BalanceStack, flattening it
904 public function pop() {
905 $elt = array_pop( $this->elements
);
906 if ( !$elt->isA( 'mw:p-wrap' ) ) {
907 $elt->flatten( $this->tidyCompat
);
912 * Remove all nodes up to and including position $idx from the
913 * BalanceStack, flattening them in the process.
916 public function popTo( $idx ) {
917 while ( $this->length() > $idx ) {
923 * Pop elements off the stack up to and including the first
924 * element with the specified HTML tagname (or matching the given
926 * @param BalanceElement|array|string $tag
928 public function popTag( $tag ) {
929 while ( $this->length() > 0 ) {
930 if ( $this->currentNode()->isA( $tag ) ) {
939 * Pop elements off the stack *not including* the first element
940 * in the specified set.
941 * @param BalanceElement|array|string $set
943 public function clearToContext( $set ) {
944 // Note that we don't loop to 0. Never pop the <html> elt off.
945 while ( $this->length() > 1 ) {
946 if ( $this->currentNode()->isA( $set ) ) {
954 * Remove the given $elt from the BalanceStack, optionally
955 * flattening it in the process.
956 * @param BalanceElement $elt The element to remove.
957 * @param bool $flatten Whether to flatten the removed element.
959 public function removeElement( $elt, $flatten = true ) {
960 Assert
::parameterType( 'MediaWiki\Tidy\BalanceElement', $elt, '$elt' );
962 $elt->parent
!== 'flat',
964 '$elt should not already have been flattened.'
967 $elt->parent
->parent
!== 'flat',
969 'The parent of $elt should not already have been flattened.'
971 $idx = array_search( $elt, $this->elements
, true );
972 Assert
::parameter( $idx !== false, '$elt', 'must be in stack' );
973 array_splice( $this->elements
, $idx, 1 );
975 // serialize $elt into its parent
976 // otherwise, it will eventually serialize when the parent
977 // is serialized, we just hold onto the memory for its
978 // tree of objects a little longer.
979 $elt->flatten( $this->tidyCompat
);
981 Assert
::postcondition(
982 array_search( $elt, $this->elements
, true ) === false,
983 '$elt should no longer be in open elements stack'
988 * Find $a in the BalanceStack and insert $b after it.
989 * @param BalanceElement $a
990 * @param BalanceElement $b
992 public function insertAfter( $a, $b ) {
993 Assert
::parameterType( 'MediaWiki\Tidy\BalanceElement', $a, '$a' );
994 Assert
::parameterType( 'MediaWiki\Tidy\BalanceElement', $b, '$b' );
995 $idx = $this->indexOf( $a );
996 Assert
::parameter( $idx !== false, '$a', 'must be in stack' );
997 array_splice( $this->elements
, $idx +
1, 0, [ $b ] );
1000 # Fostering and adoption.
1003 * Foster parent the given $elt in the stack of open elements.
1004 * @param BalanceElement|string $elt
1005 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
1007 private function fosterParent( $elt ) {
1008 Assert
::parameterType( 'MediaWiki\Tidy\BalanceElement|string', $elt, '$elt' );
1009 $lastTable = $this->indexOf( 'table' );
1010 $lastTemplate = $this->indexOf( 'template' );
1014 if ( $lastTemplate >= 0 && ( $lastTable < 0 ||
$lastTemplate > $lastTable ) ) {
1015 $parent = $this->elements
[$lastTemplate];
1016 } elseif ( $lastTable >= 0 ) {
1017 $parent = $this->elements
[$lastTable]->parent
;
1018 # Assume all tables have parents, since we're not running scripts!
1020 $parent !== null, "All tables should have parents"
1022 $before = $this->elements
[$lastTable];
1024 $parent = $this->elements
[0]; // the `html` element.
1027 if ( $this->tidyCompat
) {
1028 if ( is_string( $elt ) ) {
1029 // We're fostering text: do we need a p-wrapper?
1030 if ( $parent->isA( BalanceSets
::$tidyPWrapSet ) ) {
1031 $this->insertHTMLElement( 'mw:p-wrap', [] );
1032 $this->insertText( $elt );
1036 // We're fostering an element; do we need to merge p-wrappers?
1037 if ( $elt->isA( 'mw:p-wrap' ) ) {
1039 array_search( $before, $parent->children
, true ) :
1040 count( $parent->children
);
1041 $after = $idx > 0 ?
$parent->children
[$idx - 1] : '';
1043 $after instanceof BalanceElement
&&
1044 $after->isA( 'mw:p-wrap' )
1046 return $after; // Re-use existing p-wrapper.
1053 $parent->insertBefore( $before, $elt );
1055 $parent->appendChild( $elt );
1061 * Run the "adoption agency algoritm" (AAA) for the given subject
1063 * @param string $tag The subject tag name.
1064 * @param BalanceActiveFormattingElements $afe The current
1065 * active formatting elements list.
1066 * @return true if the adoption agency algorithm "did something", false
1067 * if more processing is required by the caller.
1068 * @see https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
1070 public function adoptionAgency( $tag, $afe ) {
1071 // If the current node is an HTML element whose tag name is subject,
1072 // and the current node is not in the list of active formatting
1073 // elements, then pop the current node off the stack of open
1074 // elements and abort these steps.
1076 $this->currentNode()->isA( $tag ) &&
1077 !$afe->isInList( $this->currentNode() )
1080 return true; // no more handling required
1083 // Let outer loop counter be zero.
1086 // Outer loop: If outer loop counter is greater than or
1087 // equal to eight, then abort these steps.
1088 while ( $outer < 8 ) {
1089 // Increment outer loop counter by one.
1092 // Let the formatting element be the last element in the list
1093 // of active formatting elements that: is between the end of
1094 // the list and the last scope marker in the list, if any, or
1095 // the start of the list otherwise, and has the same tag name
1097 $fmtelt = $afe->findElementByTag( $tag );
1099 // If there is no such node, then abort these steps and instead
1100 // act as described in the "any other end tag" entry below.
1102 return false; // false means handle by the default case
1105 // Otherwise, if there is such a node, but that node is not in
1106 // the stack of open elements, then this is a parse error;
1107 // remove the element from the list, and abort these steps.
1108 $index = $this->indexOf( $fmtelt );
1110 $afe->remove( $fmtelt );
1111 return true; // true means no more handling required
1114 // Otherwise, if there is such a node, and that node is also in
1115 // the stack of open elements, but the element is not in scope,
1116 // then this is a parse error; ignore the token, and abort
1118 if ( !$this->inScope( $fmtelt ) ) {
1122 // Let the furthest block be the topmost node in the stack of
1123 // open elements that is lower in the stack than the formatting
1124 // element, and is an element in the special category. There
1125 // might not be one.
1126 $furthestblock = null;
1127 $furthestblockindex = -1;
1128 $stacklen = $this->length();
1129 for ( $i = $index+
1; $i < $stacklen; $i++
) {
1130 if ( $this->node( $i )->isA( BalanceSets
::$specialSet ) ) {
1131 $furthestblock = $this->node( $i );
1132 $furthestblockindex = $i;
1137 // If there is no furthest block, then the UA must skip the
1138 // subsequent steps and instead just pop all the nodes from the
1139 // bottom of the stack of open elements, from the current node
1140 // up to and including the formatting element, and remove the
1141 // formatting element from the list of active formatting
1143 if ( !$furthestblock ) {
1144 $this->popTag( $fmtelt );
1145 $afe->remove( $fmtelt );
1148 // Let the common ancestor be the element immediately above
1149 // the formatting element in the stack of open elements.
1150 $ancestor = $this->node( $index-1 );
1152 // Let a bookmark note the position of the formatting
1153 // element in the list of active formatting elements
1154 // relative to the elements on either side of it in the
1156 $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] );
1157 $afe->insertAfter( $fmtelt, $BOOKMARK );
1159 // Let node and last node be the furthest block.
1160 $node = $furthestblock;
1161 $lastnode = $furthestblock;
1162 $nodeindex = $furthestblockindex;
1165 // Let inner loop counter be zero.
1170 // Increment inner loop counter by one.
1173 // Let node be the element immediately above node in
1174 // the stack of open elements, or if node is no longer
1175 // in the stack of open elements (e.g. because it got
1176 // removed by this algorithm), the element that was
1177 // immediately above node in the stack of open elements
1178 // before node was removed.
1179 $node = $this->node( --$nodeindex );
1181 // If node is the formatting element, then go
1182 // to the next step in the overall algorithm.
1183 if ( $node === $fmtelt ) break;
1185 // If the inner loop counter is greater than three and node
1186 // is in the list of active formatting elements, then remove
1187 // node from the list of active formatting elements.
1188 $isAFE = $afe->isInList( $node );
1189 if ( $inner > 3 && $isAFE ) {
1190 $afe->remove( $node );
1194 // If node is not in the list of active formatting
1195 // elements, then remove node from the stack of open
1196 // elements and then go back to the step labeled inner
1199 // Don't flatten here, since we're about to relocate
1200 // parts of this $node.
1201 $this->removeElement( $node, false );
1205 // Create an element for the token for which the
1206 // element node was created with common ancestor as
1207 // the intended parent, replace the entry for node
1208 // in the list of active formatting elements with an
1209 // entry for the new element, replace the entry for
1210 // node in the stack of open elements with an entry for
1211 // the new element, and let node be the new element.
1212 $newelt = new BalanceElement(
1213 $node->namespaceURI
, $node->localName
, $node->attribs
);
1214 $afe->replace( $node, $newelt );
1215 $this->replaceAt( $nodeindex, $newelt );
1218 // If last node is the furthest block, then move the
1219 // aforementioned bookmark to be immediately after the
1220 // new node in the list of active formatting elements.
1221 if ( $lastnode === $furthestblock ) {
1222 $afe->remove( $BOOKMARK );
1223 $afe->insertAfter( $newelt, $BOOKMARK );
1226 // Insert last node into node, first removing it from
1227 // its previous parent node if any.
1228 $node->appendChild( $lastnode );
1230 // Let last node be node.
1234 // If the common ancestor node is a table, tbody, tfoot,
1235 // thead, or tr element, then, foster parent whatever last
1236 // node ended up being in the previous step, first removing
1237 // it from its previous parent node if any.
1239 $this->fosterParentMode
&&
1240 $ancestor->isA( BalanceSets
::$tableSectionRowSet )
1242 $this->fosterParent( $lastnode );
1244 // Otherwise, append whatever last node ended up being in
1245 // the previous step to the common ancestor node, first
1246 // removing it from its previous parent node if any.
1247 $ancestor->appendChild( $lastnode );
1250 // Create an element for the token for which the
1251 // formatting element was created, with furthest block
1252 // as the intended parent.
1253 $newelt2 = new BalanceElement(
1254 $fmtelt->namespaceURI
, $fmtelt->localName
, $fmtelt->attribs
);
1256 // Take all of the child nodes of the furthest block and
1257 // append them to the element created in the last step.
1258 $newelt2->adoptChildren( $furthestblock );
1260 // Append that new element to the furthest block.
1261 $furthestblock->appendChild( $newelt2 );
1263 // Remove the formatting element from the list of active
1264 // formatting elements, and insert the new element into the
1265 // list of active formatting elements at the position of
1266 // the aforementioned bookmark.
1267 $afe->remove( $fmtelt );
1268 $afe->replace( $BOOKMARK, $newelt2 );
1270 // Remove the formatting element from the stack of open
1271 // elements, and insert the new element into the stack of
1272 // open elements immediately below the position of the
1273 // furthest block in that stack.
1274 $this->removeElement( $fmtelt );
1275 $this->insertAfter( $furthestblock, $newelt2 );
1283 * Return the contents of the open elements stack as a string for
1287 public function __toString() {
1289 foreach ( $this->elements
as $elt ) {
1290 array_push( $r, $elt->localName
);
1292 return implode( $r, ' ' );
1297 * A pseudo-element used as a marker in the list of active formatting elements
1302 class BalanceMarker
{
1308 * The list of active formatting elements, which is used to handle
1309 * mis-nested formatting element tags in the HTML5 tree builder
1314 * @see https://html.spec.whatwg.org/multipage/syntax.html#list-of-active-formatting-elements
1316 class BalanceActiveFormattingElements
{
1317 /** The last (most recent) element in the list */
1320 /** The first (least recent) element in the list */
1324 * An array of arrays representing the population of elements in each bucket
1325 * according to the Noah's Ark clause. The outer array is stack-like, with each
1326 * integer-indexed element representing a segment of the list, bounded by
1327 * markers. The first element represents the segment of the list before the
1330 * The inner arrays are indexed by "Noah key", which is a string which uniquely
1331 * identifies each bucket according to the rules in the spec. The value in
1332 * the inner array is the first (least recently inserted) element in the bucket,
1333 * and subsequent members of the bucket can be found by iterating through the
1334 * singly-linked list via $node->nextNoah.
1336 * This is optimised for the most common case of inserting into a bucket
1337 * with zero members, and deleting a bucket containing one member. In the
1338 * worst case, iteration through the list is still O(1) in the document
1339 * size, since each bucket can have at most 3 members.
1341 private $noahTableStack = [ [] ];
1343 public function __destruct() {
1344 for ( $node = $this->head
; $node; $node = $next ) {
1345 $next = $node->nextAFE
;
1346 $node->prevAFE
= $node->nextAFE
= $node->nextNoah
= null;
1348 $this->head
= $this->tail
= $this->noahTableStack
= null;
1351 public function insertMarker() {
1352 $elt = new BalanceMarker
;
1353 if ( $this->tail
) {
1354 $this->tail
->nextAFE
= $elt;
1355 $elt->prevAFE
= $this->tail
;
1360 $this->noahTableStack
[] = [];
1364 * Follow the steps required when the spec requires us to "push onto the
1365 * list of active formatting elements".
1366 * @param BalanceElement $elt
1368 public function push( BalanceElement
$elt ) {
1369 // Must not be in the list already
1370 if ( $elt->prevAFE
!== null ||
$this->head
=== $elt ) {
1371 throw new ParameterAssertionException( '$elt',
1372 'Cannot insert a node into the AFE list twice' );
1375 // "Noah's Ark clause" -- if there are already three copies of
1376 // this element before we encounter a marker, then drop the last
1378 $noahKey = $elt->getNoahKey();
1379 $table =& $this->noahTableStack
[ count( $this->noahTableStack
) - 1 ];
1380 if ( !isset( $table[$noahKey] ) ) {
1381 $table[$noahKey] = $elt;
1384 $head = $tail = $table[$noahKey];
1385 while ( $tail->nextNoah
) {
1386 $tail = $tail->nextNoah
;
1389 if ( $count >= 3 ) {
1390 $this->remove( $head );
1392 $tail->nextNoah
= $elt;
1394 // Add to the main AFE list
1395 if ( $this->tail
) {
1396 $this->tail
->nextAFE
= $elt;
1397 $elt->prevAFE
= $this->tail
;
1405 * Follow the steps required when the spec asks us to "clear the list of
1406 * active formatting elements up to the last marker".
1408 public function clearToMarker() {
1409 // Iterate back through the list starting from the tail
1410 $tail = $this->tail
;
1411 while ( $tail && !( $tail instanceof BalanceMarker
) ) {
1412 // Unlink the element
1413 $prev = $tail->prevAFE
;
1414 $tail->prevAFE
= null;
1416 $prev->nextAFE
= null;
1418 $tail->nextNoah
= null;
1421 // If we finished on a marker, unlink it and pop it off the Noah table stack
1423 $prev = $tail->prevAFE
;
1425 $prev->nextAFE
= null;
1428 array_pop( $this->noahTableStack
);
1430 // No marker: wipe the top-level Noah table (which is the only one)
1431 $this->noahTableStack
[0] = [];
1433 // If we removed all the elements, clear the head pointer
1437 $this->tail
= $tail;
1441 * Find and return the last element with the specified tag between the
1442 * end of the list and the last marker on the list.
1443 * Used when parsing <a> "in body mode".
1445 public function findElementByTag( $tag ) {
1447 while ( $elt && !( $elt instanceof BalanceMarker
) ) {
1448 if ( $elt->localName
=== $tag ) {
1451 $elt = $elt->prevAFE
;
1457 * Determine whether an element is in the list of formatting elements.
1460 public function isInList( BalanceElement
$elt ) {
1461 return $this->head
=== $elt ||
$elt->prevAFE
;
1465 * Find the element $elt in the list and remove it.
1466 * Used when parsing <a> in body mode.
1468 public function remove( BalanceElement
$elt ) {
1469 if ( $this->head
!== $elt && !$elt->prevAFE
) {
1470 throw new ParameterAssertionException( '$elt',
1471 "Attempted to remove an element which is not in the AFE list" );
1473 // Update head and tail pointers
1474 if ( $this->head
=== $elt ) {
1475 $this->head
= $elt->nextAFE
;
1477 if ( $this->tail
=== $elt ) {
1478 $this->tail
= $elt->prevAFE
;
1480 // Update previous element
1481 if ( $elt->prevAFE
) {
1482 $elt->prevAFE
->nextAFE
= $elt->nextAFE
;
1484 // Update next element
1485 if ( $elt->nextAFE
) {
1486 $elt->nextAFE
->prevAFE
= $elt->prevAFE
;
1488 // Clear pointers so that isInList() etc. will work
1489 $elt->prevAFE
= $elt->nextAFE
= null;
1491 $this->removeFromNoahList( $elt );
1494 private function addToNoahList( BalanceElement
$elt ) {
1495 $noahKey = $elt->getNoahKey();
1496 $table =& $this->noahTableStack
[ count( $this->noahTableStack
) - 1 ];
1497 if ( !isset( $table[$noahKey] ) ) {
1498 $table[$noahKey] = $elt;
1500 $tail = $table[$noahKey];
1501 while ( $tail->nextNoah
) {
1502 $tail = $tail->nextNoah
;
1504 $tail->nextNoah
= $elt;
1508 private function removeFromNoahList( BalanceElement
$elt ) {
1509 $table =& $this->noahTableStack
[ count( $this->noahTableStack
) - 1 ];
1510 $key = $elt->getNoahKey();
1511 $noahElt = $table[$key];
1512 if ( $noahElt === $elt ) {
1513 if ( $noahElt->nextNoah
) {
1514 $table[$key] = $noahElt->nextNoah
;
1515 $noahElt->nextNoah
= null;
1517 unset( $table[$key] );
1521 $prevNoahElt = $noahElt;
1522 $noahElt = $prevNoahElt->nextNoah
;
1523 if ( $noahElt === $elt ) {
1525 $prevNoahElt->nextNoah
= $elt->nextNoah
;
1526 $elt->nextNoah
= null;
1529 } while ( $noahElt );
1534 * Find element $a in the list and replace it with element $b
1536 public function replace( BalanceElement
$a, BalanceElement
$b ) {
1537 if ( $this->head
!== $a && !$a->prevAFE
) {
1538 throw new ParameterAssertionException( '$a',
1539 "Attempted to replace an element which is not in the AFE list" );
1541 // Update head and tail pointers
1542 if ( $this->head
=== $a ) {
1545 if ( $this->tail
=== $a ) {
1548 // Update previous element
1549 if ( $a->prevAFE
) {
1550 $a->prevAFE
->nextAFE
= $b;
1552 // Update next element
1553 if ( $a->nextAFE
) {
1554 $a->nextAFE
->prevAFE
= $b;
1556 $b->prevAFE
= $a->prevAFE
;
1557 $b->nextAFE
= $a->nextAFE
;
1558 $a->nextAFE
= $a->prevAFE
= null;
1560 $this->removeFromNoahList( $a );
1561 $this->addToNoahList( $b );
1565 * Find $a in the list and insert $b after it.
1567 public function insertAfter( BalanceElement
$a, BalanceElement
$b ) {
1568 if ( $this->head
!== $a && !$a->prevAFE
) {
1569 throw new ParameterAssertionException( '$a',
1570 "Attempted to insert after an element which is not in the AFE list" );
1572 if ( $this->tail
=== $a ) {
1575 if ( $a->nextAFE
) {
1576 $a->nextAFE
->prevAFE
= $b;
1578 $b->nextAFE
= $a->nextAFE
;
1581 $this->addToNoahList( $b );
1584 // @codingStandardsIgnoreStart Generic.Files.LineLength.TooLong
1586 * Reconstruct the active formatting elements.
1587 * @param BalanceStack $stack The open elements stack
1588 * @see https://html.spec.whatwg.org/multipage/syntax.html#reconstruct-the-active-formatting-elements
1590 // @codingStandardsIgnoreEnd
1591 public function reconstruct( $stack ) {
1592 $entry = $this->tail
;
1593 // If there are no entries in the list of active formatting elements,
1594 // then there is nothing to reconstruct
1598 // If the last is a marker, do nothing.
1599 if ( $entry instanceof BalanceMarker
) {
1602 // Or if it is an open element, do nothing.
1603 if ( $stack->indexOf( $entry ) >= 0 ) {
1607 // Loop backward through the list until we find a marker or an
1609 while ( $entry->prevAFE
) {
1610 $entry = $entry->prevAFE
;
1611 if ( $entry instanceof BalanceMarker ||
$stack->indexOf( $entry ) >= 0 ) {
1616 // Now loop forward, starting from the element after the current one (or
1617 // the first element if we didn't find a marker or open element),
1618 // recreating formatting elements and pushing them back onto the list
1619 // of open elements.
1620 if ( $entry->prevAFE
) {
1621 $entry = $entry->nextAFE
;
1624 $newElement = $stack->insertHTMLElement(
1627 $this->replace( $entry, $newElement );
1628 $entry = $newElement->nextAFE
;
1633 * Get a string representation of the AFE list, for debugging
1635 public function __toString() {
1638 for ( $node = $this->head
; $node; $prev = $node, $node = $node->nextAFE
) {
1639 if ( $node instanceof BalanceMarker
) {
1643 $s .= $node->localName
. '#' . substr( md5( spl_object_hash( $node ) ), 0, 8 );
1644 if ( $node->nextNoah
) {
1645 $s .= " (noah sibling: {$node->nextNoah->localName}#" .
1646 substr( md5( spl_object_hash( $node->nextNoah
) ), 0, 8 ) .
1649 if ( $node->nextAFE
&& $node->nextAFE
->prevAFE
!== $node ) {
1650 $s .= " (reverse link is wrong!)";
1654 if ( $prev !== $this->tail
) {
1655 $s .= "(tail pointer is wrong!)\n";
1662 * An implementation of the tree building portion of the HTML5 parsing
1665 * This is used to balance and tidy output so that the result can
1666 * always be cleanly serialized/deserialized by an HTML5 parser. It
1667 * does *not* guarantee "conforming" output -- the HTML5 spec contains
1668 * a number of constraints which are not enforced by the HTML5 parsing
1669 * process. But the result will be free of gross errors: misnested or
1670 * unclosed tags, for example, and will be unchanged by spec-complient
1671 * parsing followed by serialization.
1673 * The tree building stage is structured as a state machine.
1674 * When comparing the implementation to
1675 * https://www.w3.org/TR/html5/syntax.html#tree-construction
1676 * note that each state is implemented as a function with a
1677 * name ending in `Mode` (because the HTML spec refers to them
1678 * as insertion modes). The current insertion mode is held by
1679 * the $parseMode property.
1681 * The following simplifications have been made:
1682 * - We handle body content only (ie, we start `in body`.)
1683 * - The document is never in "quirks mode".
1684 * - All occurrences of < and > have been entity escaped, so we
1685 * can parse tags by simply splitting on those two characters.
1686 * Similarly, all attributes have been "cleaned" and are double-quoted
1688 * - All comments and null characters are assumed to have been removed.
1689 * - We don't alter linefeeds after <pre>/<listing>.
1690 * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
1691 * <form>, <frame>, <plaintext>, <isindex>, <textarea>, <xmp>, <iframe>,
1692 * <noembed>, <noscript>, <select>, <script>, <title>. As a result,
1693 * further simplifications can be made:
1694 * - `frameset-ok` is not tracked.
1695 * - `form element pointer` is not tracked.
1696 * - `head element pointer` is not tracked (but presumed non-null)
1697 * - Tokenizer has only a single mode.
1699 * We generally mark places where we omit cases from the spec due to
1700 * disallowed elements with a comment: `# OMITTED: <element-name>`.
1702 * The HTML spec keeps a flag during the parsing process to track
1703 * whether or not a "parse error" has been encountered. We don't
1704 * bother to track that flag, we just implement the error-handling
1705 * process as specified.
1709 * @see https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
1713 private $bitsIterator;
1714 private $allowedHtmlElements;
1718 private $tidyCompat;
1720 private $textIntegrationMode = false;
1721 private $pendingTableText;
1722 private $originalInsertionMode;
1723 private $fragmentContext;
1726 * Create a new Balancer.
1727 * @param array $config Balancer configuration. Includes:
1728 * 'strict' : boolean, defaults to false.
1729 * When true, enforces syntactic constraints on input:
1730 * all non-tag '<' must be escaped, all attributes must be
1731 * separated by a single space and double-quoted. This is
1732 * consistent with the output of the Sanitizer.
1733 * 'allowedHtmlElements' : array, defaults to null.
1734 * When present, the keys of this associative array give
1735 * the acceptable HTML tag names. When not present, no
1736 * tag sanitization is done.
1737 * 'tidyCompat' : boolean, defaults to false.
1738 * When true, the serialization algorithm is tweaked to
1739 * provide historical compatibility with the old "tidy"
1740 * program: <p>-wrapping is done to the children of
1741 * <body> and <blockquote> elements, and empty elements
1744 public function __construct( array $config = [] ) {
1745 $config = $config +
[
1747 'allowedHtmlElements' => null,
1748 'tidyCompat' => false,
1750 $this->allowedHtmlElements
= $config['allowedHtmlElements'];
1751 $this->strict
= $config['strict'];
1752 $this->tidyCompat
= $config['tidyCompat'];
1753 if ( $this->allowedHtmlElements
!== null ) {
1755 $bad = array_uintersect_assoc(
1756 $this->allowedHtmlElements
,
1757 BalanceSets
::$unsupportedSet[BalanceSets
::HTML_NAMESPACE
],
1758 function( $a, $b ) {
1759 // Ignore the values (just intersect the keys) by saying
1760 // all values are equal to each other.
1764 if ( count( $bad ) > 0 ) {
1765 $badstr = implode( array_keys( $bad ), ',' );
1766 throw new ParameterAssertionException(
1768 'Balance attempted with sanitization including ' .
1769 "unsupported elements: {$badstr}"
1776 * Return a balanced HTML string for the HTML fragment given by $text,
1777 * subject to the caveats listed in the class description. The result
1778 * will typically be idempotent -- that is, rebalancing the output
1779 * would result in no change.
1781 * @param string $text The markup to be balanced
1782 * @param callable $processingCallback Callback to do any variable or
1783 * parameter replacements in HTML attributes values
1784 * @param array|bool $processingArgs Arguments for the processing callback
1785 * @return string The balanced markup
1787 public function balance( $text, $processingCallback = null, $processingArgs = [] ) {
1788 $this->parseMode
= 'inBodyMode';
1789 $this->bitsIterator
= new ExplodeIterator( '<', $text );
1790 $this->afe
= new BalanceActiveFormattingElements();
1791 $this->stack
= new BalanceStack();
1792 $this->stack
->tidyCompat
= $this->tidyCompat
;
1793 $this->processingCallback
= $processingCallback;
1794 $this->processingArgs
= $processingArgs;
1796 # The stack is constructed with an <html> element already on it.
1797 # Set this up as a fragment parsed with <body> as the context.
1798 $this->fragmentContext
=
1799 new BalanceElement( BalanceSets
::HTML_NAMESPACE
, 'body', [] );
1800 $this->resetInsertionMode();
1802 // First element is text not tag
1803 $x = $this->bitsIterator
->current();
1804 $this->bitsIterator
->next();
1805 $this->insertToken( 'text', str_replace( '>', '>', $x ) );
1806 // Now process each tag.
1807 while ( $this->bitsIterator
->valid() ) {
1810 $this->insertToken( 'eof', null );
1811 $result = $this->stack
->getOutput();
1812 // Free memory before returning.
1813 $this->bitsIterator
= null;
1815 $this->stack
= null;
1816 $this->fragmentContext
= null;
1821 * Pass a token to the tree builder. The $token will be one of the
1822 * strings "tag", "endtag", or "text".
1824 private function insertToken( $token, $value, $attribs = null, $selfclose = false ) {
1825 // validate tags against $unsupportedSet
1826 if ( $token === 'tag' ||
$token === 'endtag' ) {
1827 if ( isset( BalanceSets
::$unsupportedSet[BalanceSets
::HTML_NAMESPACE
][$value] ) ) {
1828 # As described in "simplifications" above, these tags are
1829 # not supported in the balancer.
1832 "Unsupported $token <$value> found."
1836 } elseif ( $token === 'text' && $value === '' ) {
1837 # Don't actually inject the empty string as a text token.
1840 // Some hoops we have to jump through
1841 $adjusted = $this->stack
->adjustedCurrentNode( $this->fragmentContext
);
1845 $this->stack
->length() === 0 ||
1846 $adjusted->isHtml() ||
1850 } elseif ( $adjusted->isMathmlTextIntegrationPoint() ) {
1851 if ( $token === 'text' ) {
1855 $value !== 'mglyph' && $value !== 'malignmark'
1860 $adjusted->namespaceURI
=== BalanceSets
::MATHML_NAMESPACE
&&
1861 $adjusted->localName
=== 'annotation-xml' &&
1862 $token === 'tag' && $value === 'svg'
1866 $adjusted->isHtmlIntegrationPoint() &&
1867 ( $token === 'tag' ||
$token === 'text' )
1872 return $this->insertForeignToken( $token, $value, $attribs, $selfclose );
1874 $func = $this->parseMode
;
1875 return $this->$func( $token, $value, $attribs, $selfclose );
1879 private function insertForeignToken( $token, $value, $attribs = null, $selfclose = false ) {
1880 if ( $token === 'text' ) {
1881 $this->stack
->insertText( $value );
1883 } elseif ( $token === 'tag' ) {
1886 if ( isset( $attribs['color'] )
1887 ||
isset( $attribs['face'] )
1888 ||
isset( $attribs['size'] )
1892 /* otherwise, fall through */
1937 if ( $this->fragmentContext
) {
1941 $this->stack
->pop();
1942 $node = $this->stack
->currentNode();
1944 $node->isMathmlTextIntegrationPoint() ||
1945 $node->isHtmlIntegrationPoint() ||
1951 return $this->insertToken( $token, $value, $attribs, $selfclose );
1953 // "Any other start tag"
1954 $adjusted = ( $this->fragmentContext
&& $this->stack
->length()===1 ) ?
1955 $this->fragmentContext
: $this->stack
->currentNode();
1956 $this->stack
->insertForeignElement(
1957 $adjusted->namespaceURI
, $value, $attribs
1960 $this->stack
->pop();
1963 } elseif ( $token === 'endtag' ) {
1965 foreach ( $this->stack
as $i => $node ) {
1966 if ( $node->isHtml() && !$first ) {
1967 // process the end tag as HTML
1968 $func = $this->parseMode
;
1969 return $this->$func( $token, $value, $attribs, $selfclose );
1970 } elseif ( $i === 0 ) {
1972 } elseif ( $node->localName
=== $value ) {
1973 $this->stack
->popTag( $node );
1982 * Grab the next "token" from $bitsIterator. This is either a open/close
1983 * tag or text, depending on whether the Sanitizer approves.
1985 private function advance() {
1986 $x = $this->bitsIterator
->current();
1987 $this->bitsIterator
->next();
1989 # $slash: Does the current element start with a '/'?
1990 # $t: Current element name
1991 # $attribStr: String between element name and >
1992 # $brace: Ending '>' or '/>'
1993 # $rest: Everything until the next element from the $bitsIterator
1994 if ( preg_match( Sanitizer
::ELEMENT_BITS_REGEX
, $x, $regs ) ) {
1995 list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs;
1996 $t = strtolower( $t );
1997 if ( $this->strict
) {
1998 /* Verify that attributes are all properly double-quoted */
2001 '/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr
2003 "Bad attribute string found"
2008 !$this->strict
, "< found which does not start a valid tag"
2010 $slash = $t = $attribStr = $brace = $rest = null;
2013 $sanitize = $this->allowedHtmlElements
!== null;
2015 $goodtag = $t && isset( $this->allowedHtmlElements
[$t] );
2018 if ( is_callable( $this->processingCallback
) ) {
2019 call_user_func_array( $this->processingCallback
, [ &$attribStr, $this->processingArgs
] );
2022 $goodtag = Sanitizer
::validateTag( $attribStr, $t );
2027 $attribs = Sanitizer
::decodeTagAttributes( $attribStr );
2028 $attribs = Sanitizer
::validateTagAttributes( $attribs, $t );
2030 $attribs = Sanitizer
::decodeTagAttributes( $attribStr );
2032 $goodtag = $this->insertToken(
2033 $slash ?
'endtag' : 'tag', $t, $attribs, $brace === '/>'
2037 $rest = str_replace( '>', '>', $rest );
2038 $this->insertToken( 'text', str_replace( '>', '>', $rest ) );
2040 # bad tag; serialize entire thing as text.
2041 $this->insertToken( 'text', '<' . str_replace( '>', '>', $x ) );
2045 private function switchMode( $mode ) {
2047 substr( $mode, -4 )==='Mode', '$mode', 'should end in Mode'
2049 $oldMode = $this->parseMode
;
2050 $this->parseMode
= $mode;
2054 private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfclose ) {
2055 $this->switchMode( $mode );
2056 return $this->insertToken( $token, $value, $attribs, $selfclose );
2059 private function resetInsertionMode() {
2061 foreach ( $this->stack
as $i => $node ) {
2064 if ( $this->fragmentContext
) {
2065 $node = $this->fragmentContext
;
2068 if ( $node->isHtml() ) {
2069 switch ( $node->localName
) {
2073 $stacklen = $this->stack->length();
2074 for ( $j = $i + 1; $j < $stacklen-1; $j++ ) {
2075 $ancestor = $this->stack->node( $stacklen-$j-1 );
2076 if ( $ancestor->isA( 'template' ) ) {
2079 if ( $ancestor->isA( 'table' ) ) {
2080 $this->switchMode( 'inSelectInTableMode' );
2084 $this->switchMode( 'inSelectMode' );
2088 $this->switchMode( 'inRowMode' );
2093 $this->switchMode( 'inTableBodyMode' );
2096 $this->switchMode( 'inCaptionMode' );
2099 $this->switchMode( 'inColumnGroupMode' );
2102 $this->switchMode( 'inTableMode' );
2106 array_slice( $this->templateInsertionModes
, -1 )[0]
2110 $this->switchMode( 'inBodyMode' );
2112 # OMITTED: <frameset>
2118 if ( $node->isA( BalanceSets
::$tableCellSet ) ) {
2119 $this->switchMode( 'inCellMode' );
2126 $this->switchMode( 'inBodyMode' );
2132 private function stopParsing() {
2133 # Most of the spec methods are inapplicable, other than step 2:
2134 # "pop all the nodes off the stack of open elements".
2135 # We're going to keep the top-most <html> element on the stack, though.
2137 # Clear the AFE list first, otherwise the element objects will stay live
2138 # during serialization, potentially using O(N^2) memory. Note that
2139 # popping the stack will never result in reconstructing the active
2140 # formatting elements.
2142 $this->stack
->popTo( 1 );
2145 private function parseRawText( $value, $attribs = null ) {
2146 $this->stack
->insertHTMLElement( $value, $attribs );
2147 // XXX switch tokenizer to rawtext state?
2148 $this->originalInsertionMode
= $this->switchMode( 'inTextMode' );
2152 private function inTextMode( $token, $value, $attribs = null, $selfclose = false ) {
2153 if ( $token === 'text' ) {
2154 $this->stack
->insertText( $value );
2156 } elseif ( $token === 'eof' ) {
2157 $this->stack
->pop();
2158 return $this->switchModeAndReprocess(
2159 $this->originalInsertionMode
, $token, $value, $attribs, $selfclose
2161 } elseif ( $token === 'endtag' ) {
2162 $this->stack
->pop();
2163 $this->switchMode( $this->originalInsertionMode
);
2169 private function inHeadMode( $token, $value, $attribs = null, $selfclose = false ) {
2170 if ( $token === 'text' ) {
2171 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
2172 $this->stack
->insertText( $matches[0] );
2173 $value = substr( $value, strlen( $matches[0] ) );
2175 if ( strlen( $value ) === 0 ) {
2176 return true; // All text handled.
2178 // Fall through to handle non-whitespace below.
2179 } elseif ( $token === 'tag' ) {
2182 # OMITTED: in a full HTML parser, this might change the encoding.
2189 $this->stack
->insertHTMLElement( $value, $attribs );
2190 $this->stack
->pop();
2193 # OMITTED: <noscript>
2196 return $this->parseRawText( $value, $attribs );
2199 $this->stack
->insertHTMLElement( $value, $attribs );
2200 $this->afe
->insertMarker();
2201 # OMITTED: frameset_ok
2202 $this->switchMode( 'inTemplateMode' );
2203 $this->templateInsertionModes
[] = $this->parseMode
;
2207 } elseif ( $token === 'endtag' ) {
2213 break; // handle at the bottom of the function
2215 if ( $this->stack
->indexOf( $value ) < 0 ) {
2216 return true; // Ignore the token.
2218 $this->stack
->generateImpliedEndTags( null, true /* thorough */ );
2219 $this->stack
->popTag( $value );
2220 $this->afe
->clearToMarker();
2221 array_pop( $this->templateInsertionModes
);
2222 $this->resetInsertionMode();
2225 // ignore any other end tag
2230 // If not handled above
2231 $this->inHeadMode( 'endtag', 'head' ); // synthetic </head>
2232 // Then redo this one
2233 return $this->insertToken( $token, $value, $attribs, $selfclose );
2236 private function inBodyMode( $token, $value, $attribs = null, $selfclose = false ) {
2237 if ( $token === 'text' ) {
2238 $this->afe
->reconstruct( $this->stack
);
2239 $this->stack
->insertText( $value );
2241 } elseif ( $token === 'eof' ) {
2242 if ( !empty( $this->templateInsertionModes
) ) {
2243 return $this->inTemplateMode( $token, $value, $attribs, $selfclose );
2245 $this->stopParsing();
2247 } elseif ( $token === 'tag' ) {
2260 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2262 # OMITTED: <frameset>
2288 if ( $this->stack
->inButtonScope( 'p' ) ) {
2289 $this->inBodyMode( 'endtag', 'p' );
2291 $this->stack
->insertHTMLElement( $value, $attribs );
2300 if ( $this->stack
->inButtonScope( 'p' ) ) {
2301 $this->inBodyMode( 'endtag', 'p' );
2303 if ( $this->stack
->currentNode()->isA( BalanceSets
::$headingSet ) ) {
2304 $this->stack
->pop();
2306 $this->stack
->insertHTMLElement( $value, $attribs );
2311 if ( $this->stack
->inButtonScope( 'p' ) ) {
2312 $this->inBodyMode( 'endtag', 'p' );
2314 $this->stack
->insertHTMLElement( $value, $attribs );
2315 # As described in "simplifications" above:
2316 # 1. We don't touch the next token, even if it's a linefeed.
2317 # 2. OMITTED: frameset_ok
2323 # OMITTED: frameset_ok
2324 foreach ( $this->stack
as $node ) {
2325 if ( $node->isA( 'li' ) ) {
2326 $this->inBodyMode( 'endtag', 'li' );
2330 $node->isA( BalanceSets
::$specialSet ) &&
2331 !$node->isA( BalanceSets
::$addressDivPSet )
2336 if ( $this->stack
->inButtonScope( 'p' ) ) {
2337 $this->inBodyMode( 'endtag', 'p' );
2339 $this->stack
->insertHTMLElement( $value, $attribs );
2344 # OMITTED: frameset_ok
2345 foreach ( $this->stack
as $node ) {
2346 if ( $node->isA( 'dd' ) ) {
2347 $this->inBodyMode( 'endtag', 'dd' );
2350 if ( $node->isA( 'dt' ) ) {
2351 $this->inBodyMode( 'endtag', 'dt' );
2355 $node->isA( BalanceSets
::$specialSet ) &&
2356 !$node->isA( BalanceSets
::$addressDivPSet )
2361 if ( $this->stack
->inButtonScope( 'p' ) ) {
2362 $this->inBodyMode( 'endtag', 'p' );
2364 $this->stack
->insertHTMLElement( $value, $attribs );
2367 # OMITTED: <plaintext>
2370 if ( $this->stack
->inScope( 'button' ) ) {
2371 $this->inBodyMode( 'endtag', 'button' );
2372 return $this->insertToken( $token, $value, $attribs, $selfclose );
2374 $this->afe
->reconstruct( $this->stack
);
2375 $this->stack
->insertHTMLElement( $value, $attribs );
2379 $activeElement = $this->afe
->findElementByTag( 'a' );
2380 if ( $activeElement ) {
2381 $this->inBodyMode( 'endtag', 'a' );
2382 if ( $this->afe
->isInList( $activeElement ) ) {
2383 $this->afe
->remove( $activeElement );
2384 // Don't flatten here, since when we fall
2385 // through below we might foster parent
2386 // the new <a> tag inside this one.
2387 $this->stack
->removeElement( $activeElement, false );
2403 $this->afe
->reconstruct( $this->stack
);
2404 $this->afe
->push( $this->stack
->insertHTMLElement( $value, $attribs ), $attribs );
2408 $this->afe
->reconstruct( $this->stack
);
2409 if ( $this->stack
->inScope( 'nobr' ) ) {
2410 $this->inBodyMode( 'endtag', 'nobr' );
2411 $this->afe
->reconstruct( $this->stack
);
2413 $this->afe
->push( $this->stack
->insertHTMLElement( $value, $attribs ), $attribs );
2419 $this->afe
->reconstruct( $this->stack
);
2420 $this->stack
->insertHTMLElement( $value, $attribs );
2421 $this->afe
->insertMarker();
2422 # OMITTED: frameset_ok
2426 # The document is never in "quirks mode"; see simplifications
2428 if ( $this->stack
->inButtonScope( 'p' ) ) {
2429 $this->inBodyMode( 'endtag', 'p' );
2431 $this->stack
->insertHTMLElement( $value, $attribs );
2432 # OMITTED: frameset_ok
2433 $this->switchMode( 'inTableMode' );
2442 $this->afe
->reconstruct( $this->stack
);
2443 $this->stack
->insertHTMLElement( $value, $attribs );
2444 $this->stack
->pop();
2445 # OMITTED: frameset_ok
2449 $this->afe
->reconstruct( $this->stack
);
2450 $this->stack
->insertHTMLElement( $value, $attribs );
2451 $this->stack
->pop();
2452 # OMITTED: frameset_ok
2453 # (hence we don't need to examine the tag's "type" attribute)
2460 $this->stack
->insertHTMLElement( $value, $attribs );
2461 $this->stack
->pop();
2465 if ( $this->stack
->inButtonScope( 'p' ) ) {
2466 $this->inBodyMode( 'endtag', 'p' );
2468 $this->stack
->insertHTMLElement( $value, $attribs );
2469 $this->stack
->pop();
2474 return $this->inBodyMode( $token, 'img', $attribs, $selfclose );
2476 # OMITTED: <isindex>
2477 # OMITTED: <textarea>
2480 # OMITTED: <noembed>
2481 # OMITTED: <noscript>
2486 $this->afe->reconstruct( $this->stack );
2487 $this->stack->insertHTMLElement( $value, $attribs );
2488 switch ( $this->parseMode ) {
2490 case 'inCaptionMode':
2491 case 'inTableBodyMode':
2494 $this->switchMode( 'inSelectInTableMode' );
2497 $this->switchMode( 'inSelectMode' );
2504 if ( $this->stack
->currentNode()->isA( 'option' ) ) {
2505 $this->inBodyMode( 'endtag', 'option' );
2507 $this->afe
->reconstruct( $this->stack
);
2508 $this->stack
->insertHTMLElement( $value, $attribs );
2513 if ( $this->stack
->inScope( 'ruby' ) ) {
2514 $this->stack
->generateImpliedEndTags();
2516 $this->stack
->insertHTMLElement( $value, $attribs );
2521 if ( $this->stack
->inScope( 'ruby' ) ) {
2522 $this->stack
->generateImpliedEndTags( 'rtc' );
2524 $this->stack
->insertHTMLElement( $value, $attribs );
2528 $this->afe
->reconstruct( $this->stack
);
2529 # We skip the spec's "adjust MathML attributes" and
2530 # "adjust foreign attributes" steps, since the browser will
2531 # do this later when it parses the output and it doesn't affect
2533 $this->stack
->insertForeignElement(
2534 BalanceSets
::MATHML_NAMESPACE
, $value, $attribs
2537 # emit explicit </math> tag.
2538 $this->stack
->pop();
2543 $this->afe
->reconstruct( $this->stack
);
2544 # We skip the spec's "adjust SVG attributes" and
2545 # "adjust foreign attributes" steps, since the browser will
2546 # do this later when it parses the output and it doesn't affect
2548 $this->stack
->insertForeignElement(
2549 BalanceSets
::SVG_NAMESPACE
, $value, $attribs
2552 # emit explicit </svg> tag.
2553 $this->stack
->pop();
2568 // Ignore table tags if we're not inTableMode
2572 // Handle any other start tag here
2573 $this->afe
->reconstruct( $this->stack
);
2574 $this->stack
->insertHTMLElement( $value, $attribs );
2576 } elseif ( $token === 'endtag' ) {
2578 # </body>,</html> are unsupported.
2581 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2609 // Ignore if there is not a matching open tag
2610 if ( !$this->stack
->inScope( $value ) ) {
2613 $this->stack
->generateImpliedEndTags();
2614 $this->stack
->popTag( $value );
2620 if ( !$this->stack
->inButtonScope( 'p' ) ) {
2621 $this->inBodyMode( 'tag', 'p', [] );
2622 return $this->insertToken( $token, $value, $attribs, $selfclose );
2624 $this->stack
->generateImpliedEndTags( $value );
2625 $this->stack
->popTag( $value );
2629 if ( !$this->stack
->inListItemScope( $value ) ) {
2630 return true; # ignore
2632 $this->stack
->generateImpliedEndTags( $value );
2633 $this->stack
->popTag( $value );
2638 if ( !$this->stack
->inScope( $value ) ) {
2639 return true; # ignore
2641 $this->stack
->generateImpliedEndTags( $value );
2642 $this->stack
->popTag( $value );
2651 if ( !$this->stack
->inScope( BalanceSets
::$headingSet ) ) {
2654 $this->stack
->generateImpliedEndTags();
2655 $this->stack
->popTag( BalanceSets
::$headingSet );
2659 # Take a deep breath, then:
2676 if ( $this->stack
->adoptionAgency( $value, $this->afe
) ) {
2677 return true; # If we did something, we're done.
2679 break; # Go to the "any other end tag" case.
2684 if ( !$this->stack
->inScope( $value ) ) {
2685 return true; # ignore
2687 $this->stack
->generateImpliedEndTags();
2688 $this->stack
->popTag( $value );
2689 $this->afe
->clearToMarker();
2693 # Turn </br> into <br>
2694 return $this->inBodyMode( 'tag', $value, [] );
2697 // Any other end tag goes here
2698 foreach ( $this->stack
as $i => $node ) {
2699 if ( $node->isA( $value ) ) {
2700 $this->stack
->generateImpliedEndTags( $value );
2701 $this->stack
->popTo( $i ); # including $i
2703 } elseif ( $node->isA( BalanceSets
::$specialSet ) ) {
2704 return true; // ignore this close token.
2709 Assert
::invariant( false, "Bad token type: $token" );
2713 private function inTableMode( $token, $value, $attribs = null, $selfclose = false ) {
2714 if ( $token === 'text' ) {
2715 if ( $this->textIntegrationMode
) {
2716 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
2717 } elseif ( $this->stack
->currentNode()->isA( BalanceSets
::$tableSectionRowSet ) ) {
2718 $this->pendingTableText
= '';
2719 $this->originalInsertionMode
= $this->parseMode
;
2720 return $this->switchModeAndReprocess( 'inTableTextMode', $token, $value, $attribs, $selfclose );
2722 // fall through to default case.
2723 } elseif ( $token === 'eof' ) {
2724 $this->stopParsing();
2726 } elseif ( $token === 'tag' ) {
2729 $this->afe
->insertMarker();
2730 $this->stack
->insertHTMLElement( $value, $attribs );
2731 $this->switchMode( 'inCaptionMode' );
2734 $this->stack
->clearToContext( BalanceSets
::$tableContextSet );
2735 $this->stack
->insertHTMLElement( $value, $attribs );
2736 $this->switchMode( 'inColumnGroupMode' );
2739 $this->inTableMode( 'tag', 'colgroup', [] );
2740 return $this->insertToken( $token, $value, $attribs, $selfclose );
2744 $this->stack
->clearToContext( BalanceSets
::$tableContextSet );
2745 $this->stack
->insertHTMLElement( $value, $attribs );
2746 $this->switchMode( 'inTableBodyMode' );
2751 $this->inTableMode( 'tag', 'tbody', [] );
2752 return $this->insertToken( $token, $value, $attribs, $selfclose );
2754 if ( !$this->stack
->inTableScope( $value ) ) {
2755 return true; // Ignore this tag.
2757 $this->inTableMode( 'endtag', $value );
2758 return $this->insertToken( $token, $value, $attribs, $selfclose );
2763 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2766 if ( !isset( $attribs['type'] ) ||
strcasecmp( $attribs['type'], 'hidden' ) !== 0 ) {
2767 break; // Handle this as "everything else"
2769 $this->stack
->insertHTMLElement( $value, $attribs );
2770 $this->stack
->pop();
2775 // Fall through for "anything else" clause.
2776 } elseif ( $token === 'endtag' ) {
2779 if ( !$this->stack
->inTableScope( $value ) ) {
2780 return true; // Ignore.
2782 $this->stack
->popTag( $value );
2783 $this->resetInsertionMode();
2796 return true; // Ignore the token.
2798 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2800 // Fall through for "anything else" clause.
2802 // This is the "anything else" case:
2803 $this->stack
->fosterParentMode
= true;
2804 $this->inBodyMode( $token, $value, $attribs, $selfclose );
2805 $this->stack
->fosterParentMode
= false;
2809 private function inTableTextMode( $token, $value, $attribs = null, $selfclose = false ) {
2810 if ( $token === 'text' ) {
2811 $this->pendingTableText
.= $value;
2815 $text = $this->pendingTableText
;
2816 $this->pendingTableText
= '';
2817 if ( preg_match( '/[^\x09\x0A\x0C\x0D\x20]/', $text ) ) {
2818 // This should match the "anything else" case inTableMode
2819 $this->stack
->fosterParentMode
= true;
2820 $this->inBodyMode( 'text', $text );
2821 $this->stack
->fosterParentMode
= false;
2823 // Pending text is just whitespace.
2824 $this->stack
->insertText( $text );
2826 return $this->switchModeAndReprocess(
2827 $this->originalInsertionMode
, $token, $value, $attribs, $selfclose
2831 // helper for inCaptionMode
2832 private function endCaption() {
2833 if ( !$this->stack
->inTableScope( 'caption' ) ) {
2836 $this->stack
->generateImpliedEndTags();
2837 $this->stack
->popTag( 'caption' );
2838 $this->afe
->clearToMarker();
2839 $this->switchMode( 'inTableMode' );
2843 private function inCaptionMode( $token, $value, $attribs = null, $selfclose = false ) {
2844 if ( $token === 'tag' ) {
2855 if ( $this->endCaption() ) {
2856 $this->insertToken( $token, $value, $attribs, $selfclose );
2860 // Fall through to "anything else" case.
2861 } elseif ( $token === 'endtag' ) {
2864 $this->endCaption();
2867 if ( $this->endCaption() ) {
2868 $this->insertToken( $token, $value, $attribs, $selfclose );
2884 // Fall through to "anything else" case.
2886 // The Anything Else case
2887 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
2890 private function inColumnGroupMode( $token, $value, $attribs = null, $selfclose = false ) {
2891 if ( $token === 'text' ) {
2892 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
2893 $this->stack
->insertText( $matches[0] );
2894 $value = substr( $value, strlen( $matches[0] ) );
2896 if ( strlen( $value ) === 0 ) {
2897 return true; // All text handled.
2899 // Fall through to handle non-whitespace below.
2900 } elseif ( $token === 'tag' ) {
2904 $this->stack
->insertHTMLElement( $value, $attribs );
2905 $this->stack
->pop();
2908 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2910 // Fall through for "anything else".
2911 } elseif ( $token === 'endtag' ) {
2914 if ( !$this->stack
->currentNode()->isA( 'colgroup' ) ) {
2915 return true; // Ignore the token.
2917 $this->stack
->pop();
2918 $this->switchMode( 'inTableMode' );
2921 return true; // Ignore the token.
2923 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2925 // Fall through for "anything else".
2926 } elseif ( $token === 'eof' ) {
2927 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
2931 if ( !$this->stack
->currentNode()->isA( 'colgroup' ) ) {
2932 return true; // Ignore the token.
2934 $this->inColumnGroupMode( 'endtag', 'colgroup' );
2935 return $this->insertToken( $token, $value, $attribs, $selfclose );
2938 // Helper function for inTableBodyMode
2939 private function endSection() {
2941 $this->stack
->inTableScope( 'tbody' ) ||
2942 $this->stack
->inTableScope( 'thead' ) ||
2943 $this->stack
->inTableScope( 'tfoot' )
2947 $this->stack
->clearToContext( BalanceSets
::$tableBodyContextSet );
2948 $this->stack
->pop();
2949 $this->switchMode( 'inTableMode' );
2952 private function inTableBodyMode( $token, $value, $attribs = null, $selfclose = false ) {
2953 if ( $token === 'tag' ) {
2956 $this->stack
->clearToContext( BalanceSets
::$tableBodyContextSet );
2957 $this->stack
->insertHTMLElement( $value, $attribs );
2958 $this->switchMode( 'inRowMode' );
2962 $this->inTableBodyMode( 'tag', 'tr', [] );
2963 $this->insertToken( $token, $value, $attribs, $selfclose );
2971 if ( $this->endSection() ) {
2972 $this->insertToken( $token, $value, $attribs, $selfclose );
2976 } elseif ( $token === 'endtag' ) {
2979 if ( $this->endSection() ) {
2980 $this->insertToken( $token, $value, $attribs, $selfclose );
2986 if ( $this->stack
->inTableScope( $value ) ) {
2987 $this->endSection();
2998 return true; // Ignore the token.
3002 return $this->inTableMode( $token, $value, $attribs, $selfclose );
3005 // Helper function for inRowMode
3006 private function endRow() {
3007 if ( !$this->stack
->inTableScope( 'tr' ) ) {
3010 $this->stack
->clearToContext( BalanceSets
::$tableRowContextSet );
3011 $this->stack
->pop();
3012 $this->switchMode( 'inTableBodyMode' );
3015 private function inRowMode( $token, $value, $attribs = null, $selfclose = false ) {
3016 if ( $token === 'tag' ) {
3020 $this->stack
->clearToContext( BalanceSets
::$tableRowContextSet );
3021 $this->stack
->insertHTMLElement( $value, $attribs );
3022 $this->switchMode( 'inCellMode' );
3023 $this->afe
->insertMarker();
3032 if ( $this->endRow() ) {
3033 $this->insertToken( $token, $value, $attribs, $selfclose );
3037 } elseif ( $token === 'endtag' ) {
3043 if ( $this->endRow() ) {
3044 $this->insertToken( $token, $value, $attribs, $selfclose );
3051 $this->stack
->inTableScope( $value ) &&
3054 $this->insertToken( $token, $value, $attribs, $selfclose );
3064 return true; // Ignore the token.
3068 return $this->inTableMode( $token, $value, $attribs, $selfclose );
3071 // Helper for inCellMode
3072 private function endCell() {
3073 if ( $this->stack
->inTableScope( 'td' ) ) {
3074 $this->inCellMode( 'endtag', 'td' );
3076 } elseif ( $this->stack
->inTableScope( 'th' ) ) {
3077 $this->inCellMode( 'endtag', 'th' );
3083 private function inCellMode( $token, $value, $attribs = null, $selfclose = false ) {
3084 if ( $token === 'tag' ) {
3095 if ( $this->endCell() ) {
3096 $this->insertToken( $token, $value, $attribs, $selfclose );
3100 } elseif ( $token === 'endtag' ) {
3104 if ( $this->stack
->inTableScope( $value ) ) {
3105 $this->stack
->generateImpliedEndTags();
3106 $this->stack
->popTag( $value );
3107 $this->afe
->clearToMarker();
3108 $this->switchMode( 'inRowMode' );
3123 if ( $this->stack
->inTableScope( $value ) ) {
3124 $this->stack
->generateImpliedEndTags();
3125 $this->stack
->popTag( BalanceSets
::$tableCellSet );
3126 $this->afe
->clearToMarker();
3127 $this->switchMode( 'inRowMode' );
3128 $this->insertToken( $token, $value, $attribs, $selfclose );
3134 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3139 private function inSelectMode( $token, $value, $attribs = null, $selfclose = false ) {
3140 Assert::invariant( false, 'Unimplemented' );
3143 private function inSelectInTableMode( $token, $value, $attribs = null, $selfclose = false ) {
3144 Assert::invariant( false, 'Unimplemented' );
3148 private function inTemplateMode( $token, $value, $attribs = null, $selfclose = false ) {
3149 if ( $token === 'text' ) {
3150 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3151 } elseif ( $token === 'eof' ) {
3152 if ( $this->stack
->indexOf( 'template' ) < 0 ) {
3153 $this->stopParsing();
3155 $this->stack
->popTag( 'template' );
3156 $this->afe
->clearToMarker();
3157 array_pop( $this->templateInsertionModes
);
3158 $this->resetInsertionMode();
3159 $this->insertToken( $token, $value, $attribs, $selfclose );
3162 } elseif ( $token === 'tag' ) {
3174 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3181 return $this->switchModeAndReprocess(
3182 'inTableMode', $token, $value, $attribs, $selfclose
3186 return $this->switchModeAndReprocess(
3187 'inColumnGroupMode', $token, $value, $attribs, $selfclose
3191 return $this->switchModeAndReprocess(
3192 'inTableBodyMode', $token, $value, $attribs, $selfclose
3197 return $this->switchModeAndReprocess(
3198 'inRowMode', $token, $value, $attribs, $selfclose
3201 return $this->switchModeAndReprocess(
3202 'inBodyMode', $token, $value, $attribs, $selfclose
3204 } elseif ( $token === 'endtag' ) {
3207 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3211 Assert
::invariant( false, "Bad token type: $token" );