3 * An implementation of the tree building portion of the HTML5 parsing
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
24 * @author C. Scott Ananian, 2016
26 namespace MediaWiki\Tidy
;
28 use Wikimedia\Assert\Assert
;
29 use Wikimedia\Assert\ParameterAssertionException
;
31 use \IteratorAggregate
;
32 use \ReverseArrayIterator
;
35 // A note for future librarization[1] -- this file is a good candidate
36 // for splitting into an independent library, except that it is currently
37 // highly optimized for MediaWiki use. It only implements the portions
38 // of the HTML5 tree builder used by tags supported by MediaWiki, and
39 // does not contain a true tokenizer pass, instead relying on
40 // comment stripping, attribute normalization, and escaping done by
41 // the MediaWiki Sanitizer. It also deliberately avoids building
42 // a true DOM in memory, instead serializing elements to an output string
43 // as soon as possible (usually as soon as the tag is closed) to reduce
44 // its memory footprint.
46 // We've been gradually lifting some of these restrictions to handle
47 // non-sanitized output generated by extensions, but we shortcut the tokenizer
48 // for speed (primarily by splitting on `<`) and so rely on syntactic
51 // On the other hand, I've been pretty careful to note with comments in the
52 // code the places where this implementation omits features of the spec or
53 // depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
54 // implement the missing pieces and make this a standalone PHP HTML5 parser.
55 // In order to do so, some sort of MediaWiki-specific API will need
56 // to be added to (a) allow the Balancer to bypass the tokenizer,
57 // and (b) support on-the-fly flattening instead of DOM node creation.
59 // [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
62 * Utility constants and sets for the HTML5 tree building algorithm.
63 * Sets are associative arrays indexed first by namespace and then by
64 * lower-cased tag name.
70 const HTML_NAMESPACE
= 'http://www.w3.org/1999/xhtml';
71 const MATHML_NAMESPACE
= 'http://www.w3.org/1998/Math/MathML';
72 const SVG_NAMESPACE
= 'http://www.w3.org/2000/svg';
74 public static $unsupportedSet = [
75 self
::HTML_NAMESPACE
=> [
76 'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
78 'plaintext' => true, 'isindex' => true,
79 'xmp' => true, 'iframe' => true, 'noembed' => true,
80 'noscript' => true, 'script' => true,
85 public static $emptyElementSet = [
86 self
::HTML_NAMESPACE
=> [
87 'area' => true, 'base' => true, 'basefont' => true,
88 'bgsound' => true, 'br' => true, 'col' => true, 'command' => true,
89 'embed' => true, 'frame' => true, 'hr' => true, 'img' => true,
90 'input' => true, 'keygen' => true, 'link' => true, 'meta' => true,
91 'param' => true, 'source' => true, 'track' => true, 'wbr' => true
95 public static $extraLinefeedSet = [
96 self
::HTML_NAMESPACE
=> [
97 'pre' => true, 'textarea' => true, 'listing' => true,
101 public static $headingSet = [
102 self
::HTML_NAMESPACE
=> [
103 'h1' => true, 'h2' => true, 'h3' => true,
104 'h4' => true, 'h5' => true, 'h6' => true
108 public static $specialSet = [
109 self
::HTML_NAMESPACE
=> [
110 'address' => true, 'applet' => true, 'area' => true,
111 'article' => true, 'aside' => true, 'base' => true,
112 'basefont' => true, 'bgsound' => true, 'blockquote' => true,
113 'body' => true, 'br' => true, 'button' => true, 'caption' => true,
114 'center' => true, 'col' => true, 'colgroup' => true, 'dd' => true,
115 'details' => true, 'dir' => true, 'div' => true, 'dl' => true,
116 'dt' => true, 'embed' => true, 'fieldset' => true,
117 'figcaption' => true, 'figure' => true, 'footer' => true,
118 'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true,
119 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
120 'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
121 'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
122 'input' => true, 'isindex' => true, 'li' => true, 'link' => true,
123 'listing' => true, 'main' => true, 'marquee' => true,
124 'menu' => true, 'menuitem' => true, 'meta' => true, 'nav' => true,
125 'noembed' => true, 'noframes' => true, 'noscript' => true,
126 'object' => true, 'ol' => true, 'p' => true, 'param' => true,
127 'plaintext' => true, 'pre' => true, 'script' => true,
128 'section' => true, 'select' => true, 'source' => true,
129 'style' => true, 'summary' => true, 'table' => true,
130 'tbody' => true, 'td' => true, 'template' => true,
131 'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true,
132 'title' => true, 'tr' => true, 'track' => true, 'ul' => true,
133 'wbr' => true, 'xmp' => true
135 self
::SVG_NAMESPACE
=> [
136 'foreignobject' => true, 'desc' => true, 'title' => true
138 self
::MATHML_NAMESPACE
=> [
139 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
140 'mtext' => true, 'annotation-xml' => true
144 public static $addressDivPSet = [
145 self
::HTML_NAMESPACE
=> [
146 'address' => true, 'div' => true, 'p' => true
150 public static $tableSectionRowSet = [
151 self
::HTML_NAMESPACE
=> [
152 'table' => true, 'thead' => true, 'tbody' => true,
153 'tfoot' => true, 'tr' => true
157 public static $impliedEndTagsSet = [
158 self
::HTML_NAMESPACE
=> [
159 'dd' => true, 'dt' => true, 'li' => true, 'optgroup' => true,
160 'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
161 'rt' => true, 'rtc' => true
165 public static $thoroughImpliedEndTagsSet = [
166 self
::HTML_NAMESPACE
=> [
167 'caption' => true, 'colgroup' => true, 'dd' => true, 'dt' => true,
168 'li' => true, 'optgroup' => true, 'option' => true, 'p' => true,
169 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true,
170 'tbody' => true, 'td' => true, 'tfoot' => true, 'th' => true,
171 'thead' => true, 'tr' => true
175 public static $tableCellSet = [
176 self
::HTML_NAMESPACE
=> [
177 'td' => true, 'th' => true
180 public static $tableContextSet = [
181 self
::HTML_NAMESPACE
=> [
182 'table' => true, 'template' => true, 'html' => true
186 public static $tableBodyContextSet = [
187 self
::HTML_NAMESPACE
=> [
188 'tbody' => true, 'tfoot' => true, 'thead' => true,
189 'template' => true, 'html' => true
193 public static $tableRowContextSet = [
194 self
::HTML_NAMESPACE
=> [
195 'tr' => true, 'template' => true, 'html' => true
199 // See https://html.spec.whatwg.org/multipage/forms.html#form-associated-element
200 public static $formAssociatedSet = [
201 self
::HTML_NAMESPACE
=> [
202 'button' => true, 'fieldset' => true, 'input' => true,
203 'keygen' => true, 'object' => true, 'output' => true,
204 'select' => true, 'textarea' => true, 'img' => true
208 public static $inScopeSet = [
209 self
::HTML_NAMESPACE
=> [
210 'applet' => true, 'caption' => true, 'html' => true,
211 'marquee' => true, 'object' => true,
212 'table' => true, 'td' => true, 'template' => true,
215 self
::SVG_NAMESPACE
=> [
216 'foreignobject' => true, 'desc' => true, 'title' => true
218 self
::MATHML_NAMESPACE
=> [
219 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
220 'mtext' => true, 'annotation-xml' => true
224 private static $inListItemScopeSet = null;
225 public static function inListItemScopeSet() {
226 if ( self
::$inListItemScopeSet === null ) {
227 self
::$inListItemScopeSet = self
::$inScopeSet;
228 self
::$inListItemScopeSet[self
::HTML_NAMESPACE
]['ol'] = true;
229 self
::$inListItemScopeSet[self
::HTML_NAMESPACE
]['ul'] = true;
231 return self
::$inListItemScopeSet;
234 private static $inButtonScopeSet = null;
235 public static function inButtonScopeSet() {
236 if ( self
::$inButtonScopeSet === null ) {
237 self
::$inButtonScopeSet = self
::$inScopeSet;
238 self
::$inButtonScopeSet[self
::HTML_NAMESPACE
]['button'] = true;
240 return self
::$inButtonScopeSet;
243 public static $inTableScopeSet = [
244 self
::HTML_NAMESPACE
=> [
245 'html' => true, 'table' => true, 'template' => true
249 public static $inInvertedSelectScopeSet = [
250 self
::HTML_NAMESPACE
=> [
251 'option' => true, 'optgroup' => true
255 public static $mathmlTextIntegrationPointSet = [
256 self
::MATHML_NAMESPACE
=> [
257 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
262 public static $htmlIntegrationPointSet = [
263 self
::SVG_NAMESPACE
=> [
264 'foreignobject' => true,
270 // For tidy compatibility.
271 public static $tidyPWrapSet = [
272 self
::HTML_NAMESPACE
=> [
273 'body' => true, 'blockquote' => true,
274 // We parse with <body> as the fragment context, but the top-level
275 // element on the stack is actually <html>. We could use the
276 // "adjusted current node" everywhere to work around this, but it's
277 // easier just to add <html> to the p-wrap set.
281 public static $tidyInlineSet = [
282 self
::HTML_NAMESPACE
=> [
283 'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true,
284 'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true,
285 'br' => true, 'button' => true, 'cite' => true, 'code' => true,
286 'dfn' => true, 'em' => true, 'font' => true, 'i' => true,
287 'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true,
288 'label' => true, 'legend' => true, 'map' => true, 'object' => true,
289 'param' => true, 'q' => true, 'rb' => true, 'rbc' => true,
290 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true,
291 's' => true, 'samp' => true, 'select' => true, 'small' => true,
292 'span' => true, 'strike' => true, 'strong' => true, 'sub' => true,
293 'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true,
300 * A BalanceElement is a simplified version of a DOM Node. The main
301 * difference is that we only keep BalanceElements around for nodes
302 * currently on the BalanceStack of open elements. As soon as an
303 * element is closed, with some minor exceptions relating to the
304 * tree builder "adoption agency algorithm", the element and all its
305 * children are serialized to a string using the flatten() method.
306 * This keeps our memory usage low.
311 class BalanceElement
{
313 * The namespace of the element.
314 * @var string $namespaceURI
316 public $namespaceURI;
318 * The lower-cased name of the element.
319 * @var string $localName
323 * Attributes for the element, in array form
324 * @var array $attribs
329 * Parent of this element, or the string "flat" if this element has
330 * already been flattened into its parent.
331 * @var string|null $parent
336 * An array of children of this element. Typically only the last
337 * child will be an actual BalanceElement object; the rest will
338 * be strings, representing either text nodes or flattened
339 * BalanceElement objects.
340 * @var array $children
345 * A unique string identifier for Noah's Ark purposes, lazy initialized
350 * The next active formatting element in the list, or null if this is the
351 * end of the AFE list or if the element is not in the AFE list.
356 * The previous active formatting element in the list, or null if this is
357 * the start of the list or if the element is not in the AFE list.
362 * The next element in the Noah's Ark species bucket.
367 * Make a new BalanceElement corresponding to the HTML DOM Element
368 * with the given localname, namespace, and attributes.
370 * @param string $namespaceURI The namespace of the element.
371 * @param string $localName The lowercased name of the tag.
372 * @param array $attribs Attributes of the element
374 public function __construct( $namespaceURI, $localName, array $attribs ) {
375 $this->localName
= $localName;
376 $this->namespaceURI
= $namespaceURI;
377 $this->attribs
= $attribs;
378 $this->contents
= '';
379 $this->parent
= null;
380 $this->children
= [];
384 * Remove the given child from this element.
385 * @param BalanceElement $elt
387 private function removeChild( BalanceElement
$elt ) {
388 Assert
::precondition(
389 $this->parent
!== 'flat', "Can't removeChild after flattening $this"
392 $elt->parent
=== $this, 'elt', 'must have $this as a parent'
394 $idx = array_search( $elt, $this->children
, true );
395 Assert
::parameter( $idx !== false, '$elt', 'must be a child of $this' );
397 array_splice( $this->children
, $idx, 1 );
401 * Find $a in the list of children and insert $b before it.
402 * @param BalanceElement $a
403 * @param BalanceElement|string $b
405 public function insertBefore( BalanceElement
$a, $b ) {
406 Assert
::precondition(
407 $this->parent
!== 'flat', "Can't insertBefore after flattening."
409 $idx = array_search( $a, $this->children
, true );
410 Assert
::parameter( $idx !== false, '$a', 'must be a child of $this' );
411 if ( is_string( $b ) ) {
412 array_splice( $this->children
, $idx, 0, [ $b ] );
414 Assert
::parameter( $b->parent
!== 'flat', '$b', "Can't be flat" );
415 if ( $b->parent
!== null ) {
416 $b->parent
->removeChild( $b );
418 array_splice( $this->children
, $idx, 0, [ $b ] );
424 * Append $elt to the end of the list of children.
425 * @param BalanceElement|string $elt
427 public function appendChild( $elt ) {
428 Assert
::precondition(
429 $this->parent
!== 'flat', "Can't appendChild after flattening."
431 if ( is_string( $elt ) ) {
432 array_push( $this->children
, $elt );
435 // Remove $elt from parent, if it had one.
436 if ( $elt->parent
!== null ) {
437 $elt->parent
->removeChild( $elt );
439 array_push( $this->children
, $elt );
440 $elt->parent
= $this;
444 * Transfer all of the children of $elt to $this.
445 * @param BalanceElement $elt
447 public function adoptChildren( BalanceElement
$elt ) {
448 Assert
::precondition(
449 $elt->parent
!== 'flat', "Can't adoptChildren after flattening."
451 foreach ( $elt->children
as $child ) {
452 if ( !is_string( $child ) ) {
453 // This is an optimization which avoids an O(n^2) set of
454 // array_splice operations.
455 $child->parent
= null;
457 $this->appendChild( $child );
463 * Flatten this node and all of its children into a string, as specified
464 * by the HTML serialization specification, and replace this node
465 * in its parent by that string.
469 public function flatten( $tidyCompat = false ) {
470 Assert
::parameter( $this->parent
!== null, '$this', 'must be a child' );
471 Assert
::parameter( $this->parent
!== 'flat', '$this', 'already flat' );
472 $idx = array_search( $this, $this->parent
->children
, true );
474 $idx !== false, '$this', 'must be a child of its parent'
478 foreach ( $this->children
as $elt ) {
479 if ( !is_string( $elt ) ) {
480 $elt = $elt->flatten( $tidyCompat );
482 if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) {
486 if ( $this->isHtmlNamed( 'mw:p-wrap' ) ) {
487 $this->localName
= 'p';
488 } elseif ( $blank ) {
489 // Add 'mw-empty-elt' class so elements can be hidden via CSS
490 // for compatibility with legacy tidy.
491 if ( !count( $this->attribs
) &&
492 ( $this->localName
=== 'tr' ||
$this->localName
=== 'li' )
494 $this->attribs
= [ 'class' => "mw-empty-elt" ];
498 $flat = $blank ?
'' : "{$this}";
502 $this->parent
->children
[$idx] = $flat;
503 $this->parent
= 'flat'; // for assertion checking
508 * Serialize this node and all of its children to a string, as specified
509 * by the HTML serialization specification.
511 * @return string The serialization of the BalanceElement
512 * @see https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments
514 public function __toString() {
516 foreach ( $this->attribs
as $name => $value ) {
517 $encValue = Sanitizer
::encodeAttribute( $value );
518 $encAttribs .= " $name=\"$encValue\"";
520 if ( !$this->isA( BalanceSets
::$emptyElementSet ) ) {
521 $out = "<{$this->localName}{$encAttribs}>";
522 $len = strlen( $out );
524 foreach ( $this->children
as $elt ) {
527 $out .= "</{$this->localName}>";
529 $this->isA( BalanceSets
::$extraLinefeedSet ) &&
532 // Double the linefeed after pre/listing/textarea
533 // according to the HTML5 fragment serialization algorithm.
534 $out = substr( $out, 0, $len +
1 ) .
535 substr( $out, $len );
538 $out = "<{$this->localName}{$encAttribs} />";
540 count( $this->children
) === 0,
541 "Empty elements shouldn't have children."
547 // Utility functions on BalanceElements.
550 * Determine if $this represents a specific HTML tag, is a member of
551 * a tag set, or is equal to another BalanceElement.
553 * @param BalanceElement|array|string $set The target BalanceElement,
554 * set (from the BalanceSets class), or string (HTML tag name).
557 public function isA( $set ) {
558 if ( $set instanceof BalanceElement
) {
559 return $this === $set;
560 } elseif ( is_array( $set ) ) {
561 return isset( $set[$this->namespaceURI
] ) &&
562 isset( $set[$this->namespaceURI
][$this->localName
] );
564 // assume this is an HTML element name.
565 return $this->isHtml() && $this->localName
=== $set;
570 * Determine if this element is an HTML element with the specified name
571 * @param string $tagName
574 public function isHtmlNamed( $tagName ) {
575 return $this->namespaceURI
=== BalanceSets
::HTML_NAMESPACE
576 && $this->localName
=== $tagName;
580 * Determine if $this represents an element in the HTML namespace.
584 public function isHtml() {
585 return $this->namespaceURI
=== BalanceSets
::HTML_NAMESPACE
;
589 * Determine if $this represents a MathML text integration point,
590 * as defined in the HTML5 specification.
593 * @see https://html.spec.whatwg.org/multipage/syntax.html#mathml-text-integration-point
595 public function isMathmlTextIntegrationPoint() {
596 return $this->isA( BalanceSets
::$mathmlTextIntegrationPointSet );
600 * Determine if $this represents an HTML integration point,
601 * as defined in the HTML5 specification.
604 * @see https://html.spec.whatwg.org/multipage/syntax.html#html-integration-point
606 public function isHtmlIntegrationPoint() {
607 if ( $this->isA( BalanceSets
::$htmlIntegrationPointSet ) ) {
611 $this->namespaceURI
=== BalanceSets
::MATHML_NAMESPACE
&&
612 $this->localName
=== 'annotation-xml' &&
613 isset( $this->attribs
['encoding'] ) &&
614 ( strcasecmp( $this->attribs
['encoding'], 'text/html' ) == 0 ||
615 strcasecmp( $this->attribs
['encoding'], 'application/xhtml+xml' ) == 0 )
623 * Get a string key for the Noah's Ark algorithm
625 public function getNoahKey() {
626 if ( $this->noahKey
=== null ) {
627 $attribs = $this->attribs
;
629 $this->noahKey
= serialize( [ $this->namespaceURI
, $this->localName
, $attribs ] );
631 return $this->noahKey
;
636 * The "stack of open elements" as defined in the HTML5 tree builder
637 * spec. This contains methods to ensure that content (start tags, text)
638 * are inserted at the correct place in the output string, and to
639 * flatten BalanceElements are they are closed to avoid holding onto
640 * a complete DOM tree for the document in memory.
642 * The stack defines a PHP iterator to traverse it in "reverse order",
643 * that is, the most-recently-added element is visited first in a
648 * @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
650 class BalanceStack
implements IteratorAggregate
{
652 * Backing storage for the stack.
653 * @var array $elements
655 private $elements = [];
657 * Foster parent mode determines how nodes are inserted into the
659 * @var bool $fosterParentMode
660 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
662 public $fosterParentMode = false;
664 * Tidy compatibility mode, determines behavior of body/blockquote
666 public $tidyCompat = false;
668 * Reference to the current element
673 * Create a new BalanceStack with a single BalanceElement on it,
674 * representing the root <html> node.
676 public function __construct() {
677 // always a root <html> element on the stack
680 new BalanceElement( BalanceSets
::HTML_NAMESPACE
, 'html', [] )
682 $this->currentNode
= $this->elements
[0];
686 * Return a string representing the output of the tree builder:
687 * all the children of the root <html> node.
690 public function getOutput() {
691 // Don't include the outer '<html>....</html>'
693 foreach ( $this->elements
[0]->children
as $elt ) {
694 $out .= is_string( $elt ) ?
$elt :
695 $elt->flatten( $this->tidyCompat
);
701 * Insert a comment at the appropriate place for inserting a node.
702 * @param string $value Content of the comment.
703 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-comment
705 public function insertComment( $value ) {
706 // Just another type of text node, except for tidy p-wrapping.
707 return $this->insertText( '<!--' . $value . '-->', true );
711 * Insert text at the appropriate place for inserting a node.
712 * @param string $value
713 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
715 public function insertText( $value, $isComment = false ) {
717 $this->fosterParentMode
&&
718 $this->currentNode
->isA( BalanceSets
::$tableSectionRowSet )
720 $this->fosterParent( $value );
722 $this->tidyCompat
&& !$isComment &&
723 $this->currentNode
->isA( BalanceSets
::$tidyPWrapSet )
725 $this->insertHTMLELement( 'mw:p-wrap', [] );
726 return $this->insertText( $value );
728 $this->currentNode
->appendChild( $value );
733 * Insert a BalanceElement at the appropriate place, pushing it
734 * on to the open elements stack.
735 * @param string $namespaceURI The element namespace
736 * @param string $tag The tag name
737 * @param string $attribs Normalized attributes, as a string.
738 * @return BalanceElement
739 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-foreign-element
741 public function insertForeignElement( $namespaceURI, $tag, $attribs ) {
742 return $this->insertElement(
743 new BalanceElement( $namespaceURI, $tag, $attribs )
748 * Insert an HTML element at the appropriate place, pushing it on to
749 * the open elements stack.
750 * @param string $tag The tag name
751 * @param string $attribs Normalized attributes, as a string.
752 * @return BalanceElement
753 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-an-html-element
755 public function insertHTMLElement( $tag, $attribs ) {
756 return $this->insertForeignElement(
757 BalanceSets
::HTML_NAMESPACE
, $tag, $attribs
762 * Insert an element at the appropriate place and push it on to the
763 * open elements stack.
764 * @param BalanceElement $elt
765 * @return BalanceElement
766 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
768 public function insertElement( BalanceElement
$elt ) {
770 $this->currentNode
->isHtmlNamed( 'mw:p-wrap' ) &&
771 !$elt->isA( BalanceSets
::$tidyInlineSet )
773 // Tidy compatibility.
777 $this->fosterParentMode
&&
778 $this->currentNode
->isA( BalanceSets
::$tableSectionRowSet )
780 $elt = $this->fosterParent( $elt );
782 $this->currentNode
->appendChild( $elt );
784 Assert
::invariant( $elt->parent
!== null, "$elt must be in tree" );
785 Assert
::invariant( $elt->parent
!== 'flat', "$elt must not have been previous flattened" );
786 array_push( $this->elements
, $elt );
787 $this->currentNode
= $elt;
792 * Determine if the stack has $tag in scope.
793 * @param BalanceElement|array|string $tag
795 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-scope
797 public function inScope( $tag ) {
798 return $this->inSpecificScope( $tag, BalanceSets
::$inScopeSet );
802 * Determine if the stack has $tag in button scope.
803 * @param BalanceElement|array|string $tag
805 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-button-scope
807 public function inButtonScope( $tag ) {
808 return $this->inSpecificScope( $tag, BalanceSets
::inButtonScopeSet() );
812 * Determine if the stack has $tag in list item scope.
813 * @param BalanceElement|array|string $tag
815 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-list-item-scope
817 public function inListItemScope( $tag ) {
818 return $this->inSpecificScope( $tag, BalanceSets
::inListItemScopeSet() );
822 * Determine if the stack has $tag in table scope.
823 * @param BalanceElement|array|string $tag
825 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-table-scope
827 public function inTableScope( $tag ) {
828 return $this->inSpecificScope( $tag, BalanceSets
::$inTableScopeSet );
832 * Determine if the stack has $tag in select scope.
833 * @param BalanceElement|array|string $tag
835 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-select-scope
837 public function inSelectScope( $tag ) {
838 // Can't use inSpecificScope to implement this, since it involves
839 // *inverting* a set of tags. Implement manually.
840 foreach ( $this as $elt ) {
841 if ( $elt->isA( $tag ) ) {
844 if ( !$elt->isA( BalanceSets
::$inInvertedSelectScopeSet ) ) {
852 * Determine if the stack has $tag in a specific scope, $set.
853 * @param BalanceElement|array|string $tag
854 * @param BalanceElement|array|string $set
856 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-the-specific-scope
858 public function inSpecificScope( $tag, $set ) {
859 foreach ( $this as $elt ) {
860 if ( $elt->isA( $tag ) ) {
863 if ( $elt->isA( $set ) ) {
871 * Generate implied end tags.
872 * @param string $butnot
873 * @param bool $thorough True if we should generate end tags thoroughly.
874 * @see https://html.spec.whatwg.org/multipage/syntax.html#generate-implied-end-tags
876 public function generateImpliedEndTags( $butnot = null, $thorough = false ) {
877 $endTagSet = $thorough ?
878 BalanceSets
::$thoroughImpliedEndTagsSet :
879 BalanceSets
::$impliedEndTagsSet;
880 while ( $this->currentNode
) {
881 if ( $butnot !== null && $this->currentNode
->isHtmlNamed( $butnot ) ) {
884 if ( !$this->currentNode
->isA( $endTagSet ) ) {
892 * Return the adjusted current node.
894 public function adjustedCurrentNode( $fragmentContext ) {
895 return ( $fragmentContext && count( $this->elements
) === 1 ) ?
896 $fragmentContext : $this->currentNode
;
900 * Return an iterator over this stack which visits the current node
901 * first, and the root node last.
904 public function getIterator() {
905 return new ReverseArrayIterator( $this->elements
);
909 * Return the BalanceElement at the given position $idx, where
910 * position 0 represents the root element.
912 * @return BalanceElement
914 public function node( $idx ) {
915 return $this->elements
[ $idx ];
919 * Replace the element at position $idx in the BalanceStack with $elt.
921 * @param BalanceElement $elt
923 public function replaceAt( $idx, BalanceElement
$elt ) {
924 Assert
::precondition(
925 $this->elements
[$idx]->parent
!== 'flat',
926 'Replaced element should not have already been flattened.'
928 Assert
::precondition(
929 $elt->parent
!== 'flat',
930 'New element should not have already been flattened.'
932 $this->elements
[$idx] = $elt;
933 if ( $idx === count( $this->elements
) - 1 ) {
934 $this->currentNode
= $elt;
939 * Return the position of the given BalanceElement, set, or
940 * HTML tag name string in the BalanceStack.
941 * @param BalanceElement|array|string $tag
944 public function indexOf( $tag ) {
945 for ( $i = count( $this->elements
) - 1; $i >= 0; $i-- ) {
946 if ( $this->elements
[$i]->isA( $tag ) ) {
954 * Return the number of elements currently in the BalanceStack.
957 public function length() {
958 return count( $this->elements
);
962 * Remove the current node from the BalanceStack, flattening it
965 public function pop() {
966 $elt = array_pop( $this->elements
);
967 if ( count( $this->elements
) ) {
968 $this->currentNode
= $this->elements
[ count( $this->elements
) - 1 ];
970 $this->currentNode
= null;
972 if ( !$elt->isHtmlNamed( 'mw:p-wrap' ) ) {
973 $elt->flatten( $this->tidyCompat
);
978 * Remove all nodes up to and including position $idx from the
979 * BalanceStack, flattening them in the process.
982 public function popTo( $idx ) {
983 for ( $length = count( $this->elements
); $length > $idx; $length-- ) {
989 * Pop elements off the stack up to and including the first
990 * element with the specified HTML tagname (or matching the given
992 * @param BalanceElement|array|string $tag
994 public function popTag( $tag ) {
995 while ( $this->currentNode
) {
996 if ( $this->currentNode
->isA( $tag ) ) {
1005 * Pop elements off the stack *not including* the first element
1006 * in the specified set.
1007 * @param BalanceElement|array|string $set
1009 public function clearToContext( $set ) {
1010 // Note that we don't loop to 0. Never pop the <html> elt off.
1011 for ( $length = count( $this->elements
); $length > 1; $length-- ) {
1012 if ( $this->currentNode
->isA( $set ) ) {
1020 * Remove the given $elt from the BalanceStack, optionally
1021 * flattening it in the process.
1022 * @param BalanceElement $elt The element to remove.
1023 * @param bool $flatten Whether to flatten the removed element.
1025 public function removeElement( BalanceElement
$elt, $flatten = true ) {
1027 $elt->parent
!== 'flat',
1029 '$elt should not already have been flattened.'
1032 $elt->parent
->parent
!== 'flat',
1034 'The parent of $elt should not already have been flattened.'
1036 $idx = array_search( $elt, $this->elements
, true );
1037 Assert
::parameter( $idx !== false, '$elt', 'must be in stack' );
1038 array_splice( $this->elements
, $idx, 1 );
1039 if ( $idx === count( $this->elements
) ) {
1040 $this->currentNode
= $this->elements
[$idx - 1];
1043 // serialize $elt into its parent
1044 // otherwise, it will eventually serialize when the parent
1045 // is serialized, we just hold onto the memory for its
1046 // tree of objects a little longer.
1047 $elt->flatten( $this->tidyCompat
);
1049 Assert
::postcondition(
1050 array_search( $elt, $this->elements
, true ) === false,
1051 '$elt should no longer be in open elements stack'
1056 * Find $a in the BalanceStack and insert $b after it.
1057 * @param BalanceElement $a
1058 * @param BalanceElement $b
1060 public function insertAfter( BalanceElement
$a, BalanceElement
$b ) {
1061 $idx = $this->indexOf( $a );
1062 Assert
::parameter( $idx !== false, '$a', 'must be in stack' );
1063 if ( $idx === count( $this->elements
) - 1 ) {
1064 array_push( $this->elements
, $b );
1065 $this->currentNode
= $b;
1067 array_splice( $this->elements
, $idx +
1, 0, [ $b ] );
1071 // Fostering and adoption.
1074 * Foster parent the given $elt in the stack of open elements.
1075 * @param BalanceElement|string $elt
1076 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
1078 private function fosterParent( $elt ) {
1079 $lastTable = $this->indexOf( 'table' );
1080 $lastTemplate = $this->indexOf( 'template' );
1084 if ( $lastTemplate >= 0 && ( $lastTable < 0 ||
$lastTemplate > $lastTable ) ) {
1085 $parent = $this->elements
[$lastTemplate];
1086 } elseif ( $lastTable >= 0 ) {
1087 $parent = $this->elements
[$lastTable]->parent
;
1088 // Assume all tables have parents, since we're not running scripts!
1090 $parent !== null, "All tables should have parents"
1092 $before = $this->elements
[$lastTable];
1094 $parent = $this->elements
[0]; // the `html` element.
1097 if ( $this->tidyCompat
) {
1098 if ( is_string( $elt ) ) {
1099 // We're fostering text: do we need a p-wrapper?
1100 if ( $parent->isA( BalanceSets
::$tidyPWrapSet ) ) {
1101 $this->insertHTMLElement( 'mw:p-wrap', [] );
1102 $this->insertText( $elt );
1106 // We're fostering an element; do we need to merge p-wrappers?
1107 if ( $elt->isHtmlNamed( 'mw:p-wrap' ) ) {
1109 array_search( $before, $parent->children
, true ) :
1110 count( $parent->children
);
1111 $after = $idx > 0 ?
$parent->children
[$idx - 1] : '';
1113 $after instanceof BalanceElement
&&
1114 $after->isHtmlNamed( 'mw:p-wrap' )
1116 return $after; // Re-use existing p-wrapper.
1123 $parent->insertBefore( $before, $elt );
1125 $parent->appendChild( $elt );
1131 * Run the "adoption agency algoritm" (AAA) for the given subject
1133 * @param string $tag The subject tag name.
1134 * @param BalanceActiveFormattingElements $afe The current
1135 * active formatting elements list.
1136 * @return true if the adoption agency algorithm "did something", false
1137 * if more processing is required by the caller.
1138 * @see https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
1140 public function adoptionAgency( $tag, $afe ) {
1141 // If the current node is an HTML element whose tag name is subject,
1142 // and the current node is not in the list of active formatting
1143 // elements, then pop the current node off the stack of open
1144 // elements and abort these steps.
1146 $this->currentNode
->isHtmlNamed( $tag ) &&
1147 !$afe->isInList( $this->currentNode
)
1150 return true; // no more handling required
1153 // Outer loop: If outer loop counter is greater than or
1154 // equal to eight, then abort these steps.
1155 for ( $outer = 0; $outer < 8; $outer++
) {
1156 // Let the formatting element be the last element in the list
1157 // of active formatting elements that: is between the end of
1158 // the list and the last scope marker in the list, if any, or
1159 // the start of the list otherwise, and has the same tag name
1161 $fmtElt = $afe->findElementByTag( $tag );
1163 // If there is no such node, then abort these steps and instead
1164 // act as described in the "any other end tag" entry below.
1166 return false; // false means handle by the default case
1169 // Otherwise, if there is such a node, but that node is not in
1170 // the stack of open elements, then this is a parse error;
1171 // remove the element from the list, and abort these steps.
1172 $index = $this->indexOf( $fmtElt );
1174 $afe->remove( $fmtElt );
1175 return true; // true means no more handling required
1178 // Otherwise, if there is such a node, and that node is also in
1179 // the stack of open elements, but the element is not in scope,
1180 // then this is a parse error; ignore the token, and abort
1182 if ( !$this->inScope( $fmtElt ) ) {
1186 // Let the furthest block be the topmost node in the stack of
1187 // open elements that is lower in the stack than the formatting
1188 // element, and is an element in the special category. There
1189 // might not be one.
1190 $furthestBlock = null;
1191 $furthestBlockIndex = -1;
1192 $stackLength = $this->length();
1193 for ( $i = $index+
1; $i < $stackLength; $i++
) {
1194 if ( $this->node( $i )->isA( BalanceSets
::$specialSet ) ) {
1195 $furthestBlock = $this->node( $i );
1196 $furthestBlockIndex = $i;
1201 // If there is no furthest block, then the UA must skip the
1202 // subsequent steps and instead just pop all the nodes from the
1203 // bottom of the stack of open elements, from the current node
1204 // up to and including the formatting element, and remove the
1205 // formatting element from the list of active formatting
1207 if ( !$furthestBlock ) {
1208 $this->popTag( $fmtElt );
1209 $afe->remove( $fmtElt );
1213 // Let the common ancestor be the element immediately above
1214 // the formatting element in the stack of open elements.
1215 $ancestor = $this->node( $index-1 );
1217 // Let a bookmark note the position of the formatting
1218 // element in the list of active formatting elements
1219 // relative to the elements on either side of it in the
1221 $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] );
1222 $afe->insertAfter( $fmtElt, $BOOKMARK );
1224 // Let node and last node be the furthest block.
1225 $node = $furthestBlock;
1226 $lastNode = $furthestBlock;
1227 $nodeIndex = $furthestBlockIndex;
1231 for ( $inner = 1; true; $inner++
) {
1232 // Let node be the element immediately above node in
1233 // the stack of open elements, or if node is no longer
1234 // in the stack of open elements (e.g. because it got
1235 // removed by this algorithm), the element that was
1236 // immediately above node in the stack of open elements
1237 // before node was removed.
1238 $node = $this->node( --$nodeIndex );
1240 // If node is the formatting element, then go
1241 // to the next step in the overall algorithm.
1242 if ( $node === $fmtElt ) break;
1244 // If the inner loop counter is greater than three and node
1245 // is in the list of active formatting elements, then remove
1246 // node from the list of active formatting elements.
1247 $isAFE = $afe->isInList( $node );
1248 if ( $inner > 3 && $isAFE ) {
1249 $afe->remove( $node );
1253 // If node is not in the list of active formatting
1254 // elements, then remove node from the stack of open
1255 // elements and then go back to the step labeled inner
1258 // Don't flatten here, since we're about to relocate
1259 // parts of this $node.
1260 $this->removeElement( $node, false );
1264 // Create an element for the token for which the
1265 // element node was created with common ancestor as
1266 // the intended parent, replace the entry for node
1267 // in the list of active formatting elements with an
1268 // entry for the new element, replace the entry for
1269 // node in the stack of open elements with an entry for
1270 // the new element, and let node be the new element.
1271 $newElt = new BalanceElement(
1272 $node->namespaceURI
, $node->localName
, $node->attribs
);
1273 $afe->replace( $node, $newElt );
1274 $this->replaceAt( $nodeIndex, $newElt );
1277 // If last node is the furthest block, then move the
1278 // aforementioned bookmark to be immediately after the
1279 // new node in the list of active formatting elements.
1280 if ( $lastNode === $furthestBlock ) {
1281 $afe->remove( $BOOKMARK );
1282 $afe->insertAfter( $newElt, $BOOKMARK );
1285 // Insert last node into node, first removing it from
1286 // its previous parent node if any.
1287 $node->appendChild( $lastNode );
1289 // Let last node be node.
1293 // If the common ancestor node is a table, tbody, tfoot,
1294 // thead, or tr element, then, foster parent whatever last
1295 // node ended up being in the previous step, first removing
1296 // it from its previous parent node if any.
1298 $this->fosterParentMode
&&
1299 $ancestor->isA( BalanceSets
::$tableSectionRowSet )
1301 $this->fosterParent( $lastNode );
1303 // Otherwise, append whatever last node ended up being in
1304 // the previous step to the common ancestor node, first
1305 // removing it from its previous parent node if any.
1306 $ancestor->appendChild( $lastNode );
1309 // Create an element for the token for which the
1310 // formatting element was created, with furthest block
1311 // as the intended parent.
1312 $newElt2 = new BalanceElement(
1313 $fmtElt->namespaceURI
, $fmtElt->localName
, $fmtElt->attribs
);
1315 // Take all of the child nodes of the furthest block and
1316 // append them to the element created in the last step.
1317 $newElt2->adoptChildren( $furthestBlock );
1319 // Append that new element to the furthest block.
1320 $furthestBlock->appendChild( $newElt2 );
1322 // Remove the formatting element from the list of active
1323 // formatting elements, and insert the new element into the
1324 // list of active formatting elements at the position of
1325 // the aforementioned bookmark.
1326 $afe->remove( $fmtElt );
1327 $afe->replace( $BOOKMARK, $newElt2 );
1329 // Remove the formatting element from the stack of open
1330 // elements, and insert the new element into the stack of
1331 // open elements immediately below the position of the
1332 // furthest block in that stack.
1333 $this->removeElement( $fmtElt );
1334 $this->insertAfter( $furthestBlock, $newElt2 );
1341 * Return the contents of the open elements stack as a string for
1345 public function __toString() {
1347 foreach ( $this->elements
as $elt ) {
1348 array_push( $r, $elt->localName
);
1350 return implode( $r, ' ' );
1355 * A pseudo-element used as a marker in the list of active formatting elements
1360 class BalanceMarker
{
1366 * The list of active formatting elements, which is used to handle
1367 * mis-nested formatting element tags in the HTML5 tree builder
1372 * @see https://html.spec.whatwg.org/multipage/syntax.html#list-of-active-formatting-elements
1374 class BalanceActiveFormattingElements
{
1375 /** The last (most recent) element in the list */
1378 /** The first (least recent) element in the list */
1382 * An array of arrays representing the population of elements in each bucket
1383 * according to the Noah's Ark clause. The outer array is stack-like, with each
1384 * integer-indexed element representing a segment of the list, bounded by
1385 * markers. The first element represents the segment of the list before the
1388 * The inner arrays are indexed by "Noah key", which is a string which uniquely
1389 * identifies each bucket according to the rules in the spec. The value in
1390 * the inner array is the first (least recently inserted) element in the bucket,
1391 * and subsequent members of the bucket can be found by iterating through the
1392 * singly-linked list via $node->nextNoah.
1394 * This is optimised for the most common case of inserting into a bucket
1395 * with zero members, and deleting a bucket containing one member. In the
1396 * worst case, iteration through the list is still O(1) in the document
1397 * size, since each bucket can have at most 3 members.
1399 private $noahTableStack = [ [] ];
1401 public function __destruct() {
1402 for ( $node = $this->head
; $node; $node = $next ) {
1403 $next = $node->nextAFE
;
1404 $node->prevAFE
= $node->nextAFE
= $node->nextNoah
= null;
1406 $this->head
= $this->tail
= $this->noahTableStack
= null;
1409 public function insertMarker() {
1410 $elt = new BalanceMarker
;
1411 if ( $this->tail
) {
1412 $this->tail
->nextAFE
= $elt;
1413 $elt->prevAFE
= $this->tail
;
1418 $this->noahTableStack
[] = [];
1422 * Follow the steps required when the spec requires us to "push onto the
1423 * list of active formatting elements".
1424 * @param BalanceElement $elt
1426 public function push( BalanceElement
$elt ) {
1427 // Must not be in the list already
1428 if ( $elt->prevAFE
!== null ||
$this->head
=== $elt ) {
1429 throw new ParameterAssertionException( '$elt',
1430 'Cannot insert a node into the AFE list twice' );
1433 // "Noah's Ark clause" -- if there are already three copies of
1434 // this element before we encounter a marker, then drop the last
1436 $noahKey = $elt->getNoahKey();
1437 $table =& $this->noahTableStack
[ count( $this->noahTableStack
) - 1 ];
1438 if ( !isset( $table[$noahKey] ) ) {
1439 $table[$noahKey] = $elt;
1442 $head = $tail = $table[$noahKey];
1443 while ( $tail->nextNoah
) {
1444 $tail = $tail->nextNoah
;
1447 if ( $count >= 3 ) {
1448 $this->remove( $head );
1450 $tail->nextNoah
= $elt;
1452 // Add to the main AFE list
1453 if ( $this->tail
) {
1454 $this->tail
->nextAFE
= $elt;
1455 $elt->prevAFE
= $this->tail
;
1463 * Follow the steps required when the spec asks us to "clear the list of
1464 * active formatting elements up to the last marker".
1466 public function clearToMarker() {
1467 // Iterate back through the list starting from the tail
1468 $tail = $this->tail
;
1469 while ( $tail && !( $tail instanceof BalanceMarker
) ) {
1470 // Unlink the element
1471 $prev = $tail->prevAFE
;
1472 $tail->prevAFE
= null;
1474 $prev->nextAFE
= null;
1476 $tail->nextNoah
= null;
1479 // If we finished on a marker, unlink it and pop it off the Noah table stack
1481 $prev = $tail->prevAFE
;
1483 $prev->nextAFE
= null;
1486 array_pop( $this->noahTableStack
);
1488 // No marker: wipe the top-level Noah table (which is the only one)
1489 $this->noahTableStack
[0] = [];
1491 // If we removed all the elements, clear the head pointer
1495 $this->tail
= $tail;
1499 * Find and return the last element with the specified tag between the
1500 * end of the list and the last marker on the list.
1501 * Used when parsing <a> "in body mode".
1503 public function findElementByTag( $tag ) {
1505 while ( $elt && !( $elt instanceof BalanceMarker
) ) {
1506 if ( $elt->localName
=== $tag ) {
1509 $elt = $elt->prevAFE
;
1515 * Determine whether an element is in the list of formatting elements.
1518 public function isInList( BalanceElement
$elt ) {
1519 return $this->head
=== $elt ||
$elt->prevAFE
;
1523 * Find the element $elt in the list and remove it.
1524 * Used when parsing <a> in body mode.
1526 public function remove( BalanceElement
$elt ) {
1527 if ( $this->head
!== $elt && !$elt->prevAFE
) {
1528 throw new ParameterAssertionException( '$elt',
1529 "Attempted to remove an element which is not in the AFE list" );
1531 // Update head and tail pointers
1532 if ( $this->head
=== $elt ) {
1533 $this->head
= $elt->nextAFE
;
1535 if ( $this->tail
=== $elt ) {
1536 $this->tail
= $elt->prevAFE
;
1538 // Update previous element
1539 if ( $elt->prevAFE
) {
1540 $elt->prevAFE
->nextAFE
= $elt->nextAFE
;
1542 // Update next element
1543 if ( $elt->nextAFE
) {
1544 $elt->nextAFE
->prevAFE
= $elt->prevAFE
;
1546 // Clear pointers so that isInList() etc. will work
1547 $elt->prevAFE
= $elt->nextAFE
= null;
1549 $this->removeFromNoahList( $elt );
1552 private function addToNoahList( BalanceElement
$elt ) {
1553 $noahKey = $elt->getNoahKey();
1554 $table =& $this->noahTableStack
[ count( $this->noahTableStack
) - 1 ];
1555 if ( !isset( $table[$noahKey] ) ) {
1556 $table[$noahKey] = $elt;
1558 $tail = $table[$noahKey];
1559 while ( $tail->nextNoah
) {
1560 $tail = $tail->nextNoah
;
1562 $tail->nextNoah
= $elt;
1566 private function removeFromNoahList( BalanceElement
$elt ) {
1567 $table =& $this->noahTableStack
[ count( $this->noahTableStack
) - 1 ];
1568 $key = $elt->getNoahKey();
1569 $noahElt = $table[$key];
1570 if ( $noahElt === $elt ) {
1571 if ( $noahElt->nextNoah
) {
1572 $table[$key] = $noahElt->nextNoah
;
1573 $noahElt->nextNoah
= null;
1575 unset( $table[$key] );
1579 $prevNoahElt = $noahElt;
1580 $noahElt = $prevNoahElt->nextNoah
;
1581 if ( $noahElt === $elt ) {
1583 $prevNoahElt->nextNoah
= $elt->nextNoah
;
1584 $elt->nextNoah
= null;
1587 } while ( $noahElt );
1592 * Find element $a in the list and replace it with element $b
1594 public function replace( BalanceElement
$a, BalanceElement
$b ) {
1595 if ( $this->head
!== $a && !$a->prevAFE
) {
1596 throw new ParameterAssertionException( '$a',
1597 "Attempted to replace an element which is not in the AFE list" );
1599 // Update head and tail pointers
1600 if ( $this->head
=== $a ) {
1603 if ( $this->tail
=== $a ) {
1606 // Update previous element
1607 if ( $a->prevAFE
) {
1608 $a->prevAFE
->nextAFE
= $b;
1610 // Update next element
1611 if ( $a->nextAFE
) {
1612 $a->nextAFE
->prevAFE
= $b;
1614 $b->prevAFE
= $a->prevAFE
;
1615 $b->nextAFE
= $a->nextAFE
;
1616 $a->nextAFE
= $a->prevAFE
= null;
1618 $this->removeFromNoahList( $a );
1619 $this->addToNoahList( $b );
1623 * Find $a in the list and insert $b after it.
1625 public function insertAfter( BalanceElement
$a, BalanceElement
$b ) {
1626 if ( $this->head
!== $a && !$a->prevAFE
) {
1627 throw new ParameterAssertionException( '$a',
1628 "Attempted to insert after an element which is not in the AFE list" );
1630 if ( $this->tail
=== $a ) {
1633 if ( $a->nextAFE
) {
1634 $a->nextAFE
->prevAFE
= $b;
1636 $b->nextAFE
= $a->nextAFE
;
1639 $this->addToNoahList( $b );
1642 // @codingStandardsIgnoreStart Generic.Files.LineLength.TooLong
1644 * Reconstruct the active formatting elements.
1645 * @param BalanceStack $stack The open elements stack
1646 * @see https://html.spec.whatwg.org/multipage/syntax.html#reconstruct-the-active-formatting-elements
1648 // @codingStandardsIgnoreEnd
1649 public function reconstruct( $stack ) {
1650 $entry = $this->tail
;
1651 // If there are no entries in the list of active formatting elements,
1652 // then there is nothing to reconstruct
1656 // If the last is a marker, do nothing.
1657 if ( $entry instanceof BalanceMarker
) {
1660 // Or if it is an open element, do nothing.
1661 if ( $stack->indexOf( $entry ) >= 0 ) {
1665 // Loop backward through the list until we find a marker or an
1668 while ( $entry->prevAFE
) {
1669 $entry = $entry->prevAFE
;
1670 if ( $entry instanceof BalanceMarker ||
$stack->indexOf( $entry ) >= 0 ) {
1676 // Now loop forward, starting from the element after the current one (or
1677 // the first element if we didn't find a marker or open element),
1678 // recreating formatting elements and pushing them back onto the list
1679 // of open elements.
1681 $entry = $entry->nextAFE
;
1684 $newElement = $stack->insertHTMLElement(
1687 $this->replace( $entry, $newElement );
1688 $entry = $newElement->nextAFE
;
1693 * Get a string representation of the AFE list, for debugging
1695 public function __toString() {
1698 for ( $node = $this->head
; $node; $prev = $node, $node = $node->nextAFE
) {
1699 if ( $node instanceof BalanceMarker
) {
1703 $s .= $node->localName
. '#' . substr( md5( spl_object_hash( $node ) ), 0, 8 );
1704 if ( $node->nextNoah
) {
1705 $s .= " (noah sibling: {$node->nextNoah->localName}#" .
1706 substr( md5( spl_object_hash( $node->nextNoah
) ), 0, 8 ) .
1709 if ( $node->nextAFE
&& $node->nextAFE
->prevAFE
!== $node ) {
1710 $s .= " (reverse link is wrong!)";
1714 if ( $prev !== $this->tail
) {
1715 $s .= "(tail pointer is wrong!)\n";
1722 * An implementation of the tree building portion of the HTML5 parsing
1725 * This is used to balance and tidy output so that the result can
1726 * always be cleanly serialized/deserialized by an HTML5 parser. It
1727 * does *not* guarantee "conforming" output -- the HTML5 spec contains
1728 * a number of constraints which are not enforced by the HTML5 parsing
1729 * process. But the result will be free of gross errors: misnested or
1730 * unclosed tags, for example, and will be unchanged by spec-complient
1731 * parsing followed by serialization.
1733 * The tree building stage is structured as a state machine.
1734 * When comparing the implementation to
1735 * https://www.w3.org/TR/html5/syntax.html#tree-construction
1736 * note that each state is implemented as a function with a
1737 * name ending in `Mode` (because the HTML spec refers to them
1738 * as insertion modes). The current insertion mode is held by
1739 * the $parseMode property.
1741 * The following simplifications have been made:
1742 * - We handle body content only (ie, we start `in body`.)
1743 * - The document is never in "quirks mode".
1744 * - All occurrences of < and > have been entity escaped, so we
1745 * can parse tags by simply splitting on those two characters.
1746 * (This also simplifies the handling of < inside <textarea>.)
1747 * The character < must not appear inside comments.
1748 * Similarly, all attributes have been "cleaned" and are double-quoted
1750 * - All null characters are assumed to have been removed.
1751 * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
1752 * <frame>, <plaintext>, <isindex>, <xmp>, <iframe>,
1753 * <noembed>, <noscript>, <script>, <title>. As a result,
1754 * further simplifications can be made:
1755 * - `frameset-ok` is not tracked.
1756 * - `head element pointer` is not tracked (but presumed non-null)
1757 * - Tokenizer has only a single mode. (<textarea> wants RCDATA and
1758 * <style>/<noframes> want RAWTEXT modes which we only loosely emulate.)
1760 * We generally mark places where we omit cases from the spec due to
1761 * disallowed elements with a comment: `// OMITTED: <element-name>`.
1763 * The HTML spec keeps a flag during the parsing process to track
1764 * whether or not a "parse error" has been encountered. We don't
1765 * bother to track that flag, we just implement the error-handling
1766 * process as specified.
1770 * @see https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
1774 private $bitsIterator;
1775 private $allowedHtmlElements;
1779 private $tidyCompat;
1780 private $allowComments;
1782 private $textIntegrationMode;
1783 private $pendingTableText;
1784 private $originalInsertionMode;
1785 private $fragmentContext;
1786 private $formElementPointer;
1787 private $ignoreLinefeed;
1792 * Valid HTML5 comments.
1793 * Regex borrowed from Tim Starling's "remex-html" project.
1795 const VALID_COMMENT_REGEX
= "~ !--
1796 ( # 1. Comment match detector
1797 > | -> | # Invalid short close
1798 ( # 2. Comment contents
1808 ( # 3. Comment close
1809 --> | # Normal close
1810 --!> | # Comment end bang
1811 ( # 4. Indicate matches requiring EOF
1812 --! | # EOF in comment end bang state
1813 -- | # EOF in comment end state
1814 - | # EOF in comment end dash state
1815 # EOF in comment state
1819 ([^<]*) \z # 5. Non-tag text after the comment
1823 * Create a new Balancer.
1824 * @param array $config Balancer configuration. Includes:
1825 * 'strict' : boolean, defaults to false.
1826 * When true, enforces syntactic constraints on input:
1827 * all non-tag '<' must be escaped, all attributes must be
1828 * separated by a single space and double-quoted. This is
1829 * consistent with the output of the Sanitizer.
1830 * 'allowedHtmlElements' : array, defaults to null.
1831 * When present, the keys of this associative array give
1832 * the acceptable HTML tag names. When not present, no
1833 * tag sanitization is done.
1834 * 'tidyCompat' : boolean, defaults to false.
1835 * When true, the serialization algorithm is tweaked to
1836 * provide historical compatibility with the old "tidy"
1837 * program: <p>-wrapping is done to the children of
1838 * <body> and <blockquote> elements, and empty elements
1840 * 'allowComments': boolean, defaults to true.
1841 * When true, allows HTML comments in the input.
1842 * The Sanitizer generally strips all comments, so if you
1843 * are running on sanitized output you can set this to
1844 * false to get a bit more performance.
1846 public function __construct( array $config = [] ) {
1847 $config = $config +
[
1849 'allowedHtmlElements' => null,
1850 'tidyCompat' => false,
1851 'allowComments' => true,
1853 $this->allowedHtmlElements
= $config['allowedHtmlElements'];
1854 $this->strict
= $config['strict'];
1855 $this->tidyCompat
= $config['tidyCompat'];
1856 $this->allowComments
= $config['allowComments'];
1857 if ( $this->allowedHtmlElements
!== null ) {
1859 $bad = array_uintersect_assoc(
1860 $this->allowedHtmlElements
,
1861 BalanceSets
::$unsupportedSet[BalanceSets
::HTML_NAMESPACE
],
1862 function( $a, $b ) {
1863 // Ignore the values (just intersect the keys) by saying
1864 // all values are equal to each other.
1868 if ( count( $bad ) > 0 ) {
1869 $badstr = implode( array_keys( $bad ), ',' );
1870 throw new ParameterAssertionException(
1872 'Balance attempted with sanitization including ' .
1873 "unsupported elements: {$badstr}"
1880 * Return a balanced HTML string for the HTML fragment given by $text,
1881 * subject to the caveats listed in the class description. The result
1882 * will typically be idempotent -- that is, rebalancing the output
1883 * would result in no change.
1885 * @param string $text The markup to be balanced
1886 * @param callable $processingCallback Callback to do any variable or
1887 * parameter replacements in HTML attributes values
1888 * @param array|bool $processingArgs Arguments for the processing callback
1889 * @return string The balanced markup
1891 public function balance( $text, $processingCallback = null, $processingArgs = [] ) {
1892 $this->parseMode
= 'inBodyMode';
1893 $this->bitsIterator
= new ExplodeIterator( '<', $text );
1894 $this->afe
= new BalanceActiveFormattingElements();
1895 $this->stack
= new BalanceStack();
1896 $this->stack
->tidyCompat
= $this->tidyCompat
;
1897 $this->processingCallback
= $processingCallback;
1898 $this->processingArgs
= $processingArgs;
1900 $this->textIntegrationMode
=
1901 $this->ignoreLinefeed
=
1903 $this->inRAWTEXT
= false;
1905 // The stack is constructed with an <html> element already on it.
1906 // Set this up as a fragment parsed with <body> as the context.
1907 $this->fragmentContext
=
1908 new BalanceElement( BalanceSets
::HTML_NAMESPACE
, 'body', [] );
1909 $this->resetInsertionMode();
1910 $this->formElementPointer
= null;
1911 for ( $e = $this->fragmentContext
; $e != null; $e = $e->parent
) {
1912 if ( $e->isHtmlNamed( 'form' ) ) {
1913 $this->formElementPointer
= $e;
1918 // First element is text not tag
1919 $x = $this->bitsIterator
->current();
1920 $this->bitsIterator
->next();
1921 $this->insertToken( 'text', str_replace( '>', '>', $x ) );
1922 // Now process each tag.
1923 while ( $this->bitsIterator
->valid() ) {
1926 $this->insertToken( 'eof', null );
1927 $result = $this->stack
->getOutput();
1928 // Free memory before returning.
1929 $this->bitsIterator
= null;
1931 $this->stack
= null;
1932 $this->fragmentContext
= null;
1933 $this->formElementPointer
= null;
1938 * Pass a token to the tree builder. The $token will be one of the
1939 * strings "tag", "endtag", or "text".
1941 private function insertToken( $token, $value, $attribs = null, $selfClose = false ) {
1942 // validate tags against $unsupportedSet
1943 if ( $token === 'tag' ||
$token === 'endtag' ) {
1944 if ( isset( BalanceSets
::$unsupportedSet[BalanceSets
::HTML_NAMESPACE
][$value] ) ) {
1945 // As described in "simplifications" above, these tags are
1946 // not supported in the balancer.
1949 "Unsupported $token <$value> found."
1953 } elseif ( $token === 'text' && $value === '' ) {
1954 // Don't actually inject the empty string as a text token.
1957 // Support pre/listing/textarea by suppressing initial linefeed
1958 if ( $this->ignoreLinefeed
) {
1959 $this->ignoreLinefeed
= false;
1960 if ( $token === 'text' ) {
1961 if ( $value[0] === "\n" ) {
1962 if ( $value === "\n" ) {
1963 // Nothing would be left, don't inject the empty string.
1966 $value = substr( $value, 1 );
1970 // Some hoops we have to jump through
1971 $adjusted = $this->stack
->adjustedCurrentNode( $this->fragmentContext
);
1975 $this->stack
->length() === 0 ||
1976 $adjusted->isHtml() ||
1980 } elseif ( $adjusted->isMathmlTextIntegrationPoint() ) {
1981 if ( $token === 'text' ) {
1985 $value !== 'mglyph' && $value !== 'malignmark'
1990 $adjusted->namespaceURI
=== BalanceSets
::MATHML_NAMESPACE
&&
1991 $adjusted->localName
=== 'annotation-xml' &&
1992 $token === 'tag' && $value === 'svg'
1996 $adjusted->isHtmlIntegrationPoint() &&
1997 ( $token === 'tag' ||
$token === 'text' )
2002 return $this->insertForeignToken( $token, $value, $attribs, $selfClose );
2004 $func = $this->parseMode
;
2005 return $this->$func( $token, $value, $attribs, $selfClose );
2009 private function insertForeignToken( $token, $value, $attribs = null, $selfClose = false ) {
2010 if ( $token === 'text' ) {
2011 $this->stack
->insertText( $value );
2013 } elseif ( $token === 'tag' ) {
2016 if ( isset( $attribs['color'] )
2017 ||
isset( $attribs['face'] )
2018 ||
isset( $attribs['size'] )
2022 // otherwise, fall through
2067 if ( $this->fragmentContext
) {
2071 $this->stack
->pop();
2072 $node = $this->stack
->currentNode
;
2074 $node->isMathmlTextIntegrationPoint() ||
2075 $node->isHtmlIntegrationPoint() ||
2081 return $this->insertToken( $token, $value, $attribs, $selfClose );
2083 // "Any other start tag"
2084 $adjusted = ( $this->fragmentContext
&& $this->stack
->length()===1 ) ?
2085 $this->fragmentContext
: $this->stack
->currentNode
;
2086 $this->stack
->insertForeignElement(
2087 $adjusted->namespaceURI
, $value, $attribs
2090 $this->stack
->pop();
2093 } elseif ( $token === 'endtag' ) {
2095 foreach ( $this->stack
as $i => $node ) {
2096 if ( $node->isHtml() && !$first ) {
2097 // process the end tag as HTML
2098 $func = $this->parseMode
;
2099 return $this->$func( $token, $value, $attribs, $selfClose );
2100 } elseif ( $i === 0 ) {
2102 } elseif ( $node->localName
=== $value ) {
2103 $this->stack
->popTag( $node );
2112 * Grab the next "token" from $bitsIterator. This is either a open/close
2113 * tag or text or a comment, depending on whether the Sanitizer approves.
2115 private function advance() {
2116 $x = $this->bitsIterator
->current();
2117 $this->bitsIterator
->next();
2119 // Handle comments. These won't be generated by mediawiki (they
2120 // are stripped in the Sanitizer) but may be generated by extensions.
2122 $this->allowComments
&&
2123 !( $this->inRCDATA ||
$this->inRAWTEXT
) &&
2124 preg_match( Balancer
::VALID_COMMENT_REGEX
, $x, $regs, PREG_OFFSET_CAPTURE
) &&
2125 // verify EOF condition where necessary
2126 ( $regs[4][1] < 0 ||
!$this->bitsIterator
->valid() )
2128 $contents = $regs[2][0];
2129 $rest = $regs[5][0];
2130 $this->insertToken( 'comment', $contents );
2131 $this->insertToken( 'text', str_replace( '>', '>', $rest ) );
2134 // $slash: Does the current element start with a '/'?
2135 // $t: Current element name
2136 // $attribStr: String between element name and >
2137 // $brace: Ending '>' or '/>'
2138 // $rest: Everything until the next element from the $bitsIterator
2139 if ( preg_match( Sanitizer
::ELEMENT_BITS_REGEX
, $x, $regs ) ) {
2140 list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs;
2141 $t = strtolower( $t );
2142 if ( $this->strict
) {
2143 // Verify that attributes are all properly double-quoted
2146 '/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr
2148 "Bad attribute string found"
2153 !$this->strict
, "< found which does not start a valid tag"
2155 $slash = $t = $attribStr = $brace = $rest = null;
2158 if ( $this->inRCDATA
) {
2159 if ( $slash && $t === $this->inRCDATA
) {
2160 $this->inRCDATA
= false;
2162 // No tags allowed; this emulates the "rcdata" tokenizer mode.
2166 if ( $this->inRAWTEXT
) {
2167 if ( $slash && $t === $this->inRAWTEXT
) {
2168 $this->inRAWTEXT
= false;
2170 // No tags allowed, no entity-escaping done.
2174 $sanitize = $this->allowedHtmlElements
!== null;
2176 $goodTag = $t && isset( $this->allowedHtmlElements
[$t] );
2179 if ( is_callable( $this->processingCallback
) ) {
2180 call_user_func_array( $this->processingCallback
, [ &$attribStr, $this->processingArgs
] );
2183 $goodTag = Sanitizer
::validateTag( $attribStr, $t );
2188 $attribs = Sanitizer
::decodeTagAttributes( $attribStr );
2189 $attribs = Sanitizer
::validateTagAttributes( $attribs, $t );
2191 $attribs = Sanitizer
::decodeTagAttributes( $attribStr );
2193 $goodTag = $this->insertToken(
2194 $slash ?
'endtag' : 'tag', $t, $attribs, $brace === '/>'
2198 $rest = str_replace( '>', '>', $rest );
2199 $this->insertToken( 'text', str_replace( '>', '>', $rest ) );
2200 } elseif ( $this->inRAWTEXT
) {
2201 $this->insertToken( 'text', "<$x" );
2203 // bad tag; serialize entire thing as text.
2204 $this->insertToken( 'text', '<' . str_replace( '>', '>', $x ) );
2208 private function switchMode( $mode ) {
2210 substr( $mode, -4 )==='Mode', '$mode', 'should end in Mode'
2212 $oldMode = $this->parseMode
;
2213 $this->parseMode
= $mode;
2217 private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfClose ) {
2218 $this->switchMode( $mode );
2219 return $this->insertToken( $token, $value, $attribs, $selfClose );
2222 private function resetInsertionMode() {
2224 foreach ( $this->stack
as $i => $node ) {
2227 if ( $this->fragmentContext
) {
2228 $node = $this->fragmentContext
;
2231 if ( $node->isHtml() ) {
2232 switch ( $node->localName
) {
2234 $stackLength = $this->stack
->length();
2235 for ( $j = $i +
1; $j < $stackLength-1; $j++
) {
2236 $ancestor = $this->stack
->node( $stackLength-$j-1 );
2237 if ( $ancestor->isHtmlNamed( 'template' ) ) {
2240 if ( $ancestor->isHtmlNamed( 'table' ) ) {
2241 $this->switchMode( 'inSelectInTableMode' );
2245 $this->switchMode( 'inSelectMode' );
2248 $this->switchMode( 'inRowMode' );
2253 $this->switchMode( 'inTableBodyMode' );
2256 $this->switchMode( 'inCaptionMode' );
2259 $this->switchMode( 'inColumnGroupMode' );
2262 $this->switchMode( 'inTableMode' );
2266 array_slice( $this->templateInsertionModes
, -1 )[0]
2270 $this->switchMode( 'inBodyMode' );
2272 // OMITTED: <frameset>
2278 if ( $node->isA( BalanceSets
::$tableCellSet ) ) {
2279 $this->switchMode( 'inCellMode' );
2286 $this->switchMode( 'inBodyMode' );
2292 private function stopParsing() {
2293 // Most of the spec methods are inapplicable, other than step 2:
2294 // "pop all the nodes off the stack of open elements".
2295 // We're going to keep the top-most <html> element on the stack, though.
2297 // Clear the AFE list first, otherwise the element objects will stay live
2298 // during serialization, potentially using O(N^2) memory. Note that
2299 // popping the stack will never result in reconstructing the active
2300 // formatting elements.
2302 $this->stack
->popTo( 1 );
2305 private function parseRawText( $value, $attribs = null ) {
2306 $this->stack
->insertHTMLElement( $value, $attribs );
2307 $this->inRAWTEXT
= $value;
2308 $this->originalInsertionMode
= $this->switchMode( 'inTextMode' );
2312 private function inTextMode( $token, $value, $attribs = null, $selfClose = false ) {
2313 if ( $token === 'text' ) {
2314 $this->stack
->insertText( $value );
2316 } elseif ( $token === 'eof' ) {
2317 $this->stack
->pop();
2318 return $this->switchModeAndReprocess(
2319 $this->originalInsertionMode
, $token, $value, $attribs, $selfClose
2321 } elseif ( $token === 'endtag' ) {
2322 $this->stack
->pop();
2323 $this->switchMode( $this->originalInsertionMode
);
2329 private function inHeadMode( $token, $value, $attribs = null, $selfClose = false ) {
2330 if ( $token === 'text' ) {
2331 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
2332 $this->stack
->insertText( $matches[0] );
2333 $value = substr( $value, strlen( $matches[0] ) );
2335 if ( strlen( $value ) === 0 ) {
2336 return true; // All text handled.
2338 // Fall through to handle non-whitespace below.
2339 } elseif ( $token === 'tag' ) {
2342 // OMITTED: in a full HTML parser, this might change the encoding.
2349 $this->stack
->insertHTMLElement( $value, $attribs );
2350 $this->stack
->pop();
2353 // OMITTED: <noscript>
2356 return $this->parseRawText( $value, $attribs );
2357 // OMITTED: <script>
2359 $this->stack
->insertHTMLElement( $value, $attribs );
2360 $this->afe
->insertMarker();
2361 // OMITTED: frameset_ok
2362 $this->switchMode( 'inTemplateMode' );
2363 $this->templateInsertionModes
[] = $this->parseMode
;
2367 } elseif ( $token === 'endtag' ) {
2373 break; // handle at the bottom of the function
2375 if ( $this->stack
->indexOf( $value ) < 0 ) {
2376 return true; // Ignore the token.
2378 $this->stack
->generateImpliedEndTags( null, true /* thorough */ );
2379 $this->stack
->popTag( $value );
2380 $this->afe
->clearToMarker();
2381 array_pop( $this->templateInsertionModes
);
2382 $this->resetInsertionMode();
2385 // ignore any other end tag
2388 } elseif ( $token === 'comment' ) {
2389 $this->stack
->insertComment( $value );
2393 // If not handled above
2394 $this->inHeadMode( 'endtag', 'head' ); // synthetic </head>
2395 // Then redo this one
2396 return $this->insertToken( $token, $value, $attribs, $selfClose );
2399 private function inBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
2400 if ( $token === 'text' ) {
2401 $this->afe
->reconstruct( $this->stack
);
2402 $this->stack
->insertText( $value );
2404 } elseif ( $token === 'eof' ) {
2405 if ( !empty( $this->templateInsertionModes
) ) {
2406 return $this->inTemplateMode( $token, $value, $attribs, $selfClose );
2408 $this->stopParsing();
2410 } elseif ( $token === 'tag' ) {
2419 // OMITTED: <script>
2423 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
2425 // OMITTED: <frameset>
2451 if ( $this->stack
->inButtonScope( 'p' ) ) {
2452 $this->inBodyMode( 'endtag', 'p' );
2454 $this->stack
->insertHTMLElement( $value, $attribs );
2463 if ( $this->stack
->inButtonScope( 'p' ) ) {
2464 $this->inBodyMode( 'endtag', 'p' );
2466 if ( $this->stack
->currentNode
->isA( BalanceSets
::$headingSet ) ) {
2467 $this->stack
->pop();
2469 $this->stack
->insertHTMLElement( $value, $attribs );
2474 if ( $this->stack
->inButtonScope( 'p' ) ) {
2475 $this->inBodyMode( 'endtag', 'p' );
2477 $this->stack
->insertHTMLElement( $value, $attribs );
2478 $this->ignoreLinefeed
= true;
2479 // OMITTED: frameset_ok
2484 $this->formElementPointer
&&
2485 $this->stack
->indexOf( 'template' ) < 0
2487 return true; // in a form, not in a template.
2489 if ( $this->stack
->inButtonScope( "p" ) ) {
2490 $this->inBodyMode( 'endtag', 'p' );
2492 $elt = $this->stack
->insertHTMLElement( $value, $attribs );
2493 if ( $this->stack
->indexOf( 'template' ) < 0 ) {
2494 $this->formElementPointer
= $elt;
2499 // OMITTED: frameset_ok
2500 foreach ( $this->stack
as $node ) {
2501 if ( $node->isHtmlNamed( 'li' ) ) {
2502 $this->inBodyMode( 'endtag', 'li' );
2506 $node->isA( BalanceSets
::$specialSet ) &&
2507 !$node->isA( BalanceSets
::$addressDivPSet )
2512 if ( $this->stack
->inButtonScope( 'p' ) ) {
2513 $this->inBodyMode( 'endtag', 'p' );
2515 $this->stack
->insertHTMLElement( $value, $attribs );
2520 // OMITTED: frameset_ok
2521 foreach ( $this->stack
as $node ) {
2522 if ( $node->isHtmlNamed( 'dd' ) ) {
2523 $this->inBodyMode( 'endtag', 'dd' );
2526 if ( $node->isHtmlNamed( 'dt' ) ) {
2527 $this->inBodyMode( 'endtag', 'dt' );
2531 $node->isA( BalanceSets
::$specialSet ) &&
2532 !$node->isA( BalanceSets
::$addressDivPSet )
2537 if ( $this->stack
->inButtonScope( 'p' ) ) {
2538 $this->inBodyMode( 'endtag', 'p' );
2540 $this->stack
->insertHTMLElement( $value, $attribs );
2543 // OMITTED: <plaintext>
2546 if ( $this->stack
->inScope( 'button' ) ) {
2547 $this->inBodyMode( 'endtag', 'button' );
2548 return $this->insertToken( $token, $value, $attribs, $selfClose );
2550 $this->afe
->reconstruct( $this->stack
);
2551 $this->stack
->insertHTMLElement( $value, $attribs );
2555 $activeElement = $this->afe
->findElementByTag( 'a' );
2556 if ( $activeElement ) {
2557 $this->inBodyMode( 'endtag', 'a' );
2558 if ( $this->afe
->isInList( $activeElement ) ) {
2559 $this->afe
->remove( $activeElement );
2560 // Don't flatten here, since when we fall
2561 // through below we might foster parent
2562 // the new <a> tag inside this one.
2563 $this->stack
->removeElement( $activeElement, false );
2579 $this->afe
->reconstruct( $this->stack
);
2580 $this->afe
->push( $this->stack
->insertHTMLElement( $value, $attribs ), $attribs );
2584 $this->afe
->reconstruct( $this->stack
);
2585 if ( $this->stack
->inScope( 'nobr' ) ) {
2586 $this->inBodyMode( 'endtag', 'nobr' );
2587 $this->afe
->reconstruct( $this->stack
);
2589 $this->afe
->push( $this->stack
->insertHTMLElement( $value, $attribs ), $attribs );
2595 $this->afe
->reconstruct( $this->stack
);
2596 $this->stack
->insertHTMLElement( $value, $attribs );
2597 $this->afe
->insertMarker();
2598 // OMITTED: frameset_ok
2602 // The document is never in "quirks mode"; see simplifications
2604 if ( $this->stack
->inButtonScope( 'p' ) ) {
2605 $this->inBodyMode( 'endtag', 'p' );
2607 $this->stack
->insertHTMLElement( $value, $attribs );
2608 // OMITTED: frameset_ok
2609 $this->switchMode( 'inTableMode' );
2618 $this->afe
->reconstruct( $this->stack
);
2619 $this->stack
->insertHTMLElement( $value, $attribs );
2620 $this->stack
->pop();
2621 // OMITTED: frameset_ok
2625 $this->afe
->reconstruct( $this->stack
);
2626 $this->stack
->insertHTMLElement( $value, $attribs );
2627 $this->stack
->pop();
2628 // OMITTED: frameset_ok
2629 // (hence we don't need to examine the tag's "type" attribute)
2636 $this->stack
->insertHTMLElement( $value, $attribs );
2637 $this->stack
->pop();
2641 if ( $this->stack
->inButtonScope( 'p' ) ) {
2642 $this->inBodyMode( 'endtag', 'p' );
2644 $this->stack
->insertHTMLElement( $value, $attribs );
2645 $this->stack
->pop();
2650 return $this->inBodyMode( $token, 'img', $attribs, $selfClose );
2652 // OMITTED: <isindex>
2655 $this->stack
->insertHTMLElement( $value, $attribs );
2656 $this->ignoreLinefeed
= true;
2657 $this->inRCDATA
= $value; // emulate rcdata tokenizer mode
2658 // OMITTED: frameset_ok
2662 // OMITTED: <iframe>
2663 // OMITTED: <noembed>
2664 // OMITTED: <noscript>
2667 $this->afe
->reconstruct( $this->stack
);
2668 $this->stack
->insertHTMLElement( $value, $attribs );
2669 switch ( $this->parseMode
) {
2671 case 'inCaptionMode':
2672 case 'inTableBodyMode':
2675 $this->switchMode( 'inSelectInTableMode' );
2678 $this->switchMode( 'inSelectMode' );
2684 if ( $this->stack
->currentNode
->isHtmlNamed( 'option' ) ) {
2685 $this->inBodyMode( 'endtag', 'option' );
2687 $this->afe
->reconstruct( $this->stack
);
2688 $this->stack
->insertHTMLElement( $value, $attribs );
2693 if ( $this->stack
->inScope( 'ruby' ) ) {
2694 $this->stack
->generateImpliedEndTags();
2696 $this->stack
->insertHTMLElement( $value, $attribs );
2701 if ( $this->stack
->inScope( 'ruby' ) ) {
2702 $this->stack
->generateImpliedEndTags( 'rtc' );
2704 $this->stack
->insertHTMLElement( $value, $attribs );
2708 $this->afe
->reconstruct( $this->stack
);
2709 // We skip the spec's "adjust MathML attributes" and
2710 // "adjust foreign attributes" steps, since the browser will
2711 // do this later when it parses the output and it doesn't affect
2713 $this->stack
->insertForeignElement(
2714 BalanceSets
::MATHML_NAMESPACE
, $value, $attribs
2717 // emit explicit </math> tag.
2718 $this->stack
->pop();
2723 $this->afe
->reconstruct( $this->stack
);
2724 // We skip the spec's "adjust SVG attributes" and
2725 // "adjust foreign attributes" steps, since the browser will
2726 // do this later when it parses the output and it doesn't affect
2728 $this->stack
->insertForeignElement(
2729 BalanceSets
::SVG_NAMESPACE
, $value, $attribs
2732 // emit explicit </svg> tag.
2733 $this->stack
->pop();
2748 // Ignore table tags if we're not inTableMode
2752 // Handle any other start tag here
2753 $this->afe
->reconstruct( $this->stack
);
2754 $this->stack
->insertHTMLElement( $value, $attribs );
2756 } elseif ( $token === 'endtag' ) {
2758 // </body>,</html> are unsupported.
2761 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
2789 // Ignore if there is not a matching open tag
2790 if ( !$this->stack
->inScope( $value ) ) {
2793 $this->stack
->generateImpliedEndTags();
2794 $this->stack
->popTag( $value );
2798 if ( $this->stack
->indexOf( 'template' ) < 0 ) {
2799 $openform = $this->formElementPointer
;
2800 $this->formElementPointer
= null;
2801 if ( !$openform ||
!$this->stack
->inScope( $openform ) ) {
2804 $this->stack
->generateImpliedEndTags();
2805 // Don't flatten yet if we're removing a <form> element
2806 // out-of-order. (eg. `<form><div></form>`)
2807 $flatten = ( $this->stack
->currentNode
=== $openform );
2808 $this->stack
->removeElement( $openform, $flatten );
2810 if ( !$this->stack
->inScope( 'form' ) ) {
2813 $this->stack
->generateImpliedEndTags();
2814 $this->stack
->popTag( 'form' );
2819 if ( !$this->stack
->inButtonScope( 'p' ) ) {
2820 $this->inBodyMode( 'tag', 'p', [] );
2821 return $this->insertToken( $token, $value, $attribs, $selfClose );
2823 $this->stack
->generateImpliedEndTags( $value );
2824 $this->stack
->popTag( $value );
2828 if ( !$this->stack
->inListItemScope( $value ) ) {
2829 return true; // ignore
2831 $this->stack
->generateImpliedEndTags( $value );
2832 $this->stack
->popTag( $value );
2837 if ( !$this->stack
->inScope( $value ) ) {
2838 return true; // ignore
2840 $this->stack
->generateImpliedEndTags( $value );
2841 $this->stack
->popTag( $value );
2850 if ( !$this->stack
->inScope( BalanceSets
::$headingSet ) ) {
2851 return true; // ignore
2853 $this->stack
->generateImpliedEndTags();
2854 $this->stack
->popTag( BalanceSets
::$headingSet );
2858 // Take a deep breath, then:
2875 if ( $this->stack
->adoptionAgency( $value, $this->afe
) ) {
2876 return true; // If we did something, we're done.
2878 break; // Go to the "any other end tag" case.
2883 if ( !$this->stack
->inScope( $value ) ) {
2884 return true; // ignore
2886 $this->stack
->generateImpliedEndTags();
2887 $this->stack
->popTag( $value );
2888 $this->afe
->clearToMarker();
2892 // Turn </br> into <br>
2893 return $this->inBodyMode( 'tag', $value, [] );
2896 // Any other end tag goes here
2897 foreach ( $this->stack
as $i => $node ) {
2898 if ( $node->isHtmlNamed( $value ) ) {
2899 $this->stack
->generateImpliedEndTags( $value );
2900 $this->stack
->popTo( $i ); // including $i
2902 } elseif ( $node->isA( BalanceSets
::$specialSet ) ) {
2903 return true; // ignore this close token.
2907 } elseif ( $token === 'comment' ) {
2908 $this->stack
->insertComment( $value );
2911 Assert
::invariant( false, "Bad token type: $token" );
2915 private function inTableMode( $token, $value, $attribs = null, $selfClose = false ) {
2916 if ( $token === 'text' ) {
2917 if ( $this->textIntegrationMode
) {
2918 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
2919 } elseif ( $this->stack
->currentNode
->isA( BalanceSets
::$tableSectionRowSet ) ) {
2920 $this->pendingTableText
= '';
2921 $this->originalInsertionMode
= $this->parseMode
;
2922 return $this->switchModeAndReprocess( 'inTableTextMode',
2923 $token, $value, $attribs, $selfClose );
2925 // fall through to default case.
2926 } elseif ( $token === 'eof' ) {
2927 $this->stopParsing();
2929 } elseif ( $token === 'tag' ) {
2932 $this->afe
->insertMarker();
2933 $this->stack
->insertHTMLElement( $value, $attribs );
2934 $this->switchMode( 'inCaptionMode' );
2937 $this->stack
->clearToContext( BalanceSets
::$tableContextSet );
2938 $this->stack
->insertHTMLElement( $value, $attribs );
2939 $this->switchMode( 'inColumnGroupMode' );
2942 $this->inTableMode( 'tag', 'colgroup', [] );
2943 return $this->insertToken( $token, $value, $attribs, $selfClose );
2947 $this->stack
->clearToContext( BalanceSets
::$tableContextSet );
2948 $this->stack
->insertHTMLElement( $value, $attribs );
2949 $this->switchMode( 'inTableBodyMode' );
2954 $this->inTableMode( 'tag', 'tbody', [] );
2955 return $this->insertToken( $token, $value, $attribs, $selfClose );
2957 if ( !$this->stack
->inTableScope( $value ) ) {
2958 return true; // Ignore this tag.
2960 $this->inTableMode( 'endtag', $value );
2961 return $this->insertToken( $token, $value, $attribs, $selfClose );
2964 // OMITTED: <script>
2966 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
2969 if ( !isset( $attribs['type'] ) ||
strcasecmp( $attribs['type'], 'hidden' ) !== 0 ) {
2970 break; // Handle this as "everything else"
2972 $this->stack
->insertHTMLElement( $value, $attribs );
2973 $this->stack
->pop();
2978 $this->formElementPointer ||
2979 $this->stack
->indexOf( 'template' ) >= 0
2981 return true; // ignore this token
2983 $this->formElementPointer
=
2984 $this->stack
->insertHTMLElement( $value, $attribs );
2985 $this->stack
->popTag( $this->formElementPointer
);
2988 // Fall through for "anything else" clause.
2989 } elseif ( $token === 'endtag' ) {
2992 if ( !$this->stack
->inTableScope( $value ) ) {
2993 return true; // Ignore.
2995 $this->stack
->popTag( $value );
2996 $this->resetInsertionMode();
3009 return true; // Ignore the token.
3011 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3013 // Fall through for "anything else" clause.
3014 } elseif ( $token === 'comment' ) {
3015 $this->stack
->insertComment( $value );
3018 // This is the "anything else" case:
3019 $this->stack
->fosterParentMode
= true;
3020 $this->inBodyMode( $token, $value, $attribs, $selfClose );
3021 $this->stack
->fosterParentMode
= false;
3025 private function inTableTextMode( $token, $value, $attribs = null, $selfClose = false ) {
3026 if ( $token === 'text' ) {
3027 $this->pendingTableText
.= $value;
3031 $text = $this->pendingTableText
;
3032 $this->pendingTableText
= '';
3033 if ( preg_match( '/[^\x09\x0A\x0C\x0D\x20]/', $text ) ) {
3034 // This should match the "anything else" case inTableMode
3035 $this->stack
->fosterParentMode
= true;
3036 $this->inBodyMode( 'text', $text );
3037 $this->stack
->fosterParentMode
= false;
3039 // Pending text is just whitespace.
3040 $this->stack
->insertText( $text );
3042 return $this->switchModeAndReprocess(
3043 $this->originalInsertionMode
, $token, $value, $attribs, $selfClose
3047 // helper for inCaptionMode
3048 private function endCaption() {
3049 if ( !$this->stack
->inTableScope( 'caption' ) ) {
3052 $this->stack
->generateImpliedEndTags();
3053 $this->stack
->popTag( 'caption' );
3054 $this->afe
->clearToMarker();
3055 $this->switchMode( 'inTableMode' );
3059 private function inCaptionMode( $token, $value, $attribs = null, $selfClose = false ) {
3060 if ( $token === 'tag' ) {
3071 if ( $this->endCaption() ) {
3072 $this->insertToken( $token, $value, $attribs, $selfClose );
3076 // Fall through to "anything else" case.
3077 } elseif ( $token === 'endtag' ) {
3080 $this->endCaption();
3083 if ( $this->endCaption() ) {
3084 $this->insertToken( $token, $value, $attribs, $selfClose );
3100 // Fall through to "anything else" case.
3102 // The Anything Else case
3103 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3106 private function inColumnGroupMode( $token, $value, $attribs = null, $selfClose = false ) {
3107 if ( $token === 'text' ) {
3108 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
3109 $this->stack
->insertText( $matches[0] );
3110 $value = substr( $value, strlen( $matches[0] ) );
3112 if ( strlen( $value ) === 0 ) {
3113 return true; // All text handled.
3115 // Fall through to handle non-whitespace below.
3116 } elseif ( $token === 'tag' ) {
3120 $this->stack
->insertHTMLElement( $value, $attribs );
3121 $this->stack
->pop();
3124 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3126 // Fall through for "anything else".
3127 } elseif ( $token === 'endtag' ) {
3130 if ( !$this->stack
->currentNode
->isHtmlNamed( 'colgroup' ) ) {
3131 return true; // Ignore the token.
3133 $this->stack
->pop();
3134 $this->switchMode( 'inTableMode' );
3137 return true; // Ignore the token.
3139 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3141 // Fall through for "anything else".
3142 } elseif ( $token === 'eof' ) {
3143 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3144 } elseif ( $token === 'comment' ) {
3145 $this->stack
->insertComment( $value );
3150 if ( !$this->stack
->currentNode
->isHtmlNamed( 'colgroup' ) ) {
3151 return true; // Ignore the token.
3153 $this->inColumnGroupMode( 'endtag', 'colgroup' );
3154 return $this->insertToken( $token, $value, $attribs, $selfClose );
3157 // Helper function for inTableBodyMode
3158 private function endSection() {
3160 $this->stack
->inTableScope( 'tbody' ) ||
3161 $this->stack
->inTableScope( 'thead' ) ||
3162 $this->stack
->inTableScope( 'tfoot' )
3166 $this->stack
->clearToContext( BalanceSets
::$tableBodyContextSet );
3167 $this->stack
->pop();
3168 $this->switchMode( 'inTableMode' );
3171 private function inTableBodyMode( $token, $value, $attribs = null, $selfClose = false ) {
3172 if ( $token === 'tag' ) {
3175 $this->stack
->clearToContext( BalanceSets
::$tableBodyContextSet );
3176 $this->stack
->insertHTMLElement( $value, $attribs );
3177 $this->switchMode( 'inRowMode' );
3181 $this->inTableBodyMode( 'tag', 'tr', [] );
3182 $this->insertToken( $token, $value, $attribs, $selfClose );
3190 if ( $this->endSection() ) {
3191 $this->insertToken( $token, $value, $attribs, $selfClose );
3195 } elseif ( $token === 'endtag' ) {
3198 if ( $this->endSection() ) {
3199 $this->insertToken( $token, $value, $attribs, $selfClose );
3205 if ( $this->stack
->inTableScope( $value ) ) {
3206 $this->endSection();
3217 return true; // Ignore the token.
3221 return $this->inTableMode( $token, $value, $attribs, $selfClose );
3224 // Helper function for inRowMode
3225 private function endRow() {
3226 if ( !$this->stack
->inTableScope( 'tr' ) ) {
3229 $this->stack
->clearToContext( BalanceSets
::$tableRowContextSet );
3230 $this->stack
->pop();
3231 $this->switchMode( 'inTableBodyMode' );
3234 private function inRowMode( $token, $value, $attribs = null, $selfClose = false ) {
3235 if ( $token === 'tag' ) {
3239 $this->stack
->clearToContext( BalanceSets
::$tableRowContextSet );
3240 $this->stack
->insertHTMLElement( $value, $attribs );
3241 $this->switchMode( 'inCellMode' );
3242 $this->afe
->insertMarker();
3251 if ( $this->endRow() ) {
3252 $this->insertToken( $token, $value, $attribs, $selfClose );
3256 } elseif ( $token === 'endtag' ) {
3262 if ( $this->endRow() ) {
3263 $this->insertToken( $token, $value, $attribs, $selfClose );
3270 $this->stack
->inTableScope( $value ) &&
3273 $this->insertToken( $token, $value, $attribs, $selfClose );
3283 return true; // Ignore the token.
3287 return $this->inTableMode( $token, $value, $attribs, $selfClose );
3290 // Helper for inCellMode
3291 private function endCell() {
3292 if ( $this->stack
->inTableScope( 'td' ) ) {
3293 $this->inCellMode( 'endtag', 'td' );
3295 } elseif ( $this->stack
->inTableScope( 'th' ) ) {
3296 $this->inCellMode( 'endtag', 'th' );
3302 private function inCellMode( $token, $value, $attribs = null, $selfClose = false ) {
3303 if ( $token === 'tag' ) {
3314 if ( $this->endCell() ) {
3315 $this->insertToken( $token, $value, $attribs, $selfClose );
3319 } elseif ( $token === 'endtag' ) {
3323 if ( $this->stack
->inTableScope( $value ) ) {
3324 $this->stack
->generateImpliedEndTags();
3325 $this->stack
->popTag( $value );
3326 $this->afe
->clearToMarker();
3327 $this->switchMode( 'inRowMode' );
3342 if ( $this->stack
->inTableScope( $value ) ) {
3343 $this->stack
->generateImpliedEndTags();
3344 $this->stack
->popTag( BalanceSets
::$tableCellSet );
3345 $this->afe
->clearToMarker();
3346 $this->switchMode( 'inRowMode' );
3347 $this->insertToken( $token, $value, $attribs, $selfClose );
3353 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3356 private function inSelectMode( $token, $value, $attribs = null, $selfClose = false ) {
3357 if ( $token === 'text' ) {
3358 $this->stack
->insertText( $value );
3360 } elseif ( $token === 'eof' ) {
3361 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3362 } elseif ( $token === 'tag' ) {
3366 if ( $this->stack
->currentNode
->isHtmlNamed( 'option' ) ) {
3367 $this->stack
->pop();
3369 $this->stack
->insertHTMLElement( $value, $attribs );
3372 if ( $this->stack
->currentNode
->isHtmlNamed( 'option' ) ) {
3373 $this->stack
->pop();
3375 if ( $this->stack
->currentNode
->isHtmlNamed( 'optgroup' ) ) {
3376 $this->stack
->pop();
3378 $this->stack
->insertHTMLElement( $value, $attribs );
3381 $this->inSelectMode( 'endtag', $value ); // treat it like endtag
3386 if ( !$this->stack
->inSelectScope( 'select' ) ) {
3387 return true; // ignore token (fragment case)
3389 $this->inSelectMode( 'endtag', 'select' );
3390 return $this->insertToken( $token, $value, $attribs, $selfClose );
3393 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3395 } elseif ( $token === 'endtag' ) {
3399 $this->stack
->currentNode
->isHtmlNamed( 'option' ) &&
3400 $this->stack
->length() >= 2 &&
3401 $this->stack
->node( $this->stack
->length() - 2 )->isHtmlNamed( 'optgroup' )
3403 $this->stack
->pop();
3405 if ( $this->stack
->currentNode
->isHtmlNamed( 'optgroup' ) ) {
3406 $this->stack
->pop();
3410 if ( $this->stack
->currentNode
->isHtmlNamed( 'option' ) ) {
3411 $this->stack
->pop();
3415 if ( !$this->stack
->inSelectScope( $value ) ) {
3416 return true; // fragment case
3418 $this->stack
->popTag( $value );
3419 $this->resetInsertionMode();
3422 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3424 } elseif ( $token === 'comment' ) {
3425 $this->stack
->insertComment( $value );
3428 // anything else: just ignore the token
3432 private function inSelectInTableMode( $token, $value, $attribs = null, $selfClose = false ) {
3442 if ( $token === 'tag' ) {
3443 $this->inSelectInTableMode( 'endtag', 'select' );
3444 return $this->insertToken( $token, $value, $attribs, $selfClose );
3445 } elseif ( $token === 'endtag' ) {
3446 if ( $this->stack
->inTableScope( $value ) ) {
3447 $this->inSelectInTableMode( 'endtag', 'select' );
3448 return $this->insertToken( $token, $value, $attribs, $selfClose );
3454 return $this->inSelectMode( $token, $value, $attribs, $selfClose );
3457 private function inTemplateMode( $token, $value, $attribs = null, $selfClose = false ) {
3458 if ( $token === 'text' ||
$token === 'comment' ) {
3459 return $this->inBodyMode( $token, $value, $attribs, $selfClose );
3460 } elseif ( $token === 'eof' ) {
3461 if ( $this->stack
->indexOf( 'template' ) < 0 ) {
3462 $this->stopParsing();
3464 $this->stack
->popTag( 'template' );
3465 $this->afe
->clearToMarker();
3466 array_pop( $this->templateInsertionModes
);
3467 $this->resetInsertionMode();
3468 $this->insertToken( $token, $value, $attribs, $selfClose );
3471 } elseif ( $token === 'tag' ) {
3479 // OMITTED: <script>
3483 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3490 return $this->switchModeAndReprocess(
3491 'inTableMode', $token, $value, $attribs, $selfClose
3495 return $this->switchModeAndReprocess(
3496 'inColumnGroupMode', $token, $value, $attribs, $selfClose
3500 return $this->switchModeAndReprocess(
3501 'inTableBodyMode', $token, $value, $attribs, $selfClose
3506 return $this->switchModeAndReprocess(
3507 'inRowMode', $token, $value, $attribs, $selfClose
3510 return $this->switchModeAndReprocess(
3511 'inBodyMode', $token, $value, $attribs, $selfClose
3513 } elseif ( $token === 'endtag' ) {
3516 return $this->inHeadMode( $token, $value, $attribs, $selfClose );
3520 Assert
::invariant( false, "Bad token type: $token" );