Merge "Provide onTransaction* callbacks with the transaction result"
[lhc/web/wiklou.git] / includes / tidy / Balancer.php
1 <?php
2 /**
3 * An implementation of the tree building portion of the HTML5 parsing
4 * spec.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
20 *
21 * @file
22 * @ingroup Parser
23 * @since 1.27
24 * @author C. Scott Ananian, 2016
25 */
26 namespace MediaWiki\Tidy;
27
28 use Wikimedia\Assert\Assert;
29 use Wikimedia\Assert\ParameterAssertionException;
30 use \ExplodeIterator;
31 use \IteratorAggregate;
32 use \ReverseArrayIterator;
33 use \Sanitizer;
34
35 # A note for future librarization[1] -- this file is a good candidate
36 # for splitting into an independent library, except that it is currently
37 # highly optimized for MediaWiki use. It only implements the portions
38 # of the HTML5 tree builder used by tags supported by MediaWiki, and
39 # does not contain a true tokenizer pass, instead relying on
40 # comment stripping, attribute normalization, and escaping done by
41 # the MediaWiki Sanitizer. It also deliberately avoids building
42 # a true DOM in memory, instead serializing elements to an output string
43 # as soon as possible (usually as soon as the tag is closed) to reduce
44 # its memory footprint.
45
46 # On the other hand, I've been pretty careful to note with comments in the
47 # code the places where this implementation omits features of the spec or
48 # depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
49 # implement the missing pieces and make this a standalone PHP HTML5 parser.
50 # In order to do so, some sort of MediaWiki-specific API will need
51 # to be added to (a) allow the Balancer to bypass the tokenizer,
52 # and (b) support on-the-fly flattening instead of DOM node creation.
53
54 # [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
55
56 /**
57 * Utility constants and sets for the HTML5 tree building algorithm.
58 * Sets are associative arrays indexed first by namespace and then by
59 * lower-cased tag name.
60 *
61 * @ingroup Parser
62 * @since 1.27
63 */
64 class BalanceSets {
65 const HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml';
66 const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
67 const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
68
69 public static $unsupportedSet = [
70 self::HTML_NAMESPACE => [
71 'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
72 'form' => true, 'frame' => true,
73 'plaintext' => true, 'isindex' => true, 'textarea' => true,
74 'xmp' => true, 'iframe' => true, 'noembed' => true,
75 'noscript' => true, 'script' => true,
76 'title' => true
77 ]
78 ];
79
80 public static $emptyElementSet = [
81 self::HTML_NAMESPACE => [
82 'area' => true, 'base' => true, 'basefont' => true,
83 'bgsound' => true, 'br' => true, 'col' => true, 'command' => true,
84 'embed' => true, 'frame' => true, 'hr' => true, 'img' => true,
85 'input' => true, 'keygen' => true, 'link' => true, 'meta' => true,
86 'param' => true, 'source' => true, 'track' => true, 'wbr' => true
87 ]
88 ];
89
90 public static $headingSet = [
91 self::HTML_NAMESPACE => [
92 'h1' => true, 'h2' => true, 'h3' => true,
93 'h4' => true, 'h5' => true, 'h6' => true
94 ]
95 ];
96
97 public static $specialSet = [
98 self::HTML_NAMESPACE => [
99 'address' => true, 'applet' => true, 'area' => true,
100 'article' => true, 'aside' => true, 'base' => true,
101 'basefont' => true, 'bgsound' => true, 'blockquote' => true,
102 'body' => true, 'br' => true, 'button' => true, 'caption' => true,
103 'center' => true, 'col' => true, 'colgroup' => true, 'dd' => true,
104 'details' => true, 'dir' => true, 'div' => true, 'dl' => true,
105 'dt' => true, 'embed' => true, 'fieldset' => true,
106 'figcaption' => true, 'figure' => true, 'footer' => true,
107 'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true,
108 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
109 'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
110 'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
111 'input' => true, 'isindex' => true, 'li' => true, 'link' => true,
112 'listing' => true, 'main' => true, 'marquee' => true,
113 'menu' => true, 'menuitem' => true, 'meta' => true, 'nav' => true,
114 'noembed' => true, 'noframes' => true, 'noscript' => true,
115 'object' => true, 'ol' => true, 'p' => true, 'param' => true,
116 'plaintext' => true, 'pre' => true, 'script' => true,
117 'section' => true, 'select' => true, 'source' => true,
118 'style' => true, 'summary' => true, 'table' => true,
119 'tbody' => true, 'td' => true, 'template' => true,
120 'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true,
121 'title' => true, 'tr' => true, 'track' => true, 'ul' => true,
122 'wbr' => true, 'xmp' => true
123 ],
124 self::SVG_NAMESPACE => [
125 'foreignobject' => true, 'desc' => true, 'title' => true
126 ],
127 self::MATHML_NAMESPACE => [
128 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
129 'mtext' => true, 'annotation-xml' => true
130 ]
131 ];
132
133 public static $addressDivPSet = [
134 self::HTML_NAMESPACE => [
135 'address' => true, 'div' => true, 'p' => true
136 ]
137 ];
138
139 public static $tableSectionRowSet = [
140 self::HTML_NAMESPACE => [
141 'table' => true, 'thead' => true, 'tbody' => true,
142 'tfoot' => true, 'tr' => true
143 ]
144 ];
145
146 public static $impliedEndTagsSet = [
147 self::HTML_NAMESPACE => [
148 'dd' => true, 'dt' => true, 'li' => true, 'optgroup' => true,
149 'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
150 'rt' => true, 'rtc' => true
151 ]
152 ];
153
154 public static $thoroughImpliedEndTagsSet = [
155 self::HTML_NAMESPACE => [
156 'caption' => true, 'colgroup' => true, 'dd' => true, 'dt' => true,
157 'li' => true, 'optgroup' => true, 'option' => true, 'p' => true,
158 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true,
159 'tbody' => true, 'td' => true, 'tfoot' => true, 'th' => true,
160 'thead' => true, 'tr' => true
161 ]
162 ];
163
164 public static $tableCellSet = [
165 self::HTML_NAMESPACE => [
166 'td' => true, 'th' => true
167 ]
168 ];
169 public static $tableContextSet = [
170 self::HTML_NAMESPACE => [
171 'table' => true, 'template' => true, 'html' => true
172 ]
173 ];
174
175 public static $tableBodyContextSet = [
176 self::HTML_NAMESPACE => [
177 'tbody' => true, 'tfoot' => true, 'thead' => true,
178 'template' => true, 'html' => true
179 ]
180 ];
181
182 public static $tableRowContextSet = [
183 self::HTML_NAMESPACE => [
184 'tr' => true, 'template' => true, 'html' => true
185 ]
186 ];
187
188 # OMITTED: formAssociatedSet, since we don't allow <form>
189
190 public static $inScopeSet = [
191 self::HTML_NAMESPACE => [
192 'applet' => true, 'caption' => true, 'html' => true,
193 'marquee' => true, 'object' => true,
194 'table' => true, 'td' => true, 'template' => true,
195 'th' => true
196 ],
197 self::SVG_NAMESPACE => [
198 'foreignobject' => true, 'desc' => true, 'title' => true
199 ],
200 self::MATHML_NAMESPACE => [
201 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
202 'mtext' => true, 'annotation-xml' => true
203 ]
204 ];
205
206 private static $inListItemScopeSet = null;
207 public static function inListItemScopeSet() {
208 if ( self::$inListItemScopeSet === null ) {
209 self::$inListItemScopeSet = self::$inScopeSet;
210 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ol'] = true;
211 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ul'] = true;
212 }
213 return self::$inListItemScopeSet;
214 }
215
216 private static $inButtonScopeSet = null;
217 public static function inButtonScopeSet() {
218 if ( self::$inButtonScopeSet === null ) {
219 self::$inButtonScopeSet = self::$inScopeSet;
220 self::$inButtonScopeSet[self::HTML_NAMESPACE]['button'] = true;
221 }
222 return self::$inButtonScopeSet;
223 }
224
225 public static $inTableScopeSet = [
226 self::HTML_NAMESPACE => [
227 'html' => true, 'table' => true, 'template' => true
228 ]
229 ];
230
231 public static $inInvertedSelectScopeSet = [
232 self::HTML_NAMESPACE => [
233 'option' => true, 'optgroup' => true
234 ]
235 ];
236
237 public static $mathmlTextIntegrationPointSet = [
238 self::MATHML_NAMESPACE => [
239 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
240 'mtext' => true
241 ]
242 ];
243
244 public static $htmlIntegrationPointSet = [
245 self::SVG_NAMESPACE => [
246 'foreignobject' => true,
247 'desc' => true,
248 'title' => true
249 ]
250 ];
251
252 // For tidy compatibility.
253 public static $tidyPWrapSet = [
254 self::HTML_NAMESPACE => [
255 'body' => true, 'blockquote' => true,
256 // We parse with <body> as the fragment context, but the top-level
257 // element on the stack is actually <html>. We could use the
258 // "adjusted current node" everywhere to work around this, but it's
259 // easier just to add <html> to the p-wrap set.
260 'html' => true,
261 ],
262 ];
263 public static $tidyInlineSet = [
264 self::HTML_NAMESPACE => [
265 'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true,
266 'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true,
267 'br' => true, 'button' => true, 'cite' => true, 'code' => true,
268 'dfn' => true, 'em' => true, 'font' => true, 'i' => true,
269 'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true,
270 'label' => true, 'legend' => true, 'map' => true, 'object' => true,
271 'param' => true, 'q' => true, 'rb' => true, 'rbc' => true,
272 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true,
273 's' => true, 'samp' => true, 'select' => true, 'small' => true,
274 'span' => true, 'strike' => true, 'strong' => true, 'sub' => true,
275 'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true,
276 'var' => true,
277 ],
278 ];
279 }
280
281 /**
282 * A BalanceElement is a simplified version of a DOM Node. The main
283 * difference is that we only keep BalanceElements around for nodes
284 * currently on the BalanceStack of open elements. As soon as an
285 * element is closed, with some minor exceptions relating to the
286 * tree builder "adoption agency algorithm", the element and all its
287 * children are serialized to a string using the flatten() method.
288 * This keeps our memory usage low.
289 *
290 * @ingroup Parser
291 * @since 1.27
292 */
293 class BalanceElement {
294 /**
295 * The namespace of the element.
296 * @var string $namespaceURI
297 */
298 public $namespaceURI;
299 /**
300 * The lower-cased name of the element.
301 * @var string $localName
302 */
303 public $localName;
304 /**
305 * Attributes for the element, in array form
306 * @var array $attribs
307 */
308 public $attribs;
309
310 /**
311 * Parent of this element, or the string "flat" if this element has
312 * already been flattened into its parent.
313 * @var string|null $parent
314 */
315 public $parent;
316
317 /**
318 * An array of children of this element. Typically only the last
319 * child will be an actual BalanceElement object; the rest will
320 * be strings, representing either text nodes or flattened
321 * BalanceElement objects.
322 * @var array $children
323 */
324 public $children;
325
326 /**
327 * A unique string identifier for Noah's Ark purposes, lazy initialized
328 */
329 private $noahKey;
330
331 /**
332 * The next active formatting element in the list, or null if this is the
333 * end of the AFE list or if the element is not in the AFE list.
334 */
335 public $nextAFE;
336
337 /**
338 * The previous active formatting element in the list, or null if this is
339 * the start of the list or if the element is not in the AFE list.
340 */
341 public $prevAFE;
342
343 /**
344 * The next element in the Noah's Ark species bucket.
345 */
346 public $nextNoah;
347
348 /**
349 * Make a new BalanceElement corresponding to the HTML DOM Element
350 * with the given localname, namespace, and attributes.
351 *
352 * @param string $namespaceURI The namespace of the element.
353 * @param string $localName The lowercased name of the tag.
354 * @param array $attribs Attributes of the element
355 */
356 public function __construct( $namespaceURI, $localName, array $attribs ) {
357 $this->localName = $localName;
358 $this->namespaceURI = $namespaceURI;
359 $this->attribs = $attribs;
360 $this->contents = '';
361 $this->parent = null;
362 $this->children = [];
363 }
364
365 /**
366 * Remove the given child from this element.
367 * @param BalanceElement $elt
368 */
369 private function removeChild( BalanceElement $elt ) {
370 Assert::precondition(
371 $this->parent !== 'flat', "Can't removeChild after flattening $this"
372 );
373 Assert::parameter(
374 $elt->parent === $this, 'elt', 'must have $this as a parent'
375 );
376 $idx = array_search( $elt, $this->children, true );
377 Assert::parameter( $idx !== false, '$elt', 'must be a child of $this' );
378 $elt->parent = null;
379 array_splice( $this->children, $idx, 1 );
380 }
381
382 /**
383 * Find $a in the list of children and insert $b before it.
384 * @param BalanceElement $a
385 * @param BalanceElement|string $b
386 */
387 public function insertBefore( BalanceElement $a, $b ) {
388 Assert::precondition(
389 $this->parent !== 'flat', "Can't insertBefore after flattening."
390 );
391 $idx = array_search( $a, $this->children, true );
392 Assert::parameter( $idx !== false, '$a', 'must be a child of $this' );
393 if ( is_string( $b ) ) {
394 array_splice( $this->children, $idx, 0, [ $b ] );
395 } else {
396 Assert::parameter( $b->parent !== 'flat', '$b', "Can't be flat" );
397 if ( $b->parent !== null ) {
398 $b->parent->removeChild( $b );
399 }
400 array_splice( $this->children, $idx, 0, [ $b ] );
401 $b->parent = $this;
402 }
403 }
404
405 /**
406 * Append $elt to the end of the list of children.
407 * @param BalanceElement|string $elt
408 */
409 public function appendChild( $elt ) {
410 Assert::precondition(
411 $this->parent !== 'flat', "Can't appendChild after flattening."
412 );
413 if ( is_string( $elt ) ) {
414 array_push( $this->children, $elt );
415 return;
416 }
417 // Remove $elt from parent, if it had one.
418 if ( $elt->parent !== null ) {
419 $elt->parent->removeChild( $elt );
420 }
421 array_push( $this->children, $elt );
422 $elt->parent = $this;
423 }
424
425 /**
426 * Transfer all of the children of $elt to $this.
427 * @param BalanceElement $elt
428 */
429 public function adoptChildren( BalanceElement $elt ) {
430 Assert::precondition(
431 $elt->parent !== 'flat', "Can't adoptChildren after flattening."
432 );
433 foreach ( $elt->children as $child ) {
434 if ( !is_string( $child ) ) {
435 // This is an optimization which avoids an O(n^2) set of
436 // array_splice operations.
437 $child->parent = null;
438 }
439 $this->appendChild( $child );
440 }
441 $elt->children = [];
442 }
443
444 /**
445 * Flatten this node and all of its children into a string, as specified
446 * by the HTML serialization specification, and replace this node
447 * in its parent by that string.
448 *
449 * @see __toString()
450 */
451 public function flatten( $tidyCompat = false ) {
452 Assert::parameter( $this->parent !== null, '$this', 'must be a child' );
453 Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' );
454 $idx = array_search( $this, $this->parent->children, true );
455 Assert::parameter(
456 $idx !== false, '$this', 'must be a child of its parent'
457 );
458 if ( $tidyCompat ) {
459 $blank = true;
460 foreach ( $this->children as $elt ) {
461 if ( !is_string( $elt ) ) {
462 $elt = $elt->flatten( $tidyCompat );
463 }
464 if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) {
465 $blank = false;
466 }
467 }
468 if ( $this->isHtmlNamed( 'mw:p-wrap' ) ) {
469 $this->localName = 'p';
470 } elseif ( $blank ) {
471 // Add 'mw-empty-elt' class so elements can be hidden via CSS
472 // for compatibility with legacy tidy.
473 if ( !count( $this->attribs ) &&
474 ( $this->localName === 'tr' || $this->localName === 'li' )
475 ) {
476 $this->attribs = [ 'class' => "mw-empty-elt" ];
477 }
478 $blank = false;
479 }
480 $flat = $blank ? '' : "{$this}";
481 } else {
482 $flat = "{$this}";
483 }
484 $this->parent->children[$idx] = $flat;
485 $this->parent = 'flat'; # for assertion checking
486 return $flat;
487 }
488
489 /**
490 * Serialize this node and all of its children to a string, as specified
491 * by the HTML serialization specification.
492 *
493 * @return string The serialization of the BalanceElement
494 * @see https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments
495 */
496 public function __toString() {
497 $encAttribs = '';
498 foreach ( $this->attribs as $name => $value ) {
499 $encValue = Sanitizer::encodeAttribute( $value );
500 $encAttribs .= " $name=\"$encValue\"";
501 }
502 if ( !$this->isA( BalanceSets::$emptyElementSet ) ) {
503 $out = "<{$this->localName}{$encAttribs}>";
504 // flatten children
505 foreach ( $this->children as $elt ) {
506 $out .= "{$elt}";
507 }
508 $out .= "</{$this->localName}>";
509 } else {
510 $out = "<{$this->localName}{$encAttribs} />";
511 Assert::invariant(
512 count( $this->children ) === 0,
513 "Empty elements shouldn't have children."
514 );
515 }
516 return $out;
517 }
518
519 # Utility functions on BalanceElements.
520
521 /**
522 * Determine if $this represents a specific HTML tag, is a member of
523 * a tag set, or is equal to another BalanceElement.
524 *
525 * @param BalanceElement|array|string $set The target BalanceElement,
526 * set (from the BalanceSets class), or string (HTML tag name).
527 * @return bool
528 */
529 public function isA( $set ) {
530 if ( $set instanceof BalanceElement ) {
531 return $this === $set;
532 } elseif ( is_array( $set ) ) {
533 return isset( $set[$this->namespaceURI] ) &&
534 isset( $set[$this->namespaceURI][$this->localName] );
535 } else {
536 # assume this is an HTML element name.
537 return $this->isHtml() && $this->localName === $set;
538 }
539 }
540
541 /**
542 * Determine if this element is an HTML element with the specified name
543 * @param string $tagName
544 * @return bool
545 */
546 public function isHtmlNamed( $tagName ) {
547 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE
548 && $this->localName === $tagName;
549 }
550
551 /**
552 * Determine if $this represents an element in the HTML namespace.
553 *
554 * @return bool
555 */
556 public function isHtml() {
557 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE;
558 }
559
560 /**
561 * Determine if $this represents a MathML text integration point,
562 * as defined in the HTML5 specification.
563 *
564 * @return bool
565 * @see https://html.spec.whatwg.org/multipage/syntax.html#mathml-text-integration-point
566 */
567 public function isMathmlTextIntegrationPoint() {
568 return $this->isA( BalanceSets::$mathmlTextIntegrationPointSet );
569 }
570
571 /**
572 * Determine if $this represents an HTML integration point,
573 * as defined in the HTML5 specification.
574 *
575 * @return bool
576 * @see https://html.spec.whatwg.org/multipage/syntax.html#html-integration-point
577 */
578 public function isHtmlIntegrationPoint() {
579 if ( $this->isA( BalanceSets::$htmlIntegrationPointSet ) ) {
580 return true;
581 }
582 if (
583 $this->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
584 $this->localName === 'annotation-xml' &&
585 isset( $this->attribs['encoding'] ) &&
586 ( strcasecmp( $this->attribs['encoding'], 'text/html' ) == 0 ||
587 strcasecmp( $this->attribs['encoding'], 'application/xhtml+xml' ) == 0 )
588 ) {
589 return true;
590 }
591 return false;
592 }
593
594 /**
595 * Get a string key for the Noah's Ark algorithm
596 */
597 public function getNoahKey() {
598 if ( $this->noahKey === null ) {
599 $attribs = $this->attribs;
600 ksort( $attribs );
601 $this->noahKey = serialize( [ $this->namespaceURI, $this->localName, $attribs ] );
602 }
603 return $this->noahKey;
604 }
605 }
606
607 /**
608 * The "stack of open elements" as defined in the HTML5 tree builder
609 * spec. This contains methods to ensure that content (start tags, text)
610 * are inserted at the correct place in the output string, and to
611 * flatten BalanceElements are they are closed to avoid holding onto
612 * a complete DOM tree for the document in memory.
613 *
614 * The stack defines a PHP iterator to traverse it in "reverse order",
615 * that is, the most-recently-added element is visited first in a
616 * foreach loop.
617 *
618 * @ingroup Parser
619 * @since 1.27
620 * @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
621 */
622 class BalanceStack implements IteratorAggregate {
623 /**
624 * Backing storage for the stack.
625 * @var array $elements
626 */
627 private $elements = [];
628 /**
629 * Foster parent mode determines how nodes are inserted into the
630 * stack.
631 * @var bool $fosterParentMode
632 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
633 */
634 public $fosterParentMode = false;
635 /**
636 * Tidy compatibility mode, determines behavior of body/blockquote
637 */
638 public $tidyCompat = false;
639 /**
640 * Reference to the current element
641 */
642 public $currentNode;
643
644 /**
645 * Create a new BalanceStack with a single BalanceElement on it,
646 * representing the root &lt;html&gt; node.
647 */
648 public function __construct() {
649 # always a root <html> element on the stack
650 array_push(
651 $this->elements,
652 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] )
653 );
654 $this->currentNode = $this->elements[0];
655 }
656
657 /**
658 * Return a string representing the output of the tree builder:
659 * all the children of the root &lt;html&gt; node.
660 * @return string
661 */
662 public function getOutput() {
663 // Don't include the outer '<html>....</html>'
664 $out = '';
665 foreach ( $this->elements[0]->children as $elt ) {
666 $out .= is_string( $elt ) ? $elt :
667 $elt->flatten( $this->tidyCompat );
668 }
669 return $out;
670 }
671
672 /**
673 * Insert text at the appropriate place for inserting a node.
674 * @param string $value
675 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
676 */
677 public function insertText( $value ) {
678 if (
679 $this->fosterParentMode &&
680 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
681 ) {
682 $this->fosterParent( $value );
683 } elseif (
684 $this->tidyCompat &&
685 $this->currentNode->isA( BalanceSets::$tidyPWrapSet )
686 ) {
687 $this->insertHTMLELement( 'mw:p-wrap', [] );
688 return $this->insertText( $value );
689 } else {
690 $this->currentNode->appendChild( $value );
691 }
692 }
693
694 /**
695 * Insert a BalanceElement at the appropriate place, pushing it
696 * on to the open elements stack.
697 * @param string $namespaceURI The element namespace
698 * @param string $tag The tag name
699 * @param string $attribs Normalized attributes, as a string.
700 * @return BalanceElement
701 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-foreign-element
702 */
703 public function insertForeignElement( $namespaceURI, $tag, $attribs ) {
704 return $this->insertElement(
705 new BalanceElement( $namespaceURI, $tag, $attribs )
706 );
707 }
708
709 /**
710 * Insert an HTML element at the appropriate place, pushing it on to
711 * the open elements stack.
712 * @param string $tag The tag name
713 * @param string $attribs Normalized attributes, as a string.
714 * @return BalanceElement
715 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-an-html-element
716 */
717 public function insertHTMLElement( $tag, $attribs ) {
718 return $this->insertForeignElement(
719 BalanceSets::HTML_NAMESPACE, $tag, $attribs
720 );
721 }
722
723 /**
724 * Insert an element at the appropriate place and push it on to the
725 * open elements stack.
726 * @param BalanceElement $elt
727 * @return BalanceElement
728 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
729 */
730 public function insertElement( BalanceElement $elt ) {
731 if (
732 $this->currentNode->isHtmlNamed( 'mw:p-wrap' ) &&
733 !$elt->isA( BalanceSets::$tidyInlineSet )
734 ) {
735 // Tidy compatibility.
736 $this->pop();
737 }
738 if (
739 $this->fosterParentMode &&
740 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
741 ) {
742 $elt = $this->fosterParent( $elt );
743 } else {
744 $this->currentNode->appendChild( $elt );
745 }
746 Assert::invariant( $elt->parent !== null, "$elt must be in tree" );
747 Assert::invariant( $elt->parent !== 'flat', "$elt must not have been previous flattened" );
748 array_push( $this->elements, $elt );
749 $this->currentNode = $elt;
750 return $elt;
751 }
752
753 /**
754 * Determine if the stack has $tag in scope.
755 * @param BalanceElement|array|string $tag
756 * @return bool
757 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-scope
758 */
759 public function inScope( $tag ) {
760 return $this->inSpecificScope( $tag, BalanceSets::$inScopeSet );
761 }
762
763 /**
764 * Determine if the stack has $tag in button scope.
765 * @param BalanceElement|array|string $tag
766 * @return bool
767 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-button-scope
768 */
769 public function inButtonScope( $tag ) {
770 return $this->inSpecificScope( $tag, BalanceSets::inButtonScopeSet() );
771 }
772
773 /**
774 * Determine if the stack has $tag in list item scope.
775 * @param BalanceElement|array|string $tag
776 * @return bool
777 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-list-item-scope
778 */
779 public function inListItemScope( $tag ) {
780 return $this->inSpecificScope( $tag, BalanceSets::inListItemScopeSet() );
781 }
782
783 /**
784 * Determine if the stack has $tag in table scope.
785 * @param BalanceElement|array|string $tag
786 * @return bool
787 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-table-scope
788 */
789 public function inTableScope( $tag ) {
790 return $this->inSpecificScope( $tag, BalanceSets::$inTableScopeSet );
791 }
792
793 /**
794 * Determine if the stack has $tag in select scope.
795 * @param BalanceElement|array|string $tag
796 * @return bool
797 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-select-scope
798 */
799 public function inSelectScope( $tag ) {
800 // Can't use inSpecificScope to implement this, since it involves
801 // *inverting* a set of tags. Implement manually.
802 foreach ( $this as $elt ) {
803 if ( $elt->isA( $tag ) ) {
804 return true;
805 }
806 if ( !$elt->isA( BalanceSets::$inInvertedSelectScopeSet ) ) {
807 return false;
808 }
809 }
810 return false;
811 }
812
813 /**
814 * Determine if the stack has $tag in a specific scope, $set.
815 * @param BalanceElement|array|string $tag
816 * @param BalanceElement|array|string $set
817 * @return bool
818 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-the-specific-scope
819 */
820 public function inSpecificScope( $tag, $set ) {
821 foreach ( $this as $elt ) {
822 if ( $elt->isA( $tag ) ) {
823 return true;
824 }
825 if ( $elt->isA( $set ) ) {
826 return false;
827 }
828 }
829 return false;
830 }
831
832 /**
833 * Generate implied end tags.
834 * @param string $butnot
835 * @param bool $thorough True if we should generate end tags thoroughly.
836 * @see https://html.spec.whatwg.org/multipage/syntax.html#generate-implied-end-tags
837 */
838 public function generateImpliedEndTags( $butnot = null, $thorough = false ) {
839 $endTagSet = $thorough ?
840 BalanceSets::$thoroughImpliedEndTagsSet :
841 BalanceSets::$impliedEndTagsSet;
842 while ( $this->currentNode ) {
843 if ( $butnot !== null && $this->currentNode->isHtmlNamed( $butnot ) ) {
844 break;
845 }
846 if ( !$this->currentNode->isA( $endTagSet ) ) {
847 break;
848 }
849 $this->pop();
850 }
851 }
852
853 /**
854 * Return the adjusted current node.
855 */
856 public function adjustedCurrentNode( $fragmentContext ) {
857 return ( $fragmentContext && count( $this->elements ) === 1 ) ?
858 $fragmentContext : $this->currentNode;
859 }
860
861 /**
862 * Return an iterator over this stack which visits the current node
863 * first, and the root node last.
864 * @return Iterator
865 */
866 public function getIterator() {
867 return new ReverseArrayIterator( $this->elements );
868 }
869
870 /**
871 * Return the BalanceElement at the given position $idx, where
872 * position 0 represents the root element.
873 * @param int $idx
874 * @return BalanceElement
875 */
876 public function node( $idx ) {
877 return $this->elements[ $idx ];
878 }
879
880 /**
881 * Replace the element at position $idx in the BalanceStack with $elt.
882 * @param int $idx
883 * @param BalanceElement $elt
884 */
885 public function replaceAt( $idx, BalanceElement $elt ) {
886 Assert::precondition(
887 $this->elements[$idx]->parent !== 'flat',
888 'Replaced element should not have already been flattened.'
889 );
890 Assert::precondition(
891 $elt->parent !== 'flat',
892 'New element should not have already been flattened.'
893 );
894 $this->elements[$idx] = $elt;
895 if ( $idx === count( $this->elements ) - 1 ) {
896 $this->currentNode = $elt;
897 }
898 }
899
900 /**
901 * Return the position of the given BalanceElement, set, or
902 * HTML tag name string in the BalanceStack.
903 * @param BalanceElement|array|string $tag
904 * @return int
905 */
906 public function indexOf( $tag ) {
907 for ( $i = count( $this->elements ) - 1; $i >= 0; $i-- ) {
908 if ( $this->elements[$i]->isA( $tag ) ) {
909 return $i;
910 }
911 }
912 return -1;
913 }
914
915 /**
916 * Return the number of elements currently in the BalanceStack.
917 * @return int
918 */
919 public function length() {
920 return count( $this->elements );
921 }
922
923 /**
924 * Remove the current node from the BalanceStack, flattening it
925 * in the process.
926 */
927 public function pop() {
928 $elt = array_pop( $this->elements );
929 if ( count( $this->elements ) ) {
930 $this->currentNode = $this->elements[ count( $this->elements ) - 1 ];
931 } else {
932 $this->currentNode = null;
933 }
934 if ( !$elt->isHtmlNamed( 'mw:p-wrap' ) ) {
935 $elt->flatten( $this->tidyCompat );
936 }
937 }
938
939 /**
940 * Remove all nodes up to and including position $idx from the
941 * BalanceStack, flattening them in the process.
942 * @param int $idx
943 */
944 public function popTo( $idx ) {
945 $length = count( $this->elements );
946 for ( $length = count( $this->elements ); $length > $idx; $length-- ) {
947 $this->pop();
948 }
949 }
950
951 /**
952 * Pop elements off the stack up to and including the first
953 * element with the specified HTML tagname (or matching the given
954 * set).
955 * @param BalanceElement|array|string $tag
956 */
957 public function popTag( $tag ) {
958 while ( $this->currentNode ) {
959 if ( $this->currentNode->isA( $tag ) ) {
960 $this->pop();
961 break;
962 }
963 $this->pop();
964 }
965 }
966
967 /**
968 * Pop elements off the stack *not including* the first element
969 * in the specified set.
970 * @param BalanceElement|array|string $set
971 */
972 public function clearToContext( $set ) {
973 // Note that we don't loop to 0. Never pop the <html> elt off.
974 for ( $length = count( $this->elements ); $length > 1; $length-- ) {
975 if ( $this->currentNode->isA( $set ) ) {
976 break;
977 }
978 $this->pop();
979 }
980 }
981
982 /**
983 * Remove the given $elt from the BalanceStack, optionally
984 * flattening it in the process.
985 * @param BalanceElement $elt The element to remove.
986 * @param bool $flatten Whether to flatten the removed element.
987 */
988 public function removeElement( BalanceElement $elt, $flatten = true ) {
989 Assert::parameter(
990 $elt->parent !== 'flat',
991 '$elt',
992 '$elt should not already have been flattened.'
993 );
994 Assert::parameter(
995 $elt->parent->parent !== 'flat',
996 '$elt',
997 'The parent of $elt should not already have been flattened.'
998 );
999 $idx = array_search( $elt, $this->elements, true );
1000 Assert::parameter( $idx !== false, '$elt', 'must be in stack' );
1001 array_splice( $this->elements, $idx, 1 );
1002 if ( $idx === count( $this->elements ) ) {
1003 $this->currentNode = $this->elements[$idx - 1];
1004 }
1005 if ( $flatten ) {
1006 // serialize $elt into its parent
1007 // otherwise, it will eventually serialize when the parent
1008 // is serialized, we just hold onto the memory for its
1009 // tree of objects a little longer.
1010 $elt->flatten( $this->tidyCompat );
1011 }
1012 Assert::postcondition(
1013 array_search( $elt, $this->elements, true ) === false,
1014 '$elt should no longer be in open elements stack'
1015 );
1016 }
1017
1018 /**
1019 * Find $a in the BalanceStack and insert $b after it.
1020 * @param BalanceElement $a
1021 * @param BalanceElement $b
1022 */
1023 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
1024 $idx = $this->indexOf( $a );
1025 Assert::parameter( $idx !== false, '$a', 'must be in stack' );
1026 if ( $idx === count( $this->elements ) - 1 ) {
1027 array_push( $this->elements, $b );
1028 $this->currentNode = $b;
1029 } else {
1030 array_splice( $this->elements, $idx + 1, 0, [ $b ] );
1031 }
1032 }
1033
1034 # Fostering and adoption.
1035
1036 /**
1037 * Foster parent the given $elt in the stack of open elements.
1038 * @param BalanceElement|string $elt
1039 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
1040 */
1041 private function fosterParent( $elt ) {
1042 $lastTable = $this->indexOf( 'table' );
1043 $lastTemplate = $this->indexOf( 'template' );
1044 $parent = null;
1045 $before = null;
1046
1047 if ( $lastTemplate >= 0 && ( $lastTable < 0 || $lastTemplate > $lastTable ) ) {
1048 $parent = $this->elements[$lastTemplate];
1049 } elseif ( $lastTable >= 0 ) {
1050 $parent = $this->elements[$lastTable]->parent;
1051 # Assume all tables have parents, since we're not running scripts!
1052 Assert::invariant(
1053 $parent !== null, "All tables should have parents"
1054 );
1055 $before = $this->elements[$lastTable];
1056 } else {
1057 $parent = $this->elements[0]; // the `html` element.
1058 }
1059
1060 if ( $this->tidyCompat ) {
1061 if ( is_string( $elt ) ) {
1062 // We're fostering text: do we need a p-wrapper?
1063 if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) {
1064 $this->insertHTMLElement( 'mw:p-wrap', [] );
1065 $this->insertText( $elt );
1066 return $elt;
1067 }
1068 } else {
1069 // We're fostering an element; do we need to merge p-wrappers?
1070 if ( $elt->isHtmlNamed( 'mw:p-wrap' ) ) {
1071 $idx = $before ?
1072 array_search( $before, $parent->children, true ) :
1073 count( $parent->children );
1074 $after = $idx > 0 ? $parent->children[$idx - 1] : '';
1075 if (
1076 $after instanceof BalanceElement &&
1077 $after->isHtmlNamed( 'mw:p-wrap' )
1078 ) {
1079 return $after; // Re-use existing p-wrapper.
1080 }
1081 }
1082 }
1083 }
1084
1085 if ( $before ) {
1086 $parent->insertBefore( $before, $elt );
1087 } else {
1088 $parent->appendChild( $elt );
1089 }
1090 return $elt;
1091 }
1092
1093 /**
1094 * Run the "adoption agency algoritm" (AAA) for the given subject
1095 * tag name.
1096 * @param string $tag The subject tag name.
1097 * @param BalanceActiveFormattingElements $afe The current
1098 * active formatting elements list.
1099 * @return true if the adoption agency algorithm "did something", false
1100 * if more processing is required by the caller.
1101 * @see https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
1102 */
1103 public function adoptionAgency( $tag, $afe ) {
1104 // If the current node is an HTML element whose tag name is subject,
1105 // and the current node is not in the list of active formatting
1106 // elements, then pop the current node off the stack of open
1107 // elements and abort these steps.
1108 if (
1109 $this->currentNode->isHtmlNamed( $tag ) &&
1110 !$afe->isInList( $this->currentNode )
1111 ) {
1112 $this->pop();
1113 return true; // no more handling required
1114 }
1115
1116 // Let outer loop counter be zero.
1117 $outer = 0;
1118
1119 // Outer loop: If outer loop counter is greater than or
1120 // equal to eight, then abort these steps.
1121 while ( $outer < 8 ) {
1122 // Increment outer loop counter by one.
1123 $outer++;
1124
1125 // Let the formatting element be the last element in the list
1126 // of active formatting elements that: is between the end of
1127 // the list and the last scope marker in the list, if any, or
1128 // the start of the list otherwise, and has the same tag name
1129 // as the token.
1130 $fmtelt = $afe->findElementByTag( $tag );
1131
1132 // If there is no such node, then abort these steps and instead
1133 // act as described in the "any other end tag" entry below.
1134 if ( !$fmtelt ) {
1135 return false; // false means handle by the default case
1136 }
1137
1138 // Otherwise, if there is such a node, but that node is not in
1139 // the stack of open elements, then this is a parse error;
1140 // remove the element from the list, and abort these steps.
1141 $index = $this->indexOf( $fmtelt );
1142 if ( $index < 0 ) {
1143 $afe->remove( $fmtelt );
1144 return true; // true means no more handling required
1145 }
1146
1147 // Otherwise, if there is such a node, and that node is also in
1148 // the stack of open elements, but the element is not in scope,
1149 // then this is a parse error; ignore the token, and abort
1150 // these steps.
1151 if ( !$this->inScope( $fmtelt ) ) {
1152 return true;
1153 }
1154
1155 // Let the furthest block be the topmost node in the stack of
1156 // open elements that is lower in the stack than the formatting
1157 // element, and is an element in the special category. There
1158 // might not be one.
1159 $furthestblock = null;
1160 $furthestblockindex = -1;
1161 $stacklen = $this->length();
1162 for ( $i = $index+1; $i < $stacklen; $i++ ) {
1163 if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) {
1164 $furthestblock = $this->node( $i );
1165 $furthestblockindex = $i;
1166 break;
1167 }
1168 }
1169
1170 // If there is no furthest block, then the UA must skip the
1171 // subsequent steps and instead just pop all the nodes from the
1172 // bottom of the stack of open elements, from the current node
1173 // up to and including the formatting element, and remove the
1174 // formatting element from the list of active formatting
1175 // elements.
1176 if ( !$furthestblock ) {
1177 $this->popTag( $fmtelt );
1178 $afe->remove( $fmtelt );
1179 return true;
1180 } else {
1181 // Let the common ancestor be the element immediately above
1182 // the formatting element in the stack of open elements.
1183 $ancestor = $this->node( $index-1 );
1184
1185 // Let a bookmark note the position of the formatting
1186 // element in the list of active formatting elements
1187 // relative to the elements on either side of it in the
1188 // list.
1189 $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] );
1190 $afe->insertAfter( $fmtelt, $BOOKMARK );
1191
1192 // Let node and last node be the furthest block.
1193 $node = $furthestblock;
1194 $lastnode = $furthestblock;
1195 $nodeindex = $furthestblockindex;
1196 $isAFE = false;
1197
1198 // Let inner loop counter be zero.
1199 $inner = 0;
1200
1201 while ( true ) {
1202
1203 // Increment inner loop counter by one.
1204 $inner++;
1205
1206 // Let node be the element immediately above node in
1207 // the stack of open elements, or if node is no longer
1208 // in the stack of open elements (e.g. because it got
1209 // removed by this algorithm), the element that was
1210 // immediately above node in the stack of open elements
1211 // before node was removed.
1212 $node = $this->node( --$nodeindex );
1213
1214 // If node is the formatting element, then go
1215 // to the next step in the overall algorithm.
1216 if ( $node === $fmtelt ) break;
1217
1218 // If the inner loop counter is greater than three and node
1219 // is in the list of active formatting elements, then remove
1220 // node from the list of active formatting elements.
1221 $isAFE = $afe->isInList( $node );
1222 if ( $inner > 3 && $isAFE ) {
1223 $afe->remove( $node );
1224 $isAFE = false;
1225 }
1226
1227 // If node is not in the list of active formatting
1228 // elements, then remove node from the stack of open
1229 // elements and then go back to the step labeled inner
1230 // loop.
1231 if ( !$isAFE ) {
1232 // Don't flatten here, since we're about to relocate
1233 // parts of this $node.
1234 $this->removeElement( $node, false );
1235 continue;
1236 }
1237
1238 // Create an element for the token for which the
1239 // element node was created with common ancestor as
1240 // the intended parent, replace the entry for node
1241 // in the list of active formatting elements with an
1242 // entry for the new element, replace the entry for
1243 // node in the stack of open elements with an entry for
1244 // the new element, and let node be the new element.
1245 $newelt = new BalanceElement(
1246 $node->namespaceURI, $node->localName, $node->attribs );
1247 $afe->replace( $node, $newelt );
1248 $this->replaceAt( $nodeindex, $newelt );
1249 $node = $newelt;
1250
1251 // If last node is the furthest block, then move the
1252 // aforementioned bookmark to be immediately after the
1253 // new node in the list of active formatting elements.
1254 if ( $lastnode === $furthestblock ) {
1255 $afe->remove( $BOOKMARK );
1256 $afe->insertAfter( $newelt, $BOOKMARK );
1257 }
1258
1259 // Insert last node into node, first removing it from
1260 // its previous parent node if any.
1261 $node->appendChild( $lastnode );
1262
1263 // Let last node be node.
1264 $lastnode = $node;
1265 }
1266
1267 // If the common ancestor node is a table, tbody, tfoot,
1268 // thead, or tr element, then, foster parent whatever last
1269 // node ended up being in the previous step, first removing
1270 // it from its previous parent node if any.
1271 if (
1272 $this->fosterParentMode &&
1273 $ancestor->isA( BalanceSets::$tableSectionRowSet )
1274 ) {
1275 $this->fosterParent( $lastnode );
1276 } else {
1277 // Otherwise, append whatever last node ended up being in
1278 // the previous step to the common ancestor node, first
1279 // removing it from its previous parent node if any.
1280 $ancestor->appendChild( $lastnode );
1281 }
1282
1283 // Create an element for the token for which the
1284 // formatting element was created, with furthest block
1285 // as the intended parent.
1286 $newelt2 = new BalanceElement(
1287 $fmtelt->namespaceURI, $fmtelt->localName, $fmtelt->attribs );
1288
1289 // Take all of the child nodes of the furthest block and
1290 // append them to the element created in the last step.
1291 $newelt2->adoptChildren( $furthestblock );
1292
1293 // Append that new element to the furthest block.
1294 $furthestblock->appendChild( $newelt2 );
1295
1296 // Remove the formatting element from the list of active
1297 // formatting elements, and insert the new element into the
1298 // list of active formatting elements at the position of
1299 // the aforementioned bookmark.
1300 $afe->remove( $fmtelt );
1301 $afe->replace( $BOOKMARK, $newelt2 );
1302
1303 // Remove the formatting element from the stack of open
1304 // elements, and insert the new element into the stack of
1305 // open elements immediately below the position of the
1306 // furthest block in that stack.
1307 $this->removeElement( $fmtelt );
1308 $this->insertAfter( $furthestblock, $newelt2 );
1309 }
1310 }
1311
1312 return true;
1313 }
1314
1315 /**
1316 * Return the contents of the open elements stack as a string for
1317 * debugging.
1318 * @return string
1319 */
1320 public function __toString() {
1321 $r = [];
1322 foreach ( $this->elements as $elt ) {
1323 array_push( $r, $elt->localName );
1324 }
1325 return implode( $r, ' ' );
1326 }
1327 }
1328
1329 /**
1330 * A pseudo-element used as a marker in the list of active formatting elements
1331 *
1332 * @ingroup Parser
1333 * @since 1.27
1334 */
1335 class BalanceMarker {
1336 public $nextAFE;
1337 public $prevAFE;
1338 }
1339
1340 /**
1341 * The list of active formatting elements, which is used to handle
1342 * mis-nested formatting element tags in the HTML5 tree builder
1343 * specification.
1344 *
1345 * @ingroup Parser
1346 * @since 1.27
1347 * @see https://html.spec.whatwg.org/multipage/syntax.html#list-of-active-formatting-elements
1348 */
1349 class BalanceActiveFormattingElements {
1350 /** The last (most recent) element in the list */
1351 private $tail;
1352
1353 /** The first (least recent) element in the list */
1354 private $head;
1355
1356 /**
1357 * An array of arrays representing the population of elements in each bucket
1358 * according to the Noah's Ark clause. The outer array is stack-like, with each
1359 * integer-indexed element representing a segment of the list, bounded by
1360 * markers. The first element represents the segment of the list before the
1361 * first marker.
1362 *
1363 * The inner arrays are indexed by "Noah key", which is a string which uniquely
1364 * identifies each bucket according to the rules in the spec. The value in
1365 * the inner array is the first (least recently inserted) element in the bucket,
1366 * and subsequent members of the bucket can be found by iterating through the
1367 * singly-linked list via $node->nextNoah.
1368 *
1369 * This is optimised for the most common case of inserting into a bucket
1370 * with zero members, and deleting a bucket containing one member. In the
1371 * worst case, iteration through the list is still O(1) in the document
1372 * size, since each bucket can have at most 3 members.
1373 */
1374 private $noahTableStack = [ [] ];
1375
1376 public function __destruct() {
1377 for ( $node = $this->head; $node; $node = $next ) {
1378 $next = $node->nextAFE;
1379 $node->prevAFE = $node->nextAFE = $node->nextNoah = null;
1380 }
1381 $this->head = $this->tail = $this->noahTableStack = null;
1382 }
1383
1384 public function insertMarker() {
1385 $elt = new BalanceMarker;
1386 if ( $this->tail ) {
1387 $this->tail->nextAFE = $elt;
1388 $elt->prevAFE = $this->tail;
1389 } else {
1390 $this->head = $elt;
1391 }
1392 $this->tail = $elt;
1393 $this->noahTableStack[] = [];
1394 }
1395
1396 /**
1397 * Follow the steps required when the spec requires us to "push onto the
1398 * list of active formatting elements".
1399 * @param BalanceElement $elt
1400 */
1401 public function push( BalanceElement $elt ) {
1402 // Must not be in the list already
1403 if ( $elt->prevAFE !== null || $this->head === $elt ) {
1404 throw new ParameterAssertionException( '$elt',
1405 'Cannot insert a node into the AFE list twice' );
1406 }
1407
1408 // "Noah's Ark clause" -- if there are already three copies of
1409 // this element before we encounter a marker, then drop the last
1410 // one.
1411 $noahKey = $elt->getNoahKey();
1412 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1413 if ( !isset( $table[$noahKey] ) ) {
1414 $table[$noahKey] = $elt;
1415 } else {
1416 $count = 1;
1417 $head = $tail = $table[$noahKey];
1418 while ( $tail->nextNoah ) {
1419 $tail = $tail->nextNoah;
1420 $count++;
1421 }
1422 if ( $count >= 3 ) {
1423 $this->remove( $head );
1424 }
1425 $tail->nextNoah = $elt;
1426 }
1427 // Add to the main AFE list
1428 if ( $this->tail ) {
1429 $this->tail->nextAFE = $elt;
1430 $elt->prevAFE = $this->tail;
1431 } else {
1432 $this->head = $elt;
1433 }
1434 $this->tail = $elt;
1435 }
1436
1437 /**
1438 * Follow the steps required when the spec asks us to "clear the list of
1439 * active formatting elements up to the last marker".
1440 */
1441 public function clearToMarker() {
1442 // Iterate back through the list starting from the tail
1443 $tail = $this->tail;
1444 while ( $tail && !( $tail instanceof BalanceMarker ) ) {
1445 // Unlink the element
1446 $prev = $tail->prevAFE;
1447 $tail->prevAFE = null;
1448 if ( $prev ) {
1449 $prev->nextAFE = null;
1450 }
1451 $tail->nextNoah = null;
1452 $tail = $prev;
1453 }
1454 // If we finished on a marker, unlink it and pop it off the Noah table stack
1455 if ( $tail ) {
1456 $prev = $tail->prevAFE;
1457 if ( $prev ) {
1458 $prev->nextAFE = null;
1459 }
1460 $tail = $prev;
1461 array_pop( $this->noahTableStack );
1462 } else {
1463 // No marker: wipe the top-level Noah table (which is the only one)
1464 $this->noahTableStack[0] = [];
1465 }
1466 // If we removed all the elements, clear the head pointer
1467 if ( !$tail ) {
1468 $this->head = null;
1469 }
1470 $this->tail = $tail;
1471 }
1472
1473 /**
1474 * Find and return the last element with the specified tag between the
1475 * end of the list and the last marker on the list.
1476 * Used when parsing &lt;a&gt; "in body mode".
1477 */
1478 public function findElementByTag( $tag ) {
1479 $elt = $this->tail;
1480 while ( $elt && !( $elt instanceof BalanceMarker ) ) {
1481 if ( $elt->localName === $tag ) {
1482 return $elt;
1483 }
1484 $elt = $elt->prevAFE;
1485 }
1486 return null;
1487 }
1488
1489 /**
1490 * Determine whether an element is in the list of formatting elements.
1491 * @return boolean
1492 */
1493 public function isInList( BalanceElement $elt ) {
1494 return $this->head === $elt || $elt->prevAFE;
1495 }
1496
1497 /**
1498 * Find the element $elt in the list and remove it.
1499 * Used when parsing &lt;a&gt; in body mode.
1500 */
1501 public function remove( BalanceElement $elt ) {
1502 if ( $this->head !== $elt && !$elt->prevAFE ) {
1503 throw new ParameterAssertionException( '$elt',
1504 "Attempted to remove an element which is not in the AFE list" );
1505 }
1506 // Update head and tail pointers
1507 if ( $this->head === $elt ) {
1508 $this->head = $elt->nextAFE;
1509 }
1510 if ( $this->tail === $elt ) {
1511 $this->tail = $elt->prevAFE;
1512 }
1513 // Update previous element
1514 if ( $elt->prevAFE ) {
1515 $elt->prevAFE->nextAFE = $elt->nextAFE;
1516 }
1517 // Update next element
1518 if ( $elt->nextAFE ) {
1519 $elt->nextAFE->prevAFE = $elt->prevAFE;
1520 }
1521 // Clear pointers so that isInList() etc. will work
1522 $elt->prevAFE = $elt->nextAFE = null;
1523 // Update Noah list
1524 $this->removeFromNoahList( $elt );
1525 }
1526
1527 private function addToNoahList( BalanceElement $elt ) {
1528 $noahKey = $elt->getNoahKey();
1529 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1530 if ( !isset( $table[$noahKey] ) ) {
1531 $table[$noahKey] = $elt;
1532 } else {
1533 $tail = $table[$noahKey];
1534 while ( $tail->nextNoah ) {
1535 $tail = $tail->nextNoah;
1536 }
1537 $tail->nextNoah = $elt;
1538 }
1539 }
1540
1541 private function removeFromNoahList( BalanceElement $elt ) {
1542 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1543 $key = $elt->getNoahKey();
1544 $noahElt = $table[$key];
1545 if ( $noahElt === $elt ) {
1546 if ( $noahElt->nextNoah ) {
1547 $table[$key] = $noahElt->nextNoah;
1548 $noahElt->nextNoah = null;
1549 } else {
1550 unset( $table[$key] );
1551 }
1552 } else {
1553 do {
1554 $prevNoahElt = $noahElt;
1555 $noahElt = $prevNoahElt->nextNoah;
1556 if ( $noahElt === $elt ) {
1557 // Found it, unlink
1558 $prevNoahElt->nextNoah = $elt->nextNoah;
1559 $elt->nextNoah = null;
1560 break;
1561 }
1562 } while ( $noahElt );
1563 }
1564 }
1565
1566 /**
1567 * Find element $a in the list and replace it with element $b
1568 */
1569 public function replace( BalanceElement $a, BalanceElement $b ) {
1570 if ( $this->head !== $a && !$a->prevAFE ) {
1571 throw new ParameterAssertionException( '$a',
1572 "Attempted to replace an element which is not in the AFE list" );
1573 }
1574 // Update head and tail pointers
1575 if ( $this->head === $a ) {
1576 $this->head = $b;
1577 }
1578 if ( $this->tail === $a ) {
1579 $this->tail = $b;
1580 }
1581 // Update previous element
1582 if ( $a->prevAFE ) {
1583 $a->prevAFE->nextAFE = $b;
1584 }
1585 // Update next element
1586 if ( $a->nextAFE ) {
1587 $a->nextAFE->prevAFE = $b;
1588 }
1589 $b->prevAFE = $a->prevAFE;
1590 $b->nextAFE = $a->nextAFE;
1591 $a->nextAFE = $a->prevAFE = null;
1592 // Update Noah list
1593 $this->removeFromNoahList( $a );
1594 $this->addToNoahList( $b );
1595 }
1596
1597 /**
1598 * Find $a in the list and insert $b after it.
1599 */
1600 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
1601 if ( $this->head !== $a && !$a->prevAFE ) {
1602 throw new ParameterAssertionException( '$a',
1603 "Attempted to insert after an element which is not in the AFE list" );
1604 }
1605 if ( $this->tail === $a ) {
1606 $this->tail = $b;
1607 }
1608 if ( $a->nextAFE ) {
1609 $a->nextAFE->prevAFE = $b;
1610 }
1611 $b->nextAFE = $a->nextAFE;
1612 $b->prevAFE = $a;
1613 $a->nextAFE = $b;
1614 $this->addToNoahList( $b );
1615 }
1616
1617 // @codingStandardsIgnoreStart Generic.Files.LineLength.TooLong
1618 /**
1619 * Reconstruct the active formatting elements.
1620 * @param BalanceStack $stack The open elements stack
1621 * @see https://html.spec.whatwg.org/multipage/syntax.html#reconstruct-the-active-formatting-elements
1622 */
1623 // @codingStandardsIgnoreEnd
1624 public function reconstruct( $stack ) {
1625 $entry = $this->tail;
1626 // If there are no entries in the list of active formatting elements,
1627 // then there is nothing to reconstruct
1628 if ( !$entry ) {
1629 return;
1630 }
1631 // If the last is a marker, do nothing.
1632 if ( $entry instanceof BalanceMarker ) {
1633 return;
1634 }
1635 // Or if it is an open element, do nothing.
1636 if ( $stack->indexOf( $entry ) >= 0 ) {
1637 return;
1638 }
1639
1640 // Loop backward through the list until we find a marker or an
1641 // open element
1642 $foundit = false;
1643 while ( $entry->prevAFE ) {
1644 $entry = $entry->prevAFE;
1645 if ( $entry instanceof BalanceMarker || $stack->indexOf( $entry ) >= 0 ) {
1646 $foundit = true;
1647 break;
1648 }
1649 }
1650
1651 // Now loop forward, starting from the element after the current one (or
1652 // the first element if we didn't find a marker or open element),
1653 // recreating formatting elements and pushing them back onto the list
1654 // of open elements.
1655 if ( $foundit ) {
1656 $entry = $entry->nextAFE;
1657 }
1658 do {
1659 $newElement = $stack->insertHTMLElement(
1660 $entry->localName,
1661 $entry->attribs );
1662 $this->replace( $entry, $newElement );
1663 $entry = $newElement->nextAFE;
1664 } while ( $entry );
1665 }
1666
1667 /**
1668 * Get a string representation of the AFE list, for debugging
1669 */
1670 public function __toString() {
1671 $prev = null;
1672 $s = '';
1673 for ( $node = $this->head; $node; $prev = $node, $node = $node->nextAFE ) {
1674 if ( $node instanceof BalanceMarker ) {
1675 $s .= "MARKER\n";
1676 continue;
1677 }
1678 $s .= $node->localName . '#' . substr( md5( spl_object_hash( $node ) ), 0, 8 );
1679 if ( $node->nextNoah ) {
1680 $s .= " (noah sibling: {$node->nextNoah->localName}#" .
1681 substr( md5( spl_object_hash( $node->nextNoah ) ), 0, 8 ) .
1682 ')';
1683 }
1684 if ( $node->nextAFE && $node->nextAFE->prevAFE !== $node ) {
1685 $s .= " (reverse link is wrong!)";
1686 }
1687 $s .= "\n";
1688 }
1689 if ( $prev !== $this->tail ) {
1690 $s .= "(tail pointer is wrong!)\n";
1691 }
1692 return $s;
1693 }
1694 }
1695
1696 /**
1697 * An implementation of the tree building portion of the HTML5 parsing
1698 * spec.
1699 *
1700 * This is used to balance and tidy output so that the result can
1701 * always be cleanly serialized/deserialized by an HTML5 parser. It
1702 * does *not* guarantee "conforming" output -- the HTML5 spec contains
1703 * a number of constraints which are not enforced by the HTML5 parsing
1704 * process. But the result will be free of gross errors: misnested or
1705 * unclosed tags, for example, and will be unchanged by spec-complient
1706 * parsing followed by serialization.
1707 *
1708 * The tree building stage is structured as a state machine.
1709 * When comparing the implementation to
1710 * https://www.w3.org/TR/html5/syntax.html#tree-construction
1711 * note that each state is implemented as a function with a
1712 * name ending in `Mode` (because the HTML spec refers to them
1713 * as insertion modes). The current insertion mode is held by
1714 * the $parseMode property.
1715 *
1716 * The following simplifications have been made:
1717 * - We handle body content only (ie, we start `in body`.)
1718 * - The document is never in "quirks mode".
1719 * - All occurrences of < and > have been entity escaped, so we
1720 * can parse tags by simply splitting on those two characters.
1721 * Similarly, all attributes have been "cleaned" and are double-quoted
1722 * and escaped.
1723 * - All comments and null characters are assumed to have been removed.
1724 * - We don't alter linefeeds after <pre>/<listing>.
1725 * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
1726 * <form>, <frame>, <plaintext>, <isindex>, <textarea>, <xmp>, <iframe>,
1727 * <noembed>, <noscript>, <script>, <title>. As a result,
1728 * further simplifications can be made:
1729 * - `frameset-ok` is not tracked.
1730 * - `form element pointer` is not tracked.
1731 * - `head element pointer` is not tracked (but presumed non-null)
1732 * - Tokenizer has only a single mode.
1733 *
1734 * We generally mark places where we omit cases from the spec due to
1735 * disallowed elements with a comment: `# OMITTED: <element-name>`.
1736 *
1737 * The HTML spec keeps a flag during the parsing process to track
1738 * whether or not a "parse error" has been encountered. We don't
1739 * bother to track that flag, we just implement the error-handling
1740 * process as specified.
1741 *
1742 * @ingroup Parser
1743 * @since 1.27
1744 * @see https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
1745 */
1746 class Balancer {
1747 private $parseMode;
1748 private $bitsIterator;
1749 private $allowedHtmlElements;
1750 private $afe;
1751 private $stack;
1752 private $strict;
1753 private $tidyCompat;
1754
1755 private $textIntegrationMode = false;
1756 private $pendingTableText;
1757 private $originalInsertionMode;
1758 private $fragmentContext;
1759
1760 /**
1761 * Create a new Balancer.
1762 * @param array $config Balancer configuration. Includes:
1763 * 'strict' : boolean, defaults to false.
1764 * When true, enforces syntactic constraints on input:
1765 * all non-tag '<' must be escaped, all attributes must be
1766 * separated by a single space and double-quoted. This is
1767 * consistent with the output of the Sanitizer.
1768 * 'allowedHtmlElements' : array, defaults to null.
1769 * When present, the keys of this associative array give
1770 * the acceptable HTML tag names. When not present, no
1771 * tag sanitization is done.
1772 * 'tidyCompat' : boolean, defaults to false.
1773 * When true, the serialization algorithm is tweaked to
1774 * provide historical compatibility with the old "tidy"
1775 * program: <p>-wrapping is done to the children of
1776 * <body> and <blockquote> elements, and empty elements
1777 * are removed.
1778 */
1779 public function __construct( array $config = [] ) {
1780 $config = $config + [
1781 'strict' => false,
1782 'allowedHtmlElements' => null,
1783 'tidyCompat' => false,
1784 ];
1785 $this->allowedHtmlElements = $config['allowedHtmlElements'];
1786 $this->strict = $config['strict'];
1787 $this->tidyCompat = $config['tidyCompat'];
1788 if ( $this->allowedHtmlElements !== null ) {
1789 # Sanity check!
1790 $bad = array_uintersect_assoc(
1791 $this->allowedHtmlElements,
1792 BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE],
1793 function( $a, $b ) {
1794 // Ignore the values (just intersect the keys) by saying
1795 // all values are equal to each other.
1796 return 0;
1797 }
1798 );
1799 if ( count( $bad ) > 0 ) {
1800 $badstr = implode( array_keys( $bad ), ',' );
1801 throw new ParameterAssertionException(
1802 '$config',
1803 'Balance attempted with sanitization including ' .
1804 "unsupported elements: {$badstr}"
1805 );
1806 }
1807 }
1808 }
1809
1810 /**
1811 * Return a balanced HTML string for the HTML fragment given by $text,
1812 * subject to the caveats listed in the class description. The result
1813 * will typically be idempotent -- that is, rebalancing the output
1814 * would result in no change.
1815 *
1816 * @param string $text The markup to be balanced
1817 * @param callable $processingCallback Callback to do any variable or
1818 * parameter replacements in HTML attributes values
1819 * @param array|bool $processingArgs Arguments for the processing callback
1820 * @return string The balanced markup
1821 */
1822 public function balance( $text, $processingCallback = null, $processingArgs = [] ) {
1823 $this->parseMode = 'inBodyMode';
1824 $this->bitsIterator = new ExplodeIterator( '<', $text );
1825 $this->afe = new BalanceActiveFormattingElements();
1826 $this->stack = new BalanceStack();
1827 $this->stack->tidyCompat = $this->tidyCompat;
1828 $this->processingCallback = $processingCallback;
1829 $this->processingArgs = $processingArgs;
1830
1831 # The stack is constructed with an <html> element already on it.
1832 # Set this up as a fragment parsed with <body> as the context.
1833 $this->fragmentContext =
1834 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'body', [] );
1835 $this->resetInsertionMode();
1836
1837 // First element is text not tag
1838 $x = $this->bitsIterator->current();
1839 $this->bitsIterator->next();
1840 $this->insertToken( 'text', str_replace( '>', '&gt;', $x ) );
1841 // Now process each tag.
1842 while ( $this->bitsIterator->valid() ) {
1843 $this->advance();
1844 }
1845 $this->insertToken( 'eof', null );
1846 $result = $this->stack->getOutput();
1847 // Free memory before returning.
1848 $this->bitsIterator = null;
1849 $this->afe = null;
1850 $this->stack = null;
1851 $this->fragmentContext = null;
1852 return $result;
1853 }
1854
1855 /**
1856 * Pass a token to the tree builder. The $token will be one of the
1857 * strings "tag", "endtag", or "text".
1858 */
1859 private function insertToken( $token, $value, $attribs = null, $selfclose = false ) {
1860 // validate tags against $unsupportedSet
1861 if ( $token === 'tag' || $token === 'endtag' ) {
1862 if ( isset( BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE][$value] ) ) {
1863 # As described in "simplifications" above, these tags are
1864 # not supported in the balancer.
1865 Assert::invariant(
1866 !$this->strict,
1867 "Unsupported $token <$value> found."
1868 );
1869 return false;
1870 }
1871 } elseif ( $token === 'text' && $value === '' ) {
1872 # Don't actually inject the empty string as a text token.
1873 return true;
1874 }
1875 // Some hoops we have to jump through
1876 $adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
1877
1878 $isForeign = true;
1879 if (
1880 $this->stack->length() === 0 ||
1881 $adjusted->isHtml() ||
1882 $token === 'eof'
1883 ) {
1884 $isForeign = false;
1885 } elseif ( $adjusted->isMathmlTextIntegrationPoint() ) {
1886 if ( $token === 'text' ) {
1887 $isForeign = false;
1888 } elseif (
1889 $token === 'tag' &&
1890 $value !== 'mglyph' && $value !== 'malignmark'
1891 ) {
1892 $isForeign = false;
1893 }
1894 } elseif (
1895 $adjusted->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
1896 $adjusted->localName === 'annotation-xml' &&
1897 $token === 'tag' && $value === 'svg'
1898 ) {
1899 $isForeign = false;
1900 } elseif (
1901 $adjusted->isHtmlIntegrationPoint() &&
1902 ( $token === 'tag' || $token === 'text' )
1903 ) {
1904 $isForeign = false;
1905 }
1906 if ( $isForeign ) {
1907 return $this->insertForeignToken( $token, $value, $attribs, $selfclose );
1908 } else {
1909 $func = $this->parseMode;
1910 return $this->$func( $token, $value, $attribs, $selfclose );
1911 }
1912 }
1913
1914 private function insertForeignToken( $token, $value, $attribs = null, $selfclose = false ) {
1915 if ( $token === 'text' ) {
1916 $this->stack->insertText( $value );
1917 return true;
1918 } elseif ( $token === 'tag' ) {
1919 switch ( $value ) {
1920 case 'font':
1921 if ( isset( $attribs['color'] )
1922 || isset( $attribs['face'] )
1923 || isset( $attribs['size'] )
1924 ) {
1925 break;
1926 }
1927 /* otherwise, fall through */
1928 case 'b':
1929 case 'big':
1930 case 'blockquote':
1931 case 'body':
1932 case 'br':
1933 case 'center':
1934 case 'code':
1935 case 'dd':
1936 case 'div':
1937 case 'dl':
1938 case 'dt':
1939 case 'em':
1940 case 'embed':
1941 case 'h1':
1942 case 'h2':
1943 case 'h3':
1944 case 'h4':
1945 case 'h5':
1946 case 'h6':
1947 case 'head':
1948 case 'hr':
1949 case 'i':
1950 case 'img':
1951 case 'li':
1952 case 'listing':
1953 case 'menu':
1954 case 'meta':
1955 case 'nobr':
1956 case 'ol':
1957 case 'p':
1958 case 'pre':
1959 case 'ruby':
1960 case 's':
1961 case 'small':
1962 case 'span':
1963 case 'strong':
1964 case 'strike':
1965 case 'sub':
1966 case 'sup':
1967 case 'table':
1968 case 'tt':
1969 case 'u':
1970 case 'ul':
1971 case 'var':
1972 if ( $this->fragmentContext ) {
1973 break;
1974 }
1975 while ( true ) {
1976 $this->stack->pop();
1977 $node = $this->stack->currentNode;
1978 if (
1979 $node->isMathmlTextIntegrationPoint() ||
1980 $node->isHtmlIntegrationPoint() ||
1981 $node->isHtml()
1982 ) {
1983 break;
1984 }
1985 }
1986 return $this->insertToken( $token, $value, $attribs, $selfclose );
1987 }
1988 // "Any other start tag"
1989 $adjusted = ( $this->fragmentContext && $this->stack->length()===1 ) ?
1990 $this->fragmentContext : $this->stack->currentNode;
1991 $this->stack->insertForeignElement(
1992 $adjusted->namespaceURI, $value, $attribs
1993 );
1994 if ( $selfclose ) {
1995 $this->stack->pop();
1996 }
1997 return true;
1998 } elseif ( $token === 'endtag' ) {
1999 $first = true;
2000 foreach ( $this->stack as $i => $node ) {
2001 if ( $node->isHtml() && !$first ) {
2002 // process the end tag as HTML
2003 $func = $this->parseMode;
2004 return $this->$func( $token, $value, $attribs, $selfclose );
2005 } elseif ( $i === 0 ) {
2006 return true;
2007 } elseif ( $node->localName === $value ) {
2008 $this->stack->popTag( $node );
2009 return true;
2010 }
2011 $first = false;
2012 }
2013 }
2014 }
2015
2016 /**
2017 * Grab the next "token" from $bitsIterator. This is either a open/close
2018 * tag or text, depending on whether the Sanitizer approves.
2019 */
2020 private function advance() {
2021 $x = $this->bitsIterator->current();
2022 $this->bitsIterator->next();
2023 $regs = [];
2024 # $slash: Does the current element start with a '/'?
2025 # $t: Current element name
2026 # $attribStr: String between element name and >
2027 # $brace: Ending '>' or '/>'
2028 # $rest: Everything until the next element from the $bitsIterator
2029 if ( preg_match( Sanitizer::ELEMENT_BITS_REGEX, $x, $regs ) ) {
2030 list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs;
2031 $t = strtolower( $t );
2032 if ( $this->strict ) {
2033 /* Verify that attributes are all properly double-quoted */
2034 Assert::invariant(
2035 preg_match(
2036 '/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr
2037 ),
2038 "Bad attribute string found"
2039 );
2040 }
2041 } else {
2042 Assert::invariant(
2043 !$this->strict, "< found which does not start a valid tag"
2044 );
2045 $slash = $t = $attribStr = $brace = $rest = null;
2046 }
2047 $goodtag = $t;
2048 $sanitize = $this->allowedHtmlElements !== null;
2049 if ( $sanitize ) {
2050 $goodtag = $t && isset( $this->allowedHtmlElements[$t] );
2051 }
2052 if ( $goodtag ) {
2053 if ( is_callable( $this->processingCallback ) ) {
2054 call_user_func_array( $this->processingCallback, [ &$attribStr, $this->processingArgs ] );
2055 }
2056 if ( $sanitize ) {
2057 $goodtag = Sanitizer::validateTag( $attribStr, $t );
2058 }
2059 }
2060 if ( $goodtag ) {
2061 if ( $sanitize ) {
2062 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2063 $attribs = Sanitizer::validateTagAttributes( $attribs, $t );
2064 } else {
2065 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2066 }
2067 $goodtag = $this->insertToken(
2068 $slash ? 'endtag' : 'tag', $t, $attribs, $brace === '/>'
2069 );
2070 }
2071 if ( $goodtag ) {
2072 $rest = str_replace( '>', '&gt;', $rest );
2073 $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
2074 } else {
2075 # bad tag; serialize entire thing as text.
2076 $this->insertToken( 'text', '&lt;' . str_replace( '>', '&gt;', $x ) );
2077 }
2078 }
2079
2080 private function switchMode( $mode ) {
2081 Assert::parameter(
2082 substr( $mode, -4 )==='Mode', '$mode', 'should end in Mode'
2083 );
2084 $oldMode = $this->parseMode;
2085 $this->parseMode = $mode;
2086 return $oldMode;
2087 }
2088
2089 private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfclose ) {
2090 $this->switchMode( $mode );
2091 return $this->insertToken( $token, $value, $attribs, $selfclose );
2092 }
2093
2094 private function resetInsertionMode() {
2095 $last = false;
2096 foreach ( $this->stack as $i => $node ) {
2097 if ( $i === 0 ) {
2098 $last = true;
2099 if ( $this->fragmentContext ) {
2100 $node = $this->fragmentContext;
2101 }
2102 }
2103 if ( $node->isHtml() ) {
2104 switch ( $node->localName ) {
2105 case 'select':
2106 $stacklen = $this->stack->length();
2107 for ( $j = $i + 1; $j < $stacklen-1; $j++ ) {
2108 $ancestor = $this->stack->node( $stacklen-$j-1 );
2109 if ( $ancestor->isHtmlNamed( 'template' ) ) {
2110 break;
2111 }
2112 if ( $ancestor->isHtmlNamed( 'table' ) ) {
2113 $this->switchMode( 'inSelectInTableMode' );
2114 return;
2115 }
2116 }
2117 $this->switchMode( 'inSelectMode' );
2118 return;
2119 case 'tr':
2120 $this->switchMode( 'inRowMode' );
2121 return;
2122 case 'tbody':
2123 case 'tfoot':
2124 case 'thead':
2125 $this->switchMode( 'inTableBodyMode' );
2126 return;
2127 case 'caption':
2128 $this->switchMode( 'inCaptionMode' );
2129 return;
2130 case 'colgroup':
2131 $this->switchMode( 'inColumnGroupMode' );
2132 return;
2133 case 'table':
2134 $this->switchMode( 'inTableMode' );
2135 return;
2136 case 'template':
2137 $this->switchMode(
2138 array_slice( $this->templateInsertionModes, -1 )[0]
2139 );
2140 return;
2141 case 'body':
2142 $this->switchMode( 'inBodyMode' );
2143 return;
2144 # OMITTED: <frameset>
2145 # OMITTED: <html>
2146 # OMITTED: <head>
2147 default:
2148 if ( !$last ) {
2149 # OMITTED: <head>
2150 if ( $node->isA( BalanceSets::$tableCellSet ) ) {
2151 $this->switchMode( 'inCellMode' );
2152 return;
2153 }
2154 }
2155 }
2156 }
2157 if ( $last ) {
2158 $this->switchMode( 'inBodyMode' );
2159 return;
2160 }
2161 }
2162 }
2163
2164 private function stopParsing() {
2165 # Most of the spec methods are inapplicable, other than step 2:
2166 # "pop all the nodes off the stack of open elements".
2167 # We're going to keep the top-most <html> element on the stack, though.
2168
2169 # Clear the AFE list first, otherwise the element objects will stay live
2170 # during serialization, potentially using O(N^2) memory. Note that
2171 # popping the stack will never result in reconstructing the active
2172 # formatting elements.
2173 $this->afe = null;
2174 $this->stack->popTo( 1 );
2175 }
2176
2177 private function parseRawText( $value, $attribs = null ) {
2178 $this->stack->insertHTMLElement( $value, $attribs );
2179 // XXX switch tokenizer to rawtext state?
2180 $this->originalInsertionMode = $this->switchMode( 'inTextMode' );
2181 return true;
2182 }
2183
2184 private function inTextMode( $token, $value, $attribs = null, $selfclose = false ) {
2185 if ( $token === 'text' ) {
2186 $this->stack->insertText( $value );
2187 return true;
2188 } elseif ( $token === 'eof' ) {
2189 $this->stack->pop();
2190 return $this->switchModeAndReprocess(
2191 $this->originalInsertionMode, $token, $value, $attribs, $selfclose
2192 );
2193 } elseif ( $token === 'endtag' ) {
2194 $this->stack->pop();
2195 $this->switchMode( $this->originalInsertionMode );
2196 return true;
2197 }
2198 return true;
2199 }
2200
2201 private function inHeadMode( $token, $value, $attribs = null, $selfclose = false ) {
2202 if ( $token === 'text' ) {
2203 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
2204 $this->stack->insertText( $matches[0] );
2205 $value = substr( $value, strlen( $matches[0] ) );
2206 }
2207 if ( strlen( $value ) === 0 ) {
2208 return true; // All text handled.
2209 }
2210 // Fall through to handle non-whitespace below.
2211 } elseif ( $token === 'tag' ) {
2212 switch ( $value ) {
2213 case 'meta':
2214 # OMITTED: in a full HTML parser, this might change the encoding.
2215 /* falls through */
2216 # OMITTED: <html>
2217 case 'base':
2218 case 'basefont':
2219 case 'bgsound':
2220 case 'link':
2221 $this->stack->insertHTMLElement( $value, $attribs );
2222 $this->stack->pop();
2223 return true;
2224 # OMITTED: <title>
2225 # OMITTED: <noscript>
2226 case 'noframes':
2227 case 'style':
2228 return $this->parseRawText( $value, $attribs );
2229 # OMITTED: <script>
2230 case 'template':
2231 $this->stack->insertHTMLElement( $value, $attribs );
2232 $this->afe->insertMarker();
2233 # OMITTED: frameset_ok
2234 $this->switchMode( 'inTemplateMode' );
2235 $this->templateInsertionModes[] = $this->parseMode;
2236 return true;
2237 # OMITTED: <head>
2238 }
2239 } elseif ( $token === 'endtag' ) {
2240 switch ( $value ) {
2241 # OMITTED: <head>
2242 # OMITTED: <body>
2243 # OMITTED: <html>
2244 case 'br':
2245 break; // handle at the bottom of the function
2246 case 'template':
2247 if ( $this->stack->indexOf( $value ) < 0 ) {
2248 return true; // Ignore the token.
2249 }
2250 $this->stack->generateImpliedEndTags( null, true /* thorough */ );
2251 $this->stack->popTag( $value );
2252 $this->afe->clearToMarker();
2253 array_pop( $this->templateInsertionModes );
2254 $this->resetInsertionMode();
2255 return true;
2256 default:
2257 // ignore any other end tag
2258 return true;
2259 }
2260 }
2261
2262 // If not handled above
2263 $this->inHeadMode( 'endtag', 'head' ); // synthetic </head>
2264 // Then redo this one
2265 return $this->insertToken( $token, $value, $attribs, $selfclose );
2266 }
2267
2268 private function inBodyMode( $token, $value, $attribs = null, $selfclose = false ) {
2269 if ( $token === 'text' ) {
2270 $this->afe->reconstruct( $this->stack );
2271 $this->stack->insertText( $value );
2272 return true;
2273 } elseif ( $token === 'eof' ) {
2274 if ( !empty( $this->templateInsertionModes ) ) {
2275 return $this->inTemplateMode( $token, $value, $attribs, $selfclose );
2276 }
2277 $this->stopParsing();
2278 return true;
2279 } elseif ( $token === 'tag' ) {
2280 switch ( $value ) {
2281 # OMITTED: <html>
2282 case 'base':
2283 case 'basefont':
2284 case 'bgsound':
2285 case 'link':
2286 case 'meta':
2287 case 'noframes':
2288 # OMITTED: <script>
2289 case 'style':
2290 case 'template':
2291 # OMITTED: <title>
2292 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2293 # OMITTED: <body>
2294 # OMITTED: <frameset>
2295
2296 case 'address':
2297 case 'article':
2298 case 'aside':
2299 case 'blockquote':
2300 case 'center':
2301 case 'details':
2302 case 'dialog':
2303 case 'dir':
2304 case 'div':
2305 case 'dl':
2306 case 'fieldset':
2307 case 'figcaption':
2308 case 'figure':
2309 case 'footer':
2310 case 'header':
2311 case 'hgroup':
2312 case 'main':
2313 case 'menu':
2314 case 'nav':
2315 case 'ol':
2316 case 'p':
2317 case 'section':
2318 case 'summary':
2319 case 'ul':
2320 if ( $this->stack->inButtonScope( 'p' ) ) {
2321 $this->inBodyMode( 'endtag', 'p' );
2322 }
2323 $this->stack->insertHTMLElement( $value, $attribs );
2324 return true;
2325
2326 case 'h1':
2327 case 'h2':
2328 case 'h3':
2329 case 'h4':
2330 case 'h5':
2331 case 'h6':
2332 if ( $this->stack->inButtonScope( 'p' ) ) {
2333 $this->inBodyMode( 'endtag', 'p' );
2334 }
2335 if ( $this->stack->currentNode->isA( BalanceSets::$headingSet ) ) {
2336 $this->stack->pop();
2337 }
2338 $this->stack->insertHTMLElement( $value, $attribs );
2339 return true;
2340
2341 case 'pre':
2342 case 'listing':
2343 if ( $this->stack->inButtonScope( 'p' ) ) {
2344 $this->inBodyMode( 'endtag', 'p' );
2345 }
2346 $this->stack->insertHTMLElement( $value, $attribs );
2347 # As described in "simplifications" above:
2348 # 1. We don't touch the next token, even if it's a linefeed.
2349 # 2. OMITTED: frameset_ok
2350 return true;
2351
2352 # OMITTED: <form>
2353
2354 case 'li':
2355 # OMITTED: frameset_ok
2356 foreach ( $this->stack as $node ) {
2357 if ( $node->isHtmlNamed( 'li' ) ) {
2358 $this->inBodyMode( 'endtag', 'li' );
2359 break;
2360 }
2361 if (
2362 $node->isA( BalanceSets::$specialSet ) &&
2363 !$node->isA( BalanceSets::$addressDivPSet )
2364 ) {
2365 break;
2366 }
2367 }
2368 if ( $this->stack->inButtonScope( 'p' ) ) {
2369 $this->inBodyMode( 'endtag', 'p' );
2370 }
2371 $this->stack->insertHTMLElement( $value, $attribs );
2372 return true;
2373
2374 case 'dd':
2375 case 'dt':
2376 # OMITTED: frameset_ok
2377 foreach ( $this->stack as $node ) {
2378 if ( $node->isHtmlNamed( 'dd' ) ) {
2379 $this->inBodyMode( 'endtag', 'dd' );
2380 break;
2381 }
2382 if ( $node->isHtmlNamed( 'dt' ) ) {
2383 $this->inBodyMode( 'endtag', 'dt' );
2384 break;
2385 }
2386 if (
2387 $node->isA( BalanceSets::$specialSet ) &&
2388 !$node->isA( BalanceSets::$addressDivPSet )
2389 ) {
2390 break;
2391 }
2392 }
2393 if ( $this->stack->inButtonScope( 'p' ) ) {
2394 $this->inBodyMode( 'endtag', 'p' );
2395 }
2396 $this->stack->insertHTMLElement( $value, $attribs );
2397 return true;
2398
2399 # OMITTED: <plaintext>
2400
2401 case 'button':
2402 if ( $this->stack->inScope( 'button' ) ) {
2403 $this->inBodyMode( 'endtag', 'button' );
2404 return $this->insertToken( $token, $value, $attribs, $selfclose );
2405 }
2406 $this->afe->reconstruct( $this->stack );
2407 $this->stack->insertHTMLElement( $value, $attribs );
2408 return true;
2409
2410 case 'a':
2411 $activeElement = $this->afe->findElementByTag( 'a' );
2412 if ( $activeElement ) {
2413 $this->inBodyMode( 'endtag', 'a' );
2414 if ( $this->afe->isInList( $activeElement ) ) {
2415 $this->afe->remove( $activeElement );
2416 // Don't flatten here, since when we fall
2417 // through below we might foster parent
2418 // the new <a> tag inside this one.
2419 $this->stack->removeElement( $activeElement, false );
2420 }
2421 }
2422 /* Falls through */
2423 case 'b':
2424 case 'big':
2425 case 'code':
2426 case 'em':
2427 case 'font':
2428 case 'i':
2429 case 's':
2430 case 'small':
2431 case 'strike':
2432 case 'strong':
2433 case 'tt':
2434 case 'u':
2435 $this->afe->reconstruct( $this->stack );
2436 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
2437 return true;
2438
2439 case 'nobr':
2440 $this->afe->reconstruct( $this->stack );
2441 if ( $this->stack->inScope( 'nobr' ) ) {
2442 $this->inBodyMode( 'endtag', 'nobr' );
2443 $this->afe->reconstruct( $this->stack );
2444 }
2445 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
2446 return true;
2447
2448 case 'applet':
2449 case 'marquee':
2450 case 'object':
2451 $this->afe->reconstruct( $this->stack );
2452 $this->stack->insertHTMLElement( $value, $attribs );
2453 $this->afe->insertMarker();
2454 # OMITTED: frameset_ok
2455 return true;
2456
2457 case 'table':
2458 # The document is never in "quirks mode"; see simplifications
2459 # above.
2460 if ( $this->stack->inButtonScope( 'p' ) ) {
2461 $this->inBodyMode( 'endtag', 'p' );
2462 }
2463 $this->stack->insertHTMLElement( $value, $attribs );
2464 # OMITTED: frameset_ok
2465 $this->switchMode( 'inTableMode' );
2466 return true;
2467
2468 case 'area':
2469 case 'br':
2470 case 'embed':
2471 case 'img':
2472 case 'keygen':
2473 case 'wbr':
2474 $this->afe->reconstruct( $this->stack );
2475 $this->stack->insertHTMLElement( $value, $attribs );
2476 $this->stack->pop();
2477 # OMITTED: frameset_ok
2478 return true;
2479
2480 case 'input':
2481 $this->afe->reconstruct( $this->stack );
2482 $this->stack->insertHTMLElement( $value, $attribs );
2483 $this->stack->pop();
2484 # OMITTED: frameset_ok
2485 # (hence we don't need to examine the tag's "type" attribute)
2486 return true;
2487
2488 case 'menuitem':
2489 case 'param':
2490 case 'source':
2491 case 'track':
2492 $this->stack->insertHTMLElement( $value, $attribs );
2493 $this->stack->pop();
2494 return true;
2495
2496 case 'hr':
2497 if ( $this->stack->inButtonScope( 'p' ) ) {
2498 $this->inBodyMode( 'endtag', 'p' );
2499 }
2500 $this->stack->insertHTMLElement( $value, $attribs );
2501 $this->stack->pop();
2502 return true;
2503
2504 case 'image':
2505 # warts!
2506 return $this->inBodyMode( $token, 'img', $attribs, $selfclose );
2507
2508 # OMITTED: <isindex>
2509 # OMITTED: <textarea>
2510 # OMITTED: <xmp>
2511 # OMITTED: <iframe>
2512 # OMITTED: <noembed>
2513 # OMITTED: <noscript>
2514
2515 case 'select':
2516 $this->afe->reconstruct( $this->stack );
2517 $this->stack->insertHTMLElement( $value, $attribs );
2518 switch ( $this->parseMode ) {
2519 case 'inTableMode':
2520 case 'inCaptionMode':
2521 case 'inTableBodyMode':
2522 case 'inRowMode':
2523 case 'inCellMode':
2524 $this->switchMode( 'inSelectInTableMode' );
2525 return true;
2526 default:
2527 $this->switchMode( 'inSelectMode' );
2528 return true;
2529 }
2530
2531 case 'optgroup':
2532 case 'option':
2533 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
2534 $this->inBodyMode( 'endtag', 'option' );
2535 }
2536 $this->afe->reconstruct( $this->stack );
2537 $this->stack->insertHTMLElement( $value, $attribs );
2538 return true;
2539
2540 case 'rb':
2541 case 'rtc':
2542 if ( $this->stack->inScope( 'ruby' ) ) {
2543 $this->stack->generateImpliedEndTags();
2544 }
2545 $this->stack->insertHTMLElement( $value, $attribs );
2546 return true;
2547
2548 case 'rp':
2549 case 'rt':
2550 if ( $this->stack->inScope( 'ruby' ) ) {
2551 $this->stack->generateImpliedEndTags( 'rtc' );
2552 }
2553 $this->stack->insertHTMLElement( $value, $attribs );
2554 return true;
2555
2556 case 'math':
2557 $this->afe->reconstruct( $this->stack );
2558 # We skip the spec's "adjust MathML attributes" and
2559 # "adjust foreign attributes" steps, since the browser will
2560 # do this later when it parses the output and it doesn't affect
2561 # balancing.
2562 $this->stack->insertForeignElement(
2563 BalanceSets::MATHML_NAMESPACE, $value, $attribs
2564 );
2565 if ( $selfclose ) {
2566 # emit explicit </math> tag.
2567 $this->stack->pop();
2568 }
2569 return true;
2570
2571 case 'svg':
2572 $this->afe->reconstruct( $this->stack );
2573 # We skip the spec's "adjust SVG attributes" and
2574 # "adjust foreign attributes" steps, since the browser will
2575 # do this later when it parses the output and it doesn't affect
2576 # balancing.
2577 $this->stack->insertForeignElement(
2578 BalanceSets::SVG_NAMESPACE, $value, $attribs
2579 );
2580 if ( $selfclose ) {
2581 # emit explicit </svg> tag.
2582 $this->stack->pop();
2583 }
2584 return true;
2585
2586 case 'caption':
2587 case 'col':
2588 case 'colgroup':
2589 # OMITTED: <frame>
2590 case 'head':
2591 case 'tbody':
2592 case 'td':
2593 case 'tfoot':
2594 case 'th':
2595 case 'thead':
2596 case 'tr':
2597 // Ignore table tags if we're not inTableMode
2598 return true;
2599 }
2600
2601 // Handle any other start tag here
2602 $this->afe->reconstruct( $this->stack );
2603 $this->stack->insertHTMLElement( $value, $attribs );
2604 return true;
2605 } elseif ( $token === 'endtag' ) {
2606 switch ( $value ) {
2607 # </body>,</html> are unsupported.
2608
2609 case 'template':
2610 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2611
2612 case 'address':
2613 case 'article':
2614 case 'aside':
2615 case 'blockquote':
2616 case 'button':
2617 case 'center':
2618 case 'details':
2619 case 'dialog':
2620 case 'dir':
2621 case 'div':
2622 case 'dl':
2623 case 'fieldset':
2624 case 'figcaption':
2625 case 'figure':
2626 case 'footer':
2627 case 'header':
2628 case 'hgroup':
2629 case 'listing':
2630 case 'main':
2631 case 'menu':
2632 case 'nav':
2633 case 'ol':
2634 case 'pre':
2635 case 'section':
2636 case 'summary':
2637 case 'ul':
2638 // Ignore if there is not a matching open tag
2639 if ( !$this->stack->inScope( $value ) ) {
2640 return true;
2641 }
2642 $this->stack->generateImpliedEndTags();
2643 $this->stack->popTag( $value );
2644 return true;
2645
2646 # OMITTED: <form>
2647
2648 case 'p':
2649 if ( !$this->stack->inButtonScope( 'p' ) ) {
2650 $this->inBodyMode( 'tag', 'p', [] );
2651 return $this->insertToken( $token, $value, $attribs, $selfclose );
2652 }
2653 $this->stack->generateImpliedEndTags( $value );
2654 $this->stack->popTag( $value );
2655 return true;
2656
2657 case 'li':
2658 if ( !$this->stack->inListItemScope( $value ) ) {
2659 return true; # ignore
2660 }
2661 $this->stack->generateImpliedEndTags( $value );
2662 $this->stack->popTag( $value );
2663 return true;
2664
2665 case 'dd':
2666 case 'dt':
2667 if ( !$this->stack->inScope( $value ) ) {
2668 return true; # ignore
2669 }
2670 $this->stack->generateImpliedEndTags( $value );
2671 $this->stack->popTag( $value );
2672 return true;
2673
2674 case 'h1':
2675 case 'h2':
2676 case 'h3':
2677 case 'h4':
2678 case 'h5':
2679 case 'h6':
2680 if ( !$this->stack->inScope( BalanceSets::$headingSet ) ) {
2681 return true; # ignore
2682 }
2683 $this->stack->generateImpliedEndTags();
2684 $this->stack->popTag( BalanceSets::$headingSet );
2685 return true;
2686
2687 case 'sarcasm':
2688 # Take a deep breath, then:
2689 break;
2690
2691 case 'a':
2692 case 'b':
2693 case 'big':
2694 case 'code':
2695 case 'em':
2696 case 'font':
2697 case 'i':
2698 case 'nobr':
2699 case 's':
2700 case 'small':
2701 case 'strike':
2702 case 'strong':
2703 case 'tt':
2704 case 'u':
2705 if ( $this->stack->adoptionAgency( $value, $this->afe ) ) {
2706 return true; # If we did something, we're done.
2707 }
2708 break; # Go to the "any other end tag" case.
2709
2710 case 'applet':
2711 case 'marquee':
2712 case 'object':
2713 if ( !$this->stack->inScope( $value ) ) {
2714 return true; # ignore
2715 }
2716 $this->stack->generateImpliedEndTags();
2717 $this->stack->popTag( $value );
2718 $this->afe->clearToMarker();
2719 return true;
2720
2721 case 'br':
2722 # Turn </br> into <br>
2723 return $this->inBodyMode( 'tag', $value, [] );
2724 }
2725
2726 // Any other end tag goes here
2727 foreach ( $this->stack as $i => $node ) {
2728 if ( $node->isHtmlNamed( $value ) ) {
2729 $this->stack->generateImpliedEndTags( $value );
2730 $this->stack->popTo( $i ); # including $i
2731 break;
2732 } elseif ( $node->isA( BalanceSets::$specialSet ) ) {
2733 return true; // ignore this close token.
2734 }
2735 }
2736 return true;
2737 } else {
2738 Assert::invariant( false, "Bad token type: $token" );
2739 }
2740 }
2741
2742 private function inTableMode( $token, $value, $attribs = null, $selfclose = false ) {
2743 if ( $token === 'text' ) {
2744 if ( $this->textIntegrationMode ) {
2745 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
2746 } elseif ( $this->stack->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) {
2747 $this->pendingTableText = '';
2748 $this->originalInsertionMode = $this->parseMode;
2749 return $this->switchModeAndReprocess( 'inTableTextMode', $token, $value, $attribs, $selfclose );
2750 }
2751 // fall through to default case.
2752 } elseif ( $token === 'eof' ) {
2753 $this->stopParsing();
2754 return true;
2755 } elseif ( $token === 'tag' ) {
2756 switch ( $value ) {
2757 case 'caption':
2758 $this->afe->insertMarker();
2759 $this->stack->insertHTMLElement( $value, $attribs );
2760 $this->switchMode( 'inCaptionMode' );
2761 return true;
2762 case 'colgroup':
2763 $this->stack->clearToContext( BalanceSets::$tableContextSet );
2764 $this->stack->insertHTMLElement( $value, $attribs );
2765 $this->switchMode( 'inColumnGroupMode' );
2766 return true;
2767 case 'col':
2768 $this->inTableMode( 'tag', 'colgroup', [] );
2769 return $this->insertToken( $token, $value, $attribs, $selfclose );
2770 case 'tbody':
2771 case 'tfoot':
2772 case 'thead':
2773 $this->stack->clearToContext( BalanceSets::$tableContextSet );
2774 $this->stack->insertHTMLElement( $value, $attribs );
2775 $this->switchMode( 'inTableBodyMode' );
2776 return true;
2777 case 'td':
2778 case 'th':
2779 case 'tr':
2780 $this->inTableMode( 'tag', 'tbody', [] );
2781 return $this->insertToken( $token, $value, $attribs, $selfclose );
2782 case 'table':
2783 if ( !$this->stack->inTableScope( $value ) ) {
2784 return true; // Ignore this tag.
2785 }
2786 $this->inTableMode( 'endtag', $value );
2787 return $this->insertToken( $token, $value, $attribs, $selfclose );
2788
2789 case 'style':
2790 # OMITTED: <script>
2791 case 'template':
2792 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2793
2794 case 'input':
2795 if ( !isset( $attribs['type'] ) || strcasecmp( $attribs['type'], 'hidden' ) !== 0 ) {
2796 break; // Handle this as "everything else"
2797 }
2798 $this->stack->insertHTMLElement( $value, $attribs );
2799 $this->stack->pop();
2800 return true;
2801
2802 # OMITTED: <form>
2803 }
2804 // Fall through for "anything else" clause.
2805 } elseif ( $token === 'endtag' ) {
2806 switch ( $value ) {
2807 case 'table':
2808 if ( !$this->stack->inTableScope( $value ) ) {
2809 return true; // Ignore.
2810 }
2811 $this->stack->popTag( $value );
2812 $this->resetInsertionMode();
2813 return true;
2814 # OMITTED: <body>
2815 case 'caption':
2816 case 'col':
2817 case 'colgroup':
2818 # OMITTED: <html>
2819 case 'tbody':
2820 case 'td':
2821 case 'tfoot':
2822 case 'th':
2823 case 'thead':
2824 case 'tr':
2825 return true; // Ignore the token.
2826 case 'template':
2827 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2828 }
2829 // Fall through for "anything else" clause.
2830 }
2831 // This is the "anything else" case:
2832 $this->stack->fosterParentMode = true;
2833 $this->inBodyMode( $token, $value, $attribs, $selfclose );
2834 $this->stack->fosterParentMode = false;
2835 return true;
2836 }
2837
2838 private function inTableTextMode( $token, $value, $attribs = null, $selfclose = false ) {
2839 if ( $token === 'text' ) {
2840 $this->pendingTableText .= $value;
2841 return true;
2842 }
2843 // Non-text token:
2844 $text = $this->pendingTableText;
2845 $this->pendingTableText = '';
2846 if ( preg_match( '/[^\x09\x0A\x0C\x0D\x20]/', $text ) ) {
2847 // This should match the "anything else" case inTableMode
2848 $this->stack->fosterParentMode = true;
2849 $this->inBodyMode( 'text', $text );
2850 $this->stack->fosterParentMode = false;
2851 } else {
2852 // Pending text is just whitespace.
2853 $this->stack->insertText( $text );
2854 }
2855 return $this->switchModeAndReprocess(
2856 $this->originalInsertionMode, $token, $value, $attribs, $selfclose
2857 );
2858 }
2859
2860 // helper for inCaptionMode
2861 private function endCaption() {
2862 if ( !$this->stack->inTableScope( 'caption' ) ) {
2863 return false;
2864 }
2865 $this->stack->generateImpliedEndTags();
2866 $this->stack->popTag( 'caption' );
2867 $this->afe->clearToMarker();
2868 $this->switchMode( 'inTableMode' );
2869 return true;
2870 }
2871
2872 private function inCaptionMode( $token, $value, $attribs = null, $selfclose = false ) {
2873 if ( $token === 'tag' ) {
2874 switch ( $value ) {
2875 case 'caption':
2876 case 'col':
2877 case 'colgroup':
2878 case 'tbody':
2879 case 'td':
2880 case 'tfoot':
2881 case 'th':
2882 case 'thead':
2883 case 'tr':
2884 if ( $this->endCaption() ) {
2885 $this->insertToken( $token, $value, $attribs, $selfclose );
2886 }
2887 return true;
2888 }
2889 // Fall through to "anything else" case.
2890 } elseif ( $token === 'endtag' ) {
2891 switch ( $value ) {
2892 case 'caption':
2893 $this->endCaption();
2894 return true;
2895 case 'table':
2896 if ( $this->endCaption() ) {
2897 $this->insertToken( $token, $value, $attribs, $selfclose );
2898 }
2899 return true;
2900 case 'body':
2901 case 'col':
2902 case 'colgroup':
2903 # OMITTED: <html>
2904 case 'tbody':
2905 case 'td':
2906 case 'tfoot':
2907 case 'th':
2908 case 'thead':
2909 case 'tr':
2910 // Ignore the token
2911 return true;
2912 }
2913 // Fall through to "anything else" case.
2914 }
2915 // The Anything Else case
2916 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
2917 }
2918
2919 private function inColumnGroupMode( $token, $value, $attribs = null, $selfclose = false ) {
2920 if ( $token === 'text' ) {
2921 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
2922 $this->stack->insertText( $matches[0] );
2923 $value = substr( $value, strlen( $matches[0] ) );
2924 }
2925 if ( strlen( $value ) === 0 ) {
2926 return true; // All text handled.
2927 }
2928 // Fall through to handle non-whitespace below.
2929 } elseif ( $token === 'tag' ) {
2930 switch ( $value ) {
2931 # OMITTED: <html>
2932 case 'col':
2933 $this->stack->insertHTMLElement( $value, $attribs );
2934 $this->stack->pop();
2935 return true;
2936 case 'template':
2937 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2938 }
2939 // Fall through for "anything else".
2940 } elseif ( $token === 'endtag' ) {
2941 switch ( $value ) {
2942 case 'colgroup':
2943 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
2944 return true; // Ignore the token.
2945 }
2946 $this->stack->pop();
2947 $this->switchMode( 'inTableMode' );
2948 return true;
2949 case 'col':
2950 return true; // Ignore the token.
2951 case 'template':
2952 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2953 }
2954 // Fall through for "anything else".
2955 } elseif ( $token === 'eof' ) {
2956 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
2957 }
2958
2959 // Anything else
2960 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
2961 return true; // Ignore the token.
2962 }
2963 $this->inColumnGroupMode( 'endtag', 'colgroup' );
2964 return $this->insertToken( $token, $value, $attribs, $selfclose );
2965 }
2966
2967 // Helper function for inTableBodyMode
2968 private function endSection() {
2969 if ( !(
2970 $this->stack->inTableScope( 'tbody' ) ||
2971 $this->stack->inTableScope( 'thead' ) ||
2972 $this->stack->inTableScope( 'tfoot' )
2973 ) ) {
2974 return false;
2975 }
2976 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
2977 $this->stack->pop();
2978 $this->switchMode( 'inTableMode' );
2979 return true;
2980 }
2981 private function inTableBodyMode( $token, $value, $attribs = null, $selfclose = false ) {
2982 if ( $token === 'tag' ) {
2983 switch ( $value ) {
2984 case 'tr':
2985 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
2986 $this->stack->insertHTMLElement( $value, $attribs );
2987 $this->switchMode( 'inRowMode' );
2988 return true;
2989 case 'th':
2990 case 'td':
2991 $this->inTableBodyMode( 'tag', 'tr', [] );
2992 $this->insertToken( $token, $value, $attribs, $selfclose );
2993 return true;
2994 case 'caption':
2995 case 'col':
2996 case 'colgroup':
2997 case 'tbody':
2998 case 'tfoot':
2999 case 'thead':
3000 if ( $this->endSection() ) {
3001 $this->insertToken( $token, $value, $attribs, $selfclose );
3002 }
3003 return true;
3004 }
3005 } elseif ( $token === 'endtag' ) {
3006 switch ( $value ) {
3007 case 'table':
3008 if ( $this->endSection() ) {
3009 $this->insertToken( $token, $value, $attribs, $selfclose );
3010 }
3011 return true;
3012 case 'tbody':
3013 case 'tfoot':
3014 case 'thead':
3015 if ( $this->stack->inTableScope( $value ) ) {
3016 $this->endSection();
3017 }
3018 return true;
3019 # OMITTED: <body>
3020 case 'caption':
3021 case 'col':
3022 case 'colgroup':
3023 # OMITTED: <html>
3024 case 'td':
3025 case 'th':
3026 case 'tr':
3027 return true; // Ignore the token.
3028 }
3029 }
3030 // Anything else:
3031 return $this->inTableMode( $token, $value, $attribs, $selfclose );
3032 }
3033
3034 // Helper function for inRowMode
3035 private function endRow() {
3036 if ( !$this->stack->inTableScope( 'tr' ) ) {
3037 return false;
3038 }
3039 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3040 $this->stack->pop();
3041 $this->switchMode( 'inTableBodyMode' );
3042 return true;
3043 }
3044 private function inRowMode( $token, $value, $attribs = null, $selfclose = false ) {
3045 if ( $token === 'tag' ) {
3046 switch ( $value ) {
3047 case 'th':
3048 case 'td':
3049 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3050 $this->stack->insertHTMLElement( $value, $attribs );
3051 $this->switchMode( 'inCellMode' );
3052 $this->afe->insertMarker();
3053 return true;
3054 case 'caption':
3055 case 'col':
3056 case 'colgroup':
3057 case 'tbody':
3058 case 'tfoot':
3059 case 'thead':
3060 case 'tr':
3061 if ( $this->endRow() ) {
3062 $this->insertToken( $token, $value, $attribs, $selfclose );
3063 }
3064 return true;
3065 }
3066 } elseif ( $token === 'endtag' ) {
3067 switch ( $value ) {
3068 case 'tr':
3069 $this->endRow();
3070 return true;
3071 case 'table':
3072 if ( $this->endRow() ) {
3073 $this->insertToken( $token, $value, $attribs, $selfclose );
3074 }
3075 return true;
3076 case 'tbody':
3077 case 'tfoot':
3078 case 'thead':
3079 if (
3080 $this->stack->inTableScope( $value ) &&
3081 $this->endRow()
3082 ) {
3083 $this->insertToken( $token, $value, $attribs, $selfclose );
3084 }
3085 return true;
3086 # OMITTED: <body>
3087 case 'caption':
3088 case 'col':
3089 case 'colgroup':
3090 # OMITTED: <html>
3091 case 'td':
3092 case 'th':
3093 return true; // Ignore the token.
3094 }
3095 }
3096 // Anything else:
3097 return $this->inTableMode( $token, $value, $attribs, $selfclose );
3098 }
3099
3100 // Helper for inCellMode
3101 private function endCell() {
3102 if ( $this->stack->inTableScope( 'td' ) ) {
3103 $this->inCellMode( 'endtag', 'td' );
3104 return true;
3105 } elseif ( $this->stack->inTableScope( 'th' ) ) {
3106 $this->inCellMode( 'endtag', 'th' );
3107 return true;
3108 } else {
3109 return false;
3110 }
3111 }
3112 private function inCellMode( $token, $value, $attribs = null, $selfclose = false ) {
3113 if ( $token === 'tag' ) {
3114 switch ( $value ) {
3115 case 'caption':
3116 case 'col':
3117 case 'colgroup':
3118 case 'tbody':
3119 case 'td':
3120 case 'tfoot':
3121 case 'th':
3122 case 'thead':
3123 case 'tr':
3124 if ( $this->endCell() ) {
3125 $this->insertToken( $token, $value, $attribs, $selfclose );
3126 }
3127 return true;
3128 }
3129 } elseif ( $token === 'endtag' ) {
3130 switch ( $value ) {
3131 case 'td':
3132 case 'th':
3133 if ( $this->stack->inTableScope( $value ) ) {
3134 $this->stack->generateImpliedEndTags();
3135 $this->stack->popTag( $value );
3136 $this->afe->clearToMarker();
3137 $this->switchMode( 'inRowMode' );
3138 }
3139 return true;
3140 # OMITTED: <body>
3141 case 'caption':
3142 case 'col':
3143 case 'colgroup':
3144 # OMITTED: <html>
3145 return true;
3146
3147 case 'table':
3148 case 'tbody':
3149 case 'tfoot':
3150 case 'thead':
3151 case 'tr':
3152 if ( $this->stack->inTableScope( $value ) ) {
3153 $this->stack->generateImpliedEndTags();
3154 $this->stack->popTag( BalanceSets::$tableCellSet );
3155 $this->afe->clearToMarker();
3156 $this->switchMode( 'inRowMode' );
3157 $this->insertToken( $token, $value, $attribs, $selfclose );
3158 }
3159 return true;
3160 }
3161 }
3162 // Anything else:
3163 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3164 }
3165
3166 private function inSelectMode( $token, $value, $attribs = null, $selfclose = false ) {
3167 if ( $token === 'text' ) {
3168 $this->stack->insertText( $value );
3169 return true;
3170 } elseif ( $token === 'eof' ) {
3171 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3172 } elseif ( $token === 'tag' ) {
3173 switch ( $value ) {
3174 # OMITTED: <html>
3175 case 'option':
3176 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3177 $this->stack->pop();
3178 }
3179 $this->stack->insertHTMLElement( $value, $attribs );
3180 return true;
3181 case 'optgroup':
3182 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3183 $this->stack->pop();
3184 }
3185 if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
3186 $this->stack->pop();
3187 }
3188 $this->stack->insertHTMLElement( $value, $attribs );
3189 return true;
3190 case 'select':
3191 $this->inSelectMode( 'endtag', $value ); // treat it like endtag
3192 return true;
3193 case 'input':
3194 case 'keygen':
3195 case 'textarea':
3196 if ( !$this->stack->inSelectScope( 'select' ) ) {
3197 return true; // ignore token (fragment case)
3198 }
3199 $this->inSelectMode( 'endtag', 'select' );
3200 return $this->insertToken( $token, $value, $attribs, $selfclose );
3201 case 'script':
3202 case 'template':
3203 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3204 }
3205 } elseif ( $token === 'endtag' ) {
3206 switch ( $value ) {
3207 case 'optgroup':
3208 if (
3209 $this->stack->currentNode->isHtmlNamed( 'option' ) &&
3210 $this->stack->length() >= 2 &&
3211 $this->stack->node( $this->stack->length() - 2 )->isHtmlNamed( 'optgroup' )
3212 ) {
3213 $this->stack->pop();
3214 }
3215 if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
3216 $this->stack->pop();
3217 }
3218 return true;
3219 case 'option':
3220 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3221 $this->stack->pop();
3222 }
3223 return true;
3224 case 'select':
3225 if ( !$this->stack->inSelectScope( $value ) ) {
3226 return true; // fragment case
3227 }
3228 $this->stack->popTag( $value );
3229 $this->resetInsertionMode();
3230 return true;
3231 case 'template':
3232 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3233 }
3234 }
3235 // anything else: just ignore the token
3236 return true;
3237 }
3238
3239 private function inSelectInTableMode( $token, $value, $attribs = null, $selfclose = false ) {
3240 switch ( $value ) {
3241 case 'caption':
3242 case 'table':
3243 case 'tbody':
3244 case 'tfoot':
3245 case 'thead':
3246 case 'tr':
3247 case 'td':
3248 case 'th':
3249 if ( $token === 'tag' ) {
3250 $this->inSelectInTableMode( 'endtag', 'select' );
3251 return $this->insertToken( $token, $value, $attribs, $selfclose );
3252 } elseif ( $token === 'endtag' ) {
3253 if ( $this->stack->inTableScope( $value ) ) {
3254 $this->inSelectInTableMode( 'endtag', 'select' );
3255 return $this->insertToken( $token, $value, $attribs, $selfclose );
3256 }
3257 return true;
3258 }
3259 }
3260 // anything else
3261 return $this->inSelectMode( $token, $value, $attribs, $selfclose );
3262 }
3263
3264 private function inTemplateMode( $token, $value, $attribs = null, $selfclose = false ) {
3265 if ( $token === 'text' ) {
3266 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3267 } elseif ( $token === 'eof' ) {
3268 if ( $this->stack->indexOf( 'template' ) < 0 ) {
3269 $this->stopParsing();
3270 } else {
3271 $this->stack->popTag( 'template' );
3272 $this->afe->clearToMarker();
3273 array_pop( $this->templateInsertionModes );
3274 $this->resetInsertionMode();
3275 $this->insertToken( $token, $value, $attribs, $selfclose );
3276 }
3277 return true;
3278 } elseif ( $token === 'tag' ) {
3279 switch ( $value ) {
3280 case 'base':
3281 case 'basefont':
3282 case 'bgsound':
3283 case 'link':
3284 case 'meta':
3285 case 'noframes':
3286 # OMITTED: <script>
3287 case 'style':
3288 case 'template':
3289 # OMITTED: <title>
3290 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3291
3292 case 'caption':
3293 case 'colgroup':
3294 case 'tbody':
3295 case 'tfoot':
3296 case 'thead':
3297 return $this->switchModeAndReprocess(
3298 'inTableMode', $token, $value, $attribs, $selfclose
3299 );
3300
3301 case 'col':
3302 return $this->switchModeAndReprocess(
3303 'inColumnGroupMode', $token, $value, $attribs, $selfclose
3304 );
3305
3306 case 'tr':
3307 return $this->switchModeAndReprocess(
3308 'inTableBodyMode', $token, $value, $attribs, $selfclose
3309 );
3310
3311 case 'td':
3312 case 'th':
3313 return $this->switchModeAndReprocess(
3314 'inRowMode', $token, $value, $attribs, $selfclose
3315 );
3316 }
3317 return $this->switchModeAndReprocess(
3318 'inBodyMode', $token, $value, $attribs, $selfclose
3319 );
3320 } elseif ( $token === 'endtag' ) {
3321 switch ( $value ) {
3322 case 'template':
3323 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3324 }
3325 return true;
3326 } else {
3327 Assert::invariant( false, "Bad token type: $token" );
3328 }
3329 }
3330 }