Minor bug fixes to Balancer.
[lhc/web/wiklou.git] / includes / tidy / Balancer.php
1 <?php
2 /**
3 * An implementation of the tree building portion of the HTML5 parsing
4 * spec.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
20 *
21 * @file
22 * @ingroup Parser
23 * @since 1.27
24 * @author C. Scott Ananian, 2016
25 */
26 namespace MediaWiki\Tidy;
27
28 use Wikimedia\Assert\Assert;
29 use Wikimedia\Assert\ParameterAssertionException;
30 use \ExplodeIterator;
31 use \IteratorAggregate;
32 use \ReverseArrayIterator;
33 use \Sanitizer;
34
35 # A note for future librarization[1] -- this file is a good candidate
36 # for splitting into an independent library, except that it is currently
37 # highly optimized for MediaWiki use. It only implements the portions
38 # of the HTML5 tree builder used by tags supported by MediaWiki, and
39 # does not contain a true tokenizer pass, instead relying on
40 # comment stripping, attribute normalization, and escaping done by
41 # the MediaWiki Sanitizer. It also deliberately avoids building
42 # a true DOM in memory, instead serializing elements to an output string
43 # as soon as possible (usually as soon as the tag is closed) to reduce
44 # its memory footprint.
45
46 # On the other hand, I've been pretty careful to note with comments in the
47 # code the places where this implementation omits features of the spec or
48 # depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
49 # implement the missing pieces and make this a standalone PHP HTML5 parser.
50 # In order to do so, some sort of MediaWiki-specific API will need
51 # to be added to (a) allow the Balancer to bypass the tokenizer,
52 # and (b) support on-the-fly flattening instead of DOM node creation.
53
54 # [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
55
56 /**
57 * Utility constants and sets for the HTML5 tree building algorithm.
58 * Sets are associative arrays indexed first by namespace and then by
59 * lower-cased tag name.
60 *
61 * @ingroup Parser
62 * @since 1.27
63 */
64 class BalanceSets {
65 const HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml';
66 const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
67 const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
68
69 public static $unsupportedSet = [
70 self::HTML_NAMESPACE => [
71 'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
72 'form' => true, 'frame' => true,
73 'plaintext' => true, 'isindex' => true, 'textarea' => true,
74 'xmp' => true, 'iframe' => true, 'noembed' => true,
75 'noscript' => true, 'select' => true, 'script' => true,
76 'title' => true
77 ]
78 ];
79
80 public static $emptyElementSet = [
81 self::HTML_NAMESPACE => [
82 'area' => true, 'base' => true, 'basefont' => true,
83 'bgsound' => true, 'br' => true, 'col' => true, 'command' => true,
84 'embed' => true, 'frame' => true, 'hr' => true, 'img' => true,
85 'input' => true, 'keygen' => true, 'link' => true, 'meta' => true,
86 'param' => true, 'source' => true, 'track' => true, 'wbr' => true
87 ]
88 ];
89
90 public static $headingSet = [
91 self::HTML_NAMESPACE => [
92 'h1' => true, 'h2' => true, 'h3' => true,
93 'h4' => true, 'h5' => true, 'h6' => true
94 ]
95 ];
96
97 public static $specialSet = [
98 self::HTML_NAMESPACE => [
99 'address' => true, 'applet' => true, 'area' => true,
100 'article' => true, 'aside' => true, 'base' => true,
101 'basefont' => true, 'bgsound' => true, 'blockquote' => true,
102 'body' => true, 'br' => true, 'button' => true, 'caption' => true,
103 'center' => true, 'col' => true, 'colgroup' => true, 'dd' => true,
104 'details' => true, 'dir' => true, 'div' => true, 'dl' => true,
105 'dt' => true, 'embed' => true, 'fieldset' => true,
106 'figcaption' => true, 'figure' => true, 'footer' => true,
107 'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true,
108 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
109 'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
110 'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
111 'input' => true, 'isindex' => true, 'li' => true, 'link' => true,
112 'listing' => true, 'main' => true, 'marquee' => true,
113 'menu' => true, 'menuitem' => true, 'meta' => true, 'nav' => true,
114 'noembed' => true, 'noframes' => true, 'noscript' => true,
115 'object' => true, 'ol' => true, 'p' => true, 'param' => true,
116 'plaintext' => true, 'pre' => true, 'script' => true,
117 'section' => true, 'select' => true, 'source' => true,
118 'style' => true, 'summary' => true, 'table' => true,
119 'tbody' => true, 'td' => true, 'template' => true,
120 'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true,
121 'title' => true, 'tr' => true, 'track' => true, 'ul' => true,
122 'wbr' => true, 'xmp' => true
123 ],
124 self::SVG_NAMESPACE => [
125 'foreignobject' => true, 'desc' => true, 'title' => true
126 ],
127 self::MATHML_NAMESPACE => [
128 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
129 'mtext' => true, 'annotation-xml' => true
130 ]
131 ];
132
133 public static $addressDivPSet = [
134 self::HTML_NAMESPACE => [
135 'address' => true, 'div' => true, 'p' => true
136 ]
137 ];
138
139 public static $tableSectionRowSet = [
140 self::HTML_NAMESPACE => [
141 'table' => true, 'thead' => true, 'tbody' => true,
142 'tfoot' => true, 'tr' => true
143 ]
144 ];
145
146 public static $impliedEndTagsSet = [
147 self::HTML_NAMESPACE => [
148 'dd' => true, 'dt' => true, 'li' => true, 'optgroup' => true,
149 'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
150 'rt' => true, 'rtc' => true
151 ]
152 ];
153
154 public static $thoroughImpliedEndTagsSet = [
155 self::HTML_NAMESPACE => [
156 'caption' => true, 'colgroup' => true, 'dd' => true, 'dt' => true,
157 'li' => true, 'optgroup' => true, 'option' => true, 'p' => true,
158 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true,
159 'tbody' => true, 'td' => true, 'tfoot' => true, 'th' => true,
160 'thead' => true, 'tr' => true
161 ]
162 ];
163
164 public static $tableCellSet = [
165 self::HTML_NAMESPACE => [
166 'td' => true, 'th' => true
167 ]
168 ];
169 public static $tableContextSet = [
170 self::HTML_NAMESPACE => [
171 'table' => true, 'template' => true, 'html' => true
172 ]
173 ];
174
175 public static $tableBodyContextSet = [
176 self::HTML_NAMESPACE => [
177 'tbody' => true, 'tfoot' => true, 'thead' => true,
178 'template' => true, 'html' => true
179 ]
180 ];
181
182 public static $tableRowContextSet = [
183 self::HTML_NAMESPACE => [
184 'tr' => true, 'template' => true, 'html' => true
185 ]
186 ];
187
188 # OMITTED: formAssociatedSet, since we don't allow <form>
189
190 public static $inScopeSet = [
191 self::HTML_NAMESPACE => [
192 'applet' => true, 'caption' => true, 'html' => true,
193 'marquee' => true, 'object' => true,
194 'table' => true, 'td' => true, 'template' => true,
195 'th' => true
196 ],
197 self::SVG_NAMESPACE => [
198 'foreignobject' => true, 'desc' => true, 'title' => true
199 ],
200 self::MATHML_NAMESPACE => [
201 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
202 'mtext' => true, 'annotation-xml' => true
203 ]
204 ];
205
206 private static $inListItemScopeSet = null;
207 public static function inListItemScopeSet() {
208 if ( self::$inListItemScopeSet === null ) {
209 self::$inListItemScopeSet = self::$inScopeSet;
210 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ol'] = true;
211 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ul'] = true;
212 }
213 return self::$inListItemScopeSet;
214 }
215
216 private static $inButtonScopeSet = null;
217 public static function inButtonScopeSet() {
218 if ( self::$inButtonScopeSet === null ) {
219 self::$inButtonScopeSet = self::$inScopeSet;
220 self::$inButtonScopeSet[self::HTML_NAMESPACE]['button'] = true;
221 }
222 return self::$inButtonScopeSet;
223 }
224
225 public static $inTableScopeSet = [
226 self::HTML_NAMESPACE => [
227 'html' => true, 'table' => true, 'template' => true
228 ]
229 ];
230
231 public static $mathmlTextIntegrationPointSet = [
232 self::MATHML_NAMESPACE => [
233 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
234 'mtext' => true
235 ]
236 ];
237
238 public static $htmlIntegrationPointSet = [
239 self::SVG_NAMESPACE => [
240 'foreignobject' => true,
241 'desc' => true,
242 'title' => true
243 ]
244 ];
245
246 // For tidy compatibility.
247 public static $tidyPWrapSet = [
248 self::HTML_NAMESPACE => [
249 'body' => true, 'blockquote' => true,
250 // We parse with <body> as the fragment context, but the top-level
251 // element on the stack is actually <html>. We could use the
252 // "adjusted current node" everywhere to work around this, but it's
253 // easier just to add <html> to the p-wrap set.
254 'html' => true,
255 ],
256 ];
257 public static $tidyInlineSet = [
258 self::HTML_NAMESPACE => [
259 'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true,
260 'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true,
261 'br' => true, 'button' => true, 'cite' => true, 'code' => true,
262 'dfn' => true, 'em' => true, 'font' => true, 'i' => true,
263 'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true,
264 'label' => true, 'legend' => true, 'map' => true, 'object' => true,
265 'param' => true, 'q' => true, 'rb' => true, 'rbc' => true,
266 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true,
267 's' => true, 'samp' => true, 'select' => true, 'small' => true,
268 'span' => true, 'strike' => true, 'strong' => true, 'sub' => true,
269 'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true,
270 'var' => true,
271 ],
272 ];
273 }
274
275 /**
276 * A BalanceElement is a simplified version of a DOM Node. The main
277 * difference is that we only keep BalanceElements around for nodes
278 * currently on the BalanceStack of open elements. As soon as an
279 * element is closed, with some minor exceptions relating to the
280 * tree builder "adoption agency algorithm", the element and all its
281 * children are serialized to a string using the flatten() method.
282 * This keeps our memory usage low.
283 *
284 * @ingroup Parser
285 * @since 1.27
286 */
287 class BalanceElement {
288 /**
289 * The namespace of the element.
290 * @var string $namespaceURI
291 */
292 public $namespaceURI;
293 /**
294 * The lower-cased name of the element.
295 * @var string $localName
296 */
297 public $localName;
298 /**
299 * Attributes for the element, in array form
300 * @var array $attribs
301 */
302 public $attribs;
303
304 /**
305 * Parent of this element, or the string "flat" if this element has
306 * already been flattened into its parent.
307 * @var string|null $parent
308 */
309 public $parent;
310
311 /**
312 * An array of children of this element. Typically only the last
313 * child will be an actual BalanceElement object; the rest will
314 * be strings, representing either text nodes or flattened
315 * BalanceElement objects.
316 * @var array $children
317 */
318 public $children;
319
320 /**
321 * A unique string identifier for Noah's Ark purposes, lazy initialized
322 */
323 private $noahKey;
324
325 /**
326 * The next active formatting element in the list, or null if this is the
327 * end of the AFE list or if the element is not in the AFE list.
328 */
329 public $nextAFE;
330
331 /**
332 * The previous active formatting element in the list, or null if this is
333 * the start of the list or if the element is not in the AFE list.
334 */
335 public $prevAFE;
336
337 /**
338 * The next element in the Noah's Ark species bucket.
339 */
340 public $nextNoah;
341
342 /**
343 * Make a new BalanceElement corresponding to the HTML DOM Element
344 * with the given localname, namespace, and attributes.
345 *
346 * @param string $namespaceURI The namespace of the element.
347 * @param string $localName The lowercased name of the tag.
348 * @param array $attribs Attributes of the element
349 */
350 public function __construct( $namespaceURI, $localName, array $attribs ) {
351 $this->localName = $localName;
352 $this->namespaceURI = $namespaceURI;
353 $this->attribs = $attribs;
354 $this->contents = '';
355 $this->parent = null;
356 $this->children = [];
357 }
358
359 /**
360 * Remove the given child from this element.
361 * @param BalanceElement $elt
362 */
363 private function removeChild( BalanceElement $elt ) {
364 Assert::precondition(
365 $this->parent !== 'flat', "Can't removeChild after flattening $this"
366 );
367 Assert::parameter(
368 $elt->parent === $this, 'elt', 'must have $this as a parent'
369 );
370 $idx = array_search( $elt, $this->children, true );
371 Assert::parameter( $idx !== false, '$elt', 'must be a child of $this' );
372 $elt->parent = null;
373 array_splice( $this->children, $idx, 1 );
374 }
375
376 /**
377 * Find $a in the list of children and insert $b before it.
378 * @param BalanceElement $a
379 * @param BalanceElement|string $b
380 */
381 public function insertBefore( BalanceElement $a, $b ) {
382 Assert::precondition(
383 $this->parent !== 'flat', "Can't insertBefore after flattening."
384 );
385 $idx = array_search( $a, $this->children, true );
386 Assert::parameter( $idx !== false, '$a', 'must be a child of $this' );
387 if ( is_string( $b ) ) {
388 array_splice( $this->children, $idx, 0, [ $b ] );
389 } else {
390 Assert::parameter( $b->parent !== 'flat', '$b', "Can't be flat" );
391 if ( $b->parent !== null ) {
392 $b->parent->removeChild( $b );
393 }
394 array_splice( $this->children, $idx, 0, [ $b ] );
395 $b->parent = $this;
396 }
397 }
398
399 /**
400 * Append $elt to the end of the list of children.
401 * @param BalanceElement|string $elt
402 */
403 public function appendChild( $elt ) {
404 Assert::precondition(
405 $this->parent !== 'flat', "Can't appendChild after flattening."
406 );
407 if ( is_string( $elt ) ) {
408 array_push( $this->children, $elt );
409 return;
410 }
411 // Remove $elt from parent, if it had one.
412 if ( $elt->parent !== null ) {
413 $elt->parent->removeChild( $elt );
414 }
415 array_push( $this->children, $elt );
416 $elt->parent = $this;
417 }
418
419 /**
420 * Transfer all of the children of $elt to $this.
421 * @param BalanceElement $elt
422 */
423 public function adoptChildren( BalanceElement $elt ) {
424 Assert::precondition(
425 $elt->parent !== 'flat', "Can't adoptChildren after flattening."
426 );
427 foreach ( $elt->children as $child ) {
428 if ( !is_string( $child ) ) {
429 // This is an optimization which avoids an O(n^2) set of
430 // array_splice operations.
431 $child->parent = null;
432 }
433 $this->appendChild( $child );
434 }
435 $elt->children = [];
436 }
437
438 /**
439 * Flatten this node and all of its children into a string, as specified
440 * by the HTML serialization specification, and replace this node
441 * in its parent by that string.
442 *
443 * @see __toString()
444 */
445 public function flatten( $tidyCompat = false ) {
446 Assert::parameter( $this->parent !== null, '$this', 'must be a child' );
447 Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' );
448 $idx = array_search( $this, $this->parent->children, true );
449 Assert::parameter(
450 $idx !== false, '$this', 'must be a child of its parent'
451 );
452 if ( $tidyCompat ) {
453 $blank = true;
454 foreach ( $this->children as $elt ) {
455 if ( !is_string( $elt ) ) {
456 $elt = $elt->flatten( $tidyCompat );
457 }
458 if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) {
459 $blank = false;
460 }
461 }
462 if ( $this->isHtmlNamed( 'mw:p-wrap' ) ) {
463 $this->localName = 'p';
464 } elseif ( $blank ) {
465 // Add 'mw-empty-elt' class so elements can be hidden via CSS
466 // for compatibility with legacy tidy.
467 if ( !count( $this->attribs ) &&
468 ( $this->localName === 'tr' || $this->localName === 'li' )
469 ) {
470 $this->attribs = [ 'class' => "mw-empty-elt" ];
471 }
472 $blank = false;
473 }
474 $flat = $blank ? '' : "{$this}";
475 } else {
476 $flat = "{$this}";
477 }
478 $this->parent->children[$idx] = $flat;
479 $this->parent = 'flat'; # for assertion checking
480 return $flat;
481 }
482
483 /**
484 * Serialize this node and all of its children to a string, as specified
485 * by the HTML serialization specification.
486 *
487 * @return string The serialization of the BalanceElement
488 * @see https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments
489 */
490 public function __toString() {
491 $encAttribs = '';
492 foreach ( $this->attribs as $name => $value ) {
493 $encValue = Sanitizer::encodeAttribute( $value );
494 $encAttribs .= " $name=\"$encValue\"";
495 }
496 if ( !$this->isA( BalanceSets::$emptyElementSet ) ) {
497 $out = "<{$this->localName}{$encAttribs}>";
498 // flatten children
499 foreach ( $this->children as $elt ) {
500 $out .= "{$elt}";
501 }
502 $out .= "</{$this->localName}>";
503 } else {
504 $out = "<{$this->localName}{$encAttribs} />";
505 Assert::invariant(
506 count( $this->children ) === 0,
507 "Empty elements shouldn't have children."
508 );
509 }
510 return $out;
511 }
512
513 # Utility functions on BalanceElements.
514
515 /**
516 * Determine if $this represents a specific HTML tag, is a member of
517 * a tag set, or is equal to another BalanceElement.
518 *
519 * @param BalanceElement|array|string $set The target BalanceElement,
520 * set (from the BalanceSets class), or string (HTML tag name).
521 * @return bool
522 */
523 public function isA( $set ) {
524 if ( $set instanceof BalanceElement ) {
525 return $this === $set;
526 } elseif ( is_array( $set ) ) {
527 return isset( $set[$this->namespaceURI] ) &&
528 isset( $set[$this->namespaceURI][$this->localName] );
529 } else {
530 # assume this is an HTML element name.
531 return $this->isHtml() && $this->localName === $set;
532 }
533 }
534
535 /**
536 * Determine if this element is an HTML element with the specified name
537 * @param string $tagName
538 * @return bool
539 */
540 public function isHtmlNamed( $tagName ) {
541 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE
542 && $this->localName === $tagName;
543 }
544
545 /**
546 * Determine if $this represents an element in the HTML namespace.
547 *
548 * @return bool
549 */
550 public function isHtml() {
551 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE;
552 }
553
554 /**
555 * Determine if $this represents a MathML text integration point,
556 * as defined in the HTML5 specification.
557 *
558 * @return bool
559 * @see https://html.spec.whatwg.org/multipage/syntax.html#mathml-text-integration-point
560 */
561 public function isMathmlTextIntegrationPoint() {
562 return $this->isA( BalanceSets::$mathmlTextIntegrationPointSet );
563 }
564
565 /**
566 * Determine if $this represents an HTML integration point,
567 * as defined in the HTML5 specification.
568 *
569 * @return bool
570 * @see https://html.spec.whatwg.org/multipage/syntax.html#html-integration-point
571 */
572 public function isHtmlIntegrationPoint() {
573 if ( $this->isA( BalanceSets::$htmlIntegrationPointSet ) ) {
574 return true;
575 }
576 if (
577 $this->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
578 $this->localName === 'annotation-xml' &&
579 isset( $this->attribs['encoding'] ) &&
580 ( strcasecmp( $this->attribs['encoding'], 'text/html' ) == 0 ||
581 strcasecmp( $this->attribs['encoding'], 'application/xhtml+xml' ) == 0 )
582 ) {
583 return true;
584 }
585 return false;
586 }
587
588 /**
589 * Get a string key for the Noah's Ark algorithm
590 */
591 public function getNoahKey() {
592 if ( $this->noahKey === null ) {
593 $attribs = $this->attribs;
594 ksort( $attribs );
595 $this->noahKey = serialize( [ $this->namespaceURI, $this->localName, $attribs ] );
596 }
597 return $this->noahKey;
598 }
599 }
600
601 /**
602 * The "stack of open elements" as defined in the HTML5 tree builder
603 * spec. This contains methods to ensure that content (start tags, text)
604 * are inserted at the correct place in the output string, and to
605 * flatten BalanceElements are they are closed to avoid holding onto
606 * a complete DOM tree for the document in memory.
607 *
608 * The stack defines a PHP iterator to traverse it in "reverse order",
609 * that is, the most-recently-added element is visited first in a
610 * foreach loop.
611 *
612 * @ingroup Parser
613 * @since 1.27
614 * @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
615 */
616 class BalanceStack implements IteratorAggregate {
617 /**
618 * Backing storage for the stack.
619 * @var array $elements
620 */
621 private $elements = [];
622 /**
623 * Foster parent mode determines how nodes are inserted into the
624 * stack.
625 * @var bool $fosterParentMode
626 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
627 */
628 public $fosterParentMode = false;
629 /**
630 * Tidy compatibility mode, determines behavior of body/blockquote
631 */
632 public $tidyCompat = false;
633 /**
634 * Reference to the current element
635 */
636 public $currentNode;
637
638 /**
639 * Create a new BalanceStack with a single BalanceElement on it,
640 * representing the root &lt;html&gt; node.
641 */
642 public function __construct() {
643 # always a root <html> element on the stack
644 array_push(
645 $this->elements,
646 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] )
647 );
648 $this->currentNode = $this->elements[0];
649 }
650
651 /**
652 * Return a string representing the output of the tree builder:
653 * all the children of the root &lt;html&gt; node.
654 * @return string
655 */
656 public function getOutput() {
657 // Don't include the outer '<html>....</html>'
658 $out = '';
659 foreach ( $this->elements[0]->children as $elt ) {
660 $out .= is_string( $elt ) ? $elt :
661 $elt->flatten( $this->tidyCompat );
662 }
663 return $out;
664 }
665
666 /**
667 * Insert text at the appropriate place for inserting a node.
668 * @param string $value
669 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
670 */
671 public function insertText( $value ) {
672 if (
673 $this->fosterParentMode &&
674 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
675 ) {
676 $this->fosterParent( $value );
677 } elseif (
678 $this->tidyCompat &&
679 $this->currentNode->isA( BalanceSets::$tidyPWrapSet )
680 ) {
681 $this->insertHTMLELement( 'mw:p-wrap', [] );
682 return $this->insertText( $value );
683 } else {
684 $this->currentNode->appendChild( $value );
685 }
686 }
687
688 /**
689 * Insert a BalanceElement at the appropriate place, pushing it
690 * on to the open elements stack.
691 * @param string $namespaceURI The element namespace
692 * @param string $tag The tag name
693 * @param string $attribs Normalized attributes, as a string.
694 * @return BalanceElement
695 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-foreign-element
696 */
697 public function insertForeignElement( $namespaceURI, $tag, $attribs ) {
698 return $this->insertElement(
699 new BalanceElement( $namespaceURI, $tag, $attribs )
700 );
701 }
702
703 /**
704 * Insert an HTML element at the appropriate place, pushing it on to
705 * the open elements stack.
706 * @param string $tag The tag name
707 * @param string $attribs Normalized attributes, as a string.
708 * @return BalanceElement
709 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-an-html-element
710 */
711 public function insertHTMLElement( $tag, $attribs ) {
712 return $this->insertForeignElement(
713 BalanceSets::HTML_NAMESPACE, $tag, $attribs
714 );
715 }
716
717 /**
718 * Insert an element at the appropriate place and push it on to the
719 * open elements stack.
720 * @param BalanceElement $elt
721 * @return BalanceElement
722 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
723 */
724 public function insertElement( BalanceElement $elt ) {
725 if (
726 $this->currentNode->isHtmlNamed( 'mw:p-wrap' ) &&
727 !$elt->isA( BalanceSets::$tidyInlineSet )
728 ) {
729 // Tidy compatibility.
730 $this->pop();
731 }
732 if (
733 $this->fosterParentMode &&
734 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
735 ) {
736 $elt = $this->fosterParent( $elt );
737 } else {
738 $this->currentNode->appendChild( $elt );
739 }
740 Assert::invariant( $elt->parent !== null, "$elt must be in tree" );
741 Assert::invariant( $elt->parent !== 'flat', "$elt must not have been previous flattened" );
742 array_push( $this->elements, $elt );
743 $this->currentNode = $elt;
744 return $elt;
745 }
746
747 /**
748 * Determine if the stack has $tag in scope.
749 * @param BalanceElement|array|string $tag
750 * @return bool
751 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-scope
752 */
753 public function inScope( $tag ) {
754 return $this->inSpecificScope( $tag, BalanceSets::$inScopeSet );
755 }
756
757 /**
758 * Determine if the stack has $tag in button scope.
759 * @param BalanceElement|array|string $tag
760 * @return bool
761 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-button-scope
762 */
763 public function inButtonScope( $tag ) {
764 return $this->inSpecificScope( $tag, BalanceSets::inButtonScopeSet() );
765 }
766
767 /**
768 * Determine if the stack has $tag in list item scope.
769 * @param BalanceElement|array|string $tag
770 * @return bool
771 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-list-item-scope
772 */
773 public function inListItemScope( $tag ) {
774 return $this->inSpecificScope( $tag, BalanceSets::inListItemScopeSet() );
775 }
776
777 /**
778 * Determine if the stack has $tag in table scope.
779 * @param BalanceElement|array|string $tag
780 * @return bool
781 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-table-scope
782 */
783 public function inTableScope( $tag ) {
784 return $this->inSpecificScope( $tag, BalanceSets::$inTableScopeSet );
785 }
786
787 /**
788 * Determine if the stack has $tag in a specific scope, $set.
789 * @param BalanceElement|array|string $tag
790 * @param BalanceElement|array|string $set
791 * @return bool
792 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-the-specific-scope
793 */
794 public function inSpecificScope( $tag, $set ) {
795 foreach ( $this as $elt ) {
796 if ( $elt->isA( $tag ) ) {
797 return true;
798 }
799 if ( $elt->isA( $set ) ) {
800 return false;
801 }
802 }
803 return false;
804 }
805
806 /**
807 * Generate implied end tags.
808 * @param string $butnot
809 * @param bool $thorough True if we should generate end tags thoroughly.
810 * @see https://html.spec.whatwg.org/multipage/syntax.html#generate-implied-end-tags
811 */
812 public function generateImpliedEndTags( $butnot = null, $thorough = false ) {
813 $endTagSet = $thorough ?
814 BalanceSets::$thoroughImpliedEndTagsSet :
815 BalanceSets::$impliedEndTagsSet;
816 while ( $this->currentNode ) {
817 if ( $butnot !== null && $this->currentNode->isHtmlNamed( $butnot ) ) {
818 break;
819 }
820 if ( !$this->currentNode->isA( $endTagSet ) ) {
821 break;
822 }
823 $this->pop();
824 }
825 }
826
827 /**
828 * Return the adjusted current node.
829 */
830 public function adjustedCurrentNode( $fragmentContext ) {
831 return ( $fragmentContext && count( $this->elements ) === 1 ) ?
832 $fragmentContext : $this->currentNode;
833 }
834
835 /**
836 * Return an iterator over this stack which visits the current node
837 * first, and the root node last.
838 * @return Iterator
839 */
840 public function getIterator() {
841 return new ReverseArrayIterator( $this->elements );
842 }
843
844 /**
845 * Return the BalanceElement at the given position $idx, where
846 * position 0 represents the root element.
847 * @param int $idx
848 * @return BalanceElement
849 */
850 public function node( $idx ) {
851 return $this->elements[ $idx ];
852 }
853
854 /**
855 * Replace the element at position $idx in the BalanceStack with $elt.
856 * @param int $idx
857 * @param BalanceElement $elt
858 */
859 public function replaceAt( $idx, BalanceElement $elt ) {
860 Assert::precondition(
861 $this->elements[$idx]->parent !== 'flat',
862 'Replaced element should not have already been flattened.'
863 );
864 Assert::precondition(
865 $elt->parent !== 'flat',
866 'New element should not have already been flattened.'
867 );
868 $this->elements[$idx] = $elt;
869 if ( $idx === count( $this->elements ) - 1 ) {
870 $this->currentNode = $elt;
871 }
872 }
873
874 /**
875 * Return the position of the given BalanceElement, set, or
876 * HTML tag name string in the BalanceStack.
877 * @param BalanceElement|array|string $tag
878 * @return int
879 */
880 public function indexOf( $tag ) {
881 for ( $i = count( $this->elements ) - 1; $i >= 0; $i-- ) {
882 if ( $this->elements[$i]->isA( $tag ) ) {
883 return $i;
884 }
885 }
886 return -1;
887 }
888
889 /**
890 * Return the number of elements currently in the BalanceStack.
891 * @return int
892 */
893 public function length() {
894 return count( $this->elements );
895 }
896
897 /**
898 * Remove the current node from the BalanceStack, flattening it
899 * in the process.
900 */
901 public function pop() {
902 $elt = array_pop( $this->elements );
903 if ( count( $this->elements ) ) {
904 $this->currentNode = $this->elements[ count( $this->elements ) - 1 ];
905 } else {
906 $this->currentNode = null;
907 }
908 if ( !$elt->isHtmlNamed( 'mw:p-wrap' ) ) {
909 $elt->flatten( $this->tidyCompat );
910 }
911 }
912
913 /**
914 * Remove all nodes up to and including position $idx from the
915 * BalanceStack, flattening them in the process.
916 * @param int $idx
917 */
918 public function popTo( $idx ) {
919 $length = count( $this->elements );
920 for ( $length = count( $this->elements ); $length > $idx; $length-- ) {
921 $this->pop();
922 }
923 }
924
925 /**
926 * Pop elements off the stack up to and including the first
927 * element with the specified HTML tagname (or matching the given
928 * set).
929 * @param BalanceElement|array|string $tag
930 */
931 public function popTag( $tag ) {
932 while ( $this->currentNode ) {
933 if ( $this->currentNode->isA( $tag ) ) {
934 $this->pop();
935 break;
936 }
937 $this->pop();
938 }
939 }
940
941 /**
942 * Pop elements off the stack *not including* the first element
943 * in the specified set.
944 * @param BalanceElement|array|string $set
945 */
946 public function clearToContext( $set ) {
947 // Note that we don't loop to 0. Never pop the <html> elt off.
948 for ( $length = count( $this->elements ); $length > 1; $length-- ) {
949 if ( $this->currentNode->isA( $set ) ) {
950 break;
951 }
952 $this->pop();
953 }
954 }
955
956 /**
957 * Remove the given $elt from the BalanceStack, optionally
958 * flattening it in the process.
959 * @param BalanceElement $elt The element to remove.
960 * @param bool $flatten Whether to flatten the removed element.
961 */
962 public function removeElement( BalanceElement $elt, $flatten = true ) {
963 Assert::parameter(
964 $elt->parent !== 'flat',
965 '$elt',
966 '$elt should not already have been flattened.'
967 );
968 Assert::parameter(
969 $elt->parent->parent !== 'flat',
970 '$elt',
971 'The parent of $elt should not already have been flattened.'
972 );
973 $idx = array_search( $elt, $this->elements, true );
974 Assert::parameter( $idx !== false, '$elt', 'must be in stack' );
975 array_splice( $this->elements, $idx, 1 );
976 if ( $idx === count( $this->elements ) ) {
977 $this->currentNode = $this->elements[$idx - 1];
978 }
979 if ( $flatten ) {
980 // serialize $elt into its parent
981 // otherwise, it will eventually serialize when the parent
982 // is serialized, we just hold onto the memory for its
983 // tree of objects a little longer.
984 $elt->flatten( $this->tidyCompat );
985 }
986 Assert::postcondition(
987 array_search( $elt, $this->elements, true ) === false,
988 '$elt should no longer be in open elements stack'
989 );
990 }
991
992 /**
993 * Find $a in the BalanceStack and insert $b after it.
994 * @param BalanceElement $a
995 * @param BalanceElement $b
996 */
997 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
998 $idx = $this->indexOf( $a );
999 Assert::parameter( $idx !== false, '$a', 'must be in stack' );
1000 if ( $idx === count( $this->elements ) - 1 ) {
1001 array_push( $this->elements, $b );
1002 $this->currentNode = $b;
1003 } else {
1004 array_splice( $this->elements, $idx + 1, 0, [ $b ] );
1005 }
1006 }
1007
1008 # Fostering and adoption.
1009
1010 /**
1011 * Foster parent the given $elt in the stack of open elements.
1012 * @param BalanceElement|string $elt
1013 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
1014 */
1015 private function fosterParent( $elt ) {
1016 $lastTable = $this->indexOf( 'table' );
1017 $lastTemplate = $this->indexOf( 'template' );
1018 $parent = null;
1019 $before = null;
1020
1021 if ( $lastTemplate >= 0 && ( $lastTable < 0 || $lastTemplate > $lastTable ) ) {
1022 $parent = $this->elements[$lastTemplate];
1023 } elseif ( $lastTable >= 0 ) {
1024 $parent = $this->elements[$lastTable]->parent;
1025 # Assume all tables have parents, since we're not running scripts!
1026 Assert::invariant(
1027 $parent !== null, "All tables should have parents"
1028 );
1029 $before = $this->elements[$lastTable];
1030 } else {
1031 $parent = $this->elements[0]; // the `html` element.
1032 }
1033
1034 if ( $this->tidyCompat ) {
1035 if ( is_string( $elt ) ) {
1036 // We're fostering text: do we need a p-wrapper?
1037 if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) {
1038 $this->insertHTMLElement( 'mw:p-wrap', [] );
1039 $this->insertText( $elt );
1040 return $elt;
1041 }
1042 } else {
1043 // We're fostering an element; do we need to merge p-wrappers?
1044 if ( $elt->isHtmlNamed( 'mw:p-wrap' ) ) {
1045 $idx = $before ?
1046 array_search( $before, $parent->children, true ) :
1047 count( $parent->children );
1048 $after = $idx > 0 ? $parent->children[$idx - 1] : '';
1049 if (
1050 $after instanceof BalanceElement &&
1051 $after->isHtmlNamed( 'mw:p-wrap' )
1052 ) {
1053 return $after; // Re-use existing p-wrapper.
1054 }
1055 }
1056 }
1057 }
1058
1059 if ( $before ) {
1060 $parent->insertBefore( $before, $elt );
1061 } else {
1062 $parent->appendChild( $elt );
1063 }
1064 return $elt;
1065 }
1066
1067 /**
1068 * Run the "adoption agency algoritm" (AAA) for the given subject
1069 * tag name.
1070 * @param string $tag The subject tag name.
1071 * @param BalanceActiveFormattingElements $afe The current
1072 * active formatting elements list.
1073 * @return true if the adoption agency algorithm "did something", false
1074 * if more processing is required by the caller.
1075 * @see https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
1076 */
1077 public function adoptionAgency( $tag, $afe ) {
1078 // If the current node is an HTML element whose tag name is subject,
1079 // and the current node is not in the list of active formatting
1080 // elements, then pop the current node off the stack of open
1081 // elements and abort these steps.
1082 if (
1083 $this->currentNode->isHtmlNamed( $tag ) &&
1084 !$afe->isInList( $this->currentNode )
1085 ) {
1086 $this->pop();
1087 return true; // no more handling required
1088 }
1089
1090 // Let outer loop counter be zero.
1091 $outer = 0;
1092
1093 // Outer loop: If outer loop counter is greater than or
1094 // equal to eight, then abort these steps.
1095 while ( $outer < 8 ) {
1096 // Increment outer loop counter by one.
1097 $outer++;
1098
1099 // Let the formatting element be the last element in the list
1100 // of active formatting elements that: is between the end of
1101 // the list and the last scope marker in the list, if any, or
1102 // the start of the list otherwise, and has the same tag name
1103 // as the token.
1104 $fmtelt = $afe->findElementByTag( $tag );
1105
1106 // If there is no such node, then abort these steps and instead
1107 // act as described in the "any other end tag" entry below.
1108 if ( !$fmtelt ) {
1109 return false; // false means handle by the default case
1110 }
1111
1112 // Otherwise, if there is such a node, but that node is not in
1113 // the stack of open elements, then this is a parse error;
1114 // remove the element from the list, and abort these steps.
1115 $index = $this->indexOf( $fmtelt );
1116 if ( $index < 0 ) {
1117 $afe->remove( $fmtelt );
1118 return true; // true means no more handling required
1119 }
1120
1121 // Otherwise, if there is such a node, and that node is also in
1122 // the stack of open elements, but the element is not in scope,
1123 // then this is a parse error; ignore the token, and abort
1124 // these steps.
1125 if ( !$this->inScope( $fmtelt ) ) {
1126 return true;
1127 }
1128
1129 // Let the furthest block be the topmost node in the stack of
1130 // open elements that is lower in the stack than the formatting
1131 // element, and is an element in the special category. There
1132 // might not be one.
1133 $furthestblock = null;
1134 $furthestblockindex = -1;
1135 $stacklen = $this->length();
1136 for ( $i = $index+1; $i < $stacklen; $i++ ) {
1137 if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) {
1138 $furthestblock = $this->node( $i );
1139 $furthestblockindex = $i;
1140 break;
1141 }
1142 }
1143
1144 // If there is no furthest block, then the UA must skip the
1145 // subsequent steps and instead just pop all the nodes from the
1146 // bottom of the stack of open elements, from the current node
1147 // up to and including the formatting element, and remove the
1148 // formatting element from the list of active formatting
1149 // elements.
1150 if ( !$furthestblock ) {
1151 $this->popTag( $fmtelt );
1152 $afe->remove( $fmtelt );
1153 return true;
1154 } else {
1155 // Let the common ancestor be the element immediately above
1156 // the formatting element in the stack of open elements.
1157 $ancestor = $this->node( $index-1 );
1158
1159 // Let a bookmark note the position of the formatting
1160 // element in the list of active formatting elements
1161 // relative to the elements on either side of it in the
1162 // list.
1163 $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] );
1164 $afe->insertAfter( $fmtelt, $BOOKMARK );
1165
1166 // Let node and last node be the furthest block.
1167 $node = $furthestblock;
1168 $lastnode = $furthestblock;
1169 $nodeindex = $furthestblockindex;
1170 $isAFE = false;
1171
1172 // Let inner loop counter be zero.
1173 $inner = 0;
1174
1175 while ( true ) {
1176
1177 // Increment inner loop counter by one.
1178 $inner++;
1179
1180 // Let node be the element immediately above node in
1181 // the stack of open elements, or if node is no longer
1182 // in the stack of open elements (e.g. because it got
1183 // removed by this algorithm), the element that was
1184 // immediately above node in the stack of open elements
1185 // before node was removed.
1186 $node = $this->node( --$nodeindex );
1187
1188 // If node is the formatting element, then go
1189 // to the next step in the overall algorithm.
1190 if ( $node === $fmtelt ) break;
1191
1192 // If the inner loop counter is greater than three and node
1193 // is in the list of active formatting elements, then remove
1194 // node from the list of active formatting elements.
1195 $isAFE = $afe->isInList( $node );
1196 if ( $inner > 3 && $isAFE ) {
1197 $afe->remove( $node );
1198 $isAFE = false;
1199 }
1200
1201 // If node is not in the list of active formatting
1202 // elements, then remove node from the stack of open
1203 // elements and then go back to the step labeled inner
1204 // loop.
1205 if ( !$isAFE ) {
1206 // Don't flatten here, since we're about to relocate
1207 // parts of this $node.
1208 $this->removeElement( $node, false );
1209 continue;
1210 }
1211
1212 // Create an element for the token for which the
1213 // element node was created with common ancestor as
1214 // the intended parent, replace the entry for node
1215 // in the list of active formatting elements with an
1216 // entry for the new element, replace the entry for
1217 // node in the stack of open elements with an entry for
1218 // the new element, and let node be the new element.
1219 $newelt = new BalanceElement(
1220 $node->namespaceURI, $node->localName, $node->attribs );
1221 $afe->replace( $node, $newelt );
1222 $this->replaceAt( $nodeindex, $newelt );
1223 $node = $newelt;
1224
1225 // If last node is the furthest block, then move the
1226 // aforementioned bookmark to be immediately after the
1227 // new node in the list of active formatting elements.
1228 if ( $lastnode === $furthestblock ) {
1229 $afe->remove( $BOOKMARK );
1230 $afe->insertAfter( $newelt, $BOOKMARK );
1231 }
1232
1233 // Insert last node into node, first removing it from
1234 // its previous parent node if any.
1235 $node->appendChild( $lastnode );
1236
1237 // Let last node be node.
1238 $lastnode = $node;
1239 }
1240
1241 // If the common ancestor node is a table, tbody, tfoot,
1242 // thead, or tr element, then, foster parent whatever last
1243 // node ended up being in the previous step, first removing
1244 // it from its previous parent node if any.
1245 if (
1246 $this->fosterParentMode &&
1247 $ancestor->isA( BalanceSets::$tableSectionRowSet )
1248 ) {
1249 $this->fosterParent( $lastnode );
1250 } else {
1251 // Otherwise, append whatever last node ended up being in
1252 // the previous step to the common ancestor node, first
1253 // removing it from its previous parent node if any.
1254 $ancestor->appendChild( $lastnode );
1255 }
1256
1257 // Create an element for the token for which the
1258 // formatting element was created, with furthest block
1259 // as the intended parent.
1260 $newelt2 = new BalanceElement(
1261 $fmtelt->namespaceURI, $fmtelt->localName, $fmtelt->attribs );
1262
1263 // Take all of the child nodes of the furthest block and
1264 // append them to the element created in the last step.
1265 $newelt2->adoptChildren( $furthestblock );
1266
1267 // Append that new element to the furthest block.
1268 $furthestblock->appendChild( $newelt2 );
1269
1270 // Remove the formatting element from the list of active
1271 // formatting elements, and insert the new element into the
1272 // list of active formatting elements at the position of
1273 // the aforementioned bookmark.
1274 $afe->remove( $fmtelt );
1275 $afe->replace( $BOOKMARK, $newelt2 );
1276
1277 // Remove the formatting element from the stack of open
1278 // elements, and insert the new element into the stack of
1279 // open elements immediately below the position of the
1280 // furthest block in that stack.
1281 $this->removeElement( $fmtelt );
1282 $this->insertAfter( $furthestblock, $newelt2 );
1283 }
1284 }
1285
1286 return true;
1287 }
1288
1289 /**
1290 * Return the contents of the open elements stack as a string for
1291 * debugging.
1292 * @return string
1293 */
1294 public function __toString() {
1295 $r = [];
1296 foreach ( $this->elements as $elt ) {
1297 array_push( $r, $elt->localName );
1298 }
1299 return implode( $r, ' ' );
1300 }
1301 }
1302
1303 /**
1304 * A pseudo-element used as a marker in the list of active formatting elements
1305 *
1306 * @ingroup Parser
1307 * @since 1.27
1308 */
1309 class BalanceMarker {
1310 public $nextAFE;
1311 public $prevAFE;
1312 }
1313
1314 /**
1315 * The list of active formatting elements, which is used to handle
1316 * mis-nested formatting element tags in the HTML5 tree builder
1317 * specification.
1318 *
1319 * @ingroup Parser
1320 * @since 1.27
1321 * @see https://html.spec.whatwg.org/multipage/syntax.html#list-of-active-formatting-elements
1322 */
1323 class BalanceActiveFormattingElements {
1324 /** The last (most recent) element in the list */
1325 private $tail;
1326
1327 /** The first (least recent) element in the list */
1328 private $head;
1329
1330 /**
1331 * An array of arrays representing the population of elements in each bucket
1332 * according to the Noah's Ark clause. The outer array is stack-like, with each
1333 * integer-indexed element representing a segment of the list, bounded by
1334 * markers. The first element represents the segment of the list before the
1335 * first marker.
1336 *
1337 * The inner arrays are indexed by "Noah key", which is a string which uniquely
1338 * identifies each bucket according to the rules in the spec. The value in
1339 * the inner array is the first (least recently inserted) element in the bucket,
1340 * and subsequent members of the bucket can be found by iterating through the
1341 * singly-linked list via $node->nextNoah.
1342 *
1343 * This is optimised for the most common case of inserting into a bucket
1344 * with zero members, and deleting a bucket containing one member. In the
1345 * worst case, iteration through the list is still O(1) in the document
1346 * size, since each bucket can have at most 3 members.
1347 */
1348 private $noahTableStack = [ [] ];
1349
1350 public function __destruct() {
1351 for ( $node = $this->head; $node; $node = $next ) {
1352 $next = $node->nextAFE;
1353 $node->prevAFE = $node->nextAFE = $node->nextNoah = null;
1354 }
1355 $this->head = $this->tail = $this->noahTableStack = null;
1356 }
1357
1358 public function insertMarker() {
1359 $elt = new BalanceMarker;
1360 if ( $this->tail ) {
1361 $this->tail->nextAFE = $elt;
1362 $elt->prevAFE = $this->tail;
1363 } else {
1364 $this->head = $elt;
1365 }
1366 $this->tail = $elt;
1367 $this->noahTableStack[] = [];
1368 }
1369
1370 /**
1371 * Follow the steps required when the spec requires us to "push onto the
1372 * list of active formatting elements".
1373 * @param BalanceElement $elt
1374 */
1375 public function push( BalanceElement $elt ) {
1376 // Must not be in the list already
1377 if ( $elt->prevAFE !== null || $this->head === $elt ) {
1378 throw new ParameterAssertionException( '$elt',
1379 'Cannot insert a node into the AFE list twice' );
1380 }
1381
1382 // "Noah's Ark clause" -- if there are already three copies of
1383 // this element before we encounter a marker, then drop the last
1384 // one.
1385 $noahKey = $elt->getNoahKey();
1386 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1387 if ( !isset( $table[$noahKey] ) ) {
1388 $table[$noahKey] = $elt;
1389 } else {
1390 $count = 1;
1391 $head = $tail = $table[$noahKey];
1392 while ( $tail->nextNoah ) {
1393 $tail = $tail->nextNoah;
1394 $count++;
1395 }
1396 if ( $count >= 3 ) {
1397 $this->remove( $head );
1398 }
1399 $tail->nextNoah = $elt;
1400 }
1401 // Add to the main AFE list
1402 if ( $this->tail ) {
1403 $this->tail->nextAFE = $elt;
1404 $elt->prevAFE = $this->tail;
1405 } else {
1406 $this->head = $elt;
1407 }
1408 $this->tail = $elt;
1409 }
1410
1411 /**
1412 * Follow the steps required when the spec asks us to "clear the list of
1413 * active formatting elements up to the last marker".
1414 */
1415 public function clearToMarker() {
1416 // Iterate back through the list starting from the tail
1417 $tail = $this->tail;
1418 while ( $tail && !( $tail instanceof BalanceMarker ) ) {
1419 // Unlink the element
1420 $prev = $tail->prevAFE;
1421 $tail->prevAFE = null;
1422 if ( $prev ) {
1423 $prev->nextAFE = null;
1424 }
1425 $tail->nextNoah = null;
1426 $tail = $prev;
1427 }
1428 // If we finished on a marker, unlink it and pop it off the Noah table stack
1429 if ( $tail ) {
1430 $prev = $tail->prevAFE;
1431 if ( $prev ) {
1432 $prev->nextAFE = null;
1433 }
1434 $tail = $prev;
1435 array_pop( $this->noahTableStack );
1436 } else {
1437 // No marker: wipe the top-level Noah table (which is the only one)
1438 $this->noahTableStack[0] = [];
1439 }
1440 // If we removed all the elements, clear the head pointer
1441 if ( !$tail ) {
1442 $this->head = null;
1443 }
1444 $this->tail = $tail;
1445 }
1446
1447 /**
1448 * Find and return the last element with the specified tag between the
1449 * end of the list and the last marker on the list.
1450 * Used when parsing &lt;a&gt; "in body mode".
1451 */
1452 public function findElementByTag( $tag ) {
1453 $elt = $this->tail;
1454 while ( $elt && !( $elt instanceof BalanceMarker ) ) {
1455 if ( $elt->localName === $tag ) {
1456 return $elt;
1457 }
1458 $elt = $elt->prevAFE;
1459 }
1460 return null;
1461 }
1462
1463 /**
1464 * Determine whether an element is in the list of formatting elements.
1465 * @return boolean
1466 */
1467 public function isInList( BalanceElement $elt ) {
1468 return $this->head === $elt || $elt->prevAFE;
1469 }
1470
1471 /**
1472 * Find the element $elt in the list and remove it.
1473 * Used when parsing &lt;a&gt; in body mode.
1474 */
1475 public function remove( BalanceElement $elt ) {
1476 if ( $this->head !== $elt && !$elt->prevAFE ) {
1477 throw new ParameterAssertionException( '$elt',
1478 "Attempted to remove an element which is not in the AFE list" );
1479 }
1480 // Update head and tail pointers
1481 if ( $this->head === $elt ) {
1482 $this->head = $elt->nextAFE;
1483 }
1484 if ( $this->tail === $elt ) {
1485 $this->tail = $elt->prevAFE;
1486 }
1487 // Update previous element
1488 if ( $elt->prevAFE ) {
1489 $elt->prevAFE->nextAFE = $elt->nextAFE;
1490 }
1491 // Update next element
1492 if ( $elt->nextAFE ) {
1493 $elt->nextAFE->prevAFE = $elt->prevAFE;
1494 }
1495 // Clear pointers so that isInList() etc. will work
1496 $elt->prevAFE = $elt->nextAFE = null;
1497 // Update Noah list
1498 $this->removeFromNoahList( $elt );
1499 }
1500
1501 private function addToNoahList( BalanceElement $elt ) {
1502 $noahKey = $elt->getNoahKey();
1503 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1504 if ( !isset( $table[$noahKey] ) ) {
1505 $table[$noahKey] = $elt;
1506 } else {
1507 $tail = $table[$noahKey];
1508 while ( $tail->nextNoah ) {
1509 $tail = $tail->nextNoah;
1510 }
1511 $tail->nextNoah = $elt;
1512 }
1513 }
1514
1515 private function removeFromNoahList( BalanceElement $elt ) {
1516 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1517 $key = $elt->getNoahKey();
1518 $noahElt = $table[$key];
1519 if ( $noahElt === $elt ) {
1520 if ( $noahElt->nextNoah ) {
1521 $table[$key] = $noahElt->nextNoah;
1522 $noahElt->nextNoah = null;
1523 } else {
1524 unset( $table[$key] );
1525 }
1526 } else {
1527 do {
1528 $prevNoahElt = $noahElt;
1529 $noahElt = $prevNoahElt->nextNoah;
1530 if ( $noahElt === $elt ) {
1531 // Found it, unlink
1532 $prevNoahElt->nextNoah = $elt->nextNoah;
1533 $elt->nextNoah = null;
1534 break;
1535 }
1536 } while ( $noahElt );
1537 }
1538 }
1539
1540 /**
1541 * Find element $a in the list and replace it with element $b
1542 */
1543 public function replace( BalanceElement $a, BalanceElement $b ) {
1544 if ( $this->head !== $a && !$a->prevAFE ) {
1545 throw new ParameterAssertionException( '$a',
1546 "Attempted to replace an element which is not in the AFE list" );
1547 }
1548 // Update head and tail pointers
1549 if ( $this->head === $a ) {
1550 $this->head = $b;
1551 }
1552 if ( $this->tail === $a ) {
1553 $this->tail = $b;
1554 }
1555 // Update previous element
1556 if ( $a->prevAFE ) {
1557 $a->prevAFE->nextAFE = $b;
1558 }
1559 // Update next element
1560 if ( $a->nextAFE ) {
1561 $a->nextAFE->prevAFE = $b;
1562 }
1563 $b->prevAFE = $a->prevAFE;
1564 $b->nextAFE = $a->nextAFE;
1565 $a->nextAFE = $a->prevAFE = null;
1566 // Update Noah list
1567 $this->removeFromNoahList( $a );
1568 $this->addToNoahList( $b );
1569 }
1570
1571 /**
1572 * Find $a in the list and insert $b after it.
1573 */
1574 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
1575 if ( $this->head !== $a && !$a->prevAFE ) {
1576 throw new ParameterAssertionException( '$a',
1577 "Attempted to insert after an element which is not in the AFE list" );
1578 }
1579 if ( $this->tail === $a ) {
1580 $this->tail = $b;
1581 }
1582 if ( $a->nextAFE ) {
1583 $a->nextAFE->prevAFE = $b;
1584 }
1585 $b->nextAFE = $a->nextAFE;
1586 $b->prevAFE = $a;
1587 $a->nextAFE = $b;
1588 $this->addToNoahList( $b );
1589 }
1590
1591 // @codingStandardsIgnoreStart Generic.Files.LineLength.TooLong
1592 /**
1593 * Reconstruct the active formatting elements.
1594 * @param BalanceStack $stack The open elements stack
1595 * @see https://html.spec.whatwg.org/multipage/syntax.html#reconstruct-the-active-formatting-elements
1596 */
1597 // @codingStandardsIgnoreEnd
1598 public function reconstruct( $stack ) {
1599 $entry = $this->tail;
1600 // If there are no entries in the list of active formatting elements,
1601 // then there is nothing to reconstruct
1602 if ( !$entry ) {
1603 return;
1604 }
1605 // If the last is a marker, do nothing.
1606 if ( $entry instanceof BalanceMarker ) {
1607 return;
1608 }
1609 // Or if it is an open element, do nothing.
1610 if ( $stack->indexOf( $entry ) >= 0 ) {
1611 return;
1612 }
1613
1614 // Loop backward through the list until we find a marker or an
1615 // open element
1616 $foundit = false;
1617 while ( $entry->prevAFE ) {
1618 $entry = $entry->prevAFE;
1619 if ( $entry instanceof BalanceMarker || $stack->indexOf( $entry ) >= 0 ) {
1620 $foundit = true;
1621 break;
1622 }
1623 }
1624
1625 // Now loop forward, starting from the element after the current one (or
1626 // the first element if we didn't find a marker or open element),
1627 // recreating formatting elements and pushing them back onto the list
1628 // of open elements.
1629 if ( $foundit ) {
1630 $entry = $entry->nextAFE;
1631 }
1632 do {
1633 $newElement = $stack->insertHTMLElement(
1634 $entry->localName,
1635 $entry->attribs );
1636 $this->replace( $entry, $newElement );
1637 $entry = $newElement->nextAFE;
1638 } while ( $entry );
1639 }
1640
1641 /**
1642 * Get a string representation of the AFE list, for debugging
1643 */
1644 public function __toString() {
1645 $prev = null;
1646 $s = '';
1647 for ( $node = $this->head; $node; $prev = $node, $node = $node->nextAFE ) {
1648 if ( $node instanceof BalanceMarker ) {
1649 $s .= "MARKER\n";
1650 continue;
1651 }
1652 $s .= $node->localName . '#' . substr( md5( spl_object_hash( $node ) ), 0, 8 );
1653 if ( $node->nextNoah ) {
1654 $s .= " (noah sibling: {$node->nextNoah->localName}#" .
1655 substr( md5( spl_object_hash( $node->nextNoah ) ), 0, 8 ) .
1656 ')';
1657 }
1658 if ( $node->nextAFE && $node->nextAFE->prevAFE !== $node ) {
1659 $s .= " (reverse link is wrong!)";
1660 }
1661 $s .= "\n";
1662 }
1663 if ( $prev !== $this->tail ) {
1664 $s .= "(tail pointer is wrong!)\n";
1665 }
1666 return $s;
1667 }
1668 }
1669
1670 /**
1671 * An implementation of the tree building portion of the HTML5 parsing
1672 * spec.
1673 *
1674 * This is used to balance and tidy output so that the result can
1675 * always be cleanly serialized/deserialized by an HTML5 parser. It
1676 * does *not* guarantee "conforming" output -- the HTML5 spec contains
1677 * a number of constraints which are not enforced by the HTML5 parsing
1678 * process. But the result will be free of gross errors: misnested or
1679 * unclosed tags, for example, and will be unchanged by spec-complient
1680 * parsing followed by serialization.
1681 *
1682 * The tree building stage is structured as a state machine.
1683 * When comparing the implementation to
1684 * https://www.w3.org/TR/html5/syntax.html#tree-construction
1685 * note that each state is implemented as a function with a
1686 * name ending in `Mode` (because the HTML spec refers to them
1687 * as insertion modes). The current insertion mode is held by
1688 * the $parseMode property.
1689 *
1690 * The following simplifications have been made:
1691 * - We handle body content only (ie, we start `in body`.)
1692 * - The document is never in "quirks mode".
1693 * - All occurrences of < and > have been entity escaped, so we
1694 * can parse tags by simply splitting on those two characters.
1695 * Similarly, all attributes have been "cleaned" and are double-quoted
1696 * and escaped.
1697 * - All comments and null characters are assumed to have been removed.
1698 * - We don't alter linefeeds after <pre>/<listing>.
1699 * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
1700 * <form>, <frame>, <plaintext>, <isindex>, <textarea>, <xmp>, <iframe>,
1701 * <noembed>, <noscript>, <select>, <script>, <title>. As a result,
1702 * further simplifications can be made:
1703 * - `frameset-ok` is not tracked.
1704 * - `form element pointer` is not tracked.
1705 * - `head element pointer` is not tracked (but presumed non-null)
1706 * - Tokenizer has only a single mode.
1707 *
1708 * We generally mark places where we omit cases from the spec due to
1709 * disallowed elements with a comment: `# OMITTED: <element-name>`.
1710 *
1711 * The HTML spec keeps a flag during the parsing process to track
1712 * whether or not a "parse error" has been encountered. We don't
1713 * bother to track that flag, we just implement the error-handling
1714 * process as specified.
1715 *
1716 * @ingroup Parser
1717 * @since 1.27
1718 * @see https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
1719 */
1720 class Balancer {
1721 private $parseMode;
1722 private $bitsIterator;
1723 private $allowedHtmlElements;
1724 private $afe;
1725 private $stack;
1726 private $strict;
1727 private $tidyCompat;
1728
1729 private $textIntegrationMode = false;
1730 private $pendingTableText;
1731 private $originalInsertionMode;
1732 private $fragmentContext;
1733
1734 /**
1735 * Create a new Balancer.
1736 * @param array $config Balancer configuration. Includes:
1737 * 'strict' : boolean, defaults to false.
1738 * When true, enforces syntactic constraints on input:
1739 * all non-tag '<' must be escaped, all attributes must be
1740 * separated by a single space and double-quoted. This is
1741 * consistent with the output of the Sanitizer.
1742 * 'allowedHtmlElements' : array, defaults to null.
1743 * When present, the keys of this associative array give
1744 * the acceptable HTML tag names. When not present, no
1745 * tag sanitization is done.
1746 * 'tidyCompat' : boolean, defaults to false.
1747 * When true, the serialization algorithm is tweaked to
1748 * provide historical compatibility with the old "tidy"
1749 * program: <p>-wrapping is done to the children of
1750 * <body> and <blockquote> elements, and empty elements
1751 * are removed.
1752 */
1753 public function __construct( array $config = [] ) {
1754 $config = $config + [
1755 'strict' => false,
1756 'allowedHtmlElements' => null,
1757 'tidyCompat' => false,
1758 ];
1759 $this->allowedHtmlElements = $config['allowedHtmlElements'];
1760 $this->strict = $config['strict'];
1761 $this->tidyCompat = $config['tidyCompat'];
1762 if ( $this->allowedHtmlElements !== null ) {
1763 # Sanity check!
1764 $bad = array_uintersect_assoc(
1765 $this->allowedHtmlElements,
1766 BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE],
1767 function( $a, $b ) {
1768 // Ignore the values (just intersect the keys) by saying
1769 // all values are equal to each other.
1770 return 0;
1771 }
1772 );
1773 if ( count( $bad ) > 0 ) {
1774 $badstr = implode( array_keys( $bad ), ',' );
1775 throw new ParameterAssertionException(
1776 '$config',
1777 'Balance attempted with sanitization including ' .
1778 "unsupported elements: {$badstr}"
1779 );
1780 }
1781 }
1782 }
1783
1784 /**
1785 * Return a balanced HTML string for the HTML fragment given by $text,
1786 * subject to the caveats listed in the class description. The result
1787 * will typically be idempotent -- that is, rebalancing the output
1788 * would result in no change.
1789 *
1790 * @param string $text The markup to be balanced
1791 * @param callable $processingCallback Callback to do any variable or
1792 * parameter replacements in HTML attributes values
1793 * @param array|bool $processingArgs Arguments for the processing callback
1794 * @return string The balanced markup
1795 */
1796 public function balance( $text, $processingCallback = null, $processingArgs = [] ) {
1797 $this->parseMode = 'inBodyMode';
1798 $this->bitsIterator = new ExplodeIterator( '<', $text );
1799 $this->afe = new BalanceActiveFormattingElements();
1800 $this->stack = new BalanceStack();
1801 $this->stack->tidyCompat = $this->tidyCompat;
1802 $this->processingCallback = $processingCallback;
1803 $this->processingArgs = $processingArgs;
1804
1805 # The stack is constructed with an <html> element already on it.
1806 # Set this up as a fragment parsed with <body> as the context.
1807 $this->fragmentContext =
1808 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'body', [] );
1809 $this->resetInsertionMode();
1810
1811 // First element is text not tag
1812 $x = $this->bitsIterator->current();
1813 $this->bitsIterator->next();
1814 $this->insertToken( 'text', str_replace( '>', '&gt;', $x ) );
1815 // Now process each tag.
1816 while ( $this->bitsIterator->valid() ) {
1817 $this->advance();
1818 }
1819 $this->insertToken( 'eof', null );
1820 $result = $this->stack->getOutput();
1821 // Free memory before returning.
1822 $this->bitsIterator = null;
1823 $this->afe = null;
1824 $this->stack = null;
1825 $this->fragmentContext = null;
1826 return $result;
1827 }
1828
1829 /**
1830 * Pass a token to the tree builder. The $token will be one of the
1831 * strings "tag", "endtag", or "text".
1832 */
1833 private function insertToken( $token, $value, $attribs = null, $selfclose = false ) {
1834 // validate tags against $unsupportedSet
1835 if ( $token === 'tag' || $token === 'endtag' ) {
1836 if ( isset( BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE][$value] ) ) {
1837 # As described in "simplifications" above, these tags are
1838 # not supported in the balancer.
1839 Assert::invariant(
1840 !$this->strict,
1841 "Unsupported $token <$value> found."
1842 );
1843 return false;
1844 }
1845 } elseif ( $token === 'text' && $value === '' ) {
1846 # Don't actually inject the empty string as a text token.
1847 return true;
1848 }
1849 // Some hoops we have to jump through
1850 $adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
1851
1852 $isForeign = true;
1853 if (
1854 $this->stack->length() === 0 ||
1855 $adjusted->isHtml() ||
1856 $token === 'eof'
1857 ) {
1858 $isForeign = false;
1859 } elseif ( $adjusted->isMathmlTextIntegrationPoint() ) {
1860 if ( $token === 'text' ) {
1861 $isForeign = false;
1862 } elseif (
1863 $token === 'tag' &&
1864 $value !== 'mglyph' && $value !== 'malignmark'
1865 ) {
1866 $isForeign = false;
1867 }
1868 } elseif (
1869 $adjusted->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
1870 $adjusted->localName === 'annotation-xml' &&
1871 $token === 'tag' && $value === 'svg'
1872 ) {
1873 $isForeign = false;
1874 } elseif (
1875 $adjusted->isHtmlIntegrationPoint() &&
1876 ( $token === 'tag' || $token === 'text' )
1877 ) {
1878 $isForeign = false;
1879 }
1880 if ( $isForeign ) {
1881 return $this->insertForeignToken( $token, $value, $attribs, $selfclose );
1882 } else {
1883 $func = $this->parseMode;
1884 return $this->$func( $token, $value, $attribs, $selfclose );
1885 }
1886 }
1887
1888 private function insertForeignToken( $token, $value, $attribs = null, $selfclose = false ) {
1889 if ( $token === 'text' ) {
1890 $this->stack->insertText( $value );
1891 return true;
1892 } elseif ( $token === 'tag' ) {
1893 switch ( $value ) {
1894 case 'font':
1895 if ( isset( $attribs['color'] )
1896 || isset( $attribs['face'] )
1897 || isset( $attribs['size'] )
1898 ) {
1899 break;
1900 }
1901 /* otherwise, fall through */
1902 case 'b':
1903 case 'big':
1904 case 'blockquote':
1905 case 'body':
1906 case 'br':
1907 case 'center':
1908 case 'code':
1909 case 'dd':
1910 case 'div':
1911 case 'dl':
1912 case 'dt':
1913 case 'em':
1914 case 'embed':
1915 case 'h1':
1916 case 'h2':
1917 case 'h3':
1918 case 'h4':
1919 case 'h5':
1920 case 'h6':
1921 case 'head':
1922 case 'hr':
1923 case 'i':
1924 case 'img':
1925 case 'li':
1926 case 'listing':
1927 case 'menu':
1928 case 'meta':
1929 case 'nobr':
1930 case 'ol':
1931 case 'p':
1932 case 'pre':
1933 case 'ruby':
1934 case 's':
1935 case 'small':
1936 case 'span':
1937 case 'strong':
1938 case 'strike':
1939 case 'sub':
1940 case 'sup':
1941 case 'table':
1942 case 'tt':
1943 case 'u':
1944 case 'ul':
1945 case 'var':
1946 if ( $this->fragmentContext ) {
1947 break;
1948 }
1949 while ( true ) {
1950 $this->stack->pop();
1951 $node = $this->stack->currentNode;
1952 if (
1953 $node->isMathmlTextIntegrationPoint() ||
1954 $node->isHtmlIntegrationPoint() ||
1955 $node->isHtml()
1956 ) {
1957 break;
1958 }
1959 }
1960 return $this->insertToken( $token, $value, $attribs, $selfclose );
1961 }
1962 // "Any other start tag"
1963 $adjusted = ( $this->fragmentContext && $this->stack->length()===1 ) ?
1964 $this->fragmentContext : $this->stack->currentNode;
1965 $this->stack->insertForeignElement(
1966 $adjusted->namespaceURI, $value, $attribs
1967 );
1968 if ( $selfclose ) {
1969 $this->stack->pop();
1970 }
1971 return true;
1972 } elseif ( $token === 'endtag' ) {
1973 $first = true;
1974 foreach ( $this->stack as $i => $node ) {
1975 if ( $node->isHtml() && !$first ) {
1976 // process the end tag as HTML
1977 $func = $this->parseMode;
1978 return $this->$func( $token, $value, $attribs, $selfclose );
1979 } elseif ( $i === 0 ) {
1980 return true;
1981 } elseif ( $node->localName === $value ) {
1982 $this->stack->popTag( $node );
1983 return true;
1984 }
1985 $first = false;
1986 }
1987 }
1988 }
1989
1990 /**
1991 * Grab the next "token" from $bitsIterator. This is either a open/close
1992 * tag or text, depending on whether the Sanitizer approves.
1993 */
1994 private function advance() {
1995 $x = $this->bitsIterator->current();
1996 $this->bitsIterator->next();
1997 $regs = [];
1998 # $slash: Does the current element start with a '/'?
1999 # $t: Current element name
2000 # $attribStr: String between element name and >
2001 # $brace: Ending '>' or '/>'
2002 # $rest: Everything until the next element from the $bitsIterator
2003 if ( preg_match( Sanitizer::ELEMENT_BITS_REGEX, $x, $regs ) ) {
2004 list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs;
2005 $t = strtolower( $t );
2006 if ( $this->strict ) {
2007 /* Verify that attributes are all properly double-quoted */
2008 Assert::invariant(
2009 preg_match(
2010 '/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr
2011 ),
2012 "Bad attribute string found"
2013 );
2014 }
2015 } else {
2016 Assert::invariant(
2017 !$this->strict, "< found which does not start a valid tag"
2018 );
2019 $slash = $t = $attribStr = $brace = $rest = null;
2020 }
2021 $goodtag = $t;
2022 $sanitize = $this->allowedHtmlElements !== null;
2023 if ( $sanitize ) {
2024 $goodtag = $t && isset( $this->allowedHtmlElements[$t] );
2025 }
2026 if ( $goodtag ) {
2027 if ( is_callable( $this->processingCallback ) ) {
2028 call_user_func_array( $this->processingCallback, [ &$attribStr, $this->processingArgs ] );
2029 }
2030 if ( $sanitize ) {
2031 $goodtag = Sanitizer::validateTag( $attribStr, $t );
2032 }
2033 }
2034 if ( $goodtag ) {
2035 if ( $sanitize ) {
2036 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2037 $attribs = Sanitizer::validateTagAttributes( $attribs, $t );
2038 } else {
2039 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2040 }
2041 $goodtag = $this->insertToken(
2042 $slash ? 'endtag' : 'tag', $t, $attribs, $brace === '/>'
2043 );
2044 }
2045 if ( $goodtag ) {
2046 $rest = str_replace( '>', '&gt;', $rest );
2047 $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
2048 } else {
2049 # bad tag; serialize entire thing as text.
2050 $this->insertToken( 'text', '&lt;' . str_replace( '>', '&gt;', $x ) );
2051 }
2052 }
2053
2054 private function switchMode( $mode ) {
2055 Assert::parameter(
2056 substr( $mode, -4 )==='Mode', '$mode', 'should end in Mode'
2057 );
2058 $oldMode = $this->parseMode;
2059 $this->parseMode = $mode;
2060 return $oldMode;
2061 }
2062
2063 private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfclose ) {
2064 $this->switchMode( $mode );
2065 return $this->insertToken( $token, $value, $attribs, $selfclose );
2066 }
2067
2068 private function resetInsertionMode() {
2069 $last = false;
2070 foreach ( $this->stack as $i => $node ) {
2071 if ( $i === 0 ) {
2072 $last = true;
2073 if ( $this->fragmentContext ) {
2074 $node = $this->fragmentContext;
2075 }
2076 }
2077 if ( $node->isHtml() ) {
2078 switch ( $node->localName ) {
2079 # OMITTED: <select>
2080 /*
2081 case 'select':
2082 $stacklen = $this->stack->length();
2083 for ( $j = $i + 1; $j < $stacklen-1; $j++ ) {
2084 $ancestor = $this->stack->node( $stacklen-$j-1 );
2085 if ( $ancestor->isHtmlNamed( 'template' ) ) {
2086 break;
2087 }
2088 if ( $ancestor->isHtmlNamed( 'table' ) ) {
2089 $this->switchMode( 'inSelectInTableMode' );
2090 return;
2091 }
2092 }
2093 $this->switchMode( 'inSelectMode' );
2094 return;
2095 */
2096 case 'tr':
2097 $this->switchMode( 'inRowMode' );
2098 return;
2099 case 'tbody':
2100 case 'tfoot':
2101 case 'thead':
2102 $this->switchMode( 'inTableBodyMode' );
2103 return;
2104 case 'caption':
2105 $this->switchMode( 'inCaptionMode' );
2106 return;
2107 case 'colgroup':
2108 $this->switchMode( 'inColumnGroupMode' );
2109 return;
2110 case 'table':
2111 $this->switchMode( 'inTableMode' );
2112 return;
2113 case 'template':
2114 $this->switchMode(
2115 array_slice( $this->templateInsertionModes, -1 )[0]
2116 );
2117 return;
2118 case 'body':
2119 $this->switchMode( 'inBodyMode' );
2120 return;
2121 # OMITTED: <frameset>
2122 # OMITTED: <html>
2123 # OMITTED: <head>
2124 default:
2125 if ( !$last ) {
2126 # OMITTED: <head>
2127 if ( $node->isA( BalanceSets::$tableCellSet ) ) {
2128 $this->switchMode( 'inCellMode' );
2129 return;
2130 }
2131 }
2132 }
2133 }
2134 if ( $last ) {
2135 $this->switchMode( 'inBodyMode' );
2136 return;
2137 }
2138 }
2139 }
2140
2141 private function stopParsing() {
2142 # Most of the spec methods are inapplicable, other than step 2:
2143 # "pop all the nodes off the stack of open elements".
2144 # We're going to keep the top-most <html> element on the stack, though.
2145
2146 # Clear the AFE list first, otherwise the element objects will stay live
2147 # during serialization, potentially using O(N^2) memory. Note that
2148 # popping the stack will never result in reconstructing the active
2149 # formatting elements.
2150 $this->afe = null;
2151 $this->stack->popTo( 1 );
2152 }
2153
2154 private function parseRawText( $value, $attribs = null ) {
2155 $this->stack->insertHTMLElement( $value, $attribs );
2156 // XXX switch tokenizer to rawtext state?
2157 $this->originalInsertionMode = $this->switchMode( 'inTextMode' );
2158 return true;
2159 }
2160
2161 private function inTextMode( $token, $value, $attribs = null, $selfclose = false ) {
2162 if ( $token === 'text' ) {
2163 $this->stack->insertText( $value );
2164 return true;
2165 } elseif ( $token === 'eof' ) {
2166 $this->stack->pop();
2167 return $this->switchModeAndReprocess(
2168 $this->originalInsertionMode, $token, $value, $attribs, $selfclose
2169 );
2170 } elseif ( $token === 'endtag' ) {
2171 $this->stack->pop();
2172 $this->switchMode( $this->originalInsertionMode );
2173 return true;
2174 }
2175 return true;
2176 }
2177
2178 private function inHeadMode( $token, $value, $attribs = null, $selfclose = false ) {
2179 if ( $token === 'text' ) {
2180 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
2181 $this->stack->insertText( $matches[0] );
2182 $value = substr( $value, strlen( $matches[0] ) );
2183 }
2184 if ( strlen( $value ) === 0 ) {
2185 return true; // All text handled.
2186 }
2187 // Fall through to handle non-whitespace below.
2188 } elseif ( $token === 'tag' ) {
2189 switch ( $value ) {
2190 case 'meta':
2191 # OMITTED: in a full HTML parser, this might change the encoding.
2192 /* falls through */
2193 # OMITTED: <html>
2194 case 'base':
2195 case 'basefont':
2196 case 'bgsound':
2197 case 'link':
2198 $this->stack->insertHTMLElement( $value, $attribs );
2199 $this->stack->pop();
2200 return true;
2201 # OMITTED: <title>
2202 # OMITTED: <noscript>
2203 case 'noframes':
2204 case 'style':
2205 return $this->parseRawText( $value, $attribs );
2206 # OMITTED: <script>
2207 case 'template':
2208 $this->stack->insertHTMLElement( $value, $attribs );
2209 $this->afe->insertMarker();
2210 # OMITTED: frameset_ok
2211 $this->switchMode( 'inTemplateMode' );
2212 $this->templateInsertionModes[] = $this->parseMode;
2213 return true;
2214 # OMITTED: <head>
2215 }
2216 } elseif ( $token === 'endtag' ) {
2217 switch ( $value ) {
2218 # OMITTED: <head>
2219 # OMITTED: <body>
2220 # OMITTED: <html>
2221 case 'br':
2222 break; // handle at the bottom of the function
2223 case 'template':
2224 if ( $this->stack->indexOf( $value ) < 0 ) {
2225 return true; // Ignore the token.
2226 }
2227 $this->stack->generateImpliedEndTags( null, true /* thorough */ );
2228 $this->stack->popTag( $value );
2229 $this->afe->clearToMarker();
2230 array_pop( $this->templateInsertionModes );
2231 $this->resetInsertionMode();
2232 return true;
2233 default:
2234 // ignore any other end tag
2235 return true;
2236 }
2237 }
2238
2239 // If not handled above
2240 $this->inHeadMode( 'endtag', 'head' ); // synthetic </head>
2241 // Then redo this one
2242 return $this->insertToken( $token, $value, $attribs, $selfclose );
2243 }
2244
2245 private function inBodyMode( $token, $value, $attribs = null, $selfclose = false ) {
2246 if ( $token === 'text' ) {
2247 $this->afe->reconstruct( $this->stack );
2248 $this->stack->insertText( $value );
2249 return true;
2250 } elseif ( $token === 'eof' ) {
2251 if ( !empty( $this->templateInsertionModes ) ) {
2252 return $this->inTemplateMode( $token, $value, $attribs, $selfclose );
2253 }
2254 $this->stopParsing();
2255 return true;
2256 } elseif ( $token === 'tag' ) {
2257 switch ( $value ) {
2258 # OMITTED: <html>
2259 case 'base':
2260 case 'basefont':
2261 case 'bgsound':
2262 case 'link':
2263 case 'meta':
2264 case 'noframes':
2265 # OMITTED: <script>
2266 case 'style':
2267 case 'template':
2268 # OMITTED: <title>
2269 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2270 # OMITTED: <body>
2271 # OMITTED: <frameset>
2272
2273 case 'address':
2274 case 'article':
2275 case 'aside':
2276 case 'blockquote':
2277 case 'center':
2278 case 'details':
2279 case 'dialog':
2280 case 'dir':
2281 case 'div':
2282 case 'dl':
2283 case 'fieldset':
2284 case 'figcaption':
2285 case 'figure':
2286 case 'footer':
2287 case 'header':
2288 case 'hgroup':
2289 case 'main':
2290 case 'menu':
2291 case 'nav':
2292 case 'ol':
2293 case 'p':
2294 case 'section':
2295 case 'summary':
2296 case 'ul':
2297 if ( $this->stack->inButtonScope( 'p' ) ) {
2298 $this->inBodyMode( 'endtag', 'p' );
2299 }
2300 $this->stack->insertHTMLElement( $value, $attribs );
2301 return true;
2302
2303 case 'h1':
2304 case 'h2':
2305 case 'h3':
2306 case 'h4':
2307 case 'h5':
2308 case 'h6':
2309 if ( $this->stack->inButtonScope( 'p' ) ) {
2310 $this->inBodyMode( 'endtag', 'p' );
2311 }
2312 if ( $this->stack->currentNode->isA( BalanceSets::$headingSet ) ) {
2313 $this->stack->pop();
2314 }
2315 $this->stack->insertHTMLElement( $value, $attribs );
2316 return true;
2317
2318 case 'pre':
2319 case 'listing':
2320 if ( $this->stack->inButtonScope( 'p' ) ) {
2321 $this->inBodyMode( 'endtag', 'p' );
2322 }
2323 $this->stack->insertHTMLElement( $value, $attribs );
2324 # As described in "simplifications" above:
2325 # 1. We don't touch the next token, even if it's a linefeed.
2326 # 2. OMITTED: frameset_ok
2327 return true;
2328
2329 # OMITTED: <form>
2330
2331 case 'li':
2332 # OMITTED: frameset_ok
2333 foreach ( $this->stack as $node ) {
2334 if ( $node->isHtmlNamed( 'li' ) ) {
2335 $this->inBodyMode( 'endtag', 'li' );
2336 break;
2337 }
2338 if (
2339 $node->isA( BalanceSets::$specialSet ) &&
2340 !$node->isA( BalanceSets::$addressDivPSet )
2341 ) {
2342 break;
2343 }
2344 }
2345 if ( $this->stack->inButtonScope( 'p' ) ) {
2346 $this->inBodyMode( 'endtag', 'p' );
2347 }
2348 $this->stack->insertHTMLElement( $value, $attribs );
2349 return true;
2350
2351 case 'dd':
2352 case 'dt':
2353 # OMITTED: frameset_ok
2354 foreach ( $this->stack as $node ) {
2355 if ( $node->isHtmlNamed( 'dd' ) ) {
2356 $this->inBodyMode( 'endtag', 'dd' );
2357 break;
2358 }
2359 if ( $node->isHtmlNamed( 'dt' ) ) {
2360 $this->inBodyMode( 'endtag', 'dt' );
2361 break;
2362 }
2363 if (
2364 $node->isA( BalanceSets::$specialSet ) &&
2365 !$node->isA( BalanceSets::$addressDivPSet )
2366 ) {
2367 break;
2368 }
2369 }
2370 if ( $this->stack->inButtonScope( 'p' ) ) {
2371 $this->inBodyMode( 'endtag', 'p' );
2372 }
2373 $this->stack->insertHTMLElement( $value, $attribs );
2374 return true;
2375
2376 # OMITTED: <plaintext>
2377
2378 case 'button':
2379 if ( $this->stack->inScope( 'button' ) ) {
2380 $this->inBodyMode( 'endtag', 'button' );
2381 return $this->insertToken( $token, $value, $attribs, $selfclose );
2382 }
2383 $this->afe->reconstruct( $this->stack );
2384 $this->stack->insertHTMLElement( $value, $attribs );
2385 return true;
2386
2387 case 'a':
2388 $activeElement = $this->afe->findElementByTag( 'a' );
2389 if ( $activeElement ) {
2390 $this->inBodyMode( 'endtag', 'a' );
2391 if ( $this->afe->isInList( $activeElement ) ) {
2392 $this->afe->remove( $activeElement );
2393 // Don't flatten here, since when we fall
2394 // through below we might foster parent
2395 // the new <a> tag inside this one.
2396 $this->stack->removeElement( $activeElement, false );
2397 }
2398 }
2399 /* Falls through */
2400 case 'b':
2401 case 'big':
2402 case 'code':
2403 case 'em':
2404 case 'font':
2405 case 'i':
2406 case 's':
2407 case 'small':
2408 case 'strike':
2409 case 'strong':
2410 case 'tt':
2411 case 'u':
2412 $this->afe->reconstruct( $this->stack );
2413 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
2414 return true;
2415
2416 case 'nobr':
2417 $this->afe->reconstruct( $this->stack );
2418 if ( $this->stack->inScope( 'nobr' ) ) {
2419 $this->inBodyMode( 'endtag', 'nobr' );
2420 $this->afe->reconstruct( $this->stack );
2421 }
2422 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
2423 return true;
2424
2425 case 'applet':
2426 case 'marquee':
2427 case 'object':
2428 $this->afe->reconstruct( $this->stack );
2429 $this->stack->insertHTMLElement( $value, $attribs );
2430 $this->afe->insertMarker();
2431 # OMITTED: frameset_ok
2432 return true;
2433
2434 case 'table':
2435 # The document is never in "quirks mode"; see simplifications
2436 # above.
2437 if ( $this->stack->inButtonScope( 'p' ) ) {
2438 $this->inBodyMode( 'endtag', 'p' );
2439 }
2440 $this->stack->insertHTMLElement( $value, $attribs );
2441 # OMITTED: frameset_ok
2442 $this->switchMode( 'inTableMode' );
2443 return true;
2444
2445 case 'area':
2446 case 'br':
2447 case 'embed':
2448 case 'img':
2449 case 'keygen':
2450 case 'wbr':
2451 $this->afe->reconstruct( $this->stack );
2452 $this->stack->insertHTMLElement( $value, $attribs );
2453 $this->stack->pop();
2454 # OMITTED: frameset_ok
2455 return true;
2456
2457 case 'input':
2458 $this->afe->reconstruct( $this->stack );
2459 $this->stack->insertHTMLElement( $value, $attribs );
2460 $this->stack->pop();
2461 # OMITTED: frameset_ok
2462 # (hence we don't need to examine the tag's "type" attribute)
2463 return true;
2464
2465 case 'menuitem':
2466 case 'param':
2467 case 'source':
2468 case 'track':
2469 $this->stack->insertHTMLElement( $value, $attribs );
2470 $this->stack->pop();
2471 return true;
2472
2473 case 'hr':
2474 if ( $this->stack->inButtonScope( 'p' ) ) {
2475 $this->inBodyMode( 'endtag', 'p' );
2476 }
2477 $this->stack->insertHTMLElement( $value, $attribs );
2478 $this->stack->pop();
2479 return true;
2480
2481 case 'image':
2482 # warts!
2483 return $this->inBodyMode( $token, 'img', $attribs, $selfclose );
2484
2485 # OMITTED: <isindex>
2486 # OMITTED: <textarea>
2487 # OMITTED: <xmp>
2488 # OMITTED: <iframe>
2489 # OMITTED: <noembed>
2490 # OMITTED: <noscript>
2491
2492 # OMITTED: <select>
2493 /*
2494 case 'select':
2495 $this->afe->reconstruct( $this->stack );
2496 $this->stack->insertHTMLElement( $value, $attribs );
2497 switch ( $this->parseMode ) {
2498 case 'inTableMode':
2499 case 'inCaptionMode':
2500 case 'inTableBodyMode':
2501 case 'inRowMode':
2502 case 'inCellMode':
2503 $this->switchMode( 'inSelectInTableMode' );
2504 return true;
2505 default:
2506 $this->switchMode( 'inSelectMode' );
2507 return true;
2508 }
2509 */
2510
2511 case 'optgroup':
2512 case 'option':
2513 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
2514 $this->inBodyMode( 'endtag', 'option' );
2515 }
2516 $this->afe->reconstruct( $this->stack );
2517 $this->stack->insertHTMLElement( $value, $attribs );
2518 return true;
2519
2520 case 'rb':
2521 case 'rtc':
2522 if ( $this->stack->inScope( 'ruby' ) ) {
2523 $this->stack->generateImpliedEndTags();
2524 }
2525 $this->stack->insertHTMLElement( $value, $attribs );
2526 return true;
2527
2528 case 'rp':
2529 case 'rt':
2530 if ( $this->stack->inScope( 'ruby' ) ) {
2531 $this->stack->generateImpliedEndTags( 'rtc' );
2532 }
2533 $this->stack->insertHTMLElement( $value, $attribs );
2534 return true;
2535
2536 case 'math':
2537 $this->afe->reconstruct( $this->stack );
2538 # We skip the spec's "adjust MathML attributes" and
2539 # "adjust foreign attributes" steps, since the browser will
2540 # do this later when it parses the output and it doesn't affect
2541 # balancing.
2542 $this->stack->insertForeignElement(
2543 BalanceSets::MATHML_NAMESPACE, $value, $attribs
2544 );
2545 if ( $selfclose ) {
2546 # emit explicit </math> tag.
2547 $this->stack->pop();
2548 }
2549 return true;
2550
2551 case 'svg':
2552 $this->afe->reconstruct( $this->stack );
2553 # We skip the spec's "adjust SVG attributes" and
2554 # "adjust foreign attributes" steps, since the browser will
2555 # do this later when it parses the output and it doesn't affect
2556 # balancing.
2557 $this->stack->insertForeignElement(
2558 BalanceSets::SVG_NAMESPACE, $value, $attribs
2559 );
2560 if ( $selfclose ) {
2561 # emit explicit </svg> tag.
2562 $this->stack->pop();
2563 }
2564 return true;
2565
2566 case 'caption':
2567 case 'col':
2568 case 'colgroup':
2569 # OMITTED: <frame>
2570 case 'head':
2571 case 'tbody':
2572 case 'td':
2573 case 'tfoot':
2574 case 'th':
2575 case 'thead':
2576 case 'tr':
2577 // Ignore table tags if we're not inTableMode
2578 return true;
2579 }
2580
2581 // Handle any other start tag here
2582 $this->afe->reconstruct( $this->stack );
2583 $this->stack->insertHTMLElement( $value, $attribs );
2584 return true;
2585 } elseif ( $token === 'endtag' ) {
2586 switch ( $value ) {
2587 # </body>,</html> are unsupported.
2588
2589 case 'template':
2590 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2591
2592 case 'address':
2593 case 'article':
2594 case 'aside':
2595 case 'blockquote':
2596 case 'button':
2597 case 'center':
2598 case 'details':
2599 case 'dialog':
2600 case 'dir':
2601 case 'div':
2602 case 'dl':
2603 case 'fieldset':
2604 case 'figcaption':
2605 case 'figure':
2606 case 'footer':
2607 case 'header':
2608 case 'hgroup':
2609 case 'listing':
2610 case 'main':
2611 case 'menu':
2612 case 'nav':
2613 case 'ol':
2614 case 'pre':
2615 case 'section':
2616 case 'summary':
2617 case 'ul':
2618 // Ignore if there is not a matching open tag
2619 if ( !$this->stack->inScope( $value ) ) {
2620 return true;
2621 }
2622 $this->stack->generateImpliedEndTags();
2623 $this->stack->popTag( $value );
2624 return true;
2625
2626 # OMITTED: <form>
2627
2628 case 'p':
2629 if ( !$this->stack->inButtonScope( 'p' ) ) {
2630 $this->inBodyMode( 'tag', 'p', [] );
2631 return $this->insertToken( $token, $value, $attribs, $selfclose );
2632 }
2633 $this->stack->generateImpliedEndTags( $value );
2634 $this->stack->popTag( $value );
2635 return true;
2636
2637 case 'li':
2638 if ( !$this->stack->inListItemScope( $value ) ) {
2639 return true; # ignore
2640 }
2641 $this->stack->generateImpliedEndTags( $value );
2642 $this->stack->popTag( $value );
2643 return true;
2644
2645 case 'dd':
2646 case 'dt':
2647 if ( !$this->stack->inScope( $value ) ) {
2648 return true; # ignore
2649 }
2650 $this->stack->generateImpliedEndTags( $value );
2651 $this->stack->popTag( $value );
2652 return true;
2653
2654 case 'h1':
2655 case 'h2':
2656 case 'h3':
2657 case 'h4':
2658 case 'h5':
2659 case 'h6':
2660 if ( !$this->stack->inScope( BalanceSets::$headingSet ) ) {
2661 return true; # ignore
2662 }
2663 $this->stack->generateImpliedEndTags();
2664 $this->stack->popTag( BalanceSets::$headingSet );
2665 return true;
2666
2667 case 'sarcasm':
2668 # Take a deep breath, then:
2669 break;
2670
2671 case 'a':
2672 case 'b':
2673 case 'big':
2674 case 'code':
2675 case 'em':
2676 case 'font':
2677 case 'i':
2678 case 'nobr':
2679 case 's':
2680 case 'small':
2681 case 'strike':
2682 case 'strong':
2683 case 'tt':
2684 case 'u':
2685 if ( $this->stack->adoptionAgency( $value, $this->afe ) ) {
2686 return true; # If we did something, we're done.
2687 }
2688 break; # Go to the "any other end tag" case.
2689
2690 case 'applet':
2691 case 'marquee':
2692 case 'object':
2693 if ( !$this->stack->inScope( $value ) ) {
2694 return true; # ignore
2695 }
2696 $this->stack->generateImpliedEndTags();
2697 $this->stack->popTag( $value );
2698 $this->afe->clearToMarker();
2699 return true;
2700
2701 case 'br':
2702 # Turn </br> into <br>
2703 return $this->inBodyMode( 'tag', $value, [] );
2704 }
2705
2706 // Any other end tag goes here
2707 foreach ( $this->stack as $i => $node ) {
2708 if ( $node->isHtmlNamed( $value ) ) {
2709 $this->stack->generateImpliedEndTags( $value );
2710 $this->stack->popTo( $i ); # including $i
2711 break;
2712 } elseif ( $node->isA( BalanceSets::$specialSet ) ) {
2713 return true; // ignore this close token.
2714 }
2715 }
2716 return true;
2717 } else {
2718 Assert::invariant( false, "Bad token type: $token" );
2719 }
2720 }
2721
2722 private function inTableMode( $token, $value, $attribs = null, $selfclose = false ) {
2723 if ( $token === 'text' ) {
2724 if ( $this->textIntegrationMode ) {
2725 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
2726 } elseif ( $this->stack->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) {
2727 $this->pendingTableText = '';
2728 $this->originalInsertionMode = $this->parseMode;
2729 return $this->switchModeAndReprocess( 'inTableTextMode', $token, $value, $attribs, $selfclose );
2730 }
2731 // fall through to default case.
2732 } elseif ( $token === 'eof' ) {
2733 $this->stopParsing();
2734 return true;
2735 } elseif ( $token === 'tag' ) {
2736 switch ( $value ) {
2737 case 'caption':
2738 $this->afe->insertMarker();
2739 $this->stack->insertHTMLElement( $value, $attribs );
2740 $this->switchMode( 'inCaptionMode' );
2741 return true;
2742 case 'colgroup':
2743 $this->stack->clearToContext( BalanceSets::$tableContextSet );
2744 $this->stack->insertHTMLElement( $value, $attribs );
2745 $this->switchMode( 'inColumnGroupMode' );
2746 return true;
2747 case 'col':
2748 $this->inTableMode( 'tag', 'colgroup', [] );
2749 return $this->insertToken( $token, $value, $attribs, $selfclose );
2750 case 'tbody':
2751 case 'tfoot':
2752 case 'thead':
2753 $this->stack->clearToContext( BalanceSets::$tableContextSet );
2754 $this->stack->insertHTMLElement( $value, $attribs );
2755 $this->switchMode( 'inTableBodyMode' );
2756 return true;
2757 case 'td':
2758 case 'th':
2759 case 'tr':
2760 $this->inTableMode( 'tag', 'tbody', [] );
2761 return $this->insertToken( $token, $value, $attribs, $selfclose );
2762 case 'table':
2763 if ( !$this->stack->inTableScope( $value ) ) {
2764 return true; // Ignore this tag.
2765 }
2766 $this->inTableMode( 'endtag', $value );
2767 return $this->insertToken( $token, $value, $attribs, $selfclose );
2768
2769 case 'style':
2770 # OMITTED: <script>
2771 case 'template':
2772 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2773
2774 case 'input':
2775 if ( !isset( $attribs['type'] ) || strcasecmp( $attribs['type'], 'hidden' ) !== 0 ) {
2776 break; // Handle this as "everything else"
2777 }
2778 $this->stack->insertHTMLElement( $value, $attribs );
2779 $this->stack->pop();
2780 return true;
2781
2782 # OMITTED: <form>
2783 }
2784 // Fall through for "anything else" clause.
2785 } elseif ( $token === 'endtag' ) {
2786 switch ( $value ) {
2787 case 'table':
2788 if ( !$this->stack->inTableScope( $value ) ) {
2789 return true; // Ignore.
2790 }
2791 $this->stack->popTag( $value );
2792 $this->resetInsertionMode();
2793 return true;
2794 # OMITTED: <body>
2795 case 'caption':
2796 case 'col':
2797 case 'colgroup':
2798 # OMITTED: <html>
2799 case 'tbody':
2800 case 'td':
2801 case 'tfoot':
2802 case 'th':
2803 case 'thead':
2804 case 'tr':
2805 return true; // Ignore the token.
2806 case 'template':
2807 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2808 }
2809 // Fall through for "anything else" clause.
2810 }
2811 // This is the "anything else" case:
2812 $this->stack->fosterParentMode = true;
2813 $this->inBodyMode( $token, $value, $attribs, $selfclose );
2814 $this->stack->fosterParentMode = false;
2815 return true;
2816 }
2817
2818 private function inTableTextMode( $token, $value, $attribs = null, $selfclose = false ) {
2819 if ( $token === 'text' ) {
2820 $this->pendingTableText .= $value;
2821 return true;
2822 }
2823 // Non-text token:
2824 $text = $this->pendingTableText;
2825 $this->pendingTableText = '';
2826 if ( preg_match( '/[^\x09\x0A\x0C\x0D\x20]/', $text ) ) {
2827 // This should match the "anything else" case inTableMode
2828 $this->stack->fosterParentMode = true;
2829 $this->inBodyMode( 'text', $text );
2830 $this->stack->fosterParentMode = false;
2831 } else {
2832 // Pending text is just whitespace.
2833 $this->stack->insertText( $text );
2834 }
2835 return $this->switchModeAndReprocess(
2836 $this->originalInsertionMode, $token, $value, $attribs, $selfclose
2837 );
2838 }
2839
2840 // helper for inCaptionMode
2841 private function endCaption() {
2842 if ( !$this->stack->inTableScope( 'caption' ) ) {
2843 return false;
2844 }
2845 $this->stack->generateImpliedEndTags();
2846 $this->stack->popTag( 'caption' );
2847 $this->afe->clearToMarker();
2848 $this->switchMode( 'inTableMode' );
2849 return true;
2850 }
2851
2852 private function inCaptionMode( $token, $value, $attribs = null, $selfclose = false ) {
2853 if ( $token === 'tag' ) {
2854 switch ( $value ) {
2855 case 'caption':
2856 case 'col':
2857 case 'colgroup':
2858 case 'tbody':
2859 case 'td':
2860 case 'tfoot':
2861 case 'th':
2862 case 'thead':
2863 case 'tr':
2864 if ( $this->endCaption() ) {
2865 $this->insertToken( $token, $value, $attribs, $selfclose );
2866 }
2867 return true;
2868 }
2869 // Fall through to "anything else" case.
2870 } elseif ( $token === 'endtag' ) {
2871 switch ( $value ) {
2872 case 'caption':
2873 $this->endCaption();
2874 return true;
2875 case 'table':
2876 if ( $this->endCaption() ) {
2877 $this->insertToken( $token, $value, $attribs, $selfclose );
2878 }
2879 return true;
2880 case 'body':
2881 case 'col':
2882 case 'colgroup':
2883 # OMITTED: <html>
2884 case 'tbody':
2885 case 'td':
2886 case 'tfoot':
2887 case 'th':
2888 case 'thead':
2889 case 'tr':
2890 // Ignore the token
2891 return true;
2892 }
2893 // Fall through to "anything else" case.
2894 }
2895 // The Anything Else case
2896 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
2897 }
2898
2899 private function inColumnGroupMode( $token, $value, $attribs = null, $selfclose = false ) {
2900 if ( $token === 'text' ) {
2901 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
2902 $this->stack->insertText( $matches[0] );
2903 $value = substr( $value, strlen( $matches[0] ) );
2904 }
2905 if ( strlen( $value ) === 0 ) {
2906 return true; // All text handled.
2907 }
2908 // Fall through to handle non-whitespace below.
2909 } elseif ( $token === 'tag' ) {
2910 switch ( $value ) {
2911 # OMITTED: <html>
2912 case 'col':
2913 $this->stack->insertHTMLElement( $value, $attribs );
2914 $this->stack->pop();
2915 return true;
2916 case 'template':
2917 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2918 }
2919 // Fall through for "anything else".
2920 } elseif ( $token === 'endtag' ) {
2921 switch ( $value ) {
2922 case 'colgroup':
2923 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
2924 return true; // Ignore the token.
2925 }
2926 $this->stack->pop();
2927 $this->switchMode( 'inTableMode' );
2928 return true;
2929 case 'col':
2930 return true; // Ignore the token.
2931 case 'template':
2932 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2933 }
2934 // Fall through for "anything else".
2935 } elseif ( $token === 'eof' ) {
2936 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
2937 }
2938
2939 // Anything else
2940 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
2941 return true; // Ignore the token.
2942 }
2943 $this->inColumnGroupMode( 'endtag', 'colgroup' );
2944 return $this->insertToken( $token, $value, $attribs, $selfclose );
2945 }
2946
2947 // Helper function for inTableBodyMode
2948 private function endSection() {
2949 if ( !(
2950 $this->stack->inTableScope( 'tbody' ) ||
2951 $this->stack->inTableScope( 'thead' ) ||
2952 $this->stack->inTableScope( 'tfoot' )
2953 ) ) {
2954 return false;
2955 }
2956 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
2957 $this->stack->pop();
2958 $this->switchMode( 'inTableMode' );
2959 return true;
2960 }
2961 private function inTableBodyMode( $token, $value, $attribs = null, $selfclose = false ) {
2962 if ( $token === 'tag' ) {
2963 switch ( $value ) {
2964 case 'tr':
2965 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
2966 $this->stack->insertHTMLElement( $value, $attribs );
2967 $this->switchMode( 'inRowMode' );
2968 return true;
2969 case 'th':
2970 case 'td':
2971 $this->inTableBodyMode( 'tag', 'tr', [] );
2972 $this->insertToken( $token, $value, $attribs, $selfclose );
2973 return true;
2974 case 'caption':
2975 case 'col':
2976 case 'colgroup':
2977 case 'tbody':
2978 case 'tfoot':
2979 case 'thead':
2980 if ( $this->endSection() ) {
2981 $this->insertToken( $token, $value, $attribs, $selfclose );
2982 }
2983 return true;
2984 }
2985 } elseif ( $token === 'endtag' ) {
2986 switch ( $value ) {
2987 case 'table':
2988 if ( $this->endSection() ) {
2989 $this->insertToken( $token, $value, $attribs, $selfclose );
2990 }
2991 return true;
2992 case 'tbody':
2993 case 'tfoot':
2994 case 'thead':
2995 if ( $this->stack->inTableScope( $value ) ) {
2996 $this->endSection();
2997 }
2998 return true;
2999 # OMITTED: <body>
3000 case 'caption':
3001 case 'col':
3002 case 'colgroup':
3003 # OMITTED: <html>
3004 case 'td':
3005 case 'th':
3006 case 'tr':
3007 return true; // Ignore the token.
3008 }
3009 }
3010 // Anything else:
3011 return $this->inTableMode( $token, $value, $attribs, $selfclose );
3012 }
3013
3014 // Helper function for inRowMode
3015 private function endRow() {
3016 if ( !$this->stack->inTableScope( 'tr' ) ) {
3017 return false;
3018 }
3019 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3020 $this->stack->pop();
3021 $this->switchMode( 'inTableBodyMode' );
3022 return true;
3023 }
3024 private function inRowMode( $token, $value, $attribs = null, $selfclose = false ) {
3025 if ( $token === 'tag' ) {
3026 switch ( $value ) {
3027 case 'th':
3028 case 'td':
3029 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3030 $this->stack->insertHTMLElement( $value, $attribs );
3031 $this->switchMode( 'inCellMode' );
3032 $this->afe->insertMarker();
3033 return true;
3034 case 'caption':
3035 case 'col':
3036 case 'colgroup':
3037 case 'tbody':
3038 case 'tfoot':
3039 case 'thead':
3040 case 'tr':
3041 if ( $this->endRow() ) {
3042 $this->insertToken( $token, $value, $attribs, $selfclose );
3043 }
3044 return true;
3045 }
3046 } elseif ( $token === 'endtag' ) {
3047 switch ( $value ) {
3048 case 'tr':
3049 $this->endRow();
3050 return true;
3051 case 'table':
3052 if ( $this->endRow() ) {
3053 $this->insertToken( $token, $value, $attribs, $selfclose );
3054 }
3055 return true;
3056 case 'tbody':
3057 case 'tfoot':
3058 case 'thead':
3059 if (
3060 $this->stack->inTableScope( $value ) &&
3061 $this->endRow()
3062 ) {
3063 $this->insertToken( $token, $value, $attribs, $selfclose );
3064 }
3065 return true;
3066 # OMITTED: <body>
3067 case 'caption':
3068 case 'col':
3069 case 'colgroup':
3070 # OMITTED: <html>
3071 case 'td':
3072 case 'th':
3073 return true; // Ignore the token.
3074 }
3075 }
3076 // Anything else:
3077 return $this->inTableMode( $token, $value, $attribs, $selfclose );
3078 }
3079
3080 // Helper for inCellMode
3081 private function endCell() {
3082 if ( $this->stack->inTableScope( 'td' ) ) {
3083 $this->inCellMode( 'endtag', 'td' );
3084 return true;
3085 } elseif ( $this->stack->inTableScope( 'th' ) ) {
3086 $this->inCellMode( 'endtag', 'th' );
3087 return true;
3088 } else {
3089 return false;
3090 }
3091 }
3092 private function inCellMode( $token, $value, $attribs = null, $selfclose = false ) {
3093 if ( $token === 'tag' ) {
3094 switch ( $value ) {
3095 case 'caption':
3096 case 'col':
3097 case 'colgroup':
3098 case 'tbody':
3099 case 'td':
3100 case 'tfoot':
3101 case 'th':
3102 case 'thead':
3103 case 'tr':
3104 if ( $this->endCell() ) {
3105 $this->insertToken( $token, $value, $attribs, $selfclose );
3106 }
3107 return true;
3108 }
3109 } elseif ( $token === 'endtag' ) {
3110 switch ( $value ) {
3111 case 'td':
3112 case 'th':
3113 if ( $this->stack->inTableScope( $value ) ) {
3114 $this->stack->generateImpliedEndTags();
3115 $this->stack->popTag( $value );
3116 $this->afe->clearToMarker();
3117 $this->switchMode( 'inRowMode' );
3118 }
3119 return true;
3120 # OMITTED: <body>
3121 case 'caption':
3122 case 'col':
3123 case 'colgroup':
3124 # OMITTED: <html>
3125 return true;
3126
3127 case 'table':
3128 case 'tbody':
3129 case 'tfoot':
3130 case 'thead':
3131 case 'tr':
3132 if ( $this->stack->inTableScope( $value ) ) {
3133 $this->stack->generateImpliedEndTags();
3134 $this->stack->popTag( BalanceSets::$tableCellSet );
3135 $this->afe->clearToMarker();
3136 $this->switchMode( 'inRowMode' );
3137 $this->insertToken( $token, $value, $attribs, $selfclose );
3138 }
3139 return true;
3140 }
3141 }
3142 // Anything else:
3143 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3144 }
3145
3146 # OMITTED: <select>
3147 /*
3148 private function inSelectMode( $token, $value, $attribs = null, $selfclose = false ) {
3149 Assert::invariant( false, 'Unimplemented' );
3150 }
3151
3152 private function inSelectInTableMode( $token, $value, $attribs = null, $selfclose = false ) {
3153 Assert::invariant( false, 'Unimplemented' );
3154 }
3155 */
3156
3157 private function inTemplateMode( $token, $value, $attribs = null, $selfclose = false ) {
3158 if ( $token === 'text' ) {
3159 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3160 } elseif ( $token === 'eof' ) {
3161 if ( $this->stack->indexOf( 'template' ) < 0 ) {
3162 $this->stopParsing();
3163 } else {
3164 $this->stack->popTag( 'template' );
3165 $this->afe->clearToMarker();
3166 array_pop( $this->templateInsertionModes );
3167 $this->resetInsertionMode();
3168 $this->insertToken( $token, $value, $attribs, $selfclose );
3169 }
3170 return true;
3171 } elseif ( $token === 'tag' ) {
3172 switch ( $value ) {
3173 case 'base':
3174 case 'basefont':
3175 case 'bgsound':
3176 case 'link':
3177 case 'meta':
3178 case 'noframes':
3179 # OMITTED: <script>
3180 case 'style':
3181 case 'template':
3182 # OMITTED: <title>
3183 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3184
3185 case 'caption':
3186 case 'colgroup':
3187 case 'tbody':
3188 case 'tfoot':
3189 case 'thead':
3190 return $this->switchModeAndReprocess(
3191 'inTableMode', $token, $value, $attribs, $selfclose
3192 );
3193
3194 case 'col':
3195 return $this->switchModeAndReprocess(
3196 'inColumnGroupMode', $token, $value, $attribs, $selfclose
3197 );
3198
3199 case 'tr':
3200 return $this->switchModeAndReprocess(
3201 'inTableBodyMode', $token, $value, $attribs, $selfclose
3202 );
3203
3204 case 'td':
3205 case 'th':
3206 return $this->switchModeAndReprocess(
3207 'inRowMode', $token, $value, $attribs, $selfclose
3208 );
3209 }
3210 return $this->switchModeAndReprocess(
3211 'inBodyMode', $token, $value, $attribs, $selfclose
3212 );
3213 } elseif ( $token === 'endtag' ) {
3214 switch ( $value ) {
3215 case 'template':
3216 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3217 }
3218 return true;
3219 } else {
3220 Assert::invariant( false, "Bad token type: $token" );
3221 }
3222 }
3223 }