Balancer: cache BalanceStack::currentNode()
[lhc/web/wiklou.git] / includes / tidy / Balancer.php
1 <?php
2 /**
3 * An implementation of the tree building portion of the HTML5 parsing
4 * spec.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
20 *
21 * @file
22 * @ingroup Parser
23 * @since 1.27
24 * @author C. Scott Ananian, 2016
25 */
26 namespace MediaWiki\Tidy;
27
28 use Wikimedia\Assert\Assert;
29 use Wikimedia\Assert\ParameterAssertionException;
30 use \ExplodeIterator;
31 use \IteratorAggregate;
32 use \ReverseArrayIterator;
33 use \Sanitizer;
34
35 # A note for future librarization[1] -- this file is a good candidate
36 # for splitting into an independent library, except that it is currently
37 # highly optimized for MediaWiki use. It only implements the portions
38 # of the HTML5 tree builder used by tags supported by MediaWiki, and
39 # does not contain a true tokenizer pass, instead relying on
40 # comment stripping, attribute normalization, and escaping done by
41 # the MediaWiki Sanitizer. It also deliberately avoids building
42 # a true DOM in memory, instead serializing elements to an output string
43 # as soon as possible (usually as soon as the tag is closed) to reduce
44 # its memory footprint.
45
46 # On the other hand, I've been pretty careful to note with comments in the
47 # code the places where this implementation omits features of the spec or
48 # depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
49 # implement the missing pieces and make this a standalone PHP HTML5 parser.
50 # In order to do so, some sort of MediaWiki-specific API will need
51 # to be added to (a) allow the Balancer to bypass the tokenizer,
52 # and (b) support on-the-fly flattening instead of DOM node creation.
53
54 # [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
55
56 /**
57 * Utility constants and sets for the HTML5 tree building algorithm.
58 * Sets are associative arrays indexed first by namespace and then by
59 * lower-cased tag name.
60 *
61 * @ingroup Parser
62 * @since 1.27
63 */
64 class BalanceSets {
65 const HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml';
66 const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
67 const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
68
69 public static $unsupportedSet = [
70 self::HTML_NAMESPACE => [
71 'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
72 'form' => true, 'frame' => true,
73 'plaintext' => true, 'isindex' => true, 'textarea' => true,
74 'xmp' => true, 'iframe' => true, 'noembed' => true,
75 'noscript' => true, 'select' => true, 'script' => true,
76 'title' => true
77 ]
78 ];
79
80 public static $emptyElementSet = [
81 self::HTML_NAMESPACE => [
82 'area' => true, 'base' => true, 'basefont' => true,
83 'bgsound' => true, 'br' => true, 'col' => true, 'command' => true,
84 'embed' => true, 'frame' => true, 'hr' => true, 'img' => true,
85 'input' => true, 'keygen' => true, 'link' => true, 'meta' => true,
86 'param' => true, 'source' => true, 'track' => true, 'wbr' => true
87 ]
88 ];
89
90 public static $headingSet = [
91 self::HTML_NAMESPACE => [
92 'h1' => true, 'h2' => true, 'h3' => true,
93 'h4' => true, 'h5' => true, 'h6' => true
94 ]
95 ];
96
97 public static $specialSet = [
98 self::HTML_NAMESPACE => [
99 'address' => true, 'applet' => true, 'area' => true,
100 'article' => true, 'aside' => true, 'base' => true,
101 'basefont' => true, 'bgsound' => true, 'blockquote' => true,
102 'body' => true, 'br' => true, 'button' => true, 'caption' => true,
103 'center' => true, 'col' => true, 'colgroup' => true, 'dd' => true,
104 'details' => true, 'dir' => true, 'div' => true, 'dl' => true,
105 'dt' => true, 'embed' => true, 'fieldset' => true,
106 'figcaption' => true, 'figure' => true, 'footer' => true,
107 'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true,
108 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
109 'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
110 'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
111 'input' => true, 'isindex' => true, 'li' => true, 'link' => true,
112 'listing' => true, 'main' => true, 'marquee' => true,
113 'menu' => true, 'menuitem' => true, 'meta' => true, 'nav' => true,
114 'noembed' => true, 'noframes' => true, 'noscript' => true,
115 'object' => true, 'ol' => true, 'p' => true, 'param' => true,
116 'plaintext' => true, 'pre' => true, 'script' => true,
117 'section' => true, 'select' => true, 'source' => true,
118 'style' => true, 'summary' => true, 'table' => true,
119 'tbody' => true, 'td' => true, 'template' => true,
120 'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true,
121 'title' => true, 'tr' => true, 'track' => true, 'ul' => true,
122 'wbr' => true, 'xmp' => true
123 ],
124 self::SVG_NAMESPACE => [
125 'foreignobject' => true, 'desc' => true, 'title' => true
126 ],
127 self::MATHML_NAMESPACE => [
128 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
129 'mtext' => true, 'annotation-xml' => true
130 ]
131 ];
132
133 public static $addressDivPSet = [
134 self::HTML_NAMESPACE => [
135 'address' => true, 'div' => true, 'p' => true
136 ]
137 ];
138
139 public static $tableSectionRowSet = [
140 self::HTML_NAMESPACE => [
141 'table' => true, 'thead' => true, 'tbody' => true,
142 'tfoot' => true, 'tr' => true
143 ]
144 ];
145
146 public static $impliedEndTagsSet = [
147 self::HTML_NAMESPACE => [
148 'dd' => true, 'dt' => true, 'li' => true, 'optgroup' => true,
149 'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
150 'rt' => true, 'rtc' => true
151 ]
152 ];
153
154 public static $thoroughImpliedEndTagsSet = [
155 self::HTML_NAMESPACE => [
156 'caption' => true, 'colgroup' => true, 'dd' => true, 'dt' => true,
157 'li' => true, 'optgroup' => true, 'option' => true, 'p' => true,
158 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true,
159 'tbody' => true, 'td' => true, 'tfoot' => true, 'th' => true,
160 'thead' => true, 'tr' => true
161 ]
162 ];
163
164 public static $tableCellSet = [
165 self::HTML_NAMESPACE => [
166 'td' => true, 'th' => true
167 ]
168 ];
169 public static $tableContextSet = [
170 self::HTML_NAMESPACE => [
171 'table' => true, 'template' => true, 'html' => true
172 ]
173 ];
174
175 public static $tableBodyContextSet = [
176 self::HTML_NAMESPACE => [
177 'tbody' => true, 'tfoot' => true, 'thead' => true,
178 'template' => true, 'html' => true
179 ]
180 ];
181
182 public static $tableRowContextSet = [
183 self::HTML_NAMESPACE => [
184 'tr' => true, 'template' => true, 'html' => true
185 ]
186 ];
187
188 # OMITTED: formAssociatedSet, since we don't allow <form>
189
190 public static $inScopeSet = [
191 self::HTML_NAMESPACE => [
192 'applet' => true, 'caption' => true, 'html' => true,
193 'marquee' => true, 'object' => true,
194 'table' => true, 'td' => true, 'template' => true,
195 'th' => true
196 ],
197 self::SVG_NAMESPACE => [
198 'foreignobject' => true, 'desc' => true, 'title' => true
199 ],
200 self::MATHML_NAMESPACE => [
201 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
202 'mtext' => true, 'annotation-xml' => true
203 ]
204 ];
205
206 private static $inListItemScopeSet = null;
207 public static function inListItemScopeSet() {
208 if ( self::$inListItemScopeSet === null ) {
209 self::$inListItemScopeSet = self::$inScopeSet;
210 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ol'] = true;
211 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ul'] = true;
212 }
213 return self::$inListItemScopeSet;
214 }
215
216 private static $inButtonScopeSet = null;
217 public static function inButtonScopeSet() {
218 if ( self::$inButtonScopeSet === null ) {
219 self::$inButtonScopeSet = self::$inScopeSet;
220 self::$inButtonScopeSet[self::HTML_NAMESPACE]['button'] = true;
221 }
222 return self::$inButtonScopeSet;
223 }
224
225 public static $inTableScopeSet = [
226 self::HTML_NAMESPACE => [
227 'html' => true, 'table' => true, 'template' => true
228 ]
229 ];
230
231 public static $mathmlTextIntegrationPointSet = [
232 self::MATHML_NAMESPACE => [
233 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
234 'mtext' => true
235 ]
236 ];
237
238 public static $htmlIntegrationPointSet = [
239 self::SVG_NAMESPACE => [
240 'foreignobject' => true,
241 'desc' => true,
242 'title' => true
243 ]
244 ];
245
246 // For tidy compatibility.
247 public static $tidyPWrapSet = [
248 self::HTML_NAMESPACE => [
249 'body' => true, 'blockquote' => true,
250 // We parse with <body> as the fragment context, but the top-level
251 // element on the stack is actually <html>. We could use the
252 // "adjusted current node" everywhere to work around this, but it's
253 // easier just to add <html> to the p-wrap set.
254 'html' => true,
255 ],
256 ];
257 public static $tidyInlineSet = [
258 self::HTML_NAMESPACE => [
259 'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true,
260 'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true,
261 'br' => true, 'button' => true, 'cite' => true, 'code' => true,
262 'dfn' => true, 'em' => true, 'font' => true, 'i' => true,
263 'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true,
264 'label' => true, 'legend' => true, 'map' => true, 'object' => true,
265 'param' => true, 'q' => true, 'rb' => true, 'rbc' => true,
266 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true,
267 's' => true, 'samp' => true, 'select' => true, 'small' => true,
268 'span' => true, 'strike' => true, 'strong' => true, 'sub' => true,
269 'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true,
270 'var' => true,
271 ],
272 ];
273 }
274
275 /**
276 * A BalanceElement is a simplified version of a DOM Node. The main
277 * difference is that we only keep BalanceElements around for nodes
278 * currently on the BalanceStack of open elements. As soon as an
279 * element is closed, with some minor exceptions relating to the
280 * tree builder "adoption agency algorithm", the element and all its
281 * children are serialized to a string using the flatten() method.
282 * This keeps our memory usage low.
283 *
284 * @ingroup Parser
285 * @since 1.27
286 */
287 class BalanceElement {
288 /**
289 * The namespace of the element.
290 * @var string $namespaceURI
291 */
292 public $namespaceURI;
293 /**
294 * The lower-cased name of the element.
295 * @var string $localName
296 */
297 public $localName;
298 /**
299 * Attributes for the element, in array form
300 * @var array $attribs
301 */
302 public $attribs;
303
304 /**
305 * Parent of this element, or the string "flat" if this element has
306 * already been flattened into its parent.
307 * @var string|null $parent
308 */
309 public $parent;
310
311 /**
312 * An array of children of this element. Typically only the last
313 * child will be an actual BalanceElement object; the rest will
314 * be strings, representing either text nodes or flattened
315 * BalanceElement objects.
316 * @var array $children
317 */
318 public $children;
319
320 /**
321 * A unique string identifier for Noah's Ark purposes, lazy initialized
322 */
323 private $noahKey;
324
325 /**
326 * The next active formatting element in the list, or null if this is the
327 * end of the AFE list or if the element is not in the AFE list.
328 */
329 public $nextAFE;
330
331 /**
332 * The previous active formatting element in the list, or null if this is
333 * the start of the list or if the element is not in the AFE list.
334 */
335 public $prevAFE;
336
337 /**
338 * The next element in the Noah's Ark species bucket.
339 */
340 public $nextNoah;
341
342 /**
343 * Make a new BalanceElement corresponding to the HTML DOM Element
344 * with the given localname, namespace, and attributes.
345 *
346 * @param string $namespaceURI The namespace of the element.
347 * @param string $localName The lowercased name of the tag.
348 * @param array $attribs Attributes of the element
349 */
350 public function __construct( $namespaceURI, $localName, array $attribs ) {
351 Assert::parameterType( 'string', $namespaceURI, '$namespaceURI' );
352 Assert::parameterType( 'string', $localName, '$localName' );
353
354 $this->localName = $localName;
355 $this->namespaceURI = $namespaceURI;
356 $this->attribs = $attribs;
357 $this->contents = '';
358 $this->parent = null;
359 $this->children = [];
360 }
361
362 /**
363 * Remove the given child from this element.
364 * @param BalanceElement $elt
365 */
366 private function removeChild( $elt ) {
367 Assert::precondition(
368 $this->parent !== 'flat', "Can't removeChild after flattening $this"
369 );
370 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement', $elt, '$elt' );
371 Assert::parameter(
372 $elt->parent === $this, 'elt', 'must have $this as a parent'
373 );
374 $idx = array_search( $elt, $this->children, true );
375 Assert::parameter( $idx !== false, '$elt', 'must be a child of $this' );
376 $elt->parent = null;
377 array_splice( $this->children, $idx, 1 );
378 }
379
380 /**
381 * Find $a in the list of children and insert $b before it.
382 * @param BalanceElement $a
383 * @param BalanceElement|string $b
384 */
385 public function insertBefore( $a, $b ) {
386 Assert::precondition(
387 $this->parent !== 'flat', "Can't insertBefore after flattening."
388 );
389 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement', $a, '$a' );
390 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement|string', $b, '$b' );
391 $idx = array_search( $a, $this->children, true );
392 Assert::parameter( $idx !== false, '$a', 'must be a child of $this' );
393 if ( is_string( $b ) ) {
394 array_splice( $this->children, $idx, 0, [ $b ] );
395 } else {
396 Assert::parameter( $b->parent !== 'flat', '$b', "Can't be flat" );
397 if ( $b->parent !== null ) {
398 $b->parent->removeChild( $b );
399 }
400 array_splice( $this->children, $idx, 0, [ $b ] );
401 $b->parent = $this;
402 }
403 }
404
405 /**
406 * Append $elt to the end of the list of children.
407 * @param BalanceElement|string $elt
408 */
409 public function appendChild( $elt ) {
410 Assert::precondition(
411 $this->parent !== 'flat', "Can't appendChild after flattening."
412 );
413 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement|string', $elt, '$elt' );
414 if ( is_string( $elt ) ) {
415 array_push( $this->children, $elt );
416 return;
417 }
418 // Remove $elt from parent, if it had one.
419 if ( $elt->parent !== null ) {
420 $elt->parent->removeChild( $elt );
421 }
422 array_push( $this->children, $elt );
423 $elt->parent = $this;
424 }
425
426 /**
427 * Transfer all of the children of $elt to $this.
428 * @param BalanceElement $elt
429 */
430 public function adoptChildren( $elt ) {
431 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement', $elt, '$elt' );
432 Assert::precondition(
433 $elt->parent !== 'flat', "Can't adoptChildren after flattening."
434 );
435 foreach ( $elt->children as $child ) {
436 if ( !is_string( $child ) ) {
437 // This is an optimization which avoids an O(n^2) set of
438 // array_splice operations.
439 $child->parent = null;
440 }
441 $this->appendChild( $child );
442 }
443 $elt->children = [];
444 }
445
446 /**
447 * Flatten this node and all of its children into a string, as specified
448 * by the HTML serialization specification, and replace this node
449 * in its parent by that string.
450 *
451 * @see __toString()
452 */
453 public function flatten( $tidyCompat = false ) {
454 Assert::parameter( $this->parent !== null, '$this', 'must be a child' );
455 Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' );
456 $idx = array_search( $this, $this->parent->children, true );
457 Assert::parameter(
458 $idx !== false, '$this', 'must be a child of its parent'
459 );
460 if ( $tidyCompat ) {
461 $blank = true;
462 foreach ( $this->children as $elt ) {
463 if ( !is_string( $elt ) ) {
464 $elt = $elt->flatten( $tidyCompat );
465 }
466 if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) {
467 $blank = false;
468 }
469 }
470 if ( $this->isA( 'mw:p-wrap' ) ) {
471 $this->localName = 'p';
472 } elseif ( $blank ) {
473 // Add 'mw-empty-elt' class so elements can be hidden via CSS
474 // for compatibility with legacy tidy.
475 if ( !count( $this->attribs ) &&
476 ( $this->localName === 'tr' || $this->localName === 'li' )
477 ) {
478 $this->attribs = [ 'class' => "mw-empty-elt" ];
479 }
480 $blank = false;
481 }
482 $flat = $blank ? '' : "{$this}";
483 } else {
484 $flat = "{$this}";
485 }
486 $this->parent->children[$idx] = $flat;
487 $this->parent = 'flat'; # for assertion checking
488 return $flat;
489 }
490
491 /**
492 * Serialize this node and all of its children to a string, as specified
493 * by the HTML serialization specification.
494 *
495 * @return string The serialization of the BalanceElement
496 * @see https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments
497 */
498 public function __toString() {
499 $encAttribs = '';
500 foreach ( $this->attribs as $name => $value ) {
501 $encValue = Sanitizer::encodeAttribute( $value );
502 $encAttribs .= " $name=\"$encValue\"";
503 }
504 if ( !$this->isA( BalanceSets::$emptyElementSet ) ) {
505 $out = "<{$this->localName}{$encAttribs}>";
506 // flatten children
507 foreach ( $this->children as $elt ) {
508 $out .= "{$elt}";
509 }
510 $out .= "</{$this->localName}>";
511 } else {
512 $out = "<{$this->localName}{$encAttribs} />";
513 Assert::invariant(
514 count( $this->children ) === 0,
515 "Empty elements shouldn't have children."
516 );
517 }
518 return $out;
519 }
520
521 # Utility functions on BalanceElements.
522
523 /**
524 * Determine if $this represents a specific HTML tag, is a member of
525 * a tag set, or is equal to another BalanceElement.
526 *
527 * @param BalanceElement|array|string $set The target BalanceElement,
528 * set (from the BalanceSets class), or string (HTML tag name).
529 * @return bool
530 */
531 public function isA( $set ) {
532 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement|array|string', $set, '$set' );
533 if ( $set instanceof BalanceElement ) {
534 return $this === $set;
535 } elseif ( is_array( $set ) ) {
536 return isset( $set[$this->namespaceURI] ) &&
537 isset( $set[$this->namespaceURI][$this->localName] );
538 } else {
539 # assume this is an HTML element name.
540 return $this->isHtml() && $this->localName === $set;
541 }
542 }
543
544 /**
545 * Determine if $this represents an element in the HTML namespace.
546 *
547 * @return bool
548 */
549 public function isHtml() {
550 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE;
551 }
552
553 /**
554 * Determine if $this represents a MathML text integration point,
555 * as defined in the HTML5 specification.
556 *
557 * @return bool
558 * @see https://html.spec.whatwg.org/multipage/syntax.html#mathml-text-integration-point
559 */
560 public function isMathmlTextIntegrationPoint() {
561 return $this->isA( BalanceSets::$mathmlTextIntegrationPointSet );
562 }
563
564 /**
565 * Determine if $this represents an HTML integration point,
566 * as defined in the HTML5 specification.
567 *
568 * @return bool
569 * @see https://html.spec.whatwg.org/multipage/syntax.html#html-integration-point
570 */
571 public function isHtmlIntegrationPoint() {
572 if ( $this->isA( BalanceSets::$htmlIntegrationPointSet ) ) {
573 return true;
574 }
575 if (
576 $this->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
577 $this->localName === 'annotation-xml' &&
578 isset( $this->attribs['encoding'] ) &&
579 ( strcasecmp( $this->attribs['encoding'], 'text/html' ) == 0 ||
580 strcasecmp( $this->attribs['encoding'], 'application/xhtml+xml' ) == 0 )
581 ) {
582 return true;
583 }
584 return false;
585 }
586
587 /**
588 * Get a string key for the Noah's Ark algorithm
589 */
590 public function getNoahKey() {
591 if ( $this->noahKey === null ) {
592 $attribs = $this->attribs;
593 ksort( $attribs );
594 $this->noahKey = serialize( [ $this->namespaceURI, $this->localName, $attribs ] );
595 }
596 return $this->noahKey;
597 }
598 }
599
600 /**
601 * The "stack of open elements" as defined in the HTML5 tree builder
602 * spec. This contains methods to ensure that content (start tags, text)
603 * are inserted at the correct place in the output string, and to
604 * flatten BalanceElements are they are closed to avoid holding onto
605 * a complete DOM tree for the document in memory.
606 *
607 * The stack defines a PHP iterator to traverse it in "reverse order",
608 * that is, the most-recently-added element is visited first in a
609 * foreach loop.
610 *
611 * @ingroup Parser
612 * @since 1.27
613 * @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
614 */
615 class BalanceStack implements IteratorAggregate {
616 /**
617 * Backing storage for the stack.
618 * @var array $elements
619 */
620 private $elements = [];
621 /**
622 * Foster parent mode determines how nodes are inserted into the
623 * stack.
624 * @var bool $fosterParentMode
625 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
626 */
627 public $fosterParentMode = false;
628 /**
629 * Tidy compatibility mode, determines behavior of body/blockquote
630 */
631 public $tidyCompat = false;
632 /**
633 * Reference to the current element
634 */
635 public $currentNode;
636
637 /**
638 * Create a new BalanceStack with a single BalanceElement on it,
639 * representing the root &lt;html&gt; node.
640 */
641 public function __construct() {
642 # always a root <html> element on the stack
643 array_push(
644 $this->elements,
645 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] )
646 );
647 $this->currentNode = $this->elements[0];
648 }
649
650 /**
651 * Return a string representing the output of the tree builder:
652 * all the children of the root &lt;html&gt; node.
653 * @return string
654 */
655 public function getOutput() {
656 // Don't include the outer '<html>....</html>'
657 $out = '';
658 foreach ( $this->elements[0]->children as $elt ) {
659 $out .= is_string( $elt ) ? $elt :
660 $elt->flatten( $this->tidyCompat );
661 }
662 return $out;
663 }
664
665 /**
666 * Insert text at the appropriate place for inserting a node.
667 * @param string $value
668 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
669 */
670 public function insertText( $value ) {
671 Assert::parameterType( 'string', $value, '$value' );
672 if (
673 $this->fosterParentMode &&
674 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
675 ) {
676 $this->fosterParent( $value );
677 } elseif (
678 $this->tidyCompat &&
679 $this->currentNode->isA( BalanceSets::$tidyPWrapSet )
680 ) {
681 $this->insertHTMLELement( 'mw:p-wrap', [] );
682 return $this->insertText( $value );
683 } else {
684 $this->currentNode->appendChild( $value );
685 }
686 }
687
688 /**
689 * Insert a BalanceElement at the appropriate place, pushing it
690 * on to the open elements stack.
691 * @param string $namespaceURI The element namespace
692 * @param string $tag The tag name
693 * @param string $attribs Normalized attributes, as a string.
694 * @return BalanceElement
695 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-foreign-element
696 */
697 public function insertForeignElement( $namespaceURI, $tag, $attribs ) {
698 return $this->insertElement(
699 new BalanceElement( $namespaceURI, $tag, $attribs )
700 );
701 }
702
703 /**
704 * Insert an HTML element at the appropriate place, pushing it on to
705 * the open elements stack.
706 * @param string $tag The tag name
707 * @param string $attribs Normalized attributes, as a string.
708 * @return BalanceElement
709 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-an-html-element
710 */
711 public function insertHTMLElement( $tag, $attribs ) {
712 return $this->insertForeignElement(
713 BalanceSets::HTML_NAMESPACE, $tag, $attribs
714 );
715 }
716
717 /**
718 * Insert an element at the appropriate place and push it on to the
719 * open elements stack.
720 * @param BalanceElement $elt
721 * @return BalanceElement
722 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
723 */
724 public function insertElement( $elt ) {
725 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement', $elt, '$elt' );
726 if (
727 $this->currentNode->isA( 'mw:p-wrap' ) &&
728 !$elt->isA( BalanceSets::$tidyInlineSet )
729 ) {
730 // Tidy compatibility.
731 $this->pop();
732 }
733 if (
734 $this->fosterParentMode &&
735 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
736 ) {
737 $elt = $this->fosterParent( $elt );
738 } else {
739 $this->currentNode->appendChild( $elt );
740 }
741 Assert::invariant( $elt->parent !== null, "$elt must be in tree" );
742 Assert::invariant( $elt->parent !== 'flat', "$elt must not have been previous flattened" );
743 array_push( $this->elements, $elt );
744 $this->currentNode = $elt;
745 return $elt;
746 }
747
748 /**
749 * Determine if the stack has $tag in scope.
750 * @param BalanceElement|array|string $tag
751 * @return bool
752 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-scope
753 */
754 public function inScope( $tag ) {
755 return $this->inSpecificScope( $tag, BalanceSets::$inScopeSet );
756 }
757
758 /**
759 * Determine if the stack has $tag in button scope.
760 * @param BalanceElement|array|string $tag
761 * @return bool
762 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-button-scope
763 */
764 public function inButtonScope( $tag ) {
765 return $this->inSpecificScope( $tag, BalanceSets::inButtonScopeSet() );
766 }
767
768 /**
769 * Determine if the stack has $tag in list item scope.
770 * @param BalanceElement|array|string $tag
771 * @return bool
772 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-list-item-scope
773 */
774 public function inListItemScope( $tag ) {
775 return $this->inSpecificScope( $tag, BalanceSets::inListItemScopeSet() );
776 }
777
778 /**
779 * Determine if the stack has $tag in table scope.
780 * @param BalanceElement|array|string $tag
781 * @return bool
782 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-table-scope
783 */
784 public function inTableScope( $tag ) {
785 return $this->inSpecificScope( $tag, BalanceSets::$inTableScopeSet );
786 }
787
788 /**
789 * Determine if the stack has $tag in a specific scope, $set.
790 * @param BalanceElement|array|string $tag
791 * @param BalanceElement|array|string $set
792 * @return bool
793 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-the-specific-scope
794 */
795 public function inSpecificScope( $tag, $set ) {
796 foreach ( $this as $elt ) {
797 if ( $elt->isA( $tag ) ) {
798 return true;
799 }
800 if ( $elt->isA( $set ) ) {
801 return false;
802 }
803 }
804 return false;
805 }
806
807 /**
808 * Generate implied end tags.
809 * @param BalanceElement|array|string|null $butnot
810 * @param bool $thorough True if we should generate end tags thoroughly.
811 * @see https://html.spec.whatwg.org/multipage/syntax.html#generate-implied-end-tags
812 */
813 public function generateImpliedEndTags( $butnot = null, $thorough = false ) {
814 $endTagSet = $thorough ?
815 BalanceSets::$thoroughImpliedEndTagsSet :
816 BalanceSets::$impliedEndTagsSet;
817 while ( $this->length() > 0 ) {
818 if ( $butnot !== null && $this->currentNode->isA( $butnot ) ) {
819 break;
820 }
821 if ( !$this->currentNode->isA( $endTagSet ) ) {
822 break;
823 }
824 $this->pop();
825 }
826 }
827
828 /**
829 * Return the adjusted current node.
830 */
831 public function adjustedCurrentNode( $fragmentContext ) {
832 return ( $fragmentContext && $this->length() === 1 ) ?
833 $fragmentContext : $this->currentNode;
834 }
835
836 /**
837 * Return an iterator over this stack which visits the current node
838 * first, and the root node last.
839 * @return Iterator
840 */
841 public function getIterator() {
842 return new ReverseArrayIterator( $this->elements );
843 }
844
845 /**
846 * Return the BalanceElement at the given position $idx, where
847 * position 0 represents the root element.
848 * @param int $idx
849 * @return BalanceElement
850 */
851 public function node( $idx ) {
852 return $this->elements[ $idx ];
853 }
854
855 /**
856 * Replace the element at position $idx in the BalanceStack with $elt.
857 * @param int $idx
858 * @param BalanceElement $elt
859 */
860 public function replaceAt( $idx, $elt ) {
861 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement', $elt, '$elt' );
862 Assert::precondition(
863 $this->elements[$idx]->parent !== 'flat',
864 'Replaced element should not have already been flattened.'
865 );
866 Assert::precondition(
867 $elt->parent !== 'flat',
868 'New element should not have already been flattened.'
869 );
870 $this->elements[$idx] = $elt;
871 if ( $idx === count( $this->elements ) - 1 ) {
872 $this->currentNode = $elt;
873 }
874 }
875
876 /**
877 * Return the position of the given BalanceElement, set, or
878 * HTML tag name string in the BalanceStack.
879 * @param BalanceElement|array|string $tag
880 * @return int
881 */
882 public function indexOf( $tag ) {
883 for ( $i = count( $this->elements ) - 1; $i >= 0; $i-- ) {
884 if ( $this->elements[$i]->isA( $tag ) ) {
885 return $i;
886 }
887 }
888 return -1;
889 }
890
891 /**
892 * Return the number of elements currently in the BalanceStack.
893 * @return int
894 */
895 public function length() {
896 return count( $this->elements );
897 }
898
899 /**
900 * Remove the current node from the BalanceStack, flattening it
901 * in the process.
902 */
903 public function pop() {
904 $elt = array_pop( $this->elements );
905 if ( count( $this->elements ) ) {
906 $this->currentNode = $this->elements[ count( $this->elements ) - 1 ];
907 } else {
908 $this->currentNode = null;
909 }
910 if ( !$elt->isA( 'mw:p-wrap' ) ) {
911 $elt->flatten( $this->tidyCompat );
912 }
913 }
914
915 /**
916 * Remove all nodes up to and including position $idx from the
917 * BalanceStack, flattening them in the process.
918 * @param int $idx
919 */
920 public function popTo( $idx ) {
921 while ( $this->length() > $idx ) {
922 $this->pop();
923 }
924 }
925
926 /**
927 * Pop elements off the stack up to and including the first
928 * element with the specified HTML tagname (or matching the given
929 * set).
930 * @param BalanceElement|array|string $tag
931 */
932 public function popTag( $tag ) {
933 while ( $this->length() > 0 ) {
934 if ( $this->currentNode->isA( $tag ) ) {
935 $this->pop();
936 break;
937 }
938 $this->pop();
939 }
940 }
941
942 /**
943 * Pop elements off the stack *not including* the first element
944 * in the specified set.
945 * @param BalanceElement|array|string $set
946 */
947 public function clearToContext( $set ) {
948 // Note that we don't loop to 0. Never pop the <html> elt off.
949 while ( $this->length() > 1 ) {
950 if ( $this->currentNode->isA( $set ) ) {
951 break;
952 }
953 $this->pop();
954 }
955 }
956
957 /**
958 * Remove the given $elt from the BalanceStack, optionally
959 * flattening it in the process.
960 * @param BalanceElement $elt The element to remove.
961 * @param bool $flatten Whether to flatten the removed element.
962 */
963 public function removeElement( $elt, $flatten = true ) {
964 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement', $elt, '$elt' );
965 Assert::parameter(
966 $elt->parent !== 'flat',
967 '$elt',
968 '$elt should not already have been flattened.'
969 );
970 Assert::parameter(
971 $elt->parent->parent !== 'flat',
972 '$elt',
973 'The parent of $elt should not already have been flattened.'
974 );
975 $idx = array_search( $elt, $this->elements, true );
976 Assert::parameter( $idx !== false, '$elt', 'must be in stack' );
977 array_splice( $this->elements, $idx, 1 );
978 if ( $idx === count( $this->elements ) ) {
979 $this->currentNode = $this->elements[$idx - 1];
980 }
981 if ( $flatten ) {
982 // serialize $elt into its parent
983 // otherwise, it will eventually serialize when the parent
984 // is serialized, we just hold onto the memory for its
985 // tree of objects a little longer.
986 $elt->flatten( $this->tidyCompat );
987 }
988 Assert::postcondition(
989 array_search( $elt, $this->elements, true ) === false,
990 '$elt should no longer be in open elements stack'
991 );
992 }
993
994 /**
995 * Find $a in the BalanceStack and insert $b after it.
996 * @param BalanceElement $a
997 * @param BalanceElement $b
998 */
999 public function insertAfter( $a, $b ) {
1000 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement', $a, '$a' );
1001 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement', $b, '$b' );
1002 $idx = $this->indexOf( $a );
1003 Assert::parameter( $idx !== false, '$a', 'must be in stack' );
1004 if ( $idx === count( $this->elements ) - 1 ) {
1005 array_push( $this->elements, $b );
1006 $this->currentNode = $b;
1007 } else {
1008 array_splice( $this->elements, $idx + 1, 0, [ $b ] );
1009 }
1010 }
1011
1012 # Fostering and adoption.
1013
1014 /**
1015 * Foster parent the given $elt in the stack of open elements.
1016 * @param BalanceElement|string $elt
1017 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
1018 */
1019 private function fosterParent( $elt ) {
1020 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement|string', $elt, '$elt' );
1021 $lastTable = $this->indexOf( 'table' );
1022 $lastTemplate = $this->indexOf( 'template' );
1023 $parent = null;
1024 $before = null;
1025
1026 if ( $lastTemplate >= 0 && ( $lastTable < 0 || $lastTemplate > $lastTable ) ) {
1027 $parent = $this->elements[$lastTemplate];
1028 } elseif ( $lastTable >= 0 ) {
1029 $parent = $this->elements[$lastTable]->parent;
1030 # Assume all tables have parents, since we're not running scripts!
1031 Assert::invariant(
1032 $parent !== null, "All tables should have parents"
1033 );
1034 $before = $this->elements[$lastTable];
1035 } else {
1036 $parent = $this->elements[0]; // the `html` element.
1037 }
1038
1039 if ( $this->tidyCompat ) {
1040 if ( is_string( $elt ) ) {
1041 // We're fostering text: do we need a p-wrapper?
1042 if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) {
1043 $this->insertHTMLElement( 'mw:p-wrap', [] );
1044 $this->insertText( $elt );
1045 return $elt;
1046 }
1047 } else {
1048 // We're fostering an element; do we need to merge p-wrappers?
1049 if ( $elt->isA( 'mw:p-wrap' ) ) {
1050 $idx = $before ?
1051 array_search( $before, $parent->children, true ) :
1052 count( $parent->children );
1053 $after = $idx > 0 ? $parent->children[$idx - 1] : '';
1054 if (
1055 $after instanceof BalanceElement &&
1056 $after->isA( 'mw:p-wrap' )
1057 ) {
1058 return $after; // Re-use existing p-wrapper.
1059 }
1060 }
1061 }
1062 }
1063
1064 if ( $before ) {
1065 $parent->insertBefore( $before, $elt );
1066 } else {
1067 $parent->appendChild( $elt );
1068 }
1069 return $elt;
1070 }
1071
1072 /**
1073 * Run the "adoption agency algoritm" (AAA) for the given subject
1074 * tag name.
1075 * @param string $tag The subject tag name.
1076 * @param BalanceActiveFormattingElements $afe The current
1077 * active formatting elements list.
1078 * @return true if the adoption agency algorithm "did something", false
1079 * if more processing is required by the caller.
1080 * @see https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
1081 */
1082 public function adoptionAgency( $tag, $afe ) {
1083 // If the current node is an HTML element whose tag name is subject,
1084 // and the current node is not in the list of active formatting
1085 // elements, then pop the current node off the stack of open
1086 // elements and abort these steps.
1087 if (
1088 $this->currentNode->isA( $tag ) &&
1089 !$afe->isInList( $this->currentNode )
1090 ) {
1091 $this->pop();
1092 return true; // no more handling required
1093 }
1094
1095 // Let outer loop counter be zero.
1096 $outer = 0;
1097
1098 // Outer loop: If outer loop counter is greater than or
1099 // equal to eight, then abort these steps.
1100 while ( $outer < 8 ) {
1101 // Increment outer loop counter by one.
1102 $outer++;
1103
1104 // Let the formatting element be the last element in the list
1105 // of active formatting elements that: is between the end of
1106 // the list and the last scope marker in the list, if any, or
1107 // the start of the list otherwise, and has the same tag name
1108 // as the token.
1109 $fmtelt = $afe->findElementByTag( $tag );
1110
1111 // If there is no such node, then abort these steps and instead
1112 // act as described in the "any other end tag" entry below.
1113 if ( !$fmtelt ) {
1114 return false; // false means handle by the default case
1115 }
1116
1117 // Otherwise, if there is such a node, but that node is not in
1118 // the stack of open elements, then this is a parse error;
1119 // remove the element from the list, and abort these steps.
1120 $index = $this->indexOf( $fmtelt );
1121 if ( $index < 0 ) {
1122 $afe->remove( $fmtelt );
1123 return true; // true means no more handling required
1124 }
1125
1126 // Otherwise, if there is such a node, and that node is also in
1127 // the stack of open elements, but the element is not in scope,
1128 // then this is a parse error; ignore the token, and abort
1129 // these steps.
1130 if ( !$this->inScope( $fmtelt ) ) {
1131 return true;
1132 }
1133
1134 // Let the furthest block be the topmost node in the stack of
1135 // open elements that is lower in the stack than the formatting
1136 // element, and is an element in the special category. There
1137 // might not be one.
1138 $furthestblock = null;
1139 $furthestblockindex = -1;
1140 $stacklen = $this->length();
1141 for ( $i = $index+1; $i < $stacklen; $i++ ) {
1142 if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) {
1143 $furthestblock = $this->node( $i );
1144 $furthestblockindex = $i;
1145 break;
1146 }
1147 }
1148
1149 // If there is no furthest block, then the UA must skip the
1150 // subsequent steps and instead just pop all the nodes from the
1151 // bottom of the stack of open elements, from the current node
1152 // up to and including the formatting element, and remove the
1153 // formatting element from the list of active formatting
1154 // elements.
1155 if ( !$furthestblock ) {
1156 $this->popTag( $fmtelt );
1157 $afe->remove( $fmtelt );
1158 return true;
1159 } else {
1160 // Let the common ancestor be the element immediately above
1161 // the formatting element in the stack of open elements.
1162 $ancestor = $this->node( $index-1 );
1163
1164 // Let a bookmark note the position of the formatting
1165 // element in the list of active formatting elements
1166 // relative to the elements on either side of it in the
1167 // list.
1168 $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] );
1169 $afe->insertAfter( $fmtelt, $BOOKMARK );
1170
1171 // Let node and last node be the furthest block.
1172 $node = $furthestblock;
1173 $lastnode = $furthestblock;
1174 $nodeindex = $furthestblockindex;
1175 $isAFE = false;
1176
1177 // Let inner loop counter be zero.
1178 $inner = 0;
1179
1180 while ( true ) {
1181
1182 // Increment inner loop counter by one.
1183 $inner++;
1184
1185 // Let node be the element immediately above node in
1186 // the stack of open elements, or if node is no longer
1187 // in the stack of open elements (e.g. because it got
1188 // removed by this algorithm), the element that was
1189 // immediately above node in the stack of open elements
1190 // before node was removed.
1191 $node = $this->node( --$nodeindex );
1192
1193 // If node is the formatting element, then go
1194 // to the next step in the overall algorithm.
1195 if ( $node === $fmtelt ) break;
1196
1197 // If the inner loop counter is greater than three and node
1198 // is in the list of active formatting elements, then remove
1199 // node from the list of active formatting elements.
1200 $isAFE = $afe->isInList( $node );
1201 if ( $inner > 3 && $isAFE ) {
1202 $afe->remove( $node );
1203 $isAFE = false;
1204 }
1205
1206 // If node is not in the list of active formatting
1207 // elements, then remove node from the stack of open
1208 // elements and then go back to the step labeled inner
1209 // loop.
1210 if ( !$isAFE ) {
1211 // Don't flatten here, since we're about to relocate
1212 // parts of this $node.
1213 $this->removeElement( $node, false );
1214 continue;
1215 }
1216
1217 // Create an element for the token for which the
1218 // element node was created with common ancestor as
1219 // the intended parent, replace the entry for node
1220 // in the list of active formatting elements with an
1221 // entry for the new element, replace the entry for
1222 // node in the stack of open elements with an entry for
1223 // the new element, and let node be the new element.
1224 $newelt = new BalanceElement(
1225 $node->namespaceURI, $node->localName, $node->attribs );
1226 $afe->replace( $node, $newelt );
1227 $this->replaceAt( $nodeindex, $newelt );
1228 $node = $newelt;
1229
1230 // If last node is the furthest block, then move the
1231 // aforementioned bookmark to be immediately after the
1232 // new node in the list of active formatting elements.
1233 if ( $lastnode === $furthestblock ) {
1234 $afe->remove( $BOOKMARK );
1235 $afe->insertAfter( $newelt, $BOOKMARK );
1236 }
1237
1238 // Insert last node into node, first removing it from
1239 // its previous parent node if any.
1240 $node->appendChild( $lastnode );
1241
1242 // Let last node be node.
1243 $lastnode = $node;
1244 }
1245
1246 // If the common ancestor node is a table, tbody, tfoot,
1247 // thead, or tr element, then, foster parent whatever last
1248 // node ended up being in the previous step, first removing
1249 // it from its previous parent node if any.
1250 if (
1251 $this->fosterParentMode &&
1252 $ancestor->isA( BalanceSets::$tableSectionRowSet )
1253 ) {
1254 $this->fosterParent( $lastnode );
1255 } else {
1256 // Otherwise, append whatever last node ended up being in
1257 // the previous step to the common ancestor node, first
1258 // removing it from its previous parent node if any.
1259 $ancestor->appendChild( $lastnode );
1260 }
1261
1262 // Create an element for the token for which the
1263 // formatting element was created, with furthest block
1264 // as the intended parent.
1265 $newelt2 = new BalanceElement(
1266 $fmtelt->namespaceURI, $fmtelt->localName, $fmtelt->attribs );
1267
1268 // Take all of the child nodes of the furthest block and
1269 // append them to the element created in the last step.
1270 $newelt2->adoptChildren( $furthestblock );
1271
1272 // Append that new element to the furthest block.
1273 $furthestblock->appendChild( $newelt2 );
1274
1275 // Remove the formatting element from the list of active
1276 // formatting elements, and insert the new element into the
1277 // list of active formatting elements at the position of
1278 // the aforementioned bookmark.
1279 $afe->remove( $fmtelt );
1280 $afe->replace( $BOOKMARK, $newelt2 );
1281
1282 // Remove the formatting element from the stack of open
1283 // elements, and insert the new element into the stack of
1284 // open elements immediately below the position of the
1285 // furthest block in that stack.
1286 $this->removeElement( $fmtelt );
1287 $this->insertAfter( $furthestblock, $newelt2 );
1288 }
1289 }
1290
1291 return true;
1292 }
1293
1294 /**
1295 * Return the contents of the open elements stack as a string for
1296 * debugging.
1297 * @return string
1298 */
1299 public function __toString() {
1300 $r = [];
1301 foreach ( $this->elements as $elt ) {
1302 array_push( $r, $elt->localName );
1303 }
1304 return implode( $r, ' ' );
1305 }
1306 }
1307
1308 /**
1309 * A pseudo-element used as a marker in the list of active formatting elements
1310 *
1311 * @ingroup Parser
1312 * @since 1.27
1313 */
1314 class BalanceMarker {
1315 public $nextAFE;
1316 public $prevAFE;
1317 }
1318
1319 /**
1320 * The list of active formatting elements, which is used to handle
1321 * mis-nested formatting element tags in the HTML5 tree builder
1322 * specification.
1323 *
1324 * @ingroup Parser
1325 * @since 1.27
1326 * @see https://html.spec.whatwg.org/multipage/syntax.html#list-of-active-formatting-elements
1327 */
1328 class BalanceActiveFormattingElements {
1329 /** The last (most recent) element in the list */
1330 private $tail;
1331
1332 /** The first (least recent) element in the list */
1333 private $head;
1334
1335 /**
1336 * An array of arrays representing the population of elements in each bucket
1337 * according to the Noah's Ark clause. The outer array is stack-like, with each
1338 * integer-indexed element representing a segment of the list, bounded by
1339 * markers. The first element represents the segment of the list before the
1340 * first marker.
1341 *
1342 * The inner arrays are indexed by "Noah key", which is a string which uniquely
1343 * identifies each bucket according to the rules in the spec. The value in
1344 * the inner array is the first (least recently inserted) element in the bucket,
1345 * and subsequent members of the bucket can be found by iterating through the
1346 * singly-linked list via $node->nextNoah.
1347 *
1348 * This is optimised for the most common case of inserting into a bucket
1349 * with zero members, and deleting a bucket containing one member. In the
1350 * worst case, iteration through the list is still O(1) in the document
1351 * size, since each bucket can have at most 3 members.
1352 */
1353 private $noahTableStack = [ [] ];
1354
1355 public function __destruct() {
1356 for ( $node = $this->head; $node; $node = $next ) {
1357 $next = $node->nextAFE;
1358 $node->prevAFE = $node->nextAFE = $node->nextNoah = null;
1359 }
1360 $this->head = $this->tail = $this->noahTableStack = null;
1361 }
1362
1363 public function insertMarker() {
1364 $elt = new BalanceMarker;
1365 if ( $this->tail ) {
1366 $this->tail->nextAFE = $elt;
1367 $elt->prevAFE = $this->tail;
1368 } else {
1369 $this->head = $elt;
1370 }
1371 $this->tail = $elt;
1372 $this->noahTableStack[] = [];
1373 }
1374
1375 /**
1376 * Follow the steps required when the spec requires us to "push onto the
1377 * list of active formatting elements".
1378 * @param BalanceElement $elt
1379 */
1380 public function push( BalanceElement $elt ) {
1381 // Must not be in the list already
1382 if ( $elt->prevAFE !== null || $this->head === $elt ) {
1383 throw new ParameterAssertionException( '$elt',
1384 'Cannot insert a node into the AFE list twice' );
1385 }
1386
1387 // "Noah's Ark clause" -- if there are already three copies of
1388 // this element before we encounter a marker, then drop the last
1389 // one.
1390 $noahKey = $elt->getNoahKey();
1391 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1392 if ( !isset( $table[$noahKey] ) ) {
1393 $table[$noahKey] = $elt;
1394 } else {
1395 $count = 1;
1396 $head = $tail = $table[$noahKey];
1397 while ( $tail->nextNoah ) {
1398 $tail = $tail->nextNoah;
1399 $count++;
1400 }
1401 if ( $count >= 3 ) {
1402 $this->remove( $head );
1403 }
1404 $tail->nextNoah = $elt;
1405 }
1406 // Add to the main AFE list
1407 if ( $this->tail ) {
1408 $this->tail->nextAFE = $elt;
1409 $elt->prevAFE = $this->tail;
1410 } else {
1411 $this->head = $elt;
1412 }
1413 $this->tail = $elt;
1414 }
1415
1416 /**
1417 * Follow the steps required when the spec asks us to "clear the list of
1418 * active formatting elements up to the last marker".
1419 */
1420 public function clearToMarker() {
1421 // Iterate back through the list starting from the tail
1422 $tail = $this->tail;
1423 while ( $tail && !( $tail instanceof BalanceMarker ) ) {
1424 // Unlink the element
1425 $prev = $tail->prevAFE;
1426 $tail->prevAFE = null;
1427 if ( $prev ) {
1428 $prev->nextAFE = null;
1429 }
1430 $tail->nextNoah = null;
1431 $tail = $prev;
1432 }
1433 // If we finished on a marker, unlink it and pop it off the Noah table stack
1434 if ( $tail ) {
1435 $prev = $tail->prevAFE;
1436 if ( $prev ) {
1437 $prev->nextAFE = null;
1438 }
1439 $tail = $prev;
1440 array_pop( $this->noahTableStack );
1441 } else {
1442 // No marker: wipe the top-level Noah table (which is the only one)
1443 $this->noahTableStack[0] = [];
1444 }
1445 // If we removed all the elements, clear the head pointer
1446 if ( !$tail ) {
1447 $this->head = null;
1448 }
1449 $this->tail = $tail;
1450 }
1451
1452 /**
1453 * Find and return the last element with the specified tag between the
1454 * end of the list and the last marker on the list.
1455 * Used when parsing &lt;a&gt; "in body mode".
1456 */
1457 public function findElementByTag( $tag ) {
1458 $elt = $this->tail;
1459 while ( $elt && !( $elt instanceof BalanceMarker ) ) {
1460 if ( $elt->localName === $tag ) {
1461 return $elt;
1462 }
1463 $elt = $elt->prevAFE;
1464 }
1465 return null;
1466 }
1467
1468 /**
1469 * Determine whether an element is in the list of formatting elements.
1470 * @return boolean
1471 */
1472 public function isInList( BalanceElement $elt ) {
1473 return $this->head === $elt || $elt->prevAFE;
1474 }
1475
1476 /**
1477 * Find the element $elt in the list and remove it.
1478 * Used when parsing &lt;a&gt; in body mode.
1479 */
1480 public function remove( BalanceElement $elt ) {
1481 if ( $this->head !== $elt && !$elt->prevAFE ) {
1482 throw new ParameterAssertionException( '$elt',
1483 "Attempted to remove an element which is not in the AFE list" );
1484 }
1485 // Update head and tail pointers
1486 if ( $this->head === $elt ) {
1487 $this->head = $elt->nextAFE;
1488 }
1489 if ( $this->tail === $elt ) {
1490 $this->tail = $elt->prevAFE;
1491 }
1492 // Update previous element
1493 if ( $elt->prevAFE ) {
1494 $elt->prevAFE->nextAFE = $elt->nextAFE;
1495 }
1496 // Update next element
1497 if ( $elt->nextAFE ) {
1498 $elt->nextAFE->prevAFE = $elt->prevAFE;
1499 }
1500 // Clear pointers so that isInList() etc. will work
1501 $elt->prevAFE = $elt->nextAFE = null;
1502 // Update Noah list
1503 $this->removeFromNoahList( $elt );
1504 }
1505
1506 private function addToNoahList( BalanceElement $elt ) {
1507 $noahKey = $elt->getNoahKey();
1508 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1509 if ( !isset( $table[$noahKey] ) ) {
1510 $table[$noahKey] = $elt;
1511 } else {
1512 $tail = $table[$noahKey];
1513 while ( $tail->nextNoah ) {
1514 $tail = $tail->nextNoah;
1515 }
1516 $tail->nextNoah = $elt;
1517 }
1518 }
1519
1520 private function removeFromNoahList( BalanceElement $elt ) {
1521 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1522 $key = $elt->getNoahKey();
1523 $noahElt = $table[$key];
1524 if ( $noahElt === $elt ) {
1525 if ( $noahElt->nextNoah ) {
1526 $table[$key] = $noahElt->nextNoah;
1527 $noahElt->nextNoah = null;
1528 } else {
1529 unset( $table[$key] );
1530 }
1531 } else {
1532 do {
1533 $prevNoahElt = $noahElt;
1534 $noahElt = $prevNoahElt->nextNoah;
1535 if ( $noahElt === $elt ) {
1536 // Found it, unlink
1537 $prevNoahElt->nextNoah = $elt->nextNoah;
1538 $elt->nextNoah = null;
1539 break;
1540 }
1541 } while ( $noahElt );
1542 }
1543 }
1544
1545 /**
1546 * Find element $a in the list and replace it with element $b
1547 */
1548 public function replace( BalanceElement $a, BalanceElement $b ) {
1549 if ( $this->head !== $a && !$a->prevAFE ) {
1550 throw new ParameterAssertionException( '$a',
1551 "Attempted to replace an element which is not in the AFE list" );
1552 }
1553 // Update head and tail pointers
1554 if ( $this->head === $a ) {
1555 $this->head = $b;
1556 }
1557 if ( $this->tail === $a ) {
1558 $this->tail = $b;
1559 }
1560 // Update previous element
1561 if ( $a->prevAFE ) {
1562 $a->prevAFE->nextAFE = $b;
1563 }
1564 // Update next element
1565 if ( $a->nextAFE ) {
1566 $a->nextAFE->prevAFE = $b;
1567 }
1568 $b->prevAFE = $a->prevAFE;
1569 $b->nextAFE = $a->nextAFE;
1570 $a->nextAFE = $a->prevAFE = null;
1571 // Update Noah list
1572 $this->removeFromNoahList( $a );
1573 $this->addToNoahList( $b );
1574 }
1575
1576 /**
1577 * Find $a in the list and insert $b after it.
1578 */
1579 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
1580 if ( $this->head !== $a && !$a->prevAFE ) {
1581 throw new ParameterAssertionException( '$a',
1582 "Attempted to insert after an element which is not in the AFE list" );
1583 }
1584 if ( $this->tail === $a ) {
1585 $this->tail = $b;
1586 }
1587 if ( $a->nextAFE ) {
1588 $a->nextAFE->prevAFE = $b;
1589 }
1590 $b->nextAFE = $a->nextAFE;
1591 $b->prevAFE = $a;
1592 $a->nextAFE = $b;
1593 $this->addToNoahList( $b );
1594 }
1595
1596 // @codingStandardsIgnoreStart Generic.Files.LineLength.TooLong
1597 /**
1598 * Reconstruct the active formatting elements.
1599 * @param BalanceStack $stack The open elements stack
1600 * @see https://html.spec.whatwg.org/multipage/syntax.html#reconstruct-the-active-formatting-elements
1601 */
1602 // @codingStandardsIgnoreEnd
1603 public function reconstruct( $stack ) {
1604 $entry = $this->tail;
1605 // If there are no entries in the list of active formatting elements,
1606 // then there is nothing to reconstruct
1607 if ( !$entry ) {
1608 return;
1609 }
1610 // If the last is a marker, do nothing.
1611 if ( $entry instanceof BalanceMarker ) {
1612 return;
1613 }
1614 // Or if it is an open element, do nothing.
1615 if ( $stack->indexOf( $entry ) >= 0 ) {
1616 return;
1617 }
1618
1619 // Loop backward through the list until we find a marker or an
1620 // open element
1621 while ( $entry->prevAFE ) {
1622 $entry = $entry->prevAFE;
1623 if ( $entry instanceof BalanceMarker || $stack->indexOf( $entry ) >= 0 ) {
1624 break;
1625 }
1626 }
1627
1628 // Now loop forward, starting from the element after the current one (or
1629 // the first element if we didn't find a marker or open element),
1630 // recreating formatting elements and pushing them back onto the list
1631 // of open elements.
1632 if ( $entry->prevAFE ) {
1633 $entry = $entry->nextAFE;
1634 }
1635 do {
1636 $newElement = $stack->insertHTMLElement(
1637 $entry->localName,
1638 $entry->attribs );
1639 $this->replace( $entry, $newElement );
1640 $entry = $newElement->nextAFE;
1641 } while ( $entry );
1642 }
1643
1644 /**
1645 * Get a string representation of the AFE list, for debugging
1646 */
1647 public function __toString() {
1648 $prev = null;
1649 $s = '';
1650 for ( $node = $this->head; $node; $prev = $node, $node = $node->nextAFE ) {
1651 if ( $node instanceof BalanceMarker ) {
1652 $s .= "MARKER\n";
1653 continue;
1654 }
1655 $s .= $node->localName . '#' . substr( md5( spl_object_hash( $node ) ), 0, 8 );
1656 if ( $node->nextNoah ) {
1657 $s .= " (noah sibling: {$node->nextNoah->localName}#" .
1658 substr( md5( spl_object_hash( $node->nextNoah ) ), 0, 8 ) .
1659 ')';
1660 }
1661 if ( $node->nextAFE && $node->nextAFE->prevAFE !== $node ) {
1662 $s .= " (reverse link is wrong!)";
1663 }
1664 $s .= "\n";
1665 }
1666 if ( $prev !== $this->tail ) {
1667 $s .= "(tail pointer is wrong!)\n";
1668 }
1669 return $s;
1670 }
1671 }
1672
1673 /**
1674 * An implementation of the tree building portion of the HTML5 parsing
1675 * spec.
1676 *
1677 * This is used to balance and tidy output so that the result can
1678 * always be cleanly serialized/deserialized by an HTML5 parser. It
1679 * does *not* guarantee "conforming" output -- the HTML5 spec contains
1680 * a number of constraints which are not enforced by the HTML5 parsing
1681 * process. But the result will be free of gross errors: misnested or
1682 * unclosed tags, for example, and will be unchanged by spec-complient
1683 * parsing followed by serialization.
1684 *
1685 * The tree building stage is structured as a state machine.
1686 * When comparing the implementation to
1687 * https://www.w3.org/TR/html5/syntax.html#tree-construction
1688 * note that each state is implemented as a function with a
1689 * name ending in `Mode` (because the HTML spec refers to them
1690 * as insertion modes). The current insertion mode is held by
1691 * the $parseMode property.
1692 *
1693 * The following simplifications have been made:
1694 * - We handle body content only (ie, we start `in body`.)
1695 * - The document is never in "quirks mode".
1696 * - All occurrences of < and > have been entity escaped, so we
1697 * can parse tags by simply splitting on those two characters.
1698 * Similarly, all attributes have been "cleaned" and are double-quoted
1699 * and escaped.
1700 * - All comments and null characters are assumed to have been removed.
1701 * - We don't alter linefeeds after <pre>/<listing>.
1702 * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
1703 * <form>, <frame>, <plaintext>, <isindex>, <textarea>, <xmp>, <iframe>,
1704 * <noembed>, <noscript>, <select>, <script>, <title>. As a result,
1705 * further simplifications can be made:
1706 * - `frameset-ok` is not tracked.
1707 * - `form element pointer` is not tracked.
1708 * - `head element pointer` is not tracked (but presumed non-null)
1709 * - Tokenizer has only a single mode.
1710 *
1711 * We generally mark places where we omit cases from the spec due to
1712 * disallowed elements with a comment: `# OMITTED: <element-name>`.
1713 *
1714 * The HTML spec keeps a flag during the parsing process to track
1715 * whether or not a "parse error" has been encountered. We don't
1716 * bother to track that flag, we just implement the error-handling
1717 * process as specified.
1718 *
1719 * @ingroup Parser
1720 * @since 1.27
1721 * @see https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
1722 */
1723 class Balancer {
1724 private $parseMode;
1725 private $bitsIterator;
1726 private $allowedHtmlElements;
1727 private $afe;
1728 private $stack;
1729 private $strict;
1730 private $tidyCompat;
1731
1732 private $textIntegrationMode = false;
1733 private $pendingTableText;
1734 private $originalInsertionMode;
1735 private $fragmentContext;
1736
1737 /**
1738 * Create a new Balancer.
1739 * @param array $config Balancer configuration. Includes:
1740 * 'strict' : boolean, defaults to false.
1741 * When true, enforces syntactic constraints on input:
1742 * all non-tag '<' must be escaped, all attributes must be
1743 * separated by a single space and double-quoted. This is
1744 * consistent with the output of the Sanitizer.
1745 * 'allowedHtmlElements' : array, defaults to null.
1746 * When present, the keys of this associative array give
1747 * the acceptable HTML tag names. When not present, no
1748 * tag sanitization is done.
1749 * 'tidyCompat' : boolean, defaults to false.
1750 * When true, the serialization algorithm is tweaked to
1751 * provide historical compatibility with the old "tidy"
1752 * program: <p>-wrapping is done to the children of
1753 * <body> and <blockquote> elements, and empty elements
1754 * are removed.
1755 */
1756 public function __construct( array $config = [] ) {
1757 $config = $config + [
1758 'strict' => false,
1759 'allowedHtmlElements' => null,
1760 'tidyCompat' => false,
1761 ];
1762 $this->allowedHtmlElements = $config['allowedHtmlElements'];
1763 $this->strict = $config['strict'];
1764 $this->tidyCompat = $config['tidyCompat'];
1765 if ( $this->allowedHtmlElements !== null ) {
1766 # Sanity check!
1767 $bad = array_uintersect_assoc(
1768 $this->allowedHtmlElements,
1769 BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE],
1770 function( $a, $b ) {
1771 // Ignore the values (just intersect the keys) by saying
1772 // all values are equal to each other.
1773 return 0;
1774 }
1775 );
1776 if ( count( $bad ) > 0 ) {
1777 $badstr = implode( array_keys( $bad ), ',' );
1778 throw new ParameterAssertionException(
1779 '$config',
1780 'Balance attempted with sanitization including ' .
1781 "unsupported elements: {$badstr}"
1782 );
1783 }
1784 }
1785 }
1786
1787 /**
1788 * Return a balanced HTML string for the HTML fragment given by $text,
1789 * subject to the caveats listed in the class description. The result
1790 * will typically be idempotent -- that is, rebalancing the output
1791 * would result in no change.
1792 *
1793 * @param string $text The markup to be balanced
1794 * @param callable $processingCallback Callback to do any variable or
1795 * parameter replacements in HTML attributes values
1796 * @param array|bool $processingArgs Arguments for the processing callback
1797 * @return string The balanced markup
1798 */
1799 public function balance( $text, $processingCallback = null, $processingArgs = [] ) {
1800 $this->parseMode = 'inBodyMode';
1801 $this->bitsIterator = new ExplodeIterator( '<', $text );
1802 $this->afe = new BalanceActiveFormattingElements();
1803 $this->stack = new BalanceStack();
1804 $this->stack->tidyCompat = $this->tidyCompat;
1805 $this->processingCallback = $processingCallback;
1806 $this->processingArgs = $processingArgs;
1807
1808 # The stack is constructed with an <html> element already on it.
1809 # Set this up as a fragment parsed with <body> as the context.
1810 $this->fragmentContext =
1811 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'body', [] );
1812 $this->resetInsertionMode();
1813
1814 // First element is text not tag
1815 $x = $this->bitsIterator->current();
1816 $this->bitsIterator->next();
1817 $this->insertToken( 'text', str_replace( '>', '&gt;', $x ) );
1818 // Now process each tag.
1819 while ( $this->bitsIterator->valid() ) {
1820 $this->advance();
1821 }
1822 $this->insertToken( 'eof', null );
1823 $result = $this->stack->getOutput();
1824 // Free memory before returning.
1825 $this->bitsIterator = null;
1826 $this->afe = null;
1827 $this->stack = null;
1828 $this->fragmentContext = null;
1829 return $result;
1830 }
1831
1832 /**
1833 * Pass a token to the tree builder. The $token will be one of the
1834 * strings "tag", "endtag", or "text".
1835 */
1836 private function insertToken( $token, $value, $attribs = null, $selfclose = false ) {
1837 // validate tags against $unsupportedSet
1838 if ( $token === 'tag' || $token === 'endtag' ) {
1839 if ( isset( BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE][$value] ) ) {
1840 # As described in "simplifications" above, these tags are
1841 # not supported in the balancer.
1842 Assert::invariant(
1843 !$this->strict,
1844 "Unsupported $token <$value> found."
1845 );
1846 return false;
1847 }
1848 } elseif ( $token === 'text' && $value === '' ) {
1849 # Don't actually inject the empty string as a text token.
1850 return true;
1851 }
1852 // Some hoops we have to jump through
1853 $adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
1854
1855 $isForeign = true;
1856 if (
1857 $this->stack->length() === 0 ||
1858 $adjusted->isHtml() ||
1859 $token === 'eof'
1860 ) {
1861 $isForeign = false;
1862 } elseif ( $adjusted->isMathmlTextIntegrationPoint() ) {
1863 if ( $token === 'text' ) {
1864 $isForeign = false;
1865 } elseif (
1866 $token === 'tag' &&
1867 $value !== 'mglyph' && $value !== 'malignmark'
1868 ) {
1869 $isForeign = false;
1870 }
1871 } elseif (
1872 $adjusted->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
1873 $adjusted->localName === 'annotation-xml' &&
1874 $token === 'tag' && $value === 'svg'
1875 ) {
1876 $isForeign = false;
1877 } elseif (
1878 $adjusted->isHtmlIntegrationPoint() &&
1879 ( $token === 'tag' || $token === 'text' )
1880 ) {
1881 $isForeign = false;
1882 }
1883 if ( $isForeign ) {
1884 return $this->insertForeignToken( $token, $value, $attribs, $selfclose );
1885 } else {
1886 $func = $this->parseMode;
1887 return $this->$func( $token, $value, $attribs, $selfclose );
1888 }
1889 }
1890
1891 private function insertForeignToken( $token, $value, $attribs = null, $selfclose = false ) {
1892 if ( $token === 'text' ) {
1893 $this->stack->insertText( $value );
1894 return true;
1895 } elseif ( $token === 'tag' ) {
1896 switch ( $value ) {
1897 case 'font':
1898 if ( isset( $attribs['color'] )
1899 || isset( $attribs['face'] )
1900 || isset( $attribs['size'] )
1901 ) {
1902 break;
1903 }
1904 /* otherwise, fall through */
1905 case 'b':
1906 case 'big':
1907 case 'blockquote':
1908 case 'body':
1909 case 'br':
1910 case 'center':
1911 case 'code':
1912 case 'dd':
1913 case 'div':
1914 case 'dl':
1915 case 'dt':
1916 case 'em':
1917 case 'embed':
1918 case 'h1':
1919 case 'h2':
1920 case 'h3':
1921 case 'h4':
1922 case 'h5':
1923 case 'h6':
1924 case 'head':
1925 case 'hr':
1926 case 'i':
1927 case 'img':
1928 case 'li':
1929 case 'listing':
1930 case 'menu':
1931 case 'meta':
1932 case 'nobr':
1933 case 'ol':
1934 case 'p':
1935 case 'pre':
1936 case 'ruby':
1937 case 's':
1938 case 'small':
1939 case 'span':
1940 case 'strong':
1941 case 'strike':
1942 case 'sub':
1943 case 'sup':
1944 case 'table':
1945 case 'tt':
1946 case 'u':
1947 case 'ul':
1948 case 'var':
1949 if ( $this->fragmentContext ) {
1950 break;
1951 }
1952 while ( true ) {
1953 $this->stack->pop();
1954 $node = $this->stack->currentNode;
1955 if (
1956 $node->isMathmlTextIntegrationPoint() ||
1957 $node->isHtmlIntegrationPoint() ||
1958 $node->isHtml()
1959 ) {
1960 break;
1961 }
1962 }
1963 return $this->insertToken( $token, $value, $attribs, $selfclose );
1964 }
1965 // "Any other start tag"
1966 $adjusted = ( $this->fragmentContext && $this->stack->length()===1 ) ?
1967 $this->fragmentContext : $this->stack->currentNode;
1968 $this->stack->insertForeignElement(
1969 $adjusted->namespaceURI, $value, $attribs
1970 );
1971 if ( $selfclose ) {
1972 $this->stack->pop();
1973 }
1974 return true;
1975 } elseif ( $token === 'endtag' ) {
1976 $first = true;
1977 foreach ( $this->stack as $i => $node ) {
1978 if ( $node->isHtml() && !$first ) {
1979 // process the end tag as HTML
1980 $func = $this->parseMode;
1981 return $this->$func( $token, $value, $attribs, $selfclose );
1982 } elseif ( $i === 0 ) {
1983 return true;
1984 } elseif ( $node->localName === $value ) {
1985 $this->stack->popTag( $node );
1986 return true;
1987 }
1988 $first = false;
1989 }
1990 }
1991 }
1992
1993 /**
1994 * Grab the next "token" from $bitsIterator. This is either a open/close
1995 * tag or text, depending on whether the Sanitizer approves.
1996 */
1997 private function advance() {
1998 $x = $this->bitsIterator->current();
1999 $this->bitsIterator->next();
2000 $regs = [];
2001 # $slash: Does the current element start with a '/'?
2002 # $t: Current element name
2003 # $attribStr: String between element name and >
2004 # $brace: Ending '>' or '/>'
2005 # $rest: Everything until the next element from the $bitsIterator
2006 if ( preg_match( Sanitizer::ELEMENT_BITS_REGEX, $x, $regs ) ) {
2007 list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs;
2008 $t = strtolower( $t );
2009 if ( $this->strict ) {
2010 /* Verify that attributes are all properly double-quoted */
2011 Assert::invariant(
2012 preg_match(
2013 '/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr
2014 ),
2015 "Bad attribute string found"
2016 );
2017 }
2018 } else {
2019 Assert::invariant(
2020 !$this->strict, "< found which does not start a valid tag"
2021 );
2022 $slash = $t = $attribStr = $brace = $rest = null;
2023 }
2024 $goodtag = $t;
2025 $sanitize = $this->allowedHtmlElements !== null;
2026 if ( $sanitize ) {
2027 $goodtag = $t && isset( $this->allowedHtmlElements[$t] );
2028 }
2029 if ( $goodtag ) {
2030 if ( is_callable( $this->processingCallback ) ) {
2031 call_user_func_array( $this->processingCallback, [ &$attribStr, $this->processingArgs ] );
2032 }
2033 if ( $sanitize ) {
2034 $goodtag = Sanitizer::validateTag( $attribStr, $t );
2035 }
2036 }
2037 if ( $goodtag ) {
2038 if ( $sanitize ) {
2039 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2040 $attribs = Sanitizer::validateTagAttributes( $attribs, $t );
2041 } else {
2042 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2043 }
2044 $goodtag = $this->insertToken(
2045 $slash ? 'endtag' : 'tag', $t, $attribs, $brace === '/>'
2046 );
2047 }
2048 if ( $goodtag ) {
2049 $rest = str_replace( '>', '&gt;', $rest );
2050 $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
2051 } else {
2052 # bad tag; serialize entire thing as text.
2053 $this->insertToken( 'text', '&lt;' . str_replace( '>', '&gt;', $x ) );
2054 }
2055 }
2056
2057 private function switchMode( $mode ) {
2058 Assert::parameter(
2059 substr( $mode, -4 )==='Mode', '$mode', 'should end in Mode'
2060 );
2061 $oldMode = $this->parseMode;
2062 $this->parseMode = $mode;
2063 return $oldMode;
2064 }
2065
2066 private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfclose ) {
2067 $this->switchMode( $mode );
2068 return $this->insertToken( $token, $value, $attribs, $selfclose );
2069 }
2070
2071 private function resetInsertionMode() {
2072 $last = false;
2073 foreach ( $this->stack as $i => $node ) {
2074 if ( $i === 0 ) {
2075 $last = true;
2076 if ( $this->fragmentContext ) {
2077 $node = $this->fragmentContext;
2078 }
2079 }
2080 if ( $node->isHtml() ) {
2081 switch ( $node->localName ) {
2082 # OMITTED: <select>
2083 /*
2084 case 'select':
2085 $stacklen = $this->stack->length();
2086 for ( $j = $i + 1; $j < $stacklen-1; $j++ ) {
2087 $ancestor = $this->stack->node( $stacklen-$j-1 );
2088 if ( $ancestor->isA( 'template' ) ) {
2089 break;
2090 }
2091 if ( $ancestor->isA( 'table' ) ) {
2092 $this->switchMode( 'inSelectInTableMode' );
2093 return;
2094 }
2095 }
2096 $this->switchMode( 'inSelectMode' );
2097 return;
2098 */
2099 case 'tr':
2100 $this->switchMode( 'inRowMode' );
2101 return;
2102 case 'tbody':
2103 case 'tfoot':
2104 case 'thead':
2105 $this->switchMode( 'inTableBodyMode' );
2106 return;
2107 case 'caption':
2108 $this->switchMode( 'inCaptionMode' );
2109 return;
2110 case 'colgroup':
2111 $this->switchMode( 'inColumnGroupMode' );
2112 return;
2113 case 'table':
2114 $this->switchMode( 'inTableMode' );
2115 return;
2116 case 'template':
2117 $this->switchMode(
2118 array_slice( $this->templateInsertionModes, -1 )[0]
2119 );
2120 return;
2121 case 'body':
2122 $this->switchMode( 'inBodyMode' );
2123 return;
2124 # OMITTED: <frameset>
2125 # OMITTED: <html>
2126 # OMITTED: <head>
2127 default:
2128 if ( !$last ) {
2129 # OMITTED: <head>
2130 if ( $node->isA( BalanceSets::$tableCellSet ) ) {
2131 $this->switchMode( 'inCellMode' );
2132 return;
2133 }
2134 }
2135 }
2136 }
2137 if ( $last ) {
2138 $this->switchMode( 'inBodyMode' );
2139 return;
2140 }
2141 }
2142 }
2143
2144 private function stopParsing() {
2145 # Most of the spec methods are inapplicable, other than step 2:
2146 # "pop all the nodes off the stack of open elements".
2147 # We're going to keep the top-most <html> element on the stack, though.
2148
2149 # Clear the AFE list first, otherwise the element objects will stay live
2150 # during serialization, potentially using O(N^2) memory. Note that
2151 # popping the stack will never result in reconstructing the active
2152 # formatting elements.
2153 $this->afe = null;
2154 $this->stack->popTo( 1 );
2155 }
2156
2157 private function parseRawText( $value, $attribs = null ) {
2158 $this->stack->insertHTMLElement( $value, $attribs );
2159 // XXX switch tokenizer to rawtext state?
2160 $this->originalInsertionMode = $this->switchMode( 'inTextMode' );
2161 return true;
2162 }
2163
2164 private function inTextMode( $token, $value, $attribs = null, $selfclose = false ) {
2165 if ( $token === 'text' ) {
2166 $this->stack->insertText( $value );
2167 return true;
2168 } elseif ( $token === 'eof' ) {
2169 $this->stack->pop();
2170 return $this->switchModeAndReprocess(
2171 $this->originalInsertionMode, $token, $value, $attribs, $selfclose
2172 );
2173 } elseif ( $token === 'endtag' ) {
2174 $this->stack->pop();
2175 $this->switchMode( $this->originalInsertionMode );
2176 return true;
2177 }
2178 return true;
2179 }
2180
2181 private function inHeadMode( $token, $value, $attribs = null, $selfclose = false ) {
2182 if ( $token === 'text' ) {
2183 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
2184 $this->stack->insertText( $matches[0] );
2185 $value = substr( $value, strlen( $matches[0] ) );
2186 }
2187 if ( strlen( $value ) === 0 ) {
2188 return true; // All text handled.
2189 }
2190 // Fall through to handle non-whitespace below.
2191 } elseif ( $token === 'tag' ) {
2192 switch ( $value ) {
2193 case 'meta':
2194 # OMITTED: in a full HTML parser, this might change the encoding.
2195 /* falls through */
2196 # OMITTED: <html>
2197 case 'base':
2198 case 'basefont':
2199 case 'bgsound':
2200 case 'link':
2201 $this->stack->insertHTMLElement( $value, $attribs );
2202 $this->stack->pop();
2203 return true;
2204 # OMITTED: <title>
2205 # OMITTED: <noscript>
2206 case 'noframes':
2207 case 'style':
2208 return $this->parseRawText( $value, $attribs );
2209 # OMITTED: <script>
2210 case 'template':
2211 $this->stack->insertHTMLElement( $value, $attribs );
2212 $this->afe->insertMarker();
2213 # OMITTED: frameset_ok
2214 $this->switchMode( 'inTemplateMode' );
2215 $this->templateInsertionModes[] = $this->parseMode;
2216 return true;
2217 # OMITTED: <head>
2218 }
2219 } elseif ( $token === 'endtag' ) {
2220 switch ( $value ) {
2221 # OMITTED: <head>
2222 # OMITTED: <body>
2223 # OMITTED: <html>
2224 case 'br':
2225 break; // handle at the bottom of the function
2226 case 'template':
2227 if ( $this->stack->indexOf( $value ) < 0 ) {
2228 return true; // Ignore the token.
2229 }
2230 $this->stack->generateImpliedEndTags( null, true /* thorough */ );
2231 $this->stack->popTag( $value );
2232 $this->afe->clearToMarker();
2233 array_pop( $this->templateInsertionModes );
2234 $this->resetInsertionMode();
2235 return true;
2236 default:
2237 // ignore any other end tag
2238 return true;
2239 }
2240 }
2241
2242 // If not handled above
2243 $this->inHeadMode( 'endtag', 'head' ); // synthetic </head>
2244 // Then redo this one
2245 return $this->insertToken( $token, $value, $attribs, $selfclose );
2246 }
2247
2248 private function inBodyMode( $token, $value, $attribs = null, $selfclose = false ) {
2249 if ( $token === 'text' ) {
2250 $this->afe->reconstruct( $this->stack );
2251 $this->stack->insertText( $value );
2252 return true;
2253 } elseif ( $token === 'eof' ) {
2254 if ( !empty( $this->templateInsertionModes ) ) {
2255 return $this->inTemplateMode( $token, $value, $attribs, $selfclose );
2256 }
2257 $this->stopParsing();
2258 return true;
2259 } elseif ( $token === 'tag' ) {
2260 switch ( $value ) {
2261 # OMITTED: <html>
2262 case 'base':
2263 case 'basefont':
2264 case 'bgsound':
2265 case 'link':
2266 case 'meta':
2267 case 'noframes':
2268 # OMITTED: <script>
2269 case 'style':
2270 case 'template':
2271 # OMITTED: <title>
2272 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2273 # OMITTED: <body>
2274 # OMITTED: <frameset>
2275
2276 case 'address':
2277 case 'article':
2278 case 'aside':
2279 case 'blockquote':
2280 case 'center':
2281 case 'details':
2282 case 'dialog':
2283 case 'dir':
2284 case 'div':
2285 case 'dl':
2286 case 'fieldset':
2287 case 'figcaption':
2288 case 'figure':
2289 case 'footer':
2290 case 'header':
2291 case 'hgroup':
2292 case 'main':
2293 case 'menu':
2294 case 'nav':
2295 case 'ol':
2296 case 'p':
2297 case 'section':
2298 case 'summary':
2299 case 'ul':
2300 if ( $this->stack->inButtonScope( 'p' ) ) {
2301 $this->inBodyMode( 'endtag', 'p' );
2302 }
2303 $this->stack->insertHTMLElement( $value, $attribs );
2304 return true;
2305
2306 case 'h1':
2307 case 'h2':
2308 case 'h3':
2309 case 'h4':
2310 case 'h5':
2311 case 'h6':
2312 if ( $this->stack->inButtonScope( 'p' ) ) {
2313 $this->inBodyMode( 'endtag', 'p' );
2314 }
2315 if ( $this->stack->currentNode->isA( BalanceSets::$headingSet ) ) {
2316 $this->stack->pop();
2317 }
2318 $this->stack->insertHTMLElement( $value, $attribs );
2319 return true;
2320
2321 case 'pre':
2322 case 'listing':
2323 if ( $this->stack->inButtonScope( 'p' ) ) {
2324 $this->inBodyMode( 'endtag', 'p' );
2325 }
2326 $this->stack->insertHTMLElement( $value, $attribs );
2327 # As described in "simplifications" above:
2328 # 1. We don't touch the next token, even if it's a linefeed.
2329 # 2. OMITTED: frameset_ok
2330 return true;
2331
2332 # OMITTED: <form>
2333
2334 case 'li':
2335 # OMITTED: frameset_ok
2336 foreach ( $this->stack as $node ) {
2337 if ( $node->isA( 'li' ) ) {
2338 $this->inBodyMode( 'endtag', 'li' );
2339 break;
2340 }
2341 if (
2342 $node->isA( BalanceSets::$specialSet ) &&
2343 !$node->isA( BalanceSets::$addressDivPSet )
2344 ) {
2345 break;
2346 }
2347 }
2348 if ( $this->stack->inButtonScope( 'p' ) ) {
2349 $this->inBodyMode( 'endtag', 'p' );
2350 }
2351 $this->stack->insertHTMLElement( $value, $attribs );
2352 return true;
2353
2354 case 'dd':
2355 case 'dt':
2356 # OMITTED: frameset_ok
2357 foreach ( $this->stack as $node ) {
2358 if ( $node->isA( 'dd' ) ) {
2359 $this->inBodyMode( 'endtag', 'dd' );
2360 break;
2361 }
2362 if ( $node->isA( 'dt' ) ) {
2363 $this->inBodyMode( 'endtag', 'dt' );
2364 break;
2365 }
2366 if (
2367 $node->isA( BalanceSets::$specialSet ) &&
2368 !$node->isA( BalanceSets::$addressDivPSet )
2369 ) {
2370 break;
2371 }
2372 }
2373 if ( $this->stack->inButtonScope( 'p' ) ) {
2374 $this->inBodyMode( 'endtag', 'p' );
2375 }
2376 $this->stack->insertHTMLElement( $value, $attribs );
2377 return true;
2378
2379 # OMITTED: <plaintext>
2380
2381 case 'button':
2382 if ( $this->stack->inScope( 'button' ) ) {
2383 $this->inBodyMode( 'endtag', 'button' );
2384 return $this->insertToken( $token, $value, $attribs, $selfclose );
2385 }
2386 $this->afe->reconstruct( $this->stack );
2387 $this->stack->insertHTMLElement( $value, $attribs );
2388 return true;
2389
2390 case 'a':
2391 $activeElement = $this->afe->findElementByTag( 'a' );
2392 if ( $activeElement ) {
2393 $this->inBodyMode( 'endtag', 'a' );
2394 if ( $this->afe->isInList( $activeElement ) ) {
2395 $this->afe->remove( $activeElement );
2396 // Don't flatten here, since when we fall
2397 // through below we might foster parent
2398 // the new <a> tag inside this one.
2399 $this->stack->removeElement( $activeElement, false );
2400 }
2401 }
2402 /* Falls through */
2403 case 'b':
2404 case 'big':
2405 case 'code':
2406 case 'em':
2407 case 'font':
2408 case 'i':
2409 case 's':
2410 case 'small':
2411 case 'strike':
2412 case 'strong':
2413 case 'tt':
2414 case 'u':
2415 $this->afe->reconstruct( $this->stack );
2416 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
2417 return true;
2418
2419 case 'nobr':
2420 $this->afe->reconstruct( $this->stack );
2421 if ( $this->stack->inScope( 'nobr' ) ) {
2422 $this->inBodyMode( 'endtag', 'nobr' );
2423 $this->afe->reconstruct( $this->stack );
2424 }
2425 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
2426 return true;
2427
2428 case 'applet':
2429 case 'marquee':
2430 case 'object':
2431 $this->afe->reconstruct( $this->stack );
2432 $this->stack->insertHTMLElement( $value, $attribs );
2433 $this->afe->insertMarker();
2434 # OMITTED: frameset_ok
2435 return true;
2436
2437 case 'table':
2438 # The document is never in "quirks mode"; see simplifications
2439 # above.
2440 if ( $this->stack->inButtonScope( 'p' ) ) {
2441 $this->inBodyMode( 'endtag', 'p' );
2442 }
2443 $this->stack->insertHTMLElement( $value, $attribs );
2444 # OMITTED: frameset_ok
2445 $this->switchMode( 'inTableMode' );
2446 return true;
2447
2448 case 'area':
2449 case 'br':
2450 case 'embed':
2451 case 'img':
2452 case 'keygen':
2453 case 'wbr':
2454 $this->afe->reconstruct( $this->stack );
2455 $this->stack->insertHTMLElement( $value, $attribs );
2456 $this->stack->pop();
2457 # OMITTED: frameset_ok
2458 return true;
2459
2460 case 'input':
2461 $this->afe->reconstruct( $this->stack );
2462 $this->stack->insertHTMLElement( $value, $attribs );
2463 $this->stack->pop();
2464 # OMITTED: frameset_ok
2465 # (hence we don't need to examine the tag's "type" attribute)
2466 return true;
2467
2468 case 'menuitem':
2469 case 'param':
2470 case 'source':
2471 case 'track':
2472 $this->stack->insertHTMLElement( $value, $attribs );
2473 $this->stack->pop();
2474 return true;
2475
2476 case 'hr':
2477 if ( $this->stack->inButtonScope( 'p' ) ) {
2478 $this->inBodyMode( 'endtag', 'p' );
2479 }
2480 $this->stack->insertHTMLElement( $value, $attribs );
2481 $this->stack->pop();
2482 return true;
2483
2484 case 'image':
2485 # warts!
2486 return $this->inBodyMode( $token, 'img', $attribs, $selfclose );
2487
2488 # OMITTED: <isindex>
2489 # OMITTED: <textarea>
2490 # OMITTED: <xmp>
2491 # OMITTED: <iframe>
2492 # OMITTED: <noembed>
2493 # OMITTED: <noscript>
2494
2495 # OMITTED: <select>
2496 /*
2497 case 'select':
2498 $this->afe->reconstruct( $this->stack );
2499 $this->stack->insertHTMLElement( $value, $attribs );
2500 switch ( $this->parseMode ) {
2501 case 'inTableMode':
2502 case 'inCaptionMode':
2503 case 'inTableBodyMode':
2504 case 'inRowMode':
2505 case 'inCellMode':
2506 $this->switchMode( 'inSelectInTableMode' );
2507 return true;
2508 default:
2509 $this->switchMode( 'inSelectMode' );
2510 return true;
2511 }
2512 */
2513
2514 case 'optgroup':
2515 case 'option':
2516 if ( $this->stack->currentNode->isA( 'option' ) ) {
2517 $this->inBodyMode( 'endtag', 'option' );
2518 }
2519 $this->afe->reconstruct( $this->stack );
2520 $this->stack->insertHTMLElement( $value, $attribs );
2521 return true;
2522
2523 case 'rb':
2524 case 'rtc':
2525 if ( $this->stack->inScope( 'ruby' ) ) {
2526 $this->stack->generateImpliedEndTags();
2527 }
2528 $this->stack->insertHTMLElement( $value, $attribs );
2529 return true;
2530
2531 case 'rp':
2532 case 'rt':
2533 if ( $this->stack->inScope( 'ruby' ) ) {
2534 $this->stack->generateImpliedEndTags( 'rtc' );
2535 }
2536 $this->stack->insertHTMLElement( $value, $attribs );
2537 return true;
2538
2539 case 'math':
2540 $this->afe->reconstruct( $this->stack );
2541 # We skip the spec's "adjust MathML attributes" and
2542 # "adjust foreign attributes" steps, since the browser will
2543 # do this later when it parses the output and it doesn't affect
2544 # balancing.
2545 $this->stack->insertForeignElement(
2546 BalanceSets::MATHML_NAMESPACE, $value, $attribs
2547 );
2548 if ( $selfclose ) {
2549 # emit explicit </math> tag.
2550 $this->stack->pop();
2551 }
2552 return true;
2553
2554 case 'svg':
2555 $this->afe->reconstruct( $this->stack );
2556 # We skip the spec's "adjust SVG attributes" and
2557 # "adjust foreign attributes" steps, since the browser will
2558 # do this later when it parses the output and it doesn't affect
2559 # balancing.
2560 $this->stack->insertForeignElement(
2561 BalanceSets::SVG_NAMESPACE, $value, $attribs
2562 );
2563 if ( $selfclose ) {
2564 # emit explicit </svg> tag.
2565 $this->stack->pop();
2566 }
2567 return true;
2568
2569 case 'caption':
2570 case 'col':
2571 case 'colgroup':
2572 # OMITTED: <frame>
2573 case 'head':
2574 case 'tbody':
2575 case 'td':
2576 case 'tfoot':
2577 case 'th':
2578 case 'thead':
2579 case 'tr':
2580 // Ignore table tags if we're not inTableMode
2581 return true;
2582 }
2583
2584 // Handle any other start tag here
2585 $this->afe->reconstruct( $this->stack );
2586 $this->stack->insertHTMLElement( $value, $attribs );
2587 return true;
2588 } elseif ( $token === 'endtag' ) {
2589 switch ( $value ) {
2590 # </body>,</html> are unsupported.
2591
2592 case 'template':
2593 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2594
2595 case 'address':
2596 case 'article':
2597 case 'aside':
2598 case 'blockquote':
2599 case 'button':
2600 case 'center':
2601 case 'details':
2602 case 'dialog':
2603 case 'dir':
2604 case 'div':
2605 case 'dl':
2606 case 'fieldset':
2607 case 'figcaption':
2608 case 'figure':
2609 case 'footer':
2610 case 'header':
2611 case 'hgroup':
2612 case 'listing':
2613 case 'main':
2614 case 'menu':
2615 case 'nav':
2616 case 'ol':
2617 case 'pre':
2618 case 'section':
2619 case 'summary':
2620 case 'ul':
2621 // Ignore if there is not a matching open tag
2622 if ( !$this->stack->inScope( $value ) ) {
2623 return true;
2624 }
2625 $this->stack->generateImpliedEndTags();
2626 $this->stack->popTag( $value );
2627 return true;
2628
2629 # OMITTED: <form>
2630
2631 case 'p':
2632 if ( !$this->stack->inButtonScope( 'p' ) ) {
2633 $this->inBodyMode( 'tag', 'p', [] );
2634 return $this->insertToken( $token, $value, $attribs, $selfclose );
2635 }
2636 $this->stack->generateImpliedEndTags( $value );
2637 $this->stack->popTag( $value );
2638 return true;
2639
2640 case 'li':
2641 if ( !$this->stack->inListItemScope( $value ) ) {
2642 return true; # ignore
2643 }
2644 $this->stack->generateImpliedEndTags( $value );
2645 $this->stack->popTag( $value );
2646 return true;
2647
2648 case 'dd':
2649 case 'dt':
2650 if ( !$this->stack->inScope( $value ) ) {
2651 return true; # ignore
2652 }
2653 $this->stack->generateImpliedEndTags( $value );
2654 $this->stack->popTag( $value );
2655 return true;
2656
2657 case 'h1':
2658 case 'h2':
2659 case 'h3':
2660 case 'h4':
2661 case 'h5':
2662 case 'h6':
2663 if ( !$this->stack->inScope( BalanceSets::$headingSet ) ) {
2664 return;
2665 }
2666 $this->stack->generateImpliedEndTags();
2667 $this->stack->popTag( BalanceSets::$headingSet );
2668 return true;
2669
2670 case 'sarcasm':
2671 # Take a deep breath, then:
2672 break;
2673
2674 case 'a':
2675 case 'b':
2676 case 'big':
2677 case 'code':
2678 case 'em':
2679 case 'font':
2680 case 'i':
2681 case 'nobr':
2682 case 's':
2683 case 'small':
2684 case 'strike':
2685 case 'strong':
2686 case 'tt':
2687 case 'u':
2688 if ( $this->stack->adoptionAgency( $value, $this->afe ) ) {
2689 return true; # If we did something, we're done.
2690 }
2691 break; # Go to the "any other end tag" case.
2692
2693 case 'applet':
2694 case 'marquee':
2695 case 'object':
2696 if ( !$this->stack->inScope( $value ) ) {
2697 return true; # ignore
2698 }
2699 $this->stack->generateImpliedEndTags();
2700 $this->stack->popTag( $value );
2701 $this->afe->clearToMarker();
2702 return true;
2703
2704 case 'br':
2705 # Turn </br> into <br>
2706 return $this->inBodyMode( 'tag', $value, [] );
2707 }
2708
2709 // Any other end tag goes here
2710 foreach ( $this->stack as $i => $node ) {
2711 if ( $node->isA( $value ) ) {
2712 $this->stack->generateImpliedEndTags( $value );
2713 $this->stack->popTo( $i ); # including $i
2714 break;
2715 } elseif ( $node->isA( BalanceSets::$specialSet ) ) {
2716 return true; // ignore this close token.
2717 }
2718 }
2719 return true;
2720 } else {
2721 Assert::invariant( false, "Bad token type: $token" );
2722 }
2723 }
2724
2725 private function inTableMode( $token, $value, $attribs = null, $selfclose = false ) {
2726 if ( $token === 'text' ) {
2727 if ( $this->textIntegrationMode ) {
2728 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
2729 } elseif ( $this->stack->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) {
2730 $this->pendingTableText = '';
2731 $this->originalInsertionMode = $this->parseMode;
2732 return $this->switchModeAndReprocess( 'inTableTextMode', $token, $value, $attribs, $selfclose );
2733 }
2734 // fall through to default case.
2735 } elseif ( $token === 'eof' ) {
2736 $this->stopParsing();
2737 return true;
2738 } elseif ( $token === 'tag' ) {
2739 switch ( $value ) {
2740 case 'caption':
2741 $this->afe->insertMarker();
2742 $this->stack->insertHTMLElement( $value, $attribs );
2743 $this->switchMode( 'inCaptionMode' );
2744 return true;
2745 case 'colgroup':
2746 $this->stack->clearToContext( BalanceSets::$tableContextSet );
2747 $this->stack->insertHTMLElement( $value, $attribs );
2748 $this->switchMode( 'inColumnGroupMode' );
2749 return true;
2750 case 'col':
2751 $this->inTableMode( 'tag', 'colgroup', [] );
2752 return $this->insertToken( $token, $value, $attribs, $selfclose );
2753 case 'tbody':
2754 case 'tfoot':
2755 case 'thead':
2756 $this->stack->clearToContext( BalanceSets::$tableContextSet );
2757 $this->stack->insertHTMLElement( $value, $attribs );
2758 $this->switchMode( 'inTableBodyMode' );
2759 return true;
2760 case 'td':
2761 case 'th':
2762 case 'tr':
2763 $this->inTableMode( 'tag', 'tbody', [] );
2764 return $this->insertToken( $token, $value, $attribs, $selfclose );
2765 case 'table':
2766 if ( !$this->stack->inTableScope( $value ) ) {
2767 return true; // Ignore this tag.
2768 }
2769 $this->inTableMode( 'endtag', $value );
2770 return $this->insertToken( $token, $value, $attribs, $selfclose );
2771
2772 case 'style':
2773 # OMITTED: <script>
2774 case 'template':
2775 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2776
2777 case 'input':
2778 if ( !isset( $attribs['type'] ) || strcasecmp( $attribs['type'], 'hidden' ) !== 0 ) {
2779 break; // Handle this as "everything else"
2780 }
2781 $this->stack->insertHTMLElement( $value, $attribs );
2782 $this->stack->pop();
2783 return true;
2784
2785 # OMITTED: <form>
2786 }
2787 // Fall through for "anything else" clause.
2788 } elseif ( $token === 'endtag' ) {
2789 switch ( $value ) {
2790 case 'table':
2791 if ( !$this->stack->inTableScope( $value ) ) {
2792 return true; // Ignore.
2793 }
2794 $this->stack->popTag( $value );
2795 $this->resetInsertionMode();
2796 return true;
2797 # OMITTED: <body>
2798 case 'caption':
2799 case 'col':
2800 case 'colgroup':
2801 # OMITTED: <html>
2802 case 'tbody':
2803 case 'td':
2804 case 'tfoot':
2805 case 'th':
2806 case 'thead':
2807 case 'tr':
2808 return true; // Ignore the token.
2809 case 'template':
2810 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2811 }
2812 // Fall through for "anything else" clause.
2813 }
2814 // This is the "anything else" case:
2815 $this->stack->fosterParentMode = true;
2816 $this->inBodyMode( $token, $value, $attribs, $selfclose );
2817 $this->stack->fosterParentMode = false;
2818 return true;
2819 }
2820
2821 private function inTableTextMode( $token, $value, $attribs = null, $selfclose = false ) {
2822 if ( $token === 'text' ) {
2823 $this->pendingTableText .= $value;
2824 return true;
2825 }
2826 // Non-text token:
2827 $text = $this->pendingTableText;
2828 $this->pendingTableText = '';
2829 if ( preg_match( '/[^\x09\x0A\x0C\x0D\x20]/', $text ) ) {
2830 // This should match the "anything else" case inTableMode
2831 $this->stack->fosterParentMode = true;
2832 $this->inBodyMode( 'text', $text );
2833 $this->stack->fosterParentMode = false;
2834 } else {
2835 // Pending text is just whitespace.
2836 $this->stack->insertText( $text );
2837 }
2838 return $this->switchModeAndReprocess(
2839 $this->originalInsertionMode, $token, $value, $attribs, $selfclose
2840 );
2841 }
2842
2843 // helper for inCaptionMode
2844 private function endCaption() {
2845 if ( !$this->stack->inTableScope( 'caption' ) ) {
2846 return false;
2847 }
2848 $this->stack->generateImpliedEndTags();
2849 $this->stack->popTag( 'caption' );
2850 $this->afe->clearToMarker();
2851 $this->switchMode( 'inTableMode' );
2852 return true;
2853 }
2854
2855 private function inCaptionMode( $token, $value, $attribs = null, $selfclose = false ) {
2856 if ( $token === 'tag' ) {
2857 switch ( $value ) {
2858 case 'caption':
2859 case 'col':
2860 case 'colgroup':
2861 case 'tbody':
2862 case 'td':
2863 case 'tfoot':
2864 case 'th':
2865 case 'thead':
2866 case 'tr':
2867 if ( $this->endCaption() ) {
2868 $this->insertToken( $token, $value, $attribs, $selfclose );
2869 }
2870 return true;
2871 }
2872 // Fall through to "anything else" case.
2873 } elseif ( $token === 'endtag' ) {
2874 switch ( $value ) {
2875 case 'caption':
2876 $this->endCaption();
2877 return true;
2878 case 'table':
2879 if ( $this->endCaption() ) {
2880 $this->insertToken( $token, $value, $attribs, $selfclose );
2881 }
2882 return true;
2883 case 'body':
2884 case 'col':
2885 case 'colgroup':
2886 # OMITTED: <html>
2887 case 'tbody':
2888 case 'td':
2889 case 'tfoot':
2890 case 'th':
2891 case 'thead':
2892 case 'tr':
2893 // Ignore the token
2894 return true;
2895 }
2896 // Fall through to "anything else" case.
2897 }
2898 // The Anything Else case
2899 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
2900 }
2901
2902 private function inColumnGroupMode( $token, $value, $attribs = null, $selfclose = false ) {
2903 if ( $token === 'text' ) {
2904 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
2905 $this->stack->insertText( $matches[0] );
2906 $value = substr( $value, strlen( $matches[0] ) );
2907 }
2908 if ( strlen( $value ) === 0 ) {
2909 return true; // All text handled.
2910 }
2911 // Fall through to handle non-whitespace below.
2912 } elseif ( $token === 'tag' ) {
2913 switch ( $value ) {
2914 # OMITTED: <html>
2915 case 'col':
2916 $this->stack->insertHTMLElement( $value, $attribs );
2917 $this->stack->pop();
2918 return true;
2919 case 'template':
2920 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2921 }
2922 // Fall through for "anything else".
2923 } elseif ( $token === 'endtag' ) {
2924 switch ( $value ) {
2925 case 'colgroup':
2926 if ( !$this->stack->currentNode->isA( 'colgroup' ) ) {
2927 return true; // Ignore the token.
2928 }
2929 $this->stack->pop();
2930 $this->switchMode( 'inTableMode' );
2931 return true;
2932 case 'col':
2933 return true; // Ignore the token.
2934 case 'template':
2935 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2936 }
2937 // Fall through for "anything else".
2938 } elseif ( $token === 'eof' ) {
2939 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
2940 }
2941
2942 // Anything else
2943 if ( !$this->stack->currentNode->isA( 'colgroup' ) ) {
2944 return true; // Ignore the token.
2945 }
2946 $this->inColumnGroupMode( 'endtag', 'colgroup' );
2947 return $this->insertToken( $token, $value, $attribs, $selfclose );
2948 }
2949
2950 // Helper function for inTableBodyMode
2951 private function endSection() {
2952 if ( !(
2953 $this->stack->inTableScope( 'tbody' ) ||
2954 $this->stack->inTableScope( 'thead' ) ||
2955 $this->stack->inTableScope( 'tfoot' )
2956 ) ) {
2957 return false;
2958 }
2959 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
2960 $this->stack->pop();
2961 $this->switchMode( 'inTableMode' );
2962 return true;
2963 }
2964 private function inTableBodyMode( $token, $value, $attribs = null, $selfclose = false ) {
2965 if ( $token === 'tag' ) {
2966 switch ( $value ) {
2967 case 'tr':
2968 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
2969 $this->stack->insertHTMLElement( $value, $attribs );
2970 $this->switchMode( 'inRowMode' );
2971 return true;
2972 case 'th':
2973 case 'td':
2974 $this->inTableBodyMode( 'tag', 'tr', [] );
2975 $this->insertToken( $token, $value, $attribs, $selfclose );
2976 return true;
2977 case 'caption':
2978 case 'col':
2979 case 'colgroup':
2980 case 'tbody':
2981 case 'tfoot':
2982 case 'thead':
2983 if ( $this->endSection() ) {
2984 $this->insertToken( $token, $value, $attribs, $selfclose );
2985 }
2986 return true;
2987 }
2988 } elseif ( $token === 'endtag' ) {
2989 switch ( $value ) {
2990 case 'table':
2991 if ( $this->endSection() ) {
2992 $this->insertToken( $token, $value, $attribs, $selfclose );
2993 }
2994 return true;
2995 case 'tbody':
2996 case 'tfoot':
2997 case 'thead':
2998 if ( $this->stack->inTableScope( $value ) ) {
2999 $this->endSection();
3000 }
3001 return true;
3002 # OMITTED: <body>
3003 case 'caption':
3004 case 'col':
3005 case 'colgroup':
3006 # OMITTED: <html>
3007 case 'td':
3008 case 'th':
3009 case 'tr':
3010 return true; // Ignore the token.
3011 }
3012 }
3013 // Anything else:
3014 return $this->inTableMode( $token, $value, $attribs, $selfclose );
3015 }
3016
3017 // Helper function for inRowMode
3018 private function endRow() {
3019 if ( !$this->stack->inTableScope( 'tr' ) ) {
3020 return false;
3021 }
3022 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3023 $this->stack->pop();
3024 $this->switchMode( 'inTableBodyMode' );
3025 return true;
3026 }
3027 private function inRowMode( $token, $value, $attribs = null, $selfclose = false ) {
3028 if ( $token === 'tag' ) {
3029 switch ( $value ) {
3030 case 'th':
3031 case 'td':
3032 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3033 $this->stack->insertHTMLElement( $value, $attribs );
3034 $this->switchMode( 'inCellMode' );
3035 $this->afe->insertMarker();
3036 return true;
3037 case 'caption':
3038 case 'col':
3039 case 'colgroup':
3040 case 'tbody':
3041 case 'tfoot':
3042 case 'thead':
3043 case 'tr':
3044 if ( $this->endRow() ) {
3045 $this->insertToken( $token, $value, $attribs, $selfclose );
3046 }
3047 return true;
3048 }
3049 } elseif ( $token === 'endtag' ) {
3050 switch ( $value ) {
3051 case 'tr':
3052 $this->endRow();
3053 return true;
3054 case 'table':
3055 if ( $this->endRow() ) {
3056 $this->insertToken( $token, $value, $attribs, $selfclose );
3057 }
3058 return true;
3059 case 'tbody':
3060 case 'tfoot':
3061 case 'thead':
3062 if (
3063 $this->stack->inTableScope( $value ) &&
3064 $this->endRow()
3065 ) {
3066 $this->insertToken( $token, $value, $attribs, $selfclose );
3067 }
3068 return true;
3069 # OMITTED: <body>
3070 case 'caption':
3071 case 'col':
3072 case 'colgroup':
3073 # OMITTED: <html>
3074 case 'td':
3075 case 'th':
3076 return true; // Ignore the token.
3077 }
3078 }
3079 // Anything else:
3080 return $this->inTableMode( $token, $value, $attribs, $selfclose );
3081 }
3082
3083 // Helper for inCellMode
3084 private function endCell() {
3085 if ( $this->stack->inTableScope( 'td' ) ) {
3086 $this->inCellMode( 'endtag', 'td' );
3087 return true;
3088 } elseif ( $this->stack->inTableScope( 'th' ) ) {
3089 $this->inCellMode( 'endtag', 'th' );
3090 return true;
3091 } else {
3092 return false;
3093 }
3094 }
3095 private function inCellMode( $token, $value, $attribs = null, $selfclose = false ) {
3096 if ( $token === 'tag' ) {
3097 switch ( $value ) {
3098 case 'caption':
3099 case 'col':
3100 case 'colgroup':
3101 case 'tbody':
3102 case 'td':
3103 case 'tfoot':
3104 case 'th':
3105 case 'thead':
3106 case 'tr':
3107 if ( $this->endCell() ) {
3108 $this->insertToken( $token, $value, $attribs, $selfclose );
3109 }
3110 return true;
3111 }
3112 } elseif ( $token === 'endtag' ) {
3113 switch ( $value ) {
3114 case 'td':
3115 case 'th':
3116 if ( $this->stack->inTableScope( $value ) ) {
3117 $this->stack->generateImpliedEndTags();
3118 $this->stack->popTag( $value );
3119 $this->afe->clearToMarker();
3120 $this->switchMode( 'inRowMode' );
3121 }
3122 return true;
3123 # OMITTED: <body>
3124 case 'caption':
3125 case 'col':
3126 case 'colgroup':
3127 # OMITTED: <html>
3128 return true;
3129
3130 case 'table':
3131 case 'tbody':
3132 case 'tfoot':
3133 case 'thead':
3134 case 'tr':
3135 if ( $this->stack->inTableScope( $value ) ) {
3136 $this->stack->generateImpliedEndTags();
3137 $this->stack->popTag( BalanceSets::$tableCellSet );
3138 $this->afe->clearToMarker();
3139 $this->switchMode( 'inRowMode' );
3140 $this->insertToken( $token, $value, $attribs, $selfclose );
3141 }
3142 return true;
3143 }
3144 }
3145 // Anything else:
3146 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3147 }
3148
3149 # OMITTED: <select>
3150 /*
3151 private function inSelectMode( $token, $value, $attribs = null, $selfclose = false ) {
3152 Assert::invariant( false, 'Unimplemented' );
3153 }
3154
3155 private function inSelectInTableMode( $token, $value, $attribs = null, $selfclose = false ) {
3156 Assert::invariant( false, 'Unimplemented' );
3157 }
3158 */
3159
3160 private function inTemplateMode( $token, $value, $attribs = null, $selfclose = false ) {
3161 if ( $token === 'text' ) {
3162 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3163 } elseif ( $token === 'eof' ) {
3164 if ( $this->stack->indexOf( 'template' ) < 0 ) {
3165 $this->stopParsing();
3166 } else {
3167 $this->stack->popTag( 'template' );
3168 $this->afe->clearToMarker();
3169 array_pop( $this->templateInsertionModes );
3170 $this->resetInsertionMode();
3171 $this->insertToken( $token, $value, $attribs, $selfclose );
3172 }
3173 return true;
3174 } elseif ( $token === 'tag' ) {
3175 switch ( $value ) {
3176 case 'base':
3177 case 'basefont':
3178 case 'bgsound':
3179 case 'link':
3180 case 'meta':
3181 case 'noframes':
3182 # OMITTED: <script>
3183 case 'style':
3184 case 'template':
3185 # OMITTED: <title>
3186 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3187
3188 case 'caption':
3189 case 'colgroup':
3190 case 'tbody':
3191 case 'tfoot':
3192 case 'thead':
3193 return $this->switchModeAndReprocess(
3194 'inTableMode', $token, $value, $attribs, $selfclose
3195 );
3196
3197 case 'col':
3198 return $this->switchModeAndReprocess(
3199 'inColumnGroupMode', $token, $value, $attribs, $selfclose
3200 );
3201
3202 case 'tr':
3203 return $this->switchModeAndReprocess(
3204 'inTableBodyMode', $token, $value, $attribs, $selfclose
3205 );
3206
3207 case 'td':
3208 case 'th':
3209 return $this->switchModeAndReprocess(
3210 'inRowMode', $token, $value, $attribs, $selfclose
3211 );
3212 }
3213 return $this->switchModeAndReprocess(
3214 'inBodyMode', $token, $value, $attribs, $selfclose
3215 );
3216 } elseif ( $token === 'endtag' ) {
3217 switch ( $value ) {
3218 case 'template':
3219 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3220 }
3221 return true;
3222 } else {
3223 Assert::invariant( false, "Bad token type: $token" );
3224 }
3225 }
3226 }