00f6e4b4cc6083d37c7a0d1b69daebcfd8ff4ba2
[lhc/web/wiklou.git] / includes / tidy / Balancer.php
1 <?php
2 /**
3 * An implementation of the tree building portion of the HTML5 parsing
4 * spec.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
20 *
21 * @file
22 * @ingroup Parser
23 * @since 1.27
24 * @author C. Scott Ananian, 2016
25 */
26 namespace MediaWiki\Tidy;
27
28 use Wikimedia\Assert\Assert;
29 use Wikimedia\Assert\ParameterAssertionException;
30 use \ExplodeIterator;
31 use \IteratorAggregate;
32 use \ReverseArrayIterator;
33 use \Sanitizer;
34
35 # A note for future librarization[1] -- this file is a good candidate
36 # for splitting into an independent library, except that it is currently
37 # highly optimized for MediaWiki use. It only implements the portions
38 # of the HTML5 tree builder used by tags supported by MediaWiki, and
39 # does not contain a true tokenizer pass, instead relying on
40 # comment stripping, attribute normalization, and escaping done by
41 # the MediaWiki Sanitizer. It also deliberately avoids building
42 # a true DOM in memory, instead serializing elements to an output string
43 # as soon as possible (usually as soon as the tag is closed) to reduce
44 # its memory footprint.
45
46 # On the other hand, I've been pretty careful to note with comments in the
47 # code the places where this implementation omits features of the spec or
48 # depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
49 # implement the missing pieces and make this a standalone PHP HTML5 parser.
50 # In order to do so, some sort of MediaWiki-specific API will need
51 # to be added to (a) allow the Balancer to bypass the tokenizer,
52 # and (b) support on-the-fly flattening instead of DOM node creation.
53
54 # [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
55
56 /**
57 * Utility constants and sets for the HTML5 tree building algorithm.
58 * Sets are associative arrays indexed first by namespace and then by
59 * lower-cased tag name.
60 *
61 * @ingroup Parser
62 * @since 1.27
63 */
64 class BalanceSets {
65 const HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml';
66 const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
67 const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
68
69 public static $unsupportedSet = [
70 self::HTML_NAMESPACE => [
71 'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
72 'form' => true, 'frame' => true,
73 'plaintext' => true, 'isindex' => true, 'textarea' => true,
74 'xmp' => true, 'iframe' => true, 'noembed' => true,
75 'noscript' => true, 'select' => true, 'script' => true,
76 'title' => true
77 ]
78 ];
79
80 public static $emptyElementSet = [
81 self::HTML_NAMESPACE => [
82 'area' => true, 'base' => true, 'basefont' => true,
83 'bgsound' => true, 'br' => true, 'col' => true, 'command' => true,
84 'embed' => true, 'frame' => true, 'hr' => true, 'img' => true,
85 'input' => true, 'keygen' => true, 'link' => true, 'meta' => true,
86 'param' => true, 'source' => true, 'track' => true, 'wbr' => true
87 ]
88 ];
89
90 public static $headingSet = [
91 self::HTML_NAMESPACE => [
92 'h1' => true, 'h2' => true, 'h3' => true,
93 'h4' => true, 'h5' => true, 'h6' => true
94 ]
95 ];
96
97 public static $specialSet = [
98 self::HTML_NAMESPACE => [
99 'address' => true, 'applet' => true, 'area' => true,
100 'article' => true, 'aside' => true, 'base' => true,
101 'basefont' => true, 'bgsound' => true, 'blockquote' => true,
102 'body' => true, 'br' => true, 'button' => true, 'caption' => true,
103 'center' => true, 'col' => true, 'colgroup' => true, 'dd' => true,
104 'details' => true, 'dir' => true, 'div' => true, 'dl' => true,
105 'dt' => true, 'embed' => true, 'fieldset' => true,
106 'figcaption' => true, 'figure' => true, 'footer' => true,
107 'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true,
108 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
109 'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
110 'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
111 'input' => true, 'isindex' => true, 'li' => true, 'link' => true,
112 'listing' => true, 'main' => true, 'marquee' => true,
113 'menu' => true, 'menuitem' => true, 'meta' => true, 'nav' => true,
114 'noembed' => true, 'noframes' => true, 'noscript' => true,
115 'object' => true, 'ol' => true, 'p' => true, 'param' => true,
116 'plaintext' => true, 'pre' => true, 'script' => true,
117 'section' => true, 'select' => true, 'source' => true,
118 'style' => true, 'summary' => true, 'table' => true,
119 'tbody' => true, 'td' => true, 'template' => true,
120 'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true,
121 'title' => true, 'tr' => true, 'track' => true, 'ul' => true,
122 'wbr' => true, 'xmp' => true
123 ],
124 self::SVG_NAMESPACE => [
125 'foreignobject' => true, 'desc' => true, 'title' => true
126 ],
127 self::MATHML_NAMESPACE => [
128 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
129 'mtext' => true, 'annotation-xml' => true
130 ]
131 ];
132
133 public static $addressDivPSet = [
134 self::HTML_NAMESPACE => [
135 'address' => true, 'div' => true, 'p' => true
136 ]
137 ];
138
139 public static $tableSectionRowSet = [
140 self::HTML_NAMESPACE => [
141 'table' => true, 'thead' => true, 'tbody' => true,
142 'tfoot' => true, 'tr' => true
143 ]
144 ];
145
146 public static $impliedEndTagsSet = [
147 self::HTML_NAMESPACE => [
148 'dd' => true, 'dt' => true, 'li' => true, 'optgroup' => true,
149 'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
150 'rt' => true, 'rtc' => true
151 ]
152 ];
153
154 public static $thoroughImpliedEndTagsSet = [
155 self::HTML_NAMESPACE => [
156 'caption' => true, 'colgroup' => true, 'dd' => true, 'dt' => true,
157 'li' => true, 'optgroup' => true, 'option' => true, 'p' => true,
158 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true,
159 'tbody' => true, 'td' => true, 'tfoot' => true, 'th' => true,
160 'thead' => true, 'tr' => true
161 ]
162 ];
163
164 public static $tableCellSet = [
165 self::HTML_NAMESPACE => [
166 'td' => true, 'th' => true
167 ]
168 ];
169 public static $tableContextSet = [
170 self::HTML_NAMESPACE => [
171 'table' => true, 'template' => true, 'html' => true
172 ]
173 ];
174
175 public static $tableBodyContextSet = [
176 self::HTML_NAMESPACE => [
177 'tbody' => true, 'tfoot' => true, 'thead' => true,
178 'template' => true, 'html' => true
179 ]
180 ];
181
182 public static $tableRowContextSet = [
183 self::HTML_NAMESPACE => [
184 'tr' => true, 'template' => true, 'html' => true
185 ]
186 ];
187
188 # OMITTED: formAssociatedSet, since we don't allow <form>
189
190 public static $inScopeSet = [
191 self::HTML_NAMESPACE => [
192 'applet' => true, 'caption' => true, 'html' => true,
193 'marquee' => true, 'object' => true,
194 'table' => true, 'td' => true, 'template' => true,
195 'th' => true
196 ],
197 self::SVG_NAMESPACE => [
198 'foreignobject' => true, 'desc' => true, 'title' => true
199 ],
200 self::MATHML_NAMESPACE => [
201 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
202 'mtext' => true, 'annotation-xml' => true
203 ]
204 ];
205
206 private static $inListItemScopeSet = null;
207 public static function inListItemScopeSet() {
208 if ( self::$inListItemScopeSet === null ) {
209 self::$inListItemScopeSet = self::$inScopeSet;
210 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ol'] = true;
211 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ul'] = true;
212 }
213 return self::$inListItemScopeSet;
214 }
215
216 private static $inButtonScopeSet = null;
217 public static function inButtonScopeSet() {
218 if ( self::$inButtonScopeSet === null ) {
219 self::$inButtonScopeSet = self::$inScopeSet;
220 self::$inButtonScopeSet[self::HTML_NAMESPACE]['button'] = true;
221 }
222 return self::$inButtonScopeSet;
223 }
224
225 public static $inTableScopeSet = [
226 self::HTML_NAMESPACE => [
227 'html' => true, 'table' => true, 'template' => true
228 ]
229 ];
230
231 public static $mathmlTextIntegrationPointSet = [
232 self::MATHML_NAMESPACE => [
233 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
234 'mtext' => true
235 ]
236 ];
237
238 public static $htmlIntegrationPointSet = [
239 self::SVG_NAMESPACE => [
240 'foreignobject' => true,
241 'desc' => true,
242 'title' => true
243 ]
244 ];
245
246 // For tidy compatibility.
247 public static $tidyPWrapSet = [
248 self::HTML_NAMESPACE => [
249 'body' => true, 'blockquote' => true,
250 // We parse with <body> as the fragment context, but the top-level
251 // element on the stack is actually <html>. We could use the
252 // "adjusted current node" everywhere to work around this, but it's
253 // easier just to add <html> to the p-wrap set.
254 'html' => true,
255 ],
256 ];
257 public static $tidyInlineSet = [
258 self::HTML_NAMESPACE => [
259 'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true,
260 'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true,
261 'br' => true, 'button' => true, 'cite' => true, 'code' => true,
262 'dfn' => true, 'em' => true, 'font' => true, 'i' => true,
263 'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true,
264 'label' => true, 'legend' => true, 'map' => true, 'object' => true,
265 'param' => true, 'q' => true, 'rb' => true, 'rbc' => true,
266 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true,
267 's' => true, 'samp' => true, 'select' => true, 'small' => true,
268 'span' => true, 'strike' => true, 'strong' => true, 'sub' => true,
269 'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true,
270 'var' => true,
271 ],
272 ];
273 }
274
275 /**
276 * A BalanceElement is a simplified version of a DOM Node. The main
277 * difference is that we only keep BalanceElements around for nodes
278 * currently on the BalanceStack of open elements. As soon as an
279 * element is closed, with some minor exceptions relating to the
280 * tree builder "adoption agency algorithm", the element and all its
281 * children are serialized to a string using the flatten() method.
282 * This keeps our memory usage low.
283 *
284 * @ingroup Parser
285 * @since 1.27
286 */
287 class BalanceElement {
288 /**
289 * The namespace of the element.
290 * @var string $namespaceURI
291 */
292 public $namespaceURI;
293 /**
294 * The lower-cased name of the element.
295 * @var string $localName
296 */
297 public $localName;
298 /**
299 * Attributes for the element, in array form
300 * @var array $attribs
301 */
302 public $attribs;
303
304 /**
305 * Parent of this element, or the string "flat" if this element has
306 * already been flattened into its parent.
307 * @var string|null $parent
308 */
309 public $parent;
310
311 /**
312 * An array of children of this element. Typically only the last
313 * child will be an actual BalanceElement object; the rest will
314 * be strings, representing either text nodes or flattened
315 * BalanceElement objects.
316 * @var array $children
317 */
318 public $children;
319
320 /**
321 * A unique string identifier for Noah's Ark purposes, lazy initialized
322 */
323 private $noahKey;
324
325 /**
326 * The next active formatting element in the list, or null if this is the
327 * end of the AFE list or if the element is not in the AFE list.
328 */
329 public $nextAFE;
330
331 /**
332 * The previous active formatting element in the list, or null if this is
333 * the start of the list or if the element is not in the AFE list.
334 */
335 public $prevAFE;
336
337 /**
338 * The next element in the Noah's Ark species bucket.
339 */
340 public $nextNoah;
341
342 /**
343 * Make a new BalanceElement corresponding to the HTML DOM Element
344 * with the given localname, namespace, and attributes.
345 *
346 * @param string $namespaceURI The namespace of the element.
347 * @param string $localName The lowercased name of the tag.
348 * @param array $attribs Attributes of the element
349 */
350 public function __construct( $namespaceURI, $localName, array $attribs ) {
351 Assert::parameterType( 'string', $namespaceURI, '$namespaceURI' );
352 Assert::parameterType( 'string', $localName, '$localName' );
353
354 $this->localName = $localName;
355 $this->namespaceURI = $namespaceURI;
356 $this->attribs = $attribs;
357 $this->contents = '';
358 $this->parent = null;
359 $this->children = [];
360 }
361
362 /**
363 * Remove the given child from this element.
364 * @param BalanceElement $elt
365 */
366 private function removeChild( $elt ) {
367 Assert::precondition(
368 $this->parent !== 'flat', "Can't removeChild after flattening $this"
369 );
370 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement', $elt, '$elt' );
371 Assert::parameter(
372 $elt->parent === $this, 'elt', 'must have $this as a parent'
373 );
374 $idx = array_search( $elt, $this->children, true );
375 Assert::parameter( $idx !== false, '$elt', 'must be a child of $this' );
376 $elt->parent = null;
377 array_splice( $this->children, $idx, 1 );
378 }
379
380 /**
381 * Find $a in the list of children and insert $b before it.
382 * @param BalanceElement $a
383 * @param BalanceElement|string $b
384 */
385 public function insertBefore( $a, $b ) {
386 Assert::precondition(
387 $this->parent !== 'flat', "Can't insertBefore after flattening."
388 );
389 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement', $a, '$a' );
390 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement|string', $b, '$b' );
391 $idx = array_search( $a, $this->children, true );
392 Assert::parameter( $idx !== false, '$a', 'must be a child of $this' );
393 if ( is_string( $b ) ) {
394 array_splice( $this->children, $idx, 0, [ $b ] );
395 } else {
396 Assert::parameter( $b->parent !== 'flat', '$b', "Can't be flat" );
397 if ( $b->parent !== null ) {
398 $b->parent->removeChild( $b );
399 }
400 array_splice( $this->children, $idx, 0, [ $b ] );
401 $b->parent = $this;
402 }
403 }
404
405 /**
406 * Append $elt to the end of the list of children.
407 * @param BalanceElement|string $elt
408 */
409 public function appendChild( $elt ) {
410 Assert::precondition(
411 $this->parent !== 'flat', "Can't appendChild after flattening."
412 );
413 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement|string', $elt, '$elt' );
414 if ( is_string( $elt ) ) {
415 array_push( $this->children, $elt );
416 return;
417 }
418 // Remove $elt from parent, if it had one.
419 if ( $elt->parent !== null ) {
420 $elt->parent->removeChild( $elt );
421 }
422 array_push( $this->children, $elt );
423 $elt->parent = $this;
424 }
425
426 /**
427 * Transfer all of the children of $elt to $this.
428 * @param BalanceElement $elt
429 */
430 public function adoptChildren( $elt ) {
431 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement', $elt, '$elt' );
432 Assert::precondition(
433 $elt->parent !== 'flat', "Can't adoptChildren after flattening."
434 );
435 foreach ( $elt->children as $child ) {
436 if ( !is_string( $child ) ) {
437 // This is an optimization which avoids an O(n^2) set of
438 // array_splice operations.
439 $child->parent = null;
440 }
441 $this->appendChild( $child );
442 }
443 $elt->children = [];
444 }
445
446 /**
447 * Flatten this node and all of its children into a string, as specified
448 * by the HTML serialization specification, and replace this node
449 * in its parent by that string.
450 *
451 * @see __toString()
452 */
453 public function flatten( $tidyCompat = false ) {
454 Assert::parameter( $this->parent !== null, '$this', 'must be a child' );
455 Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' );
456 $idx = array_search( $this, $this->parent->children, true );
457 Assert::parameter(
458 $idx !== false, '$this', 'must be a child of its parent'
459 );
460 if ( $tidyCompat ) {
461 $blank = true;
462 foreach ( $this->children as $elt ) {
463 if ( !is_string( $elt ) ) {
464 $elt = $elt->flatten( $tidyCompat );
465 }
466 if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) {
467 $blank = false;
468 }
469 }
470 if ( $this->isHtmlNamed( 'mw:p-wrap' ) ) {
471 $this->localName = 'p';
472 } elseif ( $blank ) {
473 // Add 'mw-empty-elt' class so elements can be hidden via CSS
474 // for compatibility with legacy tidy.
475 if ( !count( $this->attribs ) &&
476 ( $this->localName === 'tr' || $this->localName === 'li' )
477 ) {
478 $this->attribs = [ 'class' => "mw-empty-elt" ];
479 }
480 $blank = false;
481 }
482 $flat = $blank ? '' : "{$this}";
483 } else {
484 $flat = "{$this}";
485 }
486 $this->parent->children[$idx] = $flat;
487 $this->parent = 'flat'; # for assertion checking
488 return $flat;
489 }
490
491 /**
492 * Serialize this node and all of its children to a string, as specified
493 * by the HTML serialization specification.
494 *
495 * @return string The serialization of the BalanceElement
496 * @see https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments
497 */
498 public function __toString() {
499 $encAttribs = '';
500 foreach ( $this->attribs as $name => $value ) {
501 $encValue = Sanitizer::encodeAttribute( $value );
502 $encAttribs .= " $name=\"$encValue\"";
503 }
504 if ( !$this->isA( BalanceSets::$emptyElementSet ) ) {
505 $out = "<{$this->localName}{$encAttribs}>";
506 // flatten children
507 foreach ( $this->children as $elt ) {
508 $out .= "{$elt}";
509 }
510 $out .= "</{$this->localName}>";
511 } else {
512 $out = "<{$this->localName}{$encAttribs} />";
513 Assert::invariant(
514 count( $this->children ) === 0,
515 "Empty elements shouldn't have children."
516 );
517 }
518 return $out;
519 }
520
521 # Utility functions on BalanceElements.
522
523 /**
524 * Determine if $this represents a specific HTML tag, is a member of
525 * a tag set, or is equal to another BalanceElement.
526 *
527 * @param BalanceElement|array|string $set The target BalanceElement,
528 * set (from the BalanceSets class), or string (HTML tag name).
529 * @return bool
530 */
531 public function isA( $set ) {
532 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement|array|string', $set, '$set' );
533 if ( $set instanceof BalanceElement ) {
534 return $this === $set;
535 } elseif ( is_array( $set ) ) {
536 return isset( $set[$this->namespaceURI] ) &&
537 isset( $set[$this->namespaceURI][$this->localName] );
538 } else {
539 # assume this is an HTML element name.
540 return $this->isHtml() && $this->localName === $set;
541 }
542 }
543
544 /**
545 * Determine if this element is an HTML element with the specified name
546 * @param string $tagName
547 * @return bool
548 */
549 public function isHtmlNamed( $tagName ) {
550 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE
551 && $this->localName === $tagName;
552 }
553
554 /**
555 * Determine if $this represents an element in the HTML namespace.
556 *
557 * @return bool
558 */
559 public function isHtml() {
560 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE;
561 }
562
563 /**
564 * Determine if $this represents a MathML text integration point,
565 * as defined in the HTML5 specification.
566 *
567 * @return bool
568 * @see https://html.spec.whatwg.org/multipage/syntax.html#mathml-text-integration-point
569 */
570 public function isMathmlTextIntegrationPoint() {
571 return $this->isA( BalanceSets::$mathmlTextIntegrationPointSet );
572 }
573
574 /**
575 * Determine if $this represents an HTML integration point,
576 * as defined in the HTML5 specification.
577 *
578 * @return bool
579 * @see https://html.spec.whatwg.org/multipage/syntax.html#html-integration-point
580 */
581 public function isHtmlIntegrationPoint() {
582 if ( $this->isA( BalanceSets::$htmlIntegrationPointSet ) ) {
583 return true;
584 }
585 if (
586 $this->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
587 $this->localName === 'annotation-xml' &&
588 isset( $this->attribs['encoding'] ) &&
589 ( strcasecmp( $this->attribs['encoding'], 'text/html' ) == 0 ||
590 strcasecmp( $this->attribs['encoding'], 'application/xhtml+xml' ) == 0 )
591 ) {
592 return true;
593 }
594 return false;
595 }
596
597 /**
598 * Get a string key for the Noah's Ark algorithm
599 */
600 public function getNoahKey() {
601 if ( $this->noahKey === null ) {
602 $attribs = $this->attribs;
603 ksort( $attribs );
604 $this->noahKey = serialize( [ $this->namespaceURI, $this->localName, $attribs ] );
605 }
606 return $this->noahKey;
607 }
608 }
609
610 /**
611 * The "stack of open elements" as defined in the HTML5 tree builder
612 * spec. This contains methods to ensure that content (start tags, text)
613 * are inserted at the correct place in the output string, and to
614 * flatten BalanceElements are they are closed to avoid holding onto
615 * a complete DOM tree for the document in memory.
616 *
617 * The stack defines a PHP iterator to traverse it in "reverse order",
618 * that is, the most-recently-added element is visited first in a
619 * foreach loop.
620 *
621 * @ingroup Parser
622 * @since 1.27
623 * @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
624 */
625 class BalanceStack implements IteratorAggregate {
626 /**
627 * Backing storage for the stack.
628 * @var array $elements
629 */
630 private $elements = [];
631 /**
632 * Foster parent mode determines how nodes are inserted into the
633 * stack.
634 * @var bool $fosterParentMode
635 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
636 */
637 public $fosterParentMode = false;
638 /**
639 * Tidy compatibility mode, determines behavior of body/blockquote
640 */
641 public $tidyCompat = false;
642 /**
643 * Reference to the current element
644 */
645 public $currentNode;
646
647 /**
648 * Create a new BalanceStack with a single BalanceElement on it,
649 * representing the root &lt;html&gt; node.
650 */
651 public function __construct() {
652 # always a root <html> element on the stack
653 array_push(
654 $this->elements,
655 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] )
656 );
657 $this->currentNode = $this->elements[0];
658 }
659
660 /**
661 * Return a string representing the output of the tree builder:
662 * all the children of the root &lt;html&gt; node.
663 * @return string
664 */
665 public function getOutput() {
666 // Don't include the outer '<html>....</html>'
667 $out = '';
668 foreach ( $this->elements[0]->children as $elt ) {
669 $out .= is_string( $elt ) ? $elt :
670 $elt->flatten( $this->tidyCompat );
671 }
672 return $out;
673 }
674
675 /**
676 * Insert text at the appropriate place for inserting a node.
677 * @param string $value
678 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
679 */
680 public function insertText( $value ) {
681 Assert::parameterType( 'string', $value, '$value' );
682 if (
683 $this->fosterParentMode &&
684 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
685 ) {
686 $this->fosterParent( $value );
687 } elseif (
688 $this->tidyCompat &&
689 $this->currentNode->isA( BalanceSets::$tidyPWrapSet )
690 ) {
691 $this->insertHTMLELement( 'mw:p-wrap', [] );
692 return $this->insertText( $value );
693 } else {
694 $this->currentNode->appendChild( $value );
695 }
696 }
697
698 /**
699 * Insert a BalanceElement at the appropriate place, pushing it
700 * on to the open elements stack.
701 * @param string $namespaceURI The element namespace
702 * @param string $tag The tag name
703 * @param string $attribs Normalized attributes, as a string.
704 * @return BalanceElement
705 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-foreign-element
706 */
707 public function insertForeignElement( $namespaceURI, $tag, $attribs ) {
708 return $this->insertElement(
709 new BalanceElement( $namespaceURI, $tag, $attribs )
710 );
711 }
712
713 /**
714 * Insert an HTML element at the appropriate place, pushing it on to
715 * the open elements stack.
716 * @param string $tag The tag name
717 * @param string $attribs Normalized attributes, as a string.
718 * @return BalanceElement
719 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-an-html-element
720 */
721 public function insertHTMLElement( $tag, $attribs ) {
722 return $this->insertForeignElement(
723 BalanceSets::HTML_NAMESPACE, $tag, $attribs
724 );
725 }
726
727 /**
728 * Insert an element at the appropriate place and push it on to the
729 * open elements stack.
730 * @param BalanceElement $elt
731 * @return BalanceElement
732 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
733 */
734 public function insertElement( $elt ) {
735 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement', $elt, '$elt' );
736 if (
737 $this->currentNode->isHtmlNamed( 'mw:p-wrap' ) &&
738 !$elt->isA( BalanceSets::$tidyInlineSet )
739 ) {
740 // Tidy compatibility.
741 $this->pop();
742 }
743 if (
744 $this->fosterParentMode &&
745 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
746 ) {
747 $elt = $this->fosterParent( $elt );
748 } else {
749 $this->currentNode->appendChild( $elt );
750 }
751 Assert::invariant( $elt->parent !== null, "$elt must be in tree" );
752 Assert::invariant( $elt->parent !== 'flat', "$elt must not have been previous flattened" );
753 array_push( $this->elements, $elt );
754 $this->currentNode = $elt;
755 return $elt;
756 }
757
758 /**
759 * Determine if the stack has $tag in scope.
760 * @param BalanceElement|array|string $tag
761 * @return bool
762 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-scope
763 */
764 public function inScope( $tag ) {
765 return $this->inSpecificScope( $tag, BalanceSets::$inScopeSet );
766 }
767
768 /**
769 * Determine if the stack has $tag in button scope.
770 * @param BalanceElement|array|string $tag
771 * @return bool
772 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-button-scope
773 */
774 public function inButtonScope( $tag ) {
775 return $this->inSpecificScope( $tag, BalanceSets::inButtonScopeSet() );
776 }
777
778 /**
779 * Determine if the stack has $tag in list item scope.
780 * @param BalanceElement|array|string $tag
781 * @return bool
782 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-list-item-scope
783 */
784 public function inListItemScope( $tag ) {
785 return $this->inSpecificScope( $tag, BalanceSets::inListItemScopeSet() );
786 }
787
788 /**
789 * Determine if the stack has $tag in table scope.
790 * @param BalanceElement|array|string $tag
791 * @return bool
792 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-table-scope
793 */
794 public function inTableScope( $tag ) {
795 return $this->inSpecificScope( $tag, BalanceSets::$inTableScopeSet );
796 }
797
798 /**
799 * Determine if the stack has $tag in a specific scope, $set.
800 * @param BalanceElement|array|string $tag
801 * @param BalanceElement|array|string $set
802 * @return bool
803 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-the-specific-scope
804 */
805 public function inSpecificScope( $tag, $set ) {
806 foreach ( $this as $elt ) {
807 if ( $elt->isA( $tag ) ) {
808 return true;
809 }
810 if ( $elt->isA( $set ) ) {
811 return false;
812 }
813 }
814 return false;
815 }
816
817 /**
818 * Generate implied end tags.
819 * @param string $butnot
820 * @param bool $thorough True if we should generate end tags thoroughly.
821 * @see https://html.spec.whatwg.org/multipage/syntax.html#generate-implied-end-tags
822 */
823 public function generateImpliedEndTags( $butnot = null, $thorough = false ) {
824 $endTagSet = $thorough ?
825 BalanceSets::$thoroughImpliedEndTagsSet :
826 BalanceSets::$impliedEndTagsSet;
827 while ( $this->length() > 0 ) {
828 if ( $butnot !== null && $this->currentNode->isHtmlNamed( $butnot ) ) {
829 break;
830 }
831 if ( !$this->currentNode->isA( $endTagSet ) ) {
832 break;
833 }
834 $this->pop();
835 }
836 }
837
838 /**
839 * Return the adjusted current node.
840 */
841 public function adjustedCurrentNode( $fragmentContext ) {
842 return ( $fragmentContext && $this->length() === 1 ) ?
843 $fragmentContext : $this->currentNode;
844 }
845
846 /**
847 * Return an iterator over this stack which visits the current node
848 * first, and the root node last.
849 * @return Iterator
850 */
851 public function getIterator() {
852 return new ReverseArrayIterator( $this->elements );
853 }
854
855 /**
856 * Return the BalanceElement at the given position $idx, where
857 * position 0 represents the root element.
858 * @param int $idx
859 * @return BalanceElement
860 */
861 public function node( $idx ) {
862 return $this->elements[ $idx ];
863 }
864
865 /**
866 * Replace the element at position $idx in the BalanceStack with $elt.
867 * @param int $idx
868 * @param BalanceElement $elt
869 */
870 public function replaceAt( $idx, $elt ) {
871 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement', $elt, '$elt' );
872 Assert::precondition(
873 $this->elements[$idx]->parent !== 'flat',
874 'Replaced element should not have already been flattened.'
875 );
876 Assert::precondition(
877 $elt->parent !== 'flat',
878 'New element should not have already been flattened.'
879 );
880 $this->elements[$idx] = $elt;
881 if ( $idx === count( $this->elements ) - 1 ) {
882 $this->currentNode = $elt;
883 }
884 }
885
886 /**
887 * Return the position of the given BalanceElement, set, or
888 * HTML tag name string in the BalanceStack.
889 * @param BalanceElement|array|string $tag
890 * @return int
891 */
892 public function indexOf( $tag ) {
893 for ( $i = count( $this->elements ) - 1; $i >= 0; $i-- ) {
894 if ( $this->elements[$i]->isA( $tag ) ) {
895 return $i;
896 }
897 }
898 return -1;
899 }
900
901 /**
902 * Return the number of elements currently in the BalanceStack.
903 * @return int
904 */
905 public function length() {
906 return count( $this->elements );
907 }
908
909 /**
910 * Remove the current node from the BalanceStack, flattening it
911 * in the process.
912 */
913 public function pop() {
914 $elt = array_pop( $this->elements );
915 if ( count( $this->elements ) ) {
916 $this->currentNode = $this->elements[ count( $this->elements ) - 1 ];
917 } else {
918 $this->currentNode = null;
919 }
920 if ( !$elt->isHtmlNamed( 'mw:p-wrap' ) ) {
921 $elt->flatten( $this->tidyCompat );
922 }
923 }
924
925 /**
926 * Remove all nodes up to and including position $idx from the
927 * BalanceStack, flattening them in the process.
928 * @param int $idx
929 */
930 public function popTo( $idx ) {
931 while ( $this->length() > $idx ) {
932 $this->pop();
933 }
934 }
935
936 /**
937 * Pop elements off the stack up to and including the first
938 * element with the specified HTML tagname (or matching the given
939 * set).
940 * @param BalanceElement|array|string $tag
941 */
942 public function popTag( $tag ) {
943 while ( $this->length() > 0 ) {
944 if ( $this->currentNode->isA( $tag ) ) {
945 $this->pop();
946 break;
947 }
948 $this->pop();
949 }
950 }
951
952 /**
953 * Pop elements off the stack *not including* the first element
954 * in the specified set.
955 * @param BalanceElement|array|string $set
956 */
957 public function clearToContext( $set ) {
958 // Note that we don't loop to 0. Never pop the <html> elt off.
959 while ( $this->length() > 1 ) {
960 if ( $this->currentNode->isA( $set ) ) {
961 break;
962 }
963 $this->pop();
964 }
965 }
966
967 /**
968 * Remove the given $elt from the BalanceStack, optionally
969 * flattening it in the process.
970 * @param BalanceElement $elt The element to remove.
971 * @param bool $flatten Whether to flatten the removed element.
972 */
973 public function removeElement( $elt, $flatten = true ) {
974 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement', $elt, '$elt' );
975 Assert::parameter(
976 $elt->parent !== 'flat',
977 '$elt',
978 '$elt should not already have been flattened.'
979 );
980 Assert::parameter(
981 $elt->parent->parent !== 'flat',
982 '$elt',
983 'The parent of $elt should not already have been flattened.'
984 );
985 $idx = array_search( $elt, $this->elements, true );
986 Assert::parameter( $idx !== false, '$elt', 'must be in stack' );
987 array_splice( $this->elements, $idx, 1 );
988 if ( $idx === count( $this->elements ) ) {
989 $this->currentNode = $this->elements[$idx - 1];
990 }
991 if ( $flatten ) {
992 // serialize $elt into its parent
993 // otherwise, it will eventually serialize when the parent
994 // is serialized, we just hold onto the memory for its
995 // tree of objects a little longer.
996 $elt->flatten( $this->tidyCompat );
997 }
998 Assert::postcondition(
999 array_search( $elt, $this->elements, true ) === false,
1000 '$elt should no longer be in open elements stack'
1001 );
1002 }
1003
1004 /**
1005 * Find $a in the BalanceStack and insert $b after it.
1006 * @param BalanceElement $a
1007 * @param BalanceElement $b
1008 */
1009 public function insertAfter( $a, $b ) {
1010 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement', $a, '$a' );
1011 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement', $b, '$b' );
1012 $idx = $this->indexOf( $a );
1013 Assert::parameter( $idx !== false, '$a', 'must be in stack' );
1014 if ( $idx === count( $this->elements ) - 1 ) {
1015 array_push( $this->elements, $b );
1016 $this->currentNode = $b;
1017 } else {
1018 array_splice( $this->elements, $idx + 1, 0, [ $b ] );
1019 }
1020 }
1021
1022 # Fostering and adoption.
1023
1024 /**
1025 * Foster parent the given $elt in the stack of open elements.
1026 * @param BalanceElement|string $elt
1027 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
1028 */
1029 private function fosterParent( $elt ) {
1030 Assert::parameterType( 'MediaWiki\Tidy\BalanceElement|string', $elt, '$elt' );
1031 $lastTable = $this->indexOf( 'table' );
1032 $lastTemplate = $this->indexOf( 'template' );
1033 $parent = null;
1034 $before = null;
1035
1036 if ( $lastTemplate >= 0 && ( $lastTable < 0 || $lastTemplate > $lastTable ) ) {
1037 $parent = $this->elements[$lastTemplate];
1038 } elseif ( $lastTable >= 0 ) {
1039 $parent = $this->elements[$lastTable]->parent;
1040 # Assume all tables have parents, since we're not running scripts!
1041 Assert::invariant(
1042 $parent !== null, "All tables should have parents"
1043 );
1044 $before = $this->elements[$lastTable];
1045 } else {
1046 $parent = $this->elements[0]; // the `html` element.
1047 }
1048
1049 if ( $this->tidyCompat ) {
1050 if ( is_string( $elt ) ) {
1051 // We're fostering text: do we need a p-wrapper?
1052 if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) {
1053 $this->insertHTMLElement( 'mw:p-wrap', [] );
1054 $this->insertText( $elt );
1055 return $elt;
1056 }
1057 } else {
1058 // We're fostering an element; do we need to merge p-wrappers?
1059 if ( $elt->isHtmlNamed( 'mw:p-wrap' ) ) {
1060 $idx = $before ?
1061 array_search( $before, $parent->children, true ) :
1062 count( $parent->children );
1063 $after = $idx > 0 ? $parent->children[$idx - 1] : '';
1064 if (
1065 $after instanceof BalanceElement &&
1066 $after->isHtmlNamed( 'mw:p-wrap' )
1067 ) {
1068 return $after; // Re-use existing p-wrapper.
1069 }
1070 }
1071 }
1072 }
1073
1074 if ( $before ) {
1075 $parent->insertBefore( $before, $elt );
1076 } else {
1077 $parent->appendChild( $elt );
1078 }
1079 return $elt;
1080 }
1081
1082 /**
1083 * Run the "adoption agency algoritm" (AAA) for the given subject
1084 * tag name.
1085 * @param string $tag The subject tag name.
1086 * @param BalanceActiveFormattingElements $afe The current
1087 * active formatting elements list.
1088 * @return true if the adoption agency algorithm "did something", false
1089 * if more processing is required by the caller.
1090 * @see https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
1091 */
1092 public function adoptionAgency( $tag, $afe ) {
1093 // If the current node is an HTML element whose tag name is subject,
1094 // and the current node is not in the list of active formatting
1095 // elements, then pop the current node off the stack of open
1096 // elements and abort these steps.
1097 if (
1098 $this->currentNode->isHtmlNamed( $tag ) &&
1099 !$afe->isInList( $this->currentNode )
1100 ) {
1101 $this->pop();
1102 return true; // no more handling required
1103 }
1104
1105 // Let outer loop counter be zero.
1106 $outer = 0;
1107
1108 // Outer loop: If outer loop counter is greater than or
1109 // equal to eight, then abort these steps.
1110 while ( $outer < 8 ) {
1111 // Increment outer loop counter by one.
1112 $outer++;
1113
1114 // Let the formatting element be the last element in the list
1115 // of active formatting elements that: is between the end of
1116 // the list and the last scope marker in the list, if any, or
1117 // the start of the list otherwise, and has the same tag name
1118 // as the token.
1119 $fmtelt = $afe->findElementByTag( $tag );
1120
1121 // If there is no such node, then abort these steps and instead
1122 // act as described in the "any other end tag" entry below.
1123 if ( !$fmtelt ) {
1124 return false; // false means handle by the default case
1125 }
1126
1127 // Otherwise, if there is such a node, but that node is not in
1128 // the stack of open elements, then this is a parse error;
1129 // remove the element from the list, and abort these steps.
1130 $index = $this->indexOf( $fmtelt );
1131 if ( $index < 0 ) {
1132 $afe->remove( $fmtelt );
1133 return true; // true means no more handling required
1134 }
1135
1136 // Otherwise, if there is such a node, and that node is also in
1137 // the stack of open elements, but the element is not in scope,
1138 // then this is a parse error; ignore the token, and abort
1139 // these steps.
1140 if ( !$this->inScope( $fmtelt ) ) {
1141 return true;
1142 }
1143
1144 // Let the furthest block be the topmost node in the stack of
1145 // open elements that is lower in the stack than the formatting
1146 // element, and is an element in the special category. There
1147 // might not be one.
1148 $furthestblock = null;
1149 $furthestblockindex = -1;
1150 $stacklen = $this->length();
1151 for ( $i = $index+1; $i < $stacklen; $i++ ) {
1152 if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) {
1153 $furthestblock = $this->node( $i );
1154 $furthestblockindex = $i;
1155 break;
1156 }
1157 }
1158
1159 // If there is no furthest block, then the UA must skip the
1160 // subsequent steps and instead just pop all the nodes from the
1161 // bottom of the stack of open elements, from the current node
1162 // up to and including the formatting element, and remove the
1163 // formatting element from the list of active formatting
1164 // elements.
1165 if ( !$furthestblock ) {
1166 $this->popTag( $fmtelt );
1167 $afe->remove( $fmtelt );
1168 return true;
1169 } else {
1170 // Let the common ancestor be the element immediately above
1171 // the formatting element in the stack of open elements.
1172 $ancestor = $this->node( $index-1 );
1173
1174 // Let a bookmark note the position of the formatting
1175 // element in the list of active formatting elements
1176 // relative to the elements on either side of it in the
1177 // list.
1178 $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] );
1179 $afe->insertAfter( $fmtelt, $BOOKMARK );
1180
1181 // Let node and last node be the furthest block.
1182 $node = $furthestblock;
1183 $lastnode = $furthestblock;
1184 $nodeindex = $furthestblockindex;
1185 $isAFE = false;
1186
1187 // Let inner loop counter be zero.
1188 $inner = 0;
1189
1190 while ( true ) {
1191
1192 // Increment inner loop counter by one.
1193 $inner++;
1194
1195 // Let node be the element immediately above node in
1196 // the stack of open elements, or if node is no longer
1197 // in the stack of open elements (e.g. because it got
1198 // removed by this algorithm), the element that was
1199 // immediately above node in the stack of open elements
1200 // before node was removed.
1201 $node = $this->node( --$nodeindex );
1202
1203 // If node is the formatting element, then go
1204 // to the next step in the overall algorithm.
1205 if ( $node === $fmtelt ) break;
1206
1207 // If the inner loop counter is greater than three and node
1208 // is in the list of active formatting elements, then remove
1209 // node from the list of active formatting elements.
1210 $isAFE = $afe->isInList( $node );
1211 if ( $inner > 3 && $isAFE ) {
1212 $afe->remove( $node );
1213 $isAFE = false;
1214 }
1215
1216 // If node is not in the list of active formatting
1217 // elements, then remove node from the stack of open
1218 // elements and then go back to the step labeled inner
1219 // loop.
1220 if ( !$isAFE ) {
1221 // Don't flatten here, since we're about to relocate
1222 // parts of this $node.
1223 $this->removeElement( $node, false );
1224 continue;
1225 }
1226
1227 // Create an element for the token for which the
1228 // element node was created with common ancestor as
1229 // the intended parent, replace the entry for node
1230 // in the list of active formatting elements with an
1231 // entry for the new element, replace the entry for
1232 // node in the stack of open elements with an entry for
1233 // the new element, and let node be the new element.
1234 $newelt = new BalanceElement(
1235 $node->namespaceURI, $node->localName, $node->attribs );
1236 $afe->replace( $node, $newelt );
1237 $this->replaceAt( $nodeindex, $newelt );
1238 $node = $newelt;
1239
1240 // If last node is the furthest block, then move the
1241 // aforementioned bookmark to be immediately after the
1242 // new node in the list of active formatting elements.
1243 if ( $lastnode === $furthestblock ) {
1244 $afe->remove( $BOOKMARK );
1245 $afe->insertAfter( $newelt, $BOOKMARK );
1246 }
1247
1248 // Insert last node into node, first removing it from
1249 // its previous parent node if any.
1250 $node->appendChild( $lastnode );
1251
1252 // Let last node be node.
1253 $lastnode = $node;
1254 }
1255
1256 // If the common ancestor node is a table, tbody, tfoot,
1257 // thead, or tr element, then, foster parent whatever last
1258 // node ended up being in the previous step, first removing
1259 // it from its previous parent node if any.
1260 if (
1261 $this->fosterParentMode &&
1262 $ancestor->isA( BalanceSets::$tableSectionRowSet )
1263 ) {
1264 $this->fosterParent( $lastnode );
1265 } else {
1266 // Otherwise, append whatever last node ended up being in
1267 // the previous step to the common ancestor node, first
1268 // removing it from its previous parent node if any.
1269 $ancestor->appendChild( $lastnode );
1270 }
1271
1272 // Create an element for the token for which the
1273 // formatting element was created, with furthest block
1274 // as the intended parent.
1275 $newelt2 = new BalanceElement(
1276 $fmtelt->namespaceURI, $fmtelt->localName, $fmtelt->attribs );
1277
1278 // Take all of the child nodes of the furthest block and
1279 // append them to the element created in the last step.
1280 $newelt2->adoptChildren( $furthestblock );
1281
1282 // Append that new element to the furthest block.
1283 $furthestblock->appendChild( $newelt2 );
1284
1285 // Remove the formatting element from the list of active
1286 // formatting elements, and insert the new element into the
1287 // list of active formatting elements at the position of
1288 // the aforementioned bookmark.
1289 $afe->remove( $fmtelt );
1290 $afe->replace( $BOOKMARK, $newelt2 );
1291
1292 // Remove the formatting element from the stack of open
1293 // elements, and insert the new element into the stack of
1294 // open elements immediately below the position of the
1295 // furthest block in that stack.
1296 $this->removeElement( $fmtelt );
1297 $this->insertAfter( $furthestblock, $newelt2 );
1298 }
1299 }
1300
1301 return true;
1302 }
1303
1304 /**
1305 * Return the contents of the open elements stack as a string for
1306 * debugging.
1307 * @return string
1308 */
1309 public function __toString() {
1310 $r = [];
1311 foreach ( $this->elements as $elt ) {
1312 array_push( $r, $elt->localName );
1313 }
1314 return implode( $r, ' ' );
1315 }
1316 }
1317
1318 /**
1319 * A pseudo-element used as a marker in the list of active formatting elements
1320 *
1321 * @ingroup Parser
1322 * @since 1.27
1323 */
1324 class BalanceMarker {
1325 public $nextAFE;
1326 public $prevAFE;
1327 }
1328
1329 /**
1330 * The list of active formatting elements, which is used to handle
1331 * mis-nested formatting element tags in the HTML5 tree builder
1332 * specification.
1333 *
1334 * @ingroup Parser
1335 * @since 1.27
1336 * @see https://html.spec.whatwg.org/multipage/syntax.html#list-of-active-formatting-elements
1337 */
1338 class BalanceActiveFormattingElements {
1339 /** The last (most recent) element in the list */
1340 private $tail;
1341
1342 /** The first (least recent) element in the list */
1343 private $head;
1344
1345 /**
1346 * An array of arrays representing the population of elements in each bucket
1347 * according to the Noah's Ark clause. The outer array is stack-like, with each
1348 * integer-indexed element representing a segment of the list, bounded by
1349 * markers. The first element represents the segment of the list before the
1350 * first marker.
1351 *
1352 * The inner arrays are indexed by "Noah key", which is a string which uniquely
1353 * identifies each bucket according to the rules in the spec. The value in
1354 * the inner array is the first (least recently inserted) element in the bucket,
1355 * and subsequent members of the bucket can be found by iterating through the
1356 * singly-linked list via $node->nextNoah.
1357 *
1358 * This is optimised for the most common case of inserting into a bucket
1359 * with zero members, and deleting a bucket containing one member. In the
1360 * worst case, iteration through the list is still O(1) in the document
1361 * size, since each bucket can have at most 3 members.
1362 */
1363 private $noahTableStack = [ [] ];
1364
1365 public function __destruct() {
1366 for ( $node = $this->head; $node; $node = $next ) {
1367 $next = $node->nextAFE;
1368 $node->prevAFE = $node->nextAFE = $node->nextNoah = null;
1369 }
1370 $this->head = $this->tail = $this->noahTableStack = null;
1371 }
1372
1373 public function insertMarker() {
1374 $elt = new BalanceMarker;
1375 if ( $this->tail ) {
1376 $this->tail->nextAFE = $elt;
1377 $elt->prevAFE = $this->tail;
1378 } else {
1379 $this->head = $elt;
1380 }
1381 $this->tail = $elt;
1382 $this->noahTableStack[] = [];
1383 }
1384
1385 /**
1386 * Follow the steps required when the spec requires us to "push onto the
1387 * list of active formatting elements".
1388 * @param BalanceElement $elt
1389 */
1390 public function push( BalanceElement $elt ) {
1391 // Must not be in the list already
1392 if ( $elt->prevAFE !== null || $this->head === $elt ) {
1393 throw new ParameterAssertionException( '$elt',
1394 'Cannot insert a node into the AFE list twice' );
1395 }
1396
1397 // "Noah's Ark clause" -- if there are already three copies of
1398 // this element before we encounter a marker, then drop the last
1399 // one.
1400 $noahKey = $elt->getNoahKey();
1401 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1402 if ( !isset( $table[$noahKey] ) ) {
1403 $table[$noahKey] = $elt;
1404 } else {
1405 $count = 1;
1406 $head = $tail = $table[$noahKey];
1407 while ( $tail->nextNoah ) {
1408 $tail = $tail->nextNoah;
1409 $count++;
1410 }
1411 if ( $count >= 3 ) {
1412 $this->remove( $head );
1413 }
1414 $tail->nextNoah = $elt;
1415 }
1416 // Add to the main AFE list
1417 if ( $this->tail ) {
1418 $this->tail->nextAFE = $elt;
1419 $elt->prevAFE = $this->tail;
1420 } else {
1421 $this->head = $elt;
1422 }
1423 $this->tail = $elt;
1424 }
1425
1426 /**
1427 * Follow the steps required when the spec asks us to "clear the list of
1428 * active formatting elements up to the last marker".
1429 */
1430 public function clearToMarker() {
1431 // Iterate back through the list starting from the tail
1432 $tail = $this->tail;
1433 while ( $tail && !( $tail instanceof BalanceMarker ) ) {
1434 // Unlink the element
1435 $prev = $tail->prevAFE;
1436 $tail->prevAFE = null;
1437 if ( $prev ) {
1438 $prev->nextAFE = null;
1439 }
1440 $tail->nextNoah = null;
1441 $tail = $prev;
1442 }
1443 // If we finished on a marker, unlink it and pop it off the Noah table stack
1444 if ( $tail ) {
1445 $prev = $tail->prevAFE;
1446 if ( $prev ) {
1447 $prev->nextAFE = null;
1448 }
1449 $tail = $prev;
1450 array_pop( $this->noahTableStack );
1451 } else {
1452 // No marker: wipe the top-level Noah table (which is the only one)
1453 $this->noahTableStack[0] = [];
1454 }
1455 // If we removed all the elements, clear the head pointer
1456 if ( !$tail ) {
1457 $this->head = null;
1458 }
1459 $this->tail = $tail;
1460 }
1461
1462 /**
1463 * Find and return the last element with the specified tag between the
1464 * end of the list and the last marker on the list.
1465 * Used when parsing &lt;a&gt; "in body mode".
1466 */
1467 public function findElementByTag( $tag ) {
1468 $elt = $this->tail;
1469 while ( $elt && !( $elt instanceof BalanceMarker ) ) {
1470 if ( $elt->localName === $tag ) {
1471 return $elt;
1472 }
1473 $elt = $elt->prevAFE;
1474 }
1475 return null;
1476 }
1477
1478 /**
1479 * Determine whether an element is in the list of formatting elements.
1480 * @return boolean
1481 */
1482 public function isInList( BalanceElement $elt ) {
1483 return $this->head === $elt || $elt->prevAFE;
1484 }
1485
1486 /**
1487 * Find the element $elt in the list and remove it.
1488 * Used when parsing &lt;a&gt; in body mode.
1489 */
1490 public function remove( BalanceElement $elt ) {
1491 if ( $this->head !== $elt && !$elt->prevAFE ) {
1492 throw new ParameterAssertionException( '$elt',
1493 "Attempted to remove an element which is not in the AFE list" );
1494 }
1495 // Update head and tail pointers
1496 if ( $this->head === $elt ) {
1497 $this->head = $elt->nextAFE;
1498 }
1499 if ( $this->tail === $elt ) {
1500 $this->tail = $elt->prevAFE;
1501 }
1502 // Update previous element
1503 if ( $elt->prevAFE ) {
1504 $elt->prevAFE->nextAFE = $elt->nextAFE;
1505 }
1506 // Update next element
1507 if ( $elt->nextAFE ) {
1508 $elt->nextAFE->prevAFE = $elt->prevAFE;
1509 }
1510 // Clear pointers so that isInList() etc. will work
1511 $elt->prevAFE = $elt->nextAFE = null;
1512 // Update Noah list
1513 $this->removeFromNoahList( $elt );
1514 }
1515
1516 private function addToNoahList( BalanceElement $elt ) {
1517 $noahKey = $elt->getNoahKey();
1518 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1519 if ( !isset( $table[$noahKey] ) ) {
1520 $table[$noahKey] = $elt;
1521 } else {
1522 $tail = $table[$noahKey];
1523 while ( $tail->nextNoah ) {
1524 $tail = $tail->nextNoah;
1525 }
1526 $tail->nextNoah = $elt;
1527 }
1528 }
1529
1530 private function removeFromNoahList( BalanceElement $elt ) {
1531 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1532 $key = $elt->getNoahKey();
1533 $noahElt = $table[$key];
1534 if ( $noahElt === $elt ) {
1535 if ( $noahElt->nextNoah ) {
1536 $table[$key] = $noahElt->nextNoah;
1537 $noahElt->nextNoah = null;
1538 } else {
1539 unset( $table[$key] );
1540 }
1541 } else {
1542 do {
1543 $prevNoahElt = $noahElt;
1544 $noahElt = $prevNoahElt->nextNoah;
1545 if ( $noahElt === $elt ) {
1546 // Found it, unlink
1547 $prevNoahElt->nextNoah = $elt->nextNoah;
1548 $elt->nextNoah = null;
1549 break;
1550 }
1551 } while ( $noahElt );
1552 }
1553 }
1554
1555 /**
1556 * Find element $a in the list and replace it with element $b
1557 */
1558 public function replace( BalanceElement $a, BalanceElement $b ) {
1559 if ( $this->head !== $a && !$a->prevAFE ) {
1560 throw new ParameterAssertionException( '$a',
1561 "Attempted to replace an element which is not in the AFE list" );
1562 }
1563 // Update head and tail pointers
1564 if ( $this->head === $a ) {
1565 $this->head = $b;
1566 }
1567 if ( $this->tail === $a ) {
1568 $this->tail = $b;
1569 }
1570 // Update previous element
1571 if ( $a->prevAFE ) {
1572 $a->prevAFE->nextAFE = $b;
1573 }
1574 // Update next element
1575 if ( $a->nextAFE ) {
1576 $a->nextAFE->prevAFE = $b;
1577 }
1578 $b->prevAFE = $a->prevAFE;
1579 $b->nextAFE = $a->nextAFE;
1580 $a->nextAFE = $a->prevAFE = null;
1581 // Update Noah list
1582 $this->removeFromNoahList( $a );
1583 $this->addToNoahList( $b );
1584 }
1585
1586 /**
1587 * Find $a in the list and insert $b after it.
1588 */
1589 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
1590 if ( $this->head !== $a && !$a->prevAFE ) {
1591 throw new ParameterAssertionException( '$a',
1592 "Attempted to insert after an element which is not in the AFE list" );
1593 }
1594 if ( $this->tail === $a ) {
1595 $this->tail = $b;
1596 }
1597 if ( $a->nextAFE ) {
1598 $a->nextAFE->prevAFE = $b;
1599 }
1600 $b->nextAFE = $a->nextAFE;
1601 $b->prevAFE = $a;
1602 $a->nextAFE = $b;
1603 $this->addToNoahList( $b );
1604 }
1605
1606 // @codingStandardsIgnoreStart Generic.Files.LineLength.TooLong
1607 /**
1608 * Reconstruct the active formatting elements.
1609 * @param BalanceStack $stack The open elements stack
1610 * @see https://html.spec.whatwg.org/multipage/syntax.html#reconstruct-the-active-formatting-elements
1611 */
1612 // @codingStandardsIgnoreEnd
1613 public function reconstruct( $stack ) {
1614 $entry = $this->tail;
1615 // If there are no entries in the list of active formatting elements,
1616 // then there is nothing to reconstruct
1617 if ( !$entry ) {
1618 return;
1619 }
1620 // If the last is a marker, do nothing.
1621 if ( $entry instanceof BalanceMarker ) {
1622 return;
1623 }
1624 // Or if it is an open element, do nothing.
1625 if ( $stack->indexOf( $entry ) >= 0 ) {
1626 return;
1627 }
1628
1629 // Loop backward through the list until we find a marker or an
1630 // open element
1631 while ( $entry->prevAFE ) {
1632 $entry = $entry->prevAFE;
1633 if ( $entry instanceof BalanceMarker || $stack->indexOf( $entry ) >= 0 ) {
1634 break;
1635 }
1636 }
1637
1638 // Now loop forward, starting from the element after the current one (or
1639 // the first element if we didn't find a marker or open element),
1640 // recreating formatting elements and pushing them back onto the list
1641 // of open elements.
1642 if ( $entry->prevAFE ) {
1643 $entry = $entry->nextAFE;
1644 }
1645 do {
1646 $newElement = $stack->insertHTMLElement(
1647 $entry->localName,
1648 $entry->attribs );
1649 $this->replace( $entry, $newElement );
1650 $entry = $newElement->nextAFE;
1651 } while ( $entry );
1652 }
1653
1654 /**
1655 * Get a string representation of the AFE list, for debugging
1656 */
1657 public function __toString() {
1658 $prev = null;
1659 $s = '';
1660 for ( $node = $this->head; $node; $prev = $node, $node = $node->nextAFE ) {
1661 if ( $node instanceof BalanceMarker ) {
1662 $s .= "MARKER\n";
1663 continue;
1664 }
1665 $s .= $node->localName . '#' . substr( md5( spl_object_hash( $node ) ), 0, 8 );
1666 if ( $node->nextNoah ) {
1667 $s .= " (noah sibling: {$node->nextNoah->localName}#" .
1668 substr( md5( spl_object_hash( $node->nextNoah ) ), 0, 8 ) .
1669 ')';
1670 }
1671 if ( $node->nextAFE && $node->nextAFE->prevAFE !== $node ) {
1672 $s .= " (reverse link is wrong!)";
1673 }
1674 $s .= "\n";
1675 }
1676 if ( $prev !== $this->tail ) {
1677 $s .= "(tail pointer is wrong!)\n";
1678 }
1679 return $s;
1680 }
1681 }
1682
1683 /**
1684 * An implementation of the tree building portion of the HTML5 parsing
1685 * spec.
1686 *
1687 * This is used to balance and tidy output so that the result can
1688 * always be cleanly serialized/deserialized by an HTML5 parser. It
1689 * does *not* guarantee "conforming" output -- the HTML5 spec contains
1690 * a number of constraints which are not enforced by the HTML5 parsing
1691 * process. But the result will be free of gross errors: misnested or
1692 * unclosed tags, for example, and will be unchanged by spec-complient
1693 * parsing followed by serialization.
1694 *
1695 * The tree building stage is structured as a state machine.
1696 * When comparing the implementation to
1697 * https://www.w3.org/TR/html5/syntax.html#tree-construction
1698 * note that each state is implemented as a function with a
1699 * name ending in `Mode` (because the HTML spec refers to them
1700 * as insertion modes). The current insertion mode is held by
1701 * the $parseMode property.
1702 *
1703 * The following simplifications have been made:
1704 * - We handle body content only (ie, we start `in body`.)
1705 * - The document is never in "quirks mode".
1706 * - All occurrences of < and > have been entity escaped, so we
1707 * can parse tags by simply splitting on those two characters.
1708 * Similarly, all attributes have been "cleaned" and are double-quoted
1709 * and escaped.
1710 * - All comments and null characters are assumed to have been removed.
1711 * - We don't alter linefeeds after <pre>/<listing>.
1712 * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
1713 * <form>, <frame>, <plaintext>, <isindex>, <textarea>, <xmp>, <iframe>,
1714 * <noembed>, <noscript>, <select>, <script>, <title>. As a result,
1715 * further simplifications can be made:
1716 * - `frameset-ok` is not tracked.
1717 * - `form element pointer` is not tracked.
1718 * - `head element pointer` is not tracked (but presumed non-null)
1719 * - Tokenizer has only a single mode.
1720 *
1721 * We generally mark places where we omit cases from the spec due to
1722 * disallowed elements with a comment: `# OMITTED: <element-name>`.
1723 *
1724 * The HTML spec keeps a flag during the parsing process to track
1725 * whether or not a "parse error" has been encountered. We don't
1726 * bother to track that flag, we just implement the error-handling
1727 * process as specified.
1728 *
1729 * @ingroup Parser
1730 * @since 1.27
1731 * @see https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
1732 */
1733 class Balancer {
1734 private $parseMode;
1735 private $bitsIterator;
1736 private $allowedHtmlElements;
1737 private $afe;
1738 private $stack;
1739 private $strict;
1740 private $tidyCompat;
1741
1742 private $textIntegrationMode = false;
1743 private $pendingTableText;
1744 private $originalInsertionMode;
1745 private $fragmentContext;
1746
1747 /**
1748 * Create a new Balancer.
1749 * @param array $config Balancer configuration. Includes:
1750 * 'strict' : boolean, defaults to false.
1751 * When true, enforces syntactic constraints on input:
1752 * all non-tag '<' must be escaped, all attributes must be
1753 * separated by a single space and double-quoted. This is
1754 * consistent with the output of the Sanitizer.
1755 * 'allowedHtmlElements' : array, defaults to null.
1756 * When present, the keys of this associative array give
1757 * the acceptable HTML tag names. When not present, no
1758 * tag sanitization is done.
1759 * 'tidyCompat' : boolean, defaults to false.
1760 * When true, the serialization algorithm is tweaked to
1761 * provide historical compatibility with the old "tidy"
1762 * program: <p>-wrapping is done to the children of
1763 * <body> and <blockquote> elements, and empty elements
1764 * are removed.
1765 */
1766 public function __construct( array $config = [] ) {
1767 $config = $config + [
1768 'strict' => false,
1769 'allowedHtmlElements' => null,
1770 'tidyCompat' => false,
1771 ];
1772 $this->allowedHtmlElements = $config['allowedHtmlElements'];
1773 $this->strict = $config['strict'];
1774 $this->tidyCompat = $config['tidyCompat'];
1775 if ( $this->allowedHtmlElements !== null ) {
1776 # Sanity check!
1777 $bad = array_uintersect_assoc(
1778 $this->allowedHtmlElements,
1779 BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE],
1780 function( $a, $b ) {
1781 // Ignore the values (just intersect the keys) by saying
1782 // all values are equal to each other.
1783 return 0;
1784 }
1785 );
1786 if ( count( $bad ) > 0 ) {
1787 $badstr = implode( array_keys( $bad ), ',' );
1788 throw new ParameterAssertionException(
1789 '$config',
1790 'Balance attempted with sanitization including ' .
1791 "unsupported elements: {$badstr}"
1792 );
1793 }
1794 }
1795 }
1796
1797 /**
1798 * Return a balanced HTML string for the HTML fragment given by $text,
1799 * subject to the caveats listed in the class description. The result
1800 * will typically be idempotent -- that is, rebalancing the output
1801 * would result in no change.
1802 *
1803 * @param string $text The markup to be balanced
1804 * @param callable $processingCallback Callback to do any variable or
1805 * parameter replacements in HTML attributes values
1806 * @param array|bool $processingArgs Arguments for the processing callback
1807 * @return string The balanced markup
1808 */
1809 public function balance( $text, $processingCallback = null, $processingArgs = [] ) {
1810 $this->parseMode = 'inBodyMode';
1811 $this->bitsIterator = new ExplodeIterator( '<', $text );
1812 $this->afe = new BalanceActiveFormattingElements();
1813 $this->stack = new BalanceStack();
1814 $this->stack->tidyCompat = $this->tidyCompat;
1815 $this->processingCallback = $processingCallback;
1816 $this->processingArgs = $processingArgs;
1817
1818 # The stack is constructed with an <html> element already on it.
1819 # Set this up as a fragment parsed with <body> as the context.
1820 $this->fragmentContext =
1821 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'body', [] );
1822 $this->resetInsertionMode();
1823
1824 // First element is text not tag
1825 $x = $this->bitsIterator->current();
1826 $this->bitsIterator->next();
1827 $this->insertToken( 'text', str_replace( '>', '&gt;', $x ) );
1828 // Now process each tag.
1829 while ( $this->bitsIterator->valid() ) {
1830 $this->advance();
1831 }
1832 $this->insertToken( 'eof', null );
1833 $result = $this->stack->getOutput();
1834 // Free memory before returning.
1835 $this->bitsIterator = null;
1836 $this->afe = null;
1837 $this->stack = null;
1838 $this->fragmentContext = null;
1839 return $result;
1840 }
1841
1842 /**
1843 * Pass a token to the tree builder. The $token will be one of the
1844 * strings "tag", "endtag", or "text".
1845 */
1846 private function insertToken( $token, $value, $attribs = null, $selfclose = false ) {
1847 // validate tags against $unsupportedSet
1848 if ( $token === 'tag' || $token === 'endtag' ) {
1849 if ( isset( BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE][$value] ) ) {
1850 # As described in "simplifications" above, these tags are
1851 # not supported in the balancer.
1852 Assert::invariant(
1853 !$this->strict,
1854 "Unsupported $token <$value> found."
1855 );
1856 return false;
1857 }
1858 } elseif ( $token === 'text' && $value === '' ) {
1859 # Don't actually inject the empty string as a text token.
1860 return true;
1861 }
1862 // Some hoops we have to jump through
1863 $adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
1864
1865 $isForeign = true;
1866 if (
1867 $this->stack->length() === 0 ||
1868 $adjusted->isHtml() ||
1869 $token === 'eof'
1870 ) {
1871 $isForeign = false;
1872 } elseif ( $adjusted->isMathmlTextIntegrationPoint() ) {
1873 if ( $token === 'text' ) {
1874 $isForeign = false;
1875 } elseif (
1876 $token === 'tag' &&
1877 $value !== 'mglyph' && $value !== 'malignmark'
1878 ) {
1879 $isForeign = false;
1880 }
1881 } elseif (
1882 $adjusted->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
1883 $adjusted->localName === 'annotation-xml' &&
1884 $token === 'tag' && $value === 'svg'
1885 ) {
1886 $isForeign = false;
1887 } elseif (
1888 $adjusted->isHtmlIntegrationPoint() &&
1889 ( $token === 'tag' || $token === 'text' )
1890 ) {
1891 $isForeign = false;
1892 }
1893 if ( $isForeign ) {
1894 return $this->insertForeignToken( $token, $value, $attribs, $selfclose );
1895 } else {
1896 $func = $this->parseMode;
1897 return $this->$func( $token, $value, $attribs, $selfclose );
1898 }
1899 }
1900
1901 private function insertForeignToken( $token, $value, $attribs = null, $selfclose = false ) {
1902 if ( $token === 'text' ) {
1903 $this->stack->insertText( $value );
1904 return true;
1905 } elseif ( $token === 'tag' ) {
1906 switch ( $value ) {
1907 case 'font':
1908 if ( isset( $attribs['color'] )
1909 || isset( $attribs['face'] )
1910 || isset( $attribs['size'] )
1911 ) {
1912 break;
1913 }
1914 /* otherwise, fall through */
1915 case 'b':
1916 case 'big':
1917 case 'blockquote':
1918 case 'body':
1919 case 'br':
1920 case 'center':
1921 case 'code':
1922 case 'dd':
1923 case 'div':
1924 case 'dl':
1925 case 'dt':
1926 case 'em':
1927 case 'embed':
1928 case 'h1':
1929 case 'h2':
1930 case 'h3':
1931 case 'h4':
1932 case 'h5':
1933 case 'h6':
1934 case 'head':
1935 case 'hr':
1936 case 'i':
1937 case 'img':
1938 case 'li':
1939 case 'listing':
1940 case 'menu':
1941 case 'meta':
1942 case 'nobr':
1943 case 'ol':
1944 case 'p':
1945 case 'pre':
1946 case 'ruby':
1947 case 's':
1948 case 'small':
1949 case 'span':
1950 case 'strong':
1951 case 'strike':
1952 case 'sub':
1953 case 'sup':
1954 case 'table':
1955 case 'tt':
1956 case 'u':
1957 case 'ul':
1958 case 'var':
1959 if ( $this->fragmentContext ) {
1960 break;
1961 }
1962 while ( true ) {
1963 $this->stack->pop();
1964 $node = $this->stack->currentNode;
1965 if (
1966 $node->isMathmlTextIntegrationPoint() ||
1967 $node->isHtmlIntegrationPoint() ||
1968 $node->isHtml()
1969 ) {
1970 break;
1971 }
1972 }
1973 return $this->insertToken( $token, $value, $attribs, $selfclose );
1974 }
1975 // "Any other start tag"
1976 $adjusted = ( $this->fragmentContext && $this->stack->length()===1 ) ?
1977 $this->fragmentContext : $this->stack->currentNode;
1978 $this->stack->insertForeignElement(
1979 $adjusted->namespaceURI, $value, $attribs
1980 );
1981 if ( $selfclose ) {
1982 $this->stack->pop();
1983 }
1984 return true;
1985 } elseif ( $token === 'endtag' ) {
1986 $first = true;
1987 foreach ( $this->stack as $i => $node ) {
1988 if ( $node->isHtml() && !$first ) {
1989 // process the end tag as HTML
1990 $func = $this->parseMode;
1991 return $this->$func( $token, $value, $attribs, $selfclose );
1992 } elseif ( $i === 0 ) {
1993 return true;
1994 } elseif ( $node->localName === $value ) {
1995 $this->stack->popTag( $node );
1996 return true;
1997 }
1998 $first = false;
1999 }
2000 }
2001 }
2002
2003 /**
2004 * Grab the next "token" from $bitsIterator. This is either a open/close
2005 * tag or text, depending on whether the Sanitizer approves.
2006 */
2007 private function advance() {
2008 $x = $this->bitsIterator->current();
2009 $this->bitsIterator->next();
2010 $regs = [];
2011 # $slash: Does the current element start with a '/'?
2012 # $t: Current element name
2013 # $attribStr: String between element name and >
2014 # $brace: Ending '>' or '/>'
2015 # $rest: Everything until the next element from the $bitsIterator
2016 if ( preg_match( Sanitizer::ELEMENT_BITS_REGEX, $x, $regs ) ) {
2017 list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs;
2018 $t = strtolower( $t );
2019 if ( $this->strict ) {
2020 /* Verify that attributes are all properly double-quoted */
2021 Assert::invariant(
2022 preg_match(
2023 '/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr
2024 ),
2025 "Bad attribute string found"
2026 );
2027 }
2028 } else {
2029 Assert::invariant(
2030 !$this->strict, "< found which does not start a valid tag"
2031 );
2032 $slash = $t = $attribStr = $brace = $rest = null;
2033 }
2034 $goodtag = $t;
2035 $sanitize = $this->allowedHtmlElements !== null;
2036 if ( $sanitize ) {
2037 $goodtag = $t && isset( $this->allowedHtmlElements[$t] );
2038 }
2039 if ( $goodtag ) {
2040 if ( is_callable( $this->processingCallback ) ) {
2041 call_user_func_array( $this->processingCallback, [ &$attribStr, $this->processingArgs ] );
2042 }
2043 if ( $sanitize ) {
2044 $goodtag = Sanitizer::validateTag( $attribStr, $t );
2045 }
2046 }
2047 if ( $goodtag ) {
2048 if ( $sanitize ) {
2049 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2050 $attribs = Sanitizer::validateTagAttributes( $attribs, $t );
2051 } else {
2052 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2053 }
2054 $goodtag = $this->insertToken(
2055 $slash ? 'endtag' : 'tag', $t, $attribs, $brace === '/>'
2056 );
2057 }
2058 if ( $goodtag ) {
2059 $rest = str_replace( '>', '&gt;', $rest );
2060 $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
2061 } else {
2062 # bad tag; serialize entire thing as text.
2063 $this->insertToken( 'text', '&lt;' . str_replace( '>', '&gt;', $x ) );
2064 }
2065 }
2066
2067 private function switchMode( $mode ) {
2068 Assert::parameter(
2069 substr( $mode, -4 )==='Mode', '$mode', 'should end in Mode'
2070 );
2071 $oldMode = $this->parseMode;
2072 $this->parseMode = $mode;
2073 return $oldMode;
2074 }
2075
2076 private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfclose ) {
2077 $this->switchMode( $mode );
2078 return $this->insertToken( $token, $value, $attribs, $selfclose );
2079 }
2080
2081 private function resetInsertionMode() {
2082 $last = false;
2083 foreach ( $this->stack as $i => $node ) {
2084 if ( $i === 0 ) {
2085 $last = true;
2086 if ( $this->fragmentContext ) {
2087 $node = $this->fragmentContext;
2088 }
2089 }
2090 if ( $node->isHtml() ) {
2091 switch ( $node->localName ) {
2092 # OMITTED: <select>
2093 /*
2094 case 'select':
2095 $stacklen = $this->stack->length();
2096 for ( $j = $i + 1; $j < $stacklen-1; $j++ ) {
2097 $ancestor = $this->stack->node( $stacklen-$j-1 );
2098 if ( $ancestor->isHtmlNamed( 'template' ) ) {
2099 break;
2100 }
2101 if ( $ancestor->isHtmlNamed( 'table' ) ) {
2102 $this->switchMode( 'inSelectInTableMode' );
2103 return;
2104 }
2105 }
2106 $this->switchMode( 'inSelectMode' );
2107 return;
2108 */
2109 case 'tr':
2110 $this->switchMode( 'inRowMode' );
2111 return;
2112 case 'tbody':
2113 case 'tfoot':
2114 case 'thead':
2115 $this->switchMode( 'inTableBodyMode' );
2116 return;
2117 case 'caption':
2118 $this->switchMode( 'inCaptionMode' );
2119 return;
2120 case 'colgroup':
2121 $this->switchMode( 'inColumnGroupMode' );
2122 return;
2123 case 'table':
2124 $this->switchMode( 'inTableMode' );
2125 return;
2126 case 'template':
2127 $this->switchMode(
2128 array_slice( $this->templateInsertionModes, -1 )[0]
2129 );
2130 return;
2131 case 'body':
2132 $this->switchMode( 'inBodyMode' );
2133 return;
2134 # OMITTED: <frameset>
2135 # OMITTED: <html>
2136 # OMITTED: <head>
2137 default:
2138 if ( !$last ) {
2139 # OMITTED: <head>
2140 if ( $node->isA( BalanceSets::$tableCellSet ) ) {
2141 $this->switchMode( 'inCellMode' );
2142 return;
2143 }
2144 }
2145 }
2146 }
2147 if ( $last ) {
2148 $this->switchMode( 'inBodyMode' );
2149 return;
2150 }
2151 }
2152 }
2153
2154 private function stopParsing() {
2155 # Most of the spec methods are inapplicable, other than step 2:
2156 # "pop all the nodes off the stack of open elements".
2157 # We're going to keep the top-most <html> element on the stack, though.
2158
2159 # Clear the AFE list first, otherwise the element objects will stay live
2160 # during serialization, potentially using O(N^2) memory. Note that
2161 # popping the stack will never result in reconstructing the active
2162 # formatting elements.
2163 $this->afe = null;
2164 $this->stack->popTo( 1 );
2165 }
2166
2167 private function parseRawText( $value, $attribs = null ) {
2168 $this->stack->insertHTMLElement( $value, $attribs );
2169 // XXX switch tokenizer to rawtext state?
2170 $this->originalInsertionMode = $this->switchMode( 'inTextMode' );
2171 return true;
2172 }
2173
2174 private function inTextMode( $token, $value, $attribs = null, $selfclose = false ) {
2175 if ( $token === 'text' ) {
2176 $this->stack->insertText( $value );
2177 return true;
2178 } elseif ( $token === 'eof' ) {
2179 $this->stack->pop();
2180 return $this->switchModeAndReprocess(
2181 $this->originalInsertionMode, $token, $value, $attribs, $selfclose
2182 );
2183 } elseif ( $token === 'endtag' ) {
2184 $this->stack->pop();
2185 $this->switchMode( $this->originalInsertionMode );
2186 return true;
2187 }
2188 return true;
2189 }
2190
2191 private function inHeadMode( $token, $value, $attribs = null, $selfclose = false ) {
2192 if ( $token === 'text' ) {
2193 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
2194 $this->stack->insertText( $matches[0] );
2195 $value = substr( $value, strlen( $matches[0] ) );
2196 }
2197 if ( strlen( $value ) === 0 ) {
2198 return true; // All text handled.
2199 }
2200 // Fall through to handle non-whitespace below.
2201 } elseif ( $token === 'tag' ) {
2202 switch ( $value ) {
2203 case 'meta':
2204 # OMITTED: in a full HTML parser, this might change the encoding.
2205 /* falls through */
2206 # OMITTED: <html>
2207 case 'base':
2208 case 'basefont':
2209 case 'bgsound':
2210 case 'link':
2211 $this->stack->insertHTMLElement( $value, $attribs );
2212 $this->stack->pop();
2213 return true;
2214 # OMITTED: <title>
2215 # OMITTED: <noscript>
2216 case 'noframes':
2217 case 'style':
2218 return $this->parseRawText( $value, $attribs );
2219 # OMITTED: <script>
2220 case 'template':
2221 $this->stack->insertHTMLElement( $value, $attribs );
2222 $this->afe->insertMarker();
2223 # OMITTED: frameset_ok
2224 $this->switchMode( 'inTemplateMode' );
2225 $this->templateInsertionModes[] = $this->parseMode;
2226 return true;
2227 # OMITTED: <head>
2228 }
2229 } elseif ( $token === 'endtag' ) {
2230 switch ( $value ) {
2231 # OMITTED: <head>
2232 # OMITTED: <body>
2233 # OMITTED: <html>
2234 case 'br':
2235 break; // handle at the bottom of the function
2236 case 'template':
2237 if ( $this->stack->indexOf( $value ) < 0 ) {
2238 return true; // Ignore the token.
2239 }
2240 $this->stack->generateImpliedEndTags( null, true /* thorough */ );
2241 $this->stack->popTag( $value );
2242 $this->afe->clearToMarker();
2243 array_pop( $this->templateInsertionModes );
2244 $this->resetInsertionMode();
2245 return true;
2246 default:
2247 // ignore any other end tag
2248 return true;
2249 }
2250 }
2251
2252 // If not handled above
2253 $this->inHeadMode( 'endtag', 'head' ); // synthetic </head>
2254 // Then redo this one
2255 return $this->insertToken( $token, $value, $attribs, $selfclose );
2256 }
2257
2258 private function inBodyMode( $token, $value, $attribs = null, $selfclose = false ) {
2259 if ( $token === 'text' ) {
2260 $this->afe->reconstruct( $this->stack );
2261 $this->stack->insertText( $value );
2262 return true;
2263 } elseif ( $token === 'eof' ) {
2264 if ( !empty( $this->templateInsertionModes ) ) {
2265 return $this->inTemplateMode( $token, $value, $attribs, $selfclose );
2266 }
2267 $this->stopParsing();
2268 return true;
2269 } elseif ( $token === 'tag' ) {
2270 switch ( $value ) {
2271 # OMITTED: <html>
2272 case 'base':
2273 case 'basefont':
2274 case 'bgsound':
2275 case 'link':
2276 case 'meta':
2277 case 'noframes':
2278 # OMITTED: <script>
2279 case 'style':
2280 case 'template':
2281 # OMITTED: <title>
2282 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2283 # OMITTED: <body>
2284 # OMITTED: <frameset>
2285
2286 case 'address':
2287 case 'article':
2288 case 'aside':
2289 case 'blockquote':
2290 case 'center':
2291 case 'details':
2292 case 'dialog':
2293 case 'dir':
2294 case 'div':
2295 case 'dl':
2296 case 'fieldset':
2297 case 'figcaption':
2298 case 'figure':
2299 case 'footer':
2300 case 'header':
2301 case 'hgroup':
2302 case 'main':
2303 case 'menu':
2304 case 'nav':
2305 case 'ol':
2306 case 'p':
2307 case 'section':
2308 case 'summary':
2309 case 'ul':
2310 if ( $this->stack->inButtonScope( 'p' ) ) {
2311 $this->inBodyMode( 'endtag', 'p' );
2312 }
2313 $this->stack->insertHTMLElement( $value, $attribs );
2314 return true;
2315
2316 case 'h1':
2317 case 'h2':
2318 case 'h3':
2319 case 'h4':
2320 case 'h5':
2321 case 'h6':
2322 if ( $this->stack->inButtonScope( 'p' ) ) {
2323 $this->inBodyMode( 'endtag', 'p' );
2324 }
2325 if ( $this->stack->currentNode->isA( BalanceSets::$headingSet ) ) {
2326 $this->stack->pop();
2327 }
2328 $this->stack->insertHTMLElement( $value, $attribs );
2329 return true;
2330
2331 case 'pre':
2332 case 'listing':
2333 if ( $this->stack->inButtonScope( 'p' ) ) {
2334 $this->inBodyMode( 'endtag', 'p' );
2335 }
2336 $this->stack->insertHTMLElement( $value, $attribs );
2337 # As described in "simplifications" above:
2338 # 1. We don't touch the next token, even if it's a linefeed.
2339 # 2. OMITTED: frameset_ok
2340 return true;
2341
2342 # OMITTED: <form>
2343
2344 case 'li':
2345 # OMITTED: frameset_ok
2346 foreach ( $this->stack as $node ) {
2347 if ( $node->isHtmlNamed( 'li' ) ) {
2348 $this->inBodyMode( 'endtag', 'li' );
2349 break;
2350 }
2351 if (
2352 $node->isA( BalanceSets::$specialSet ) &&
2353 !$node->isA( BalanceSets::$addressDivPSet )
2354 ) {
2355 break;
2356 }
2357 }
2358 if ( $this->stack->inButtonScope( 'p' ) ) {
2359 $this->inBodyMode( 'endtag', 'p' );
2360 }
2361 $this->stack->insertHTMLElement( $value, $attribs );
2362 return true;
2363
2364 case 'dd':
2365 case 'dt':
2366 # OMITTED: frameset_ok
2367 foreach ( $this->stack as $node ) {
2368 if ( $node->isHtmlNamed( 'dd' ) ) {
2369 $this->inBodyMode( 'endtag', 'dd' );
2370 break;
2371 }
2372 if ( $node->isHtmlNamed( 'dt' ) ) {
2373 $this->inBodyMode( 'endtag', 'dt' );
2374 break;
2375 }
2376 if (
2377 $node->isA( BalanceSets::$specialSet ) &&
2378 !$node->isA( BalanceSets::$addressDivPSet )
2379 ) {
2380 break;
2381 }
2382 }
2383 if ( $this->stack->inButtonScope( 'p' ) ) {
2384 $this->inBodyMode( 'endtag', 'p' );
2385 }
2386 $this->stack->insertHTMLElement( $value, $attribs );
2387 return true;
2388
2389 # OMITTED: <plaintext>
2390
2391 case 'button':
2392 if ( $this->stack->inScope( 'button' ) ) {
2393 $this->inBodyMode( 'endtag', 'button' );
2394 return $this->insertToken( $token, $value, $attribs, $selfclose );
2395 }
2396 $this->afe->reconstruct( $this->stack );
2397 $this->stack->insertHTMLElement( $value, $attribs );
2398 return true;
2399
2400 case 'a':
2401 $activeElement = $this->afe->findElementByTag( 'a' );
2402 if ( $activeElement ) {
2403 $this->inBodyMode( 'endtag', 'a' );
2404 if ( $this->afe->isInList( $activeElement ) ) {
2405 $this->afe->remove( $activeElement );
2406 // Don't flatten here, since when we fall
2407 // through below we might foster parent
2408 // the new <a> tag inside this one.
2409 $this->stack->removeElement( $activeElement, false );
2410 }
2411 }
2412 /* Falls through */
2413 case 'b':
2414 case 'big':
2415 case 'code':
2416 case 'em':
2417 case 'font':
2418 case 'i':
2419 case 's':
2420 case 'small':
2421 case 'strike':
2422 case 'strong':
2423 case 'tt':
2424 case 'u':
2425 $this->afe->reconstruct( $this->stack );
2426 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
2427 return true;
2428
2429 case 'nobr':
2430 $this->afe->reconstruct( $this->stack );
2431 if ( $this->stack->inScope( 'nobr' ) ) {
2432 $this->inBodyMode( 'endtag', 'nobr' );
2433 $this->afe->reconstruct( $this->stack );
2434 }
2435 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
2436 return true;
2437
2438 case 'applet':
2439 case 'marquee':
2440 case 'object':
2441 $this->afe->reconstruct( $this->stack );
2442 $this->stack->insertHTMLElement( $value, $attribs );
2443 $this->afe->insertMarker();
2444 # OMITTED: frameset_ok
2445 return true;
2446
2447 case 'table':
2448 # The document is never in "quirks mode"; see simplifications
2449 # above.
2450 if ( $this->stack->inButtonScope( 'p' ) ) {
2451 $this->inBodyMode( 'endtag', 'p' );
2452 }
2453 $this->stack->insertHTMLElement( $value, $attribs );
2454 # OMITTED: frameset_ok
2455 $this->switchMode( 'inTableMode' );
2456 return true;
2457
2458 case 'area':
2459 case 'br':
2460 case 'embed':
2461 case 'img':
2462 case 'keygen':
2463 case 'wbr':
2464 $this->afe->reconstruct( $this->stack );
2465 $this->stack->insertHTMLElement( $value, $attribs );
2466 $this->stack->pop();
2467 # OMITTED: frameset_ok
2468 return true;
2469
2470 case 'input':
2471 $this->afe->reconstruct( $this->stack );
2472 $this->stack->insertHTMLElement( $value, $attribs );
2473 $this->stack->pop();
2474 # OMITTED: frameset_ok
2475 # (hence we don't need to examine the tag's "type" attribute)
2476 return true;
2477
2478 case 'menuitem':
2479 case 'param':
2480 case 'source':
2481 case 'track':
2482 $this->stack->insertHTMLElement( $value, $attribs );
2483 $this->stack->pop();
2484 return true;
2485
2486 case 'hr':
2487 if ( $this->stack->inButtonScope( 'p' ) ) {
2488 $this->inBodyMode( 'endtag', 'p' );
2489 }
2490 $this->stack->insertHTMLElement( $value, $attribs );
2491 $this->stack->pop();
2492 return true;
2493
2494 case 'image':
2495 # warts!
2496 return $this->inBodyMode( $token, 'img', $attribs, $selfclose );
2497
2498 # OMITTED: <isindex>
2499 # OMITTED: <textarea>
2500 # OMITTED: <xmp>
2501 # OMITTED: <iframe>
2502 # OMITTED: <noembed>
2503 # OMITTED: <noscript>
2504
2505 # OMITTED: <select>
2506 /*
2507 case 'select':
2508 $this->afe->reconstruct( $this->stack );
2509 $this->stack->insertHTMLElement( $value, $attribs );
2510 switch ( $this->parseMode ) {
2511 case 'inTableMode':
2512 case 'inCaptionMode':
2513 case 'inTableBodyMode':
2514 case 'inRowMode':
2515 case 'inCellMode':
2516 $this->switchMode( 'inSelectInTableMode' );
2517 return true;
2518 default:
2519 $this->switchMode( 'inSelectMode' );
2520 return true;
2521 }
2522 */
2523
2524 case 'optgroup':
2525 case 'option':
2526 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
2527 $this->inBodyMode( 'endtag', 'option' );
2528 }
2529 $this->afe->reconstruct( $this->stack );
2530 $this->stack->insertHTMLElement( $value, $attribs );
2531 return true;
2532
2533 case 'rb':
2534 case 'rtc':
2535 if ( $this->stack->inScope( 'ruby' ) ) {
2536 $this->stack->generateImpliedEndTags();
2537 }
2538 $this->stack->insertHTMLElement( $value, $attribs );
2539 return true;
2540
2541 case 'rp':
2542 case 'rt':
2543 if ( $this->stack->inScope( 'ruby' ) ) {
2544 $this->stack->generateImpliedEndTags( 'rtc' );
2545 }
2546 $this->stack->insertHTMLElement( $value, $attribs );
2547 return true;
2548
2549 case 'math':
2550 $this->afe->reconstruct( $this->stack );
2551 # We skip the spec's "adjust MathML attributes" and
2552 # "adjust foreign attributes" steps, since the browser will
2553 # do this later when it parses the output and it doesn't affect
2554 # balancing.
2555 $this->stack->insertForeignElement(
2556 BalanceSets::MATHML_NAMESPACE, $value, $attribs
2557 );
2558 if ( $selfclose ) {
2559 # emit explicit </math> tag.
2560 $this->stack->pop();
2561 }
2562 return true;
2563
2564 case 'svg':
2565 $this->afe->reconstruct( $this->stack );
2566 # We skip the spec's "adjust SVG attributes" and
2567 # "adjust foreign attributes" steps, since the browser will
2568 # do this later when it parses the output and it doesn't affect
2569 # balancing.
2570 $this->stack->insertForeignElement(
2571 BalanceSets::SVG_NAMESPACE, $value, $attribs
2572 );
2573 if ( $selfclose ) {
2574 # emit explicit </svg> tag.
2575 $this->stack->pop();
2576 }
2577 return true;
2578
2579 case 'caption':
2580 case 'col':
2581 case 'colgroup':
2582 # OMITTED: <frame>
2583 case 'head':
2584 case 'tbody':
2585 case 'td':
2586 case 'tfoot':
2587 case 'th':
2588 case 'thead':
2589 case 'tr':
2590 // Ignore table tags if we're not inTableMode
2591 return true;
2592 }
2593
2594 // Handle any other start tag here
2595 $this->afe->reconstruct( $this->stack );
2596 $this->stack->insertHTMLElement( $value, $attribs );
2597 return true;
2598 } elseif ( $token === 'endtag' ) {
2599 switch ( $value ) {
2600 # </body>,</html> are unsupported.
2601
2602 case 'template':
2603 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2604
2605 case 'address':
2606 case 'article':
2607 case 'aside':
2608 case 'blockquote':
2609 case 'button':
2610 case 'center':
2611 case 'details':
2612 case 'dialog':
2613 case 'dir':
2614 case 'div':
2615 case 'dl':
2616 case 'fieldset':
2617 case 'figcaption':
2618 case 'figure':
2619 case 'footer':
2620 case 'header':
2621 case 'hgroup':
2622 case 'listing':
2623 case 'main':
2624 case 'menu':
2625 case 'nav':
2626 case 'ol':
2627 case 'pre':
2628 case 'section':
2629 case 'summary':
2630 case 'ul':
2631 // Ignore if there is not a matching open tag
2632 if ( !$this->stack->inScope( $value ) ) {
2633 return true;
2634 }
2635 $this->stack->generateImpliedEndTags();
2636 $this->stack->popTag( $value );
2637 return true;
2638
2639 # OMITTED: <form>
2640
2641 case 'p':
2642 if ( !$this->stack->inButtonScope( 'p' ) ) {
2643 $this->inBodyMode( 'tag', 'p', [] );
2644 return $this->insertToken( $token, $value, $attribs, $selfclose );
2645 }
2646 $this->stack->generateImpliedEndTags( $value );
2647 $this->stack->popTag( $value );
2648 return true;
2649
2650 case 'li':
2651 if ( !$this->stack->inListItemScope( $value ) ) {
2652 return true; # ignore
2653 }
2654 $this->stack->generateImpliedEndTags( $value );
2655 $this->stack->popTag( $value );
2656 return true;
2657
2658 case 'dd':
2659 case 'dt':
2660 if ( !$this->stack->inScope( $value ) ) {
2661 return true; # ignore
2662 }
2663 $this->stack->generateImpliedEndTags( $value );
2664 $this->stack->popTag( $value );
2665 return true;
2666
2667 case 'h1':
2668 case 'h2':
2669 case 'h3':
2670 case 'h4':
2671 case 'h5':
2672 case 'h6':
2673 if ( !$this->stack->inScope( BalanceSets::$headingSet ) ) {
2674 return;
2675 }
2676 $this->stack->generateImpliedEndTags();
2677 $this->stack->popTag( BalanceSets::$headingSet );
2678 return true;
2679
2680 case 'sarcasm':
2681 # Take a deep breath, then:
2682 break;
2683
2684 case 'a':
2685 case 'b':
2686 case 'big':
2687 case 'code':
2688 case 'em':
2689 case 'font':
2690 case 'i':
2691 case 'nobr':
2692 case 's':
2693 case 'small':
2694 case 'strike':
2695 case 'strong':
2696 case 'tt':
2697 case 'u':
2698 if ( $this->stack->adoptionAgency( $value, $this->afe ) ) {
2699 return true; # If we did something, we're done.
2700 }
2701 break; # Go to the "any other end tag" case.
2702
2703 case 'applet':
2704 case 'marquee':
2705 case 'object':
2706 if ( !$this->stack->inScope( $value ) ) {
2707 return true; # ignore
2708 }
2709 $this->stack->generateImpliedEndTags();
2710 $this->stack->popTag( $value );
2711 $this->afe->clearToMarker();
2712 return true;
2713
2714 case 'br':
2715 # Turn </br> into <br>
2716 return $this->inBodyMode( 'tag', $value, [] );
2717 }
2718
2719 // Any other end tag goes here
2720 foreach ( $this->stack as $i => $node ) {
2721 if ( $node->isHtmlNamed( $value ) ) {
2722 $this->stack->generateImpliedEndTags( $value );
2723 $this->stack->popTo( $i ); # including $i
2724 break;
2725 } elseif ( $node->isA( BalanceSets::$specialSet ) ) {
2726 return true; // ignore this close token.
2727 }
2728 }
2729 return true;
2730 } else {
2731 Assert::invariant( false, "Bad token type: $token" );
2732 }
2733 }
2734
2735 private function inTableMode( $token, $value, $attribs = null, $selfclose = false ) {
2736 if ( $token === 'text' ) {
2737 if ( $this->textIntegrationMode ) {
2738 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
2739 } elseif ( $this->stack->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) {
2740 $this->pendingTableText = '';
2741 $this->originalInsertionMode = $this->parseMode;
2742 return $this->switchModeAndReprocess( 'inTableTextMode', $token, $value, $attribs, $selfclose );
2743 }
2744 // fall through to default case.
2745 } elseif ( $token === 'eof' ) {
2746 $this->stopParsing();
2747 return true;
2748 } elseif ( $token === 'tag' ) {
2749 switch ( $value ) {
2750 case 'caption':
2751 $this->afe->insertMarker();
2752 $this->stack->insertHTMLElement( $value, $attribs );
2753 $this->switchMode( 'inCaptionMode' );
2754 return true;
2755 case 'colgroup':
2756 $this->stack->clearToContext( BalanceSets::$tableContextSet );
2757 $this->stack->insertHTMLElement( $value, $attribs );
2758 $this->switchMode( 'inColumnGroupMode' );
2759 return true;
2760 case 'col':
2761 $this->inTableMode( 'tag', 'colgroup', [] );
2762 return $this->insertToken( $token, $value, $attribs, $selfclose );
2763 case 'tbody':
2764 case 'tfoot':
2765 case 'thead':
2766 $this->stack->clearToContext( BalanceSets::$tableContextSet );
2767 $this->stack->insertHTMLElement( $value, $attribs );
2768 $this->switchMode( 'inTableBodyMode' );
2769 return true;
2770 case 'td':
2771 case 'th':
2772 case 'tr':
2773 $this->inTableMode( 'tag', 'tbody', [] );
2774 return $this->insertToken( $token, $value, $attribs, $selfclose );
2775 case 'table':
2776 if ( !$this->stack->inTableScope( $value ) ) {
2777 return true; // Ignore this tag.
2778 }
2779 $this->inTableMode( 'endtag', $value );
2780 return $this->insertToken( $token, $value, $attribs, $selfclose );
2781
2782 case 'style':
2783 # OMITTED: <script>
2784 case 'template':
2785 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2786
2787 case 'input':
2788 if ( !isset( $attribs['type'] ) || strcasecmp( $attribs['type'], 'hidden' ) !== 0 ) {
2789 break; // Handle this as "everything else"
2790 }
2791 $this->stack->insertHTMLElement( $value, $attribs );
2792 $this->stack->pop();
2793 return true;
2794
2795 # OMITTED: <form>
2796 }
2797 // Fall through for "anything else" clause.
2798 } elseif ( $token === 'endtag' ) {
2799 switch ( $value ) {
2800 case 'table':
2801 if ( !$this->stack->inTableScope( $value ) ) {
2802 return true; // Ignore.
2803 }
2804 $this->stack->popTag( $value );
2805 $this->resetInsertionMode();
2806 return true;
2807 # OMITTED: <body>
2808 case 'caption':
2809 case 'col':
2810 case 'colgroup':
2811 # OMITTED: <html>
2812 case 'tbody':
2813 case 'td':
2814 case 'tfoot':
2815 case 'th':
2816 case 'thead':
2817 case 'tr':
2818 return true; // Ignore the token.
2819 case 'template':
2820 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2821 }
2822 // Fall through for "anything else" clause.
2823 }
2824 // This is the "anything else" case:
2825 $this->stack->fosterParentMode = true;
2826 $this->inBodyMode( $token, $value, $attribs, $selfclose );
2827 $this->stack->fosterParentMode = false;
2828 return true;
2829 }
2830
2831 private function inTableTextMode( $token, $value, $attribs = null, $selfclose = false ) {
2832 if ( $token === 'text' ) {
2833 $this->pendingTableText .= $value;
2834 return true;
2835 }
2836 // Non-text token:
2837 $text = $this->pendingTableText;
2838 $this->pendingTableText = '';
2839 if ( preg_match( '/[^\x09\x0A\x0C\x0D\x20]/', $text ) ) {
2840 // This should match the "anything else" case inTableMode
2841 $this->stack->fosterParentMode = true;
2842 $this->inBodyMode( 'text', $text );
2843 $this->stack->fosterParentMode = false;
2844 } else {
2845 // Pending text is just whitespace.
2846 $this->stack->insertText( $text );
2847 }
2848 return $this->switchModeAndReprocess(
2849 $this->originalInsertionMode, $token, $value, $attribs, $selfclose
2850 );
2851 }
2852
2853 // helper for inCaptionMode
2854 private function endCaption() {
2855 if ( !$this->stack->inTableScope( 'caption' ) ) {
2856 return false;
2857 }
2858 $this->stack->generateImpliedEndTags();
2859 $this->stack->popTag( 'caption' );
2860 $this->afe->clearToMarker();
2861 $this->switchMode( 'inTableMode' );
2862 return true;
2863 }
2864
2865 private function inCaptionMode( $token, $value, $attribs = null, $selfclose = false ) {
2866 if ( $token === 'tag' ) {
2867 switch ( $value ) {
2868 case 'caption':
2869 case 'col':
2870 case 'colgroup':
2871 case 'tbody':
2872 case 'td':
2873 case 'tfoot':
2874 case 'th':
2875 case 'thead':
2876 case 'tr':
2877 if ( $this->endCaption() ) {
2878 $this->insertToken( $token, $value, $attribs, $selfclose );
2879 }
2880 return true;
2881 }
2882 // Fall through to "anything else" case.
2883 } elseif ( $token === 'endtag' ) {
2884 switch ( $value ) {
2885 case 'caption':
2886 $this->endCaption();
2887 return true;
2888 case 'table':
2889 if ( $this->endCaption() ) {
2890 $this->insertToken( $token, $value, $attribs, $selfclose );
2891 }
2892 return true;
2893 case 'body':
2894 case 'col':
2895 case 'colgroup':
2896 # OMITTED: <html>
2897 case 'tbody':
2898 case 'td':
2899 case 'tfoot':
2900 case 'th':
2901 case 'thead':
2902 case 'tr':
2903 // Ignore the token
2904 return true;
2905 }
2906 // Fall through to "anything else" case.
2907 }
2908 // The Anything Else case
2909 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
2910 }
2911
2912 private function inColumnGroupMode( $token, $value, $attribs = null, $selfclose = false ) {
2913 if ( $token === 'text' ) {
2914 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
2915 $this->stack->insertText( $matches[0] );
2916 $value = substr( $value, strlen( $matches[0] ) );
2917 }
2918 if ( strlen( $value ) === 0 ) {
2919 return true; // All text handled.
2920 }
2921 // Fall through to handle non-whitespace below.
2922 } elseif ( $token === 'tag' ) {
2923 switch ( $value ) {
2924 # OMITTED: <html>
2925 case 'col':
2926 $this->stack->insertHTMLElement( $value, $attribs );
2927 $this->stack->pop();
2928 return true;
2929 case 'template':
2930 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2931 }
2932 // Fall through for "anything else".
2933 } elseif ( $token === 'endtag' ) {
2934 switch ( $value ) {
2935 case 'colgroup':
2936 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
2937 return true; // Ignore the token.
2938 }
2939 $this->stack->pop();
2940 $this->switchMode( 'inTableMode' );
2941 return true;
2942 case 'col':
2943 return true; // Ignore the token.
2944 case 'template':
2945 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2946 }
2947 // Fall through for "anything else".
2948 } elseif ( $token === 'eof' ) {
2949 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
2950 }
2951
2952 // Anything else
2953 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
2954 return true; // Ignore the token.
2955 }
2956 $this->inColumnGroupMode( 'endtag', 'colgroup' );
2957 return $this->insertToken( $token, $value, $attribs, $selfclose );
2958 }
2959
2960 // Helper function for inTableBodyMode
2961 private function endSection() {
2962 if ( !(
2963 $this->stack->inTableScope( 'tbody' ) ||
2964 $this->stack->inTableScope( 'thead' ) ||
2965 $this->stack->inTableScope( 'tfoot' )
2966 ) ) {
2967 return false;
2968 }
2969 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
2970 $this->stack->pop();
2971 $this->switchMode( 'inTableMode' );
2972 return true;
2973 }
2974 private function inTableBodyMode( $token, $value, $attribs = null, $selfclose = false ) {
2975 if ( $token === 'tag' ) {
2976 switch ( $value ) {
2977 case 'tr':
2978 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
2979 $this->stack->insertHTMLElement( $value, $attribs );
2980 $this->switchMode( 'inRowMode' );
2981 return true;
2982 case 'th':
2983 case 'td':
2984 $this->inTableBodyMode( 'tag', 'tr', [] );
2985 $this->insertToken( $token, $value, $attribs, $selfclose );
2986 return true;
2987 case 'caption':
2988 case 'col':
2989 case 'colgroup':
2990 case 'tbody':
2991 case 'tfoot':
2992 case 'thead':
2993 if ( $this->endSection() ) {
2994 $this->insertToken( $token, $value, $attribs, $selfclose );
2995 }
2996 return true;
2997 }
2998 } elseif ( $token === 'endtag' ) {
2999 switch ( $value ) {
3000 case 'table':
3001 if ( $this->endSection() ) {
3002 $this->insertToken( $token, $value, $attribs, $selfclose );
3003 }
3004 return true;
3005 case 'tbody':
3006 case 'tfoot':
3007 case 'thead':
3008 if ( $this->stack->inTableScope( $value ) ) {
3009 $this->endSection();
3010 }
3011 return true;
3012 # OMITTED: <body>
3013 case 'caption':
3014 case 'col':
3015 case 'colgroup':
3016 # OMITTED: <html>
3017 case 'td':
3018 case 'th':
3019 case 'tr':
3020 return true; // Ignore the token.
3021 }
3022 }
3023 // Anything else:
3024 return $this->inTableMode( $token, $value, $attribs, $selfclose );
3025 }
3026
3027 // Helper function for inRowMode
3028 private function endRow() {
3029 if ( !$this->stack->inTableScope( 'tr' ) ) {
3030 return false;
3031 }
3032 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3033 $this->stack->pop();
3034 $this->switchMode( 'inTableBodyMode' );
3035 return true;
3036 }
3037 private function inRowMode( $token, $value, $attribs = null, $selfclose = false ) {
3038 if ( $token === 'tag' ) {
3039 switch ( $value ) {
3040 case 'th':
3041 case 'td':
3042 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3043 $this->stack->insertHTMLElement( $value, $attribs );
3044 $this->switchMode( 'inCellMode' );
3045 $this->afe->insertMarker();
3046 return true;
3047 case 'caption':
3048 case 'col':
3049 case 'colgroup':
3050 case 'tbody':
3051 case 'tfoot':
3052 case 'thead':
3053 case 'tr':
3054 if ( $this->endRow() ) {
3055 $this->insertToken( $token, $value, $attribs, $selfclose );
3056 }
3057 return true;
3058 }
3059 } elseif ( $token === 'endtag' ) {
3060 switch ( $value ) {
3061 case 'tr':
3062 $this->endRow();
3063 return true;
3064 case 'table':
3065 if ( $this->endRow() ) {
3066 $this->insertToken( $token, $value, $attribs, $selfclose );
3067 }
3068 return true;
3069 case 'tbody':
3070 case 'tfoot':
3071 case 'thead':
3072 if (
3073 $this->stack->inTableScope( $value ) &&
3074 $this->endRow()
3075 ) {
3076 $this->insertToken( $token, $value, $attribs, $selfclose );
3077 }
3078 return true;
3079 # OMITTED: <body>
3080 case 'caption':
3081 case 'col':
3082 case 'colgroup':
3083 # OMITTED: <html>
3084 case 'td':
3085 case 'th':
3086 return true; // Ignore the token.
3087 }
3088 }
3089 // Anything else:
3090 return $this->inTableMode( $token, $value, $attribs, $selfclose );
3091 }
3092
3093 // Helper for inCellMode
3094 private function endCell() {
3095 if ( $this->stack->inTableScope( 'td' ) ) {
3096 $this->inCellMode( 'endtag', 'td' );
3097 return true;
3098 } elseif ( $this->stack->inTableScope( 'th' ) ) {
3099 $this->inCellMode( 'endtag', 'th' );
3100 return true;
3101 } else {
3102 return false;
3103 }
3104 }
3105 private function inCellMode( $token, $value, $attribs = null, $selfclose = false ) {
3106 if ( $token === 'tag' ) {
3107 switch ( $value ) {
3108 case 'caption':
3109 case 'col':
3110 case 'colgroup':
3111 case 'tbody':
3112 case 'td':
3113 case 'tfoot':
3114 case 'th':
3115 case 'thead':
3116 case 'tr':
3117 if ( $this->endCell() ) {
3118 $this->insertToken( $token, $value, $attribs, $selfclose );
3119 }
3120 return true;
3121 }
3122 } elseif ( $token === 'endtag' ) {
3123 switch ( $value ) {
3124 case 'td':
3125 case 'th':
3126 if ( $this->stack->inTableScope( $value ) ) {
3127 $this->stack->generateImpliedEndTags();
3128 $this->stack->popTag( $value );
3129 $this->afe->clearToMarker();
3130 $this->switchMode( 'inRowMode' );
3131 }
3132 return true;
3133 # OMITTED: <body>
3134 case 'caption':
3135 case 'col':
3136 case 'colgroup':
3137 # OMITTED: <html>
3138 return true;
3139
3140 case 'table':
3141 case 'tbody':
3142 case 'tfoot':
3143 case 'thead':
3144 case 'tr':
3145 if ( $this->stack->inTableScope( $value ) ) {
3146 $this->stack->generateImpliedEndTags();
3147 $this->stack->popTag( BalanceSets::$tableCellSet );
3148 $this->afe->clearToMarker();
3149 $this->switchMode( 'inRowMode' );
3150 $this->insertToken( $token, $value, $attribs, $selfclose );
3151 }
3152 return true;
3153 }
3154 }
3155 // Anything else:
3156 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3157 }
3158
3159 # OMITTED: <select>
3160 /*
3161 private function inSelectMode( $token, $value, $attribs = null, $selfclose = false ) {
3162 Assert::invariant( false, 'Unimplemented' );
3163 }
3164
3165 private function inSelectInTableMode( $token, $value, $attribs = null, $selfclose = false ) {
3166 Assert::invariant( false, 'Unimplemented' );
3167 }
3168 */
3169
3170 private function inTemplateMode( $token, $value, $attribs = null, $selfclose = false ) {
3171 if ( $token === 'text' ) {
3172 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3173 } elseif ( $token === 'eof' ) {
3174 if ( $this->stack->indexOf( 'template' ) < 0 ) {
3175 $this->stopParsing();
3176 } else {
3177 $this->stack->popTag( 'template' );
3178 $this->afe->clearToMarker();
3179 array_pop( $this->templateInsertionModes );
3180 $this->resetInsertionMode();
3181 $this->insertToken( $token, $value, $attribs, $selfclose );
3182 }
3183 return true;
3184 } elseif ( $token === 'tag' ) {
3185 switch ( $value ) {
3186 case 'base':
3187 case 'basefont':
3188 case 'bgsound':
3189 case 'link':
3190 case 'meta':
3191 case 'noframes':
3192 # OMITTED: <script>
3193 case 'style':
3194 case 'template':
3195 # OMITTED: <title>
3196 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3197
3198 case 'caption':
3199 case 'colgroup':
3200 case 'tbody':
3201 case 'tfoot':
3202 case 'thead':
3203 return $this->switchModeAndReprocess(
3204 'inTableMode', $token, $value, $attribs, $selfclose
3205 );
3206
3207 case 'col':
3208 return $this->switchModeAndReprocess(
3209 'inColumnGroupMode', $token, $value, $attribs, $selfclose
3210 );
3211
3212 case 'tr':
3213 return $this->switchModeAndReprocess(
3214 'inTableBodyMode', $token, $value, $attribs, $selfclose
3215 );
3216
3217 case 'td':
3218 case 'th':
3219 return $this->switchModeAndReprocess(
3220 'inRowMode', $token, $value, $attribs, $selfclose
3221 );
3222 }
3223 return $this->switchModeAndReprocess(
3224 'inBodyMode', $token, $value, $attribs, $selfclose
3225 );
3226 } elseif ( $token === 'endtag' ) {
3227 switch ( $value ) {
3228 case 'template':
3229 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3230 }
3231 return true;
3232 } else {
3233 Assert::invariant( false, "Bad token type: $token" );
3234 }
3235 }
3236 }