0fa96bd37a956b44d3149d5467c0bb3baa6a0c89
[lhc/web/wiklou.git] / includes / tidy / Balancer.php
1 <?php
2 /**
3 * An implementation of the tree building portion of the HTML5 parsing
4 * spec.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
20 *
21 * @file
22 * @ingroup Parser
23 * @since 1.27
24 * @author C. Scott Ananian, 2016
25 */
26 namespace MediaWiki\Tidy;
27
28 use Wikimedia\Assert\Assert;
29 use Wikimedia\Assert\ParameterAssertionException;
30 use \ExplodeIterator;
31 use \IteratorAggregate;
32 use \ReverseArrayIterator;
33 use \Sanitizer;
34
35 # A note for future librarization[1] -- this file is a good candidate
36 # for splitting into an independent library, except that it is currently
37 # highly optimized for MediaWiki use. It only implements the portions
38 # of the HTML5 tree builder used by tags supported by MediaWiki, and
39 # does not contain a true tokenizer pass, instead relying on
40 # comment stripping, attribute normalization, and escaping done by
41 # the MediaWiki Sanitizer. It also deliberately avoids building
42 # a true DOM in memory, instead serializing elements to an output string
43 # as soon as possible (usually as soon as the tag is closed) to reduce
44 # its memory footprint.
45
46 # On the other hand, I've been pretty careful to note with comments in the
47 # code the places where this implementation omits features of the spec or
48 # depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
49 # implement the missing pieces and make this a standalone PHP HTML5 parser.
50 # In order to do so, some sort of MediaWiki-specific API will need
51 # to be added to (a) allow the Balancer to bypass the tokenizer,
52 # and (b) support on-the-fly flattening instead of DOM node creation.
53
54 # [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
55
56 /**
57 * Utility constants and sets for the HTML5 tree building algorithm.
58 * Sets are associative arrays indexed first by namespace and then by
59 * lower-cased tag name.
60 *
61 * @ingroup Parser
62 * @since 1.27
63 */
64 class BalanceSets {
65 const HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml';
66 const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
67 const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
68
69 public static $unsupportedSet = [
70 self::HTML_NAMESPACE => [
71 'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
72 'frame' => true,
73 'plaintext' => true, 'isindex' => true, 'textarea' => true,
74 'xmp' => true, 'iframe' => true, 'noembed' => true,
75 'noscript' => true, 'script' => true,
76 'title' => true
77 ]
78 ];
79
80 public static $emptyElementSet = [
81 self::HTML_NAMESPACE => [
82 'area' => true, 'base' => true, 'basefont' => true,
83 'bgsound' => true, 'br' => true, 'col' => true, 'command' => true,
84 'embed' => true, 'frame' => true, 'hr' => true, 'img' => true,
85 'input' => true, 'keygen' => true, 'link' => true, 'meta' => true,
86 'param' => true, 'source' => true, 'track' => true, 'wbr' => true
87 ]
88 ];
89
90 public static $headingSet = [
91 self::HTML_NAMESPACE => [
92 'h1' => true, 'h2' => true, 'h3' => true,
93 'h4' => true, 'h5' => true, 'h6' => true
94 ]
95 ];
96
97 public static $specialSet = [
98 self::HTML_NAMESPACE => [
99 'address' => true, 'applet' => true, 'area' => true,
100 'article' => true, 'aside' => true, 'base' => true,
101 'basefont' => true, 'bgsound' => true, 'blockquote' => true,
102 'body' => true, 'br' => true, 'button' => true, 'caption' => true,
103 'center' => true, 'col' => true, 'colgroup' => true, 'dd' => true,
104 'details' => true, 'dir' => true, 'div' => true, 'dl' => true,
105 'dt' => true, 'embed' => true, 'fieldset' => true,
106 'figcaption' => true, 'figure' => true, 'footer' => true,
107 'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true,
108 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
109 'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
110 'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
111 'input' => true, 'isindex' => true, 'li' => true, 'link' => true,
112 'listing' => true, 'main' => true, 'marquee' => true,
113 'menu' => true, 'menuitem' => true, 'meta' => true, 'nav' => true,
114 'noembed' => true, 'noframes' => true, 'noscript' => true,
115 'object' => true, 'ol' => true, 'p' => true, 'param' => true,
116 'plaintext' => true, 'pre' => true, 'script' => true,
117 'section' => true, 'select' => true, 'source' => true,
118 'style' => true, 'summary' => true, 'table' => true,
119 'tbody' => true, 'td' => true, 'template' => true,
120 'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true,
121 'title' => true, 'tr' => true, 'track' => true, 'ul' => true,
122 'wbr' => true, 'xmp' => true
123 ],
124 self::SVG_NAMESPACE => [
125 'foreignobject' => true, 'desc' => true, 'title' => true
126 ],
127 self::MATHML_NAMESPACE => [
128 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
129 'mtext' => true, 'annotation-xml' => true
130 ]
131 ];
132
133 public static $addressDivPSet = [
134 self::HTML_NAMESPACE => [
135 'address' => true, 'div' => true, 'p' => true
136 ]
137 ];
138
139 public static $tableSectionRowSet = [
140 self::HTML_NAMESPACE => [
141 'table' => true, 'thead' => true, 'tbody' => true,
142 'tfoot' => true, 'tr' => true
143 ]
144 ];
145
146 public static $impliedEndTagsSet = [
147 self::HTML_NAMESPACE => [
148 'dd' => true, 'dt' => true, 'li' => true, 'optgroup' => true,
149 'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
150 'rt' => true, 'rtc' => true
151 ]
152 ];
153
154 public static $thoroughImpliedEndTagsSet = [
155 self::HTML_NAMESPACE => [
156 'caption' => true, 'colgroup' => true, 'dd' => true, 'dt' => true,
157 'li' => true, 'optgroup' => true, 'option' => true, 'p' => true,
158 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true,
159 'tbody' => true, 'td' => true, 'tfoot' => true, 'th' => true,
160 'thead' => true, 'tr' => true
161 ]
162 ];
163
164 public static $tableCellSet = [
165 self::HTML_NAMESPACE => [
166 'td' => true, 'th' => true
167 ]
168 ];
169 public static $tableContextSet = [
170 self::HTML_NAMESPACE => [
171 'table' => true, 'template' => true, 'html' => true
172 ]
173 ];
174
175 public static $tableBodyContextSet = [
176 self::HTML_NAMESPACE => [
177 'tbody' => true, 'tfoot' => true, 'thead' => true,
178 'template' => true, 'html' => true
179 ]
180 ];
181
182 public static $tableRowContextSet = [
183 self::HTML_NAMESPACE => [
184 'tr' => true, 'template' => true, 'html' => true
185 ]
186 ];
187
188 // See https://html.spec.whatwg.org/multipage/forms.html#form-associated-element
189 public static $formAssociatedSet = [
190 self::HTML_NAMESPACE => [
191 'button' => true, 'fieldset' => true, 'input' => true,
192 'keygen' => true, 'object' => true, 'output' => true,
193 'select' => true, 'textarea' => true, 'img' => true
194 ]
195 ];
196
197 public static $inScopeSet = [
198 self::HTML_NAMESPACE => [
199 'applet' => true, 'caption' => true, 'html' => true,
200 'marquee' => true, 'object' => true,
201 'table' => true, 'td' => true, 'template' => true,
202 'th' => true
203 ],
204 self::SVG_NAMESPACE => [
205 'foreignobject' => true, 'desc' => true, 'title' => true
206 ],
207 self::MATHML_NAMESPACE => [
208 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
209 'mtext' => true, 'annotation-xml' => true
210 ]
211 ];
212
213 private static $inListItemScopeSet = null;
214 public static function inListItemScopeSet() {
215 if ( self::$inListItemScopeSet === null ) {
216 self::$inListItemScopeSet = self::$inScopeSet;
217 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ol'] = true;
218 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ul'] = true;
219 }
220 return self::$inListItemScopeSet;
221 }
222
223 private static $inButtonScopeSet = null;
224 public static function inButtonScopeSet() {
225 if ( self::$inButtonScopeSet === null ) {
226 self::$inButtonScopeSet = self::$inScopeSet;
227 self::$inButtonScopeSet[self::HTML_NAMESPACE]['button'] = true;
228 }
229 return self::$inButtonScopeSet;
230 }
231
232 public static $inTableScopeSet = [
233 self::HTML_NAMESPACE => [
234 'html' => true, 'table' => true, 'template' => true
235 ]
236 ];
237
238 public static $inInvertedSelectScopeSet = [
239 self::HTML_NAMESPACE => [
240 'option' => true, 'optgroup' => true
241 ]
242 ];
243
244 public static $mathmlTextIntegrationPointSet = [
245 self::MATHML_NAMESPACE => [
246 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
247 'mtext' => true
248 ]
249 ];
250
251 public static $htmlIntegrationPointSet = [
252 self::SVG_NAMESPACE => [
253 'foreignobject' => true,
254 'desc' => true,
255 'title' => true
256 ]
257 ];
258
259 // For tidy compatibility.
260 public static $tidyPWrapSet = [
261 self::HTML_NAMESPACE => [
262 'body' => true, 'blockquote' => true,
263 // We parse with <body> as the fragment context, but the top-level
264 // element on the stack is actually <html>. We could use the
265 // "adjusted current node" everywhere to work around this, but it's
266 // easier just to add <html> to the p-wrap set.
267 'html' => true,
268 ],
269 ];
270 public static $tidyInlineSet = [
271 self::HTML_NAMESPACE => [
272 'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true,
273 'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true,
274 'br' => true, 'button' => true, 'cite' => true, 'code' => true,
275 'dfn' => true, 'em' => true, 'font' => true, 'i' => true,
276 'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true,
277 'label' => true, 'legend' => true, 'map' => true, 'object' => true,
278 'param' => true, 'q' => true, 'rb' => true, 'rbc' => true,
279 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true,
280 's' => true, 'samp' => true, 'select' => true, 'small' => true,
281 'span' => true, 'strike' => true, 'strong' => true, 'sub' => true,
282 'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true,
283 'var' => true,
284 ],
285 ];
286 }
287
288 /**
289 * A BalanceElement is a simplified version of a DOM Node. The main
290 * difference is that we only keep BalanceElements around for nodes
291 * currently on the BalanceStack of open elements. As soon as an
292 * element is closed, with some minor exceptions relating to the
293 * tree builder "adoption agency algorithm", the element and all its
294 * children are serialized to a string using the flatten() method.
295 * This keeps our memory usage low.
296 *
297 * @ingroup Parser
298 * @since 1.27
299 */
300 class BalanceElement {
301 /**
302 * The namespace of the element.
303 * @var string $namespaceURI
304 */
305 public $namespaceURI;
306 /**
307 * The lower-cased name of the element.
308 * @var string $localName
309 */
310 public $localName;
311 /**
312 * Attributes for the element, in array form
313 * @var array $attribs
314 */
315 public $attribs;
316
317 /**
318 * Parent of this element, or the string "flat" if this element has
319 * already been flattened into its parent.
320 * @var string|null $parent
321 */
322 public $parent;
323
324 /**
325 * An array of children of this element. Typically only the last
326 * child will be an actual BalanceElement object; the rest will
327 * be strings, representing either text nodes or flattened
328 * BalanceElement objects.
329 * @var array $children
330 */
331 public $children;
332
333 /**
334 * A unique string identifier for Noah's Ark purposes, lazy initialized
335 */
336 private $noahKey;
337
338 /**
339 * The next active formatting element in the list, or null if this is the
340 * end of the AFE list or if the element is not in the AFE list.
341 */
342 public $nextAFE;
343
344 /**
345 * The previous active formatting element in the list, or null if this is
346 * the start of the list or if the element is not in the AFE list.
347 */
348 public $prevAFE;
349
350 /**
351 * The next element in the Noah's Ark species bucket.
352 */
353 public $nextNoah;
354
355 /**
356 * Make a new BalanceElement corresponding to the HTML DOM Element
357 * with the given localname, namespace, and attributes.
358 *
359 * @param string $namespaceURI The namespace of the element.
360 * @param string $localName The lowercased name of the tag.
361 * @param array $attribs Attributes of the element
362 */
363 public function __construct( $namespaceURI, $localName, array $attribs ) {
364 $this->localName = $localName;
365 $this->namespaceURI = $namespaceURI;
366 $this->attribs = $attribs;
367 $this->contents = '';
368 $this->parent = null;
369 $this->children = [];
370 }
371
372 /**
373 * Remove the given child from this element.
374 * @param BalanceElement $elt
375 */
376 private function removeChild( BalanceElement $elt ) {
377 Assert::precondition(
378 $this->parent !== 'flat', "Can't removeChild after flattening $this"
379 );
380 Assert::parameter(
381 $elt->parent === $this, 'elt', 'must have $this as a parent'
382 );
383 $idx = array_search( $elt, $this->children, true );
384 Assert::parameter( $idx !== false, '$elt', 'must be a child of $this' );
385 $elt->parent = null;
386 array_splice( $this->children, $idx, 1 );
387 }
388
389 /**
390 * Find $a in the list of children and insert $b before it.
391 * @param BalanceElement $a
392 * @param BalanceElement|string $b
393 */
394 public function insertBefore( BalanceElement $a, $b ) {
395 Assert::precondition(
396 $this->parent !== 'flat', "Can't insertBefore after flattening."
397 );
398 $idx = array_search( $a, $this->children, true );
399 Assert::parameter( $idx !== false, '$a', 'must be a child of $this' );
400 if ( is_string( $b ) ) {
401 array_splice( $this->children, $idx, 0, [ $b ] );
402 } else {
403 Assert::parameter( $b->parent !== 'flat', '$b', "Can't be flat" );
404 if ( $b->parent !== null ) {
405 $b->parent->removeChild( $b );
406 }
407 array_splice( $this->children, $idx, 0, [ $b ] );
408 $b->parent = $this;
409 }
410 }
411
412 /**
413 * Append $elt to the end of the list of children.
414 * @param BalanceElement|string $elt
415 */
416 public function appendChild( $elt ) {
417 Assert::precondition(
418 $this->parent !== 'flat', "Can't appendChild after flattening."
419 );
420 if ( is_string( $elt ) ) {
421 array_push( $this->children, $elt );
422 return;
423 }
424 // Remove $elt from parent, if it had one.
425 if ( $elt->parent !== null ) {
426 $elt->parent->removeChild( $elt );
427 }
428 array_push( $this->children, $elt );
429 $elt->parent = $this;
430 }
431
432 /**
433 * Transfer all of the children of $elt to $this.
434 * @param BalanceElement $elt
435 */
436 public function adoptChildren( BalanceElement $elt ) {
437 Assert::precondition(
438 $elt->parent !== 'flat', "Can't adoptChildren after flattening."
439 );
440 foreach ( $elt->children as $child ) {
441 if ( !is_string( $child ) ) {
442 // This is an optimization which avoids an O(n^2) set of
443 // array_splice operations.
444 $child->parent = null;
445 }
446 $this->appendChild( $child );
447 }
448 $elt->children = [];
449 }
450
451 /**
452 * Flatten this node and all of its children into a string, as specified
453 * by the HTML serialization specification, and replace this node
454 * in its parent by that string.
455 *
456 * @see __toString()
457 */
458 public function flatten( $tidyCompat = false ) {
459 Assert::parameter( $this->parent !== null, '$this', 'must be a child' );
460 Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' );
461 $idx = array_search( $this, $this->parent->children, true );
462 Assert::parameter(
463 $idx !== false, '$this', 'must be a child of its parent'
464 );
465 if ( $tidyCompat ) {
466 $blank = true;
467 foreach ( $this->children as $elt ) {
468 if ( !is_string( $elt ) ) {
469 $elt = $elt->flatten( $tidyCompat );
470 }
471 if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) {
472 $blank = false;
473 }
474 }
475 if ( $this->isHtmlNamed( 'mw:p-wrap' ) ) {
476 $this->localName = 'p';
477 } elseif ( $blank ) {
478 // Add 'mw-empty-elt' class so elements can be hidden via CSS
479 // for compatibility with legacy tidy.
480 if ( !count( $this->attribs ) &&
481 ( $this->localName === 'tr' || $this->localName === 'li' )
482 ) {
483 $this->attribs = [ 'class' => "mw-empty-elt" ];
484 }
485 $blank = false;
486 }
487 $flat = $blank ? '' : "{$this}";
488 } else {
489 $flat = "{$this}";
490 }
491 $this->parent->children[$idx] = $flat;
492 $this->parent = 'flat'; # for assertion checking
493 return $flat;
494 }
495
496 /**
497 * Serialize this node and all of its children to a string, as specified
498 * by the HTML serialization specification.
499 *
500 * @return string The serialization of the BalanceElement
501 * @see https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments
502 */
503 public function __toString() {
504 $encAttribs = '';
505 foreach ( $this->attribs as $name => $value ) {
506 $encValue = Sanitizer::encodeAttribute( $value );
507 $encAttribs .= " $name=\"$encValue\"";
508 }
509 if ( !$this->isA( BalanceSets::$emptyElementSet ) ) {
510 $out = "<{$this->localName}{$encAttribs}>";
511 // flatten children
512 foreach ( $this->children as $elt ) {
513 $out .= "{$elt}";
514 }
515 $out .= "</{$this->localName}>";
516 } else {
517 $out = "<{$this->localName}{$encAttribs} />";
518 Assert::invariant(
519 count( $this->children ) === 0,
520 "Empty elements shouldn't have children."
521 );
522 }
523 return $out;
524 }
525
526 # Utility functions on BalanceElements.
527
528 /**
529 * Determine if $this represents a specific HTML tag, is a member of
530 * a tag set, or is equal to another BalanceElement.
531 *
532 * @param BalanceElement|array|string $set The target BalanceElement,
533 * set (from the BalanceSets class), or string (HTML tag name).
534 * @return bool
535 */
536 public function isA( $set ) {
537 if ( $set instanceof BalanceElement ) {
538 return $this === $set;
539 } elseif ( is_array( $set ) ) {
540 return isset( $set[$this->namespaceURI] ) &&
541 isset( $set[$this->namespaceURI][$this->localName] );
542 } else {
543 # assume this is an HTML element name.
544 return $this->isHtml() && $this->localName === $set;
545 }
546 }
547
548 /**
549 * Determine if this element is an HTML element with the specified name
550 * @param string $tagName
551 * @return bool
552 */
553 public function isHtmlNamed( $tagName ) {
554 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE
555 && $this->localName === $tagName;
556 }
557
558 /**
559 * Determine if $this represents an element in the HTML namespace.
560 *
561 * @return bool
562 */
563 public function isHtml() {
564 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE;
565 }
566
567 /**
568 * Determine if $this represents a MathML text integration point,
569 * as defined in the HTML5 specification.
570 *
571 * @return bool
572 * @see https://html.spec.whatwg.org/multipage/syntax.html#mathml-text-integration-point
573 */
574 public function isMathmlTextIntegrationPoint() {
575 return $this->isA( BalanceSets::$mathmlTextIntegrationPointSet );
576 }
577
578 /**
579 * Determine if $this represents an HTML integration point,
580 * as defined in the HTML5 specification.
581 *
582 * @return bool
583 * @see https://html.spec.whatwg.org/multipage/syntax.html#html-integration-point
584 */
585 public function isHtmlIntegrationPoint() {
586 if ( $this->isA( BalanceSets::$htmlIntegrationPointSet ) ) {
587 return true;
588 }
589 if (
590 $this->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
591 $this->localName === 'annotation-xml' &&
592 isset( $this->attribs['encoding'] ) &&
593 ( strcasecmp( $this->attribs['encoding'], 'text/html' ) == 0 ||
594 strcasecmp( $this->attribs['encoding'], 'application/xhtml+xml' ) == 0 )
595 ) {
596 return true;
597 }
598 return false;
599 }
600
601 /**
602 * Get a string key for the Noah's Ark algorithm
603 */
604 public function getNoahKey() {
605 if ( $this->noahKey === null ) {
606 $attribs = $this->attribs;
607 ksort( $attribs );
608 $this->noahKey = serialize( [ $this->namespaceURI, $this->localName, $attribs ] );
609 }
610 return $this->noahKey;
611 }
612 }
613
614 /**
615 * The "stack of open elements" as defined in the HTML5 tree builder
616 * spec. This contains methods to ensure that content (start tags, text)
617 * are inserted at the correct place in the output string, and to
618 * flatten BalanceElements are they are closed to avoid holding onto
619 * a complete DOM tree for the document in memory.
620 *
621 * The stack defines a PHP iterator to traverse it in "reverse order",
622 * that is, the most-recently-added element is visited first in a
623 * foreach loop.
624 *
625 * @ingroup Parser
626 * @since 1.27
627 * @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
628 */
629 class BalanceStack implements IteratorAggregate {
630 /**
631 * Backing storage for the stack.
632 * @var array $elements
633 */
634 private $elements = [];
635 /**
636 * Foster parent mode determines how nodes are inserted into the
637 * stack.
638 * @var bool $fosterParentMode
639 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
640 */
641 public $fosterParentMode = false;
642 /**
643 * Tidy compatibility mode, determines behavior of body/blockquote
644 */
645 public $tidyCompat = false;
646 /**
647 * Reference to the current element
648 */
649 public $currentNode;
650
651 /**
652 * Create a new BalanceStack with a single BalanceElement on it,
653 * representing the root &lt;html&gt; node.
654 */
655 public function __construct() {
656 # always a root <html> element on the stack
657 array_push(
658 $this->elements,
659 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] )
660 );
661 $this->currentNode = $this->elements[0];
662 }
663
664 /**
665 * Return a string representing the output of the tree builder:
666 * all the children of the root &lt;html&gt; node.
667 * @return string
668 */
669 public function getOutput() {
670 // Don't include the outer '<html>....</html>'
671 $out = '';
672 foreach ( $this->elements[0]->children as $elt ) {
673 $out .= is_string( $elt ) ? $elt :
674 $elt->flatten( $this->tidyCompat );
675 }
676 return $out;
677 }
678
679 /**
680 * Insert text at the appropriate place for inserting a node.
681 * @param string $value
682 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
683 */
684 public function insertText( $value ) {
685 if (
686 $this->fosterParentMode &&
687 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
688 ) {
689 $this->fosterParent( $value );
690 } elseif (
691 $this->tidyCompat &&
692 $this->currentNode->isA( BalanceSets::$tidyPWrapSet )
693 ) {
694 $this->insertHTMLELement( 'mw:p-wrap', [] );
695 return $this->insertText( $value );
696 } else {
697 $this->currentNode->appendChild( $value );
698 }
699 }
700
701 /**
702 * Insert a BalanceElement at the appropriate place, pushing it
703 * on to the open elements stack.
704 * @param string $namespaceURI The element namespace
705 * @param string $tag The tag name
706 * @param string $attribs Normalized attributes, as a string.
707 * @return BalanceElement
708 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-foreign-element
709 */
710 public function insertForeignElement( $namespaceURI, $tag, $attribs ) {
711 return $this->insertElement(
712 new BalanceElement( $namespaceURI, $tag, $attribs )
713 );
714 }
715
716 /**
717 * Insert an HTML element at the appropriate place, pushing it on to
718 * the open elements stack.
719 * @param string $tag The tag name
720 * @param string $attribs Normalized attributes, as a string.
721 * @return BalanceElement
722 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-an-html-element
723 */
724 public function insertHTMLElement( $tag, $attribs ) {
725 return $this->insertForeignElement(
726 BalanceSets::HTML_NAMESPACE, $tag, $attribs
727 );
728 }
729
730 /**
731 * Insert an element at the appropriate place and push it on to the
732 * open elements stack.
733 * @param BalanceElement $elt
734 * @return BalanceElement
735 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
736 */
737 public function insertElement( BalanceElement $elt ) {
738 if (
739 $this->currentNode->isHtmlNamed( 'mw:p-wrap' ) &&
740 !$elt->isA( BalanceSets::$tidyInlineSet )
741 ) {
742 // Tidy compatibility.
743 $this->pop();
744 }
745 if (
746 $this->fosterParentMode &&
747 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
748 ) {
749 $elt = $this->fosterParent( $elt );
750 } else {
751 $this->currentNode->appendChild( $elt );
752 }
753 Assert::invariant( $elt->parent !== null, "$elt must be in tree" );
754 Assert::invariant( $elt->parent !== 'flat', "$elt must not have been previous flattened" );
755 array_push( $this->elements, $elt );
756 $this->currentNode = $elt;
757 return $elt;
758 }
759
760 /**
761 * Determine if the stack has $tag in scope.
762 * @param BalanceElement|array|string $tag
763 * @return bool
764 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-scope
765 */
766 public function inScope( $tag ) {
767 return $this->inSpecificScope( $tag, BalanceSets::$inScopeSet );
768 }
769
770 /**
771 * Determine if the stack has $tag in button scope.
772 * @param BalanceElement|array|string $tag
773 * @return bool
774 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-button-scope
775 */
776 public function inButtonScope( $tag ) {
777 return $this->inSpecificScope( $tag, BalanceSets::inButtonScopeSet() );
778 }
779
780 /**
781 * Determine if the stack has $tag in list item scope.
782 * @param BalanceElement|array|string $tag
783 * @return bool
784 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-list-item-scope
785 */
786 public function inListItemScope( $tag ) {
787 return $this->inSpecificScope( $tag, BalanceSets::inListItemScopeSet() );
788 }
789
790 /**
791 * Determine if the stack has $tag in table scope.
792 * @param BalanceElement|array|string $tag
793 * @return bool
794 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-table-scope
795 */
796 public function inTableScope( $tag ) {
797 return $this->inSpecificScope( $tag, BalanceSets::$inTableScopeSet );
798 }
799
800 /**
801 * Determine if the stack has $tag in select scope.
802 * @param BalanceElement|array|string $tag
803 * @return bool
804 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-select-scope
805 */
806 public function inSelectScope( $tag ) {
807 // Can't use inSpecificScope to implement this, since it involves
808 // *inverting* a set of tags. Implement manually.
809 foreach ( $this as $elt ) {
810 if ( $elt->isA( $tag ) ) {
811 return true;
812 }
813 if ( !$elt->isA( BalanceSets::$inInvertedSelectScopeSet ) ) {
814 return false;
815 }
816 }
817 return false;
818 }
819
820 /**
821 * Determine if the stack has $tag in a specific scope, $set.
822 * @param BalanceElement|array|string $tag
823 * @param BalanceElement|array|string $set
824 * @return bool
825 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-the-specific-scope
826 */
827 public function inSpecificScope( $tag, $set ) {
828 foreach ( $this as $elt ) {
829 if ( $elt->isA( $tag ) ) {
830 return true;
831 }
832 if ( $elt->isA( $set ) ) {
833 return false;
834 }
835 }
836 return false;
837 }
838
839 /**
840 * Generate implied end tags.
841 * @param string $butnot
842 * @param bool $thorough True if we should generate end tags thoroughly.
843 * @see https://html.spec.whatwg.org/multipage/syntax.html#generate-implied-end-tags
844 */
845 public function generateImpliedEndTags( $butnot = null, $thorough = false ) {
846 $endTagSet = $thorough ?
847 BalanceSets::$thoroughImpliedEndTagsSet :
848 BalanceSets::$impliedEndTagsSet;
849 while ( $this->currentNode ) {
850 if ( $butnot !== null && $this->currentNode->isHtmlNamed( $butnot ) ) {
851 break;
852 }
853 if ( !$this->currentNode->isA( $endTagSet ) ) {
854 break;
855 }
856 $this->pop();
857 }
858 }
859
860 /**
861 * Return the adjusted current node.
862 */
863 public function adjustedCurrentNode( $fragmentContext ) {
864 return ( $fragmentContext && count( $this->elements ) === 1 ) ?
865 $fragmentContext : $this->currentNode;
866 }
867
868 /**
869 * Return an iterator over this stack which visits the current node
870 * first, and the root node last.
871 * @return Iterator
872 */
873 public function getIterator() {
874 return new ReverseArrayIterator( $this->elements );
875 }
876
877 /**
878 * Return the BalanceElement at the given position $idx, where
879 * position 0 represents the root element.
880 * @param int $idx
881 * @return BalanceElement
882 */
883 public function node( $idx ) {
884 return $this->elements[ $idx ];
885 }
886
887 /**
888 * Replace the element at position $idx in the BalanceStack with $elt.
889 * @param int $idx
890 * @param BalanceElement $elt
891 */
892 public function replaceAt( $idx, BalanceElement $elt ) {
893 Assert::precondition(
894 $this->elements[$idx]->parent !== 'flat',
895 'Replaced element should not have already been flattened.'
896 );
897 Assert::precondition(
898 $elt->parent !== 'flat',
899 'New element should not have already been flattened.'
900 );
901 $this->elements[$idx] = $elt;
902 if ( $idx === count( $this->elements ) - 1 ) {
903 $this->currentNode = $elt;
904 }
905 }
906
907 /**
908 * Return the position of the given BalanceElement, set, or
909 * HTML tag name string in the BalanceStack.
910 * @param BalanceElement|array|string $tag
911 * @return int
912 */
913 public function indexOf( $tag ) {
914 for ( $i = count( $this->elements ) - 1; $i >= 0; $i-- ) {
915 if ( $this->elements[$i]->isA( $tag ) ) {
916 return $i;
917 }
918 }
919 return -1;
920 }
921
922 /**
923 * Return the number of elements currently in the BalanceStack.
924 * @return int
925 */
926 public function length() {
927 return count( $this->elements );
928 }
929
930 /**
931 * Remove the current node from the BalanceStack, flattening it
932 * in the process.
933 */
934 public function pop() {
935 $elt = array_pop( $this->elements );
936 if ( count( $this->elements ) ) {
937 $this->currentNode = $this->elements[ count( $this->elements ) - 1 ];
938 } else {
939 $this->currentNode = null;
940 }
941 if ( !$elt->isHtmlNamed( 'mw:p-wrap' ) ) {
942 $elt->flatten( $this->tidyCompat );
943 }
944 }
945
946 /**
947 * Remove all nodes up to and including position $idx from the
948 * BalanceStack, flattening them in the process.
949 * @param int $idx
950 */
951 public function popTo( $idx ) {
952 $length = count( $this->elements );
953 for ( $length = count( $this->elements ); $length > $idx; $length-- ) {
954 $this->pop();
955 }
956 }
957
958 /**
959 * Pop elements off the stack up to and including the first
960 * element with the specified HTML tagname (or matching the given
961 * set).
962 * @param BalanceElement|array|string $tag
963 */
964 public function popTag( $tag ) {
965 while ( $this->currentNode ) {
966 if ( $this->currentNode->isA( $tag ) ) {
967 $this->pop();
968 break;
969 }
970 $this->pop();
971 }
972 }
973
974 /**
975 * Pop elements off the stack *not including* the first element
976 * in the specified set.
977 * @param BalanceElement|array|string $set
978 */
979 public function clearToContext( $set ) {
980 // Note that we don't loop to 0. Never pop the <html> elt off.
981 for ( $length = count( $this->elements ); $length > 1; $length-- ) {
982 if ( $this->currentNode->isA( $set ) ) {
983 break;
984 }
985 $this->pop();
986 }
987 }
988
989 /**
990 * Remove the given $elt from the BalanceStack, optionally
991 * flattening it in the process.
992 * @param BalanceElement $elt The element to remove.
993 * @param bool $flatten Whether to flatten the removed element.
994 */
995 public function removeElement( BalanceElement $elt, $flatten = true ) {
996 Assert::parameter(
997 $elt->parent !== 'flat',
998 '$elt',
999 '$elt should not already have been flattened.'
1000 );
1001 Assert::parameter(
1002 $elt->parent->parent !== 'flat',
1003 '$elt',
1004 'The parent of $elt should not already have been flattened.'
1005 );
1006 $idx = array_search( $elt, $this->elements, true );
1007 Assert::parameter( $idx !== false, '$elt', 'must be in stack' );
1008 array_splice( $this->elements, $idx, 1 );
1009 if ( $idx === count( $this->elements ) ) {
1010 $this->currentNode = $this->elements[$idx - 1];
1011 }
1012 if ( $flatten ) {
1013 // serialize $elt into its parent
1014 // otherwise, it will eventually serialize when the parent
1015 // is serialized, we just hold onto the memory for its
1016 // tree of objects a little longer.
1017 $elt->flatten( $this->tidyCompat );
1018 }
1019 Assert::postcondition(
1020 array_search( $elt, $this->elements, true ) === false,
1021 '$elt should no longer be in open elements stack'
1022 );
1023 }
1024
1025 /**
1026 * Find $a in the BalanceStack and insert $b after it.
1027 * @param BalanceElement $a
1028 * @param BalanceElement $b
1029 */
1030 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
1031 $idx = $this->indexOf( $a );
1032 Assert::parameter( $idx !== false, '$a', 'must be in stack' );
1033 if ( $idx === count( $this->elements ) - 1 ) {
1034 array_push( $this->elements, $b );
1035 $this->currentNode = $b;
1036 } else {
1037 array_splice( $this->elements, $idx + 1, 0, [ $b ] );
1038 }
1039 }
1040
1041 # Fostering and adoption.
1042
1043 /**
1044 * Foster parent the given $elt in the stack of open elements.
1045 * @param BalanceElement|string $elt
1046 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
1047 */
1048 private function fosterParent( $elt ) {
1049 $lastTable = $this->indexOf( 'table' );
1050 $lastTemplate = $this->indexOf( 'template' );
1051 $parent = null;
1052 $before = null;
1053
1054 if ( $lastTemplate >= 0 && ( $lastTable < 0 || $lastTemplate > $lastTable ) ) {
1055 $parent = $this->elements[$lastTemplate];
1056 } elseif ( $lastTable >= 0 ) {
1057 $parent = $this->elements[$lastTable]->parent;
1058 # Assume all tables have parents, since we're not running scripts!
1059 Assert::invariant(
1060 $parent !== null, "All tables should have parents"
1061 );
1062 $before = $this->elements[$lastTable];
1063 } else {
1064 $parent = $this->elements[0]; // the `html` element.
1065 }
1066
1067 if ( $this->tidyCompat ) {
1068 if ( is_string( $elt ) ) {
1069 // We're fostering text: do we need a p-wrapper?
1070 if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) {
1071 $this->insertHTMLElement( 'mw:p-wrap', [] );
1072 $this->insertText( $elt );
1073 return $elt;
1074 }
1075 } else {
1076 // We're fostering an element; do we need to merge p-wrappers?
1077 if ( $elt->isHtmlNamed( 'mw:p-wrap' ) ) {
1078 $idx = $before ?
1079 array_search( $before, $parent->children, true ) :
1080 count( $parent->children );
1081 $after = $idx > 0 ? $parent->children[$idx - 1] : '';
1082 if (
1083 $after instanceof BalanceElement &&
1084 $after->isHtmlNamed( 'mw:p-wrap' )
1085 ) {
1086 return $after; // Re-use existing p-wrapper.
1087 }
1088 }
1089 }
1090 }
1091
1092 if ( $before ) {
1093 $parent->insertBefore( $before, $elt );
1094 } else {
1095 $parent->appendChild( $elt );
1096 }
1097 return $elt;
1098 }
1099
1100 /**
1101 * Run the "adoption agency algoritm" (AAA) for the given subject
1102 * tag name.
1103 * @param string $tag The subject tag name.
1104 * @param BalanceActiveFormattingElements $afe The current
1105 * active formatting elements list.
1106 * @return true if the adoption agency algorithm "did something", false
1107 * if more processing is required by the caller.
1108 * @see https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
1109 */
1110 public function adoptionAgency( $tag, $afe ) {
1111 // If the current node is an HTML element whose tag name is subject,
1112 // and the current node is not in the list of active formatting
1113 // elements, then pop the current node off the stack of open
1114 // elements and abort these steps.
1115 if (
1116 $this->currentNode->isHtmlNamed( $tag ) &&
1117 !$afe->isInList( $this->currentNode )
1118 ) {
1119 $this->pop();
1120 return true; // no more handling required
1121 }
1122
1123 // Let outer loop counter be zero.
1124 $outer = 0;
1125
1126 // Outer loop: If outer loop counter is greater than or
1127 // equal to eight, then abort these steps.
1128 while ( $outer < 8 ) {
1129 // Increment outer loop counter by one.
1130 $outer++;
1131
1132 // Let the formatting element be the last element in the list
1133 // of active formatting elements that: is between the end of
1134 // the list and the last scope marker in the list, if any, or
1135 // the start of the list otherwise, and has the same tag name
1136 // as the token.
1137 $fmtelt = $afe->findElementByTag( $tag );
1138
1139 // If there is no such node, then abort these steps and instead
1140 // act as described in the "any other end tag" entry below.
1141 if ( !$fmtelt ) {
1142 return false; // false means handle by the default case
1143 }
1144
1145 // Otherwise, if there is such a node, but that node is not in
1146 // the stack of open elements, then this is a parse error;
1147 // remove the element from the list, and abort these steps.
1148 $index = $this->indexOf( $fmtelt );
1149 if ( $index < 0 ) {
1150 $afe->remove( $fmtelt );
1151 return true; // true means no more handling required
1152 }
1153
1154 // Otherwise, if there is such a node, and that node is also in
1155 // the stack of open elements, but the element is not in scope,
1156 // then this is a parse error; ignore the token, and abort
1157 // these steps.
1158 if ( !$this->inScope( $fmtelt ) ) {
1159 return true;
1160 }
1161
1162 // Let the furthest block be the topmost node in the stack of
1163 // open elements that is lower in the stack than the formatting
1164 // element, and is an element in the special category. There
1165 // might not be one.
1166 $furthestblock = null;
1167 $furthestblockindex = -1;
1168 $stacklen = $this->length();
1169 for ( $i = $index+1; $i < $stacklen; $i++ ) {
1170 if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) {
1171 $furthestblock = $this->node( $i );
1172 $furthestblockindex = $i;
1173 break;
1174 }
1175 }
1176
1177 // If there is no furthest block, then the UA must skip the
1178 // subsequent steps and instead just pop all the nodes from the
1179 // bottom of the stack of open elements, from the current node
1180 // up to and including the formatting element, and remove the
1181 // formatting element from the list of active formatting
1182 // elements.
1183 if ( !$furthestblock ) {
1184 $this->popTag( $fmtelt );
1185 $afe->remove( $fmtelt );
1186 return true;
1187 } else {
1188 // Let the common ancestor be the element immediately above
1189 // the formatting element in the stack of open elements.
1190 $ancestor = $this->node( $index-1 );
1191
1192 // Let a bookmark note the position of the formatting
1193 // element in the list of active formatting elements
1194 // relative to the elements on either side of it in the
1195 // list.
1196 $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] );
1197 $afe->insertAfter( $fmtelt, $BOOKMARK );
1198
1199 // Let node and last node be the furthest block.
1200 $node = $furthestblock;
1201 $lastnode = $furthestblock;
1202 $nodeindex = $furthestblockindex;
1203 $isAFE = false;
1204
1205 // Let inner loop counter be zero.
1206 $inner = 0;
1207
1208 while ( true ) {
1209
1210 // Increment inner loop counter by one.
1211 $inner++;
1212
1213 // Let node be the element immediately above node in
1214 // the stack of open elements, or if node is no longer
1215 // in the stack of open elements (e.g. because it got
1216 // removed by this algorithm), the element that was
1217 // immediately above node in the stack of open elements
1218 // before node was removed.
1219 $node = $this->node( --$nodeindex );
1220
1221 // If node is the formatting element, then go
1222 // to the next step in the overall algorithm.
1223 if ( $node === $fmtelt ) break;
1224
1225 // If the inner loop counter is greater than three and node
1226 // is in the list of active formatting elements, then remove
1227 // node from the list of active formatting elements.
1228 $isAFE = $afe->isInList( $node );
1229 if ( $inner > 3 && $isAFE ) {
1230 $afe->remove( $node );
1231 $isAFE = false;
1232 }
1233
1234 // If node is not in the list of active formatting
1235 // elements, then remove node from the stack of open
1236 // elements and then go back to the step labeled inner
1237 // loop.
1238 if ( !$isAFE ) {
1239 // Don't flatten here, since we're about to relocate
1240 // parts of this $node.
1241 $this->removeElement( $node, false );
1242 continue;
1243 }
1244
1245 // Create an element for the token for which the
1246 // element node was created with common ancestor as
1247 // the intended parent, replace the entry for node
1248 // in the list of active formatting elements with an
1249 // entry for the new element, replace the entry for
1250 // node in the stack of open elements with an entry for
1251 // the new element, and let node be the new element.
1252 $newelt = new BalanceElement(
1253 $node->namespaceURI, $node->localName, $node->attribs );
1254 $afe->replace( $node, $newelt );
1255 $this->replaceAt( $nodeindex, $newelt );
1256 $node = $newelt;
1257
1258 // If last node is the furthest block, then move the
1259 // aforementioned bookmark to be immediately after the
1260 // new node in the list of active formatting elements.
1261 if ( $lastnode === $furthestblock ) {
1262 $afe->remove( $BOOKMARK );
1263 $afe->insertAfter( $newelt, $BOOKMARK );
1264 }
1265
1266 // Insert last node into node, first removing it from
1267 // its previous parent node if any.
1268 $node->appendChild( $lastnode );
1269
1270 // Let last node be node.
1271 $lastnode = $node;
1272 }
1273
1274 // If the common ancestor node is a table, tbody, tfoot,
1275 // thead, or tr element, then, foster parent whatever last
1276 // node ended up being in the previous step, first removing
1277 // it from its previous parent node if any.
1278 if (
1279 $this->fosterParentMode &&
1280 $ancestor->isA( BalanceSets::$tableSectionRowSet )
1281 ) {
1282 $this->fosterParent( $lastnode );
1283 } else {
1284 // Otherwise, append whatever last node ended up being in
1285 // the previous step to the common ancestor node, first
1286 // removing it from its previous parent node if any.
1287 $ancestor->appendChild( $lastnode );
1288 }
1289
1290 // Create an element for the token for which the
1291 // formatting element was created, with furthest block
1292 // as the intended parent.
1293 $newelt2 = new BalanceElement(
1294 $fmtelt->namespaceURI, $fmtelt->localName, $fmtelt->attribs );
1295
1296 // Take all of the child nodes of the furthest block and
1297 // append them to the element created in the last step.
1298 $newelt2->adoptChildren( $furthestblock );
1299
1300 // Append that new element to the furthest block.
1301 $furthestblock->appendChild( $newelt2 );
1302
1303 // Remove the formatting element from the list of active
1304 // formatting elements, and insert the new element into the
1305 // list of active formatting elements at the position of
1306 // the aforementioned bookmark.
1307 $afe->remove( $fmtelt );
1308 $afe->replace( $BOOKMARK, $newelt2 );
1309
1310 // Remove the formatting element from the stack of open
1311 // elements, and insert the new element into the stack of
1312 // open elements immediately below the position of the
1313 // furthest block in that stack.
1314 $this->removeElement( $fmtelt );
1315 $this->insertAfter( $furthestblock, $newelt2 );
1316 }
1317 }
1318
1319 return true;
1320 }
1321
1322 /**
1323 * Return the contents of the open elements stack as a string for
1324 * debugging.
1325 * @return string
1326 */
1327 public function __toString() {
1328 $r = [];
1329 foreach ( $this->elements as $elt ) {
1330 array_push( $r, $elt->localName );
1331 }
1332 return implode( $r, ' ' );
1333 }
1334 }
1335
1336 /**
1337 * A pseudo-element used as a marker in the list of active formatting elements
1338 *
1339 * @ingroup Parser
1340 * @since 1.27
1341 */
1342 class BalanceMarker {
1343 public $nextAFE;
1344 public $prevAFE;
1345 }
1346
1347 /**
1348 * The list of active formatting elements, which is used to handle
1349 * mis-nested formatting element tags in the HTML5 tree builder
1350 * specification.
1351 *
1352 * @ingroup Parser
1353 * @since 1.27
1354 * @see https://html.spec.whatwg.org/multipage/syntax.html#list-of-active-formatting-elements
1355 */
1356 class BalanceActiveFormattingElements {
1357 /** The last (most recent) element in the list */
1358 private $tail;
1359
1360 /** The first (least recent) element in the list */
1361 private $head;
1362
1363 /**
1364 * An array of arrays representing the population of elements in each bucket
1365 * according to the Noah's Ark clause. The outer array is stack-like, with each
1366 * integer-indexed element representing a segment of the list, bounded by
1367 * markers. The first element represents the segment of the list before the
1368 * first marker.
1369 *
1370 * The inner arrays are indexed by "Noah key", which is a string which uniquely
1371 * identifies each bucket according to the rules in the spec. The value in
1372 * the inner array is the first (least recently inserted) element in the bucket,
1373 * and subsequent members of the bucket can be found by iterating through the
1374 * singly-linked list via $node->nextNoah.
1375 *
1376 * This is optimised for the most common case of inserting into a bucket
1377 * with zero members, and deleting a bucket containing one member. In the
1378 * worst case, iteration through the list is still O(1) in the document
1379 * size, since each bucket can have at most 3 members.
1380 */
1381 private $noahTableStack = [ [] ];
1382
1383 public function __destruct() {
1384 for ( $node = $this->head; $node; $node = $next ) {
1385 $next = $node->nextAFE;
1386 $node->prevAFE = $node->nextAFE = $node->nextNoah = null;
1387 }
1388 $this->head = $this->tail = $this->noahTableStack = null;
1389 }
1390
1391 public function insertMarker() {
1392 $elt = new BalanceMarker;
1393 if ( $this->tail ) {
1394 $this->tail->nextAFE = $elt;
1395 $elt->prevAFE = $this->tail;
1396 } else {
1397 $this->head = $elt;
1398 }
1399 $this->tail = $elt;
1400 $this->noahTableStack[] = [];
1401 }
1402
1403 /**
1404 * Follow the steps required when the spec requires us to "push onto the
1405 * list of active formatting elements".
1406 * @param BalanceElement $elt
1407 */
1408 public function push( BalanceElement $elt ) {
1409 // Must not be in the list already
1410 if ( $elt->prevAFE !== null || $this->head === $elt ) {
1411 throw new ParameterAssertionException( '$elt',
1412 'Cannot insert a node into the AFE list twice' );
1413 }
1414
1415 // "Noah's Ark clause" -- if there are already three copies of
1416 // this element before we encounter a marker, then drop the last
1417 // one.
1418 $noahKey = $elt->getNoahKey();
1419 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1420 if ( !isset( $table[$noahKey] ) ) {
1421 $table[$noahKey] = $elt;
1422 } else {
1423 $count = 1;
1424 $head = $tail = $table[$noahKey];
1425 while ( $tail->nextNoah ) {
1426 $tail = $tail->nextNoah;
1427 $count++;
1428 }
1429 if ( $count >= 3 ) {
1430 $this->remove( $head );
1431 }
1432 $tail->nextNoah = $elt;
1433 }
1434 // Add to the main AFE list
1435 if ( $this->tail ) {
1436 $this->tail->nextAFE = $elt;
1437 $elt->prevAFE = $this->tail;
1438 } else {
1439 $this->head = $elt;
1440 }
1441 $this->tail = $elt;
1442 }
1443
1444 /**
1445 * Follow the steps required when the spec asks us to "clear the list of
1446 * active formatting elements up to the last marker".
1447 */
1448 public function clearToMarker() {
1449 // Iterate back through the list starting from the tail
1450 $tail = $this->tail;
1451 while ( $tail && !( $tail instanceof BalanceMarker ) ) {
1452 // Unlink the element
1453 $prev = $tail->prevAFE;
1454 $tail->prevAFE = null;
1455 if ( $prev ) {
1456 $prev->nextAFE = null;
1457 }
1458 $tail->nextNoah = null;
1459 $tail = $prev;
1460 }
1461 // If we finished on a marker, unlink it and pop it off the Noah table stack
1462 if ( $tail ) {
1463 $prev = $tail->prevAFE;
1464 if ( $prev ) {
1465 $prev->nextAFE = null;
1466 }
1467 $tail = $prev;
1468 array_pop( $this->noahTableStack );
1469 } else {
1470 // No marker: wipe the top-level Noah table (which is the only one)
1471 $this->noahTableStack[0] = [];
1472 }
1473 // If we removed all the elements, clear the head pointer
1474 if ( !$tail ) {
1475 $this->head = null;
1476 }
1477 $this->tail = $tail;
1478 }
1479
1480 /**
1481 * Find and return the last element with the specified tag between the
1482 * end of the list and the last marker on the list.
1483 * Used when parsing &lt;a&gt; "in body mode".
1484 */
1485 public function findElementByTag( $tag ) {
1486 $elt = $this->tail;
1487 while ( $elt && !( $elt instanceof BalanceMarker ) ) {
1488 if ( $elt->localName === $tag ) {
1489 return $elt;
1490 }
1491 $elt = $elt->prevAFE;
1492 }
1493 return null;
1494 }
1495
1496 /**
1497 * Determine whether an element is in the list of formatting elements.
1498 * @return boolean
1499 */
1500 public function isInList( BalanceElement $elt ) {
1501 return $this->head === $elt || $elt->prevAFE;
1502 }
1503
1504 /**
1505 * Find the element $elt in the list and remove it.
1506 * Used when parsing &lt;a&gt; in body mode.
1507 */
1508 public function remove( BalanceElement $elt ) {
1509 if ( $this->head !== $elt && !$elt->prevAFE ) {
1510 throw new ParameterAssertionException( '$elt',
1511 "Attempted to remove an element which is not in the AFE list" );
1512 }
1513 // Update head and tail pointers
1514 if ( $this->head === $elt ) {
1515 $this->head = $elt->nextAFE;
1516 }
1517 if ( $this->tail === $elt ) {
1518 $this->tail = $elt->prevAFE;
1519 }
1520 // Update previous element
1521 if ( $elt->prevAFE ) {
1522 $elt->prevAFE->nextAFE = $elt->nextAFE;
1523 }
1524 // Update next element
1525 if ( $elt->nextAFE ) {
1526 $elt->nextAFE->prevAFE = $elt->prevAFE;
1527 }
1528 // Clear pointers so that isInList() etc. will work
1529 $elt->prevAFE = $elt->nextAFE = null;
1530 // Update Noah list
1531 $this->removeFromNoahList( $elt );
1532 }
1533
1534 private function addToNoahList( BalanceElement $elt ) {
1535 $noahKey = $elt->getNoahKey();
1536 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1537 if ( !isset( $table[$noahKey] ) ) {
1538 $table[$noahKey] = $elt;
1539 } else {
1540 $tail = $table[$noahKey];
1541 while ( $tail->nextNoah ) {
1542 $tail = $tail->nextNoah;
1543 }
1544 $tail->nextNoah = $elt;
1545 }
1546 }
1547
1548 private function removeFromNoahList( BalanceElement $elt ) {
1549 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1550 $key = $elt->getNoahKey();
1551 $noahElt = $table[$key];
1552 if ( $noahElt === $elt ) {
1553 if ( $noahElt->nextNoah ) {
1554 $table[$key] = $noahElt->nextNoah;
1555 $noahElt->nextNoah = null;
1556 } else {
1557 unset( $table[$key] );
1558 }
1559 } else {
1560 do {
1561 $prevNoahElt = $noahElt;
1562 $noahElt = $prevNoahElt->nextNoah;
1563 if ( $noahElt === $elt ) {
1564 // Found it, unlink
1565 $prevNoahElt->nextNoah = $elt->nextNoah;
1566 $elt->nextNoah = null;
1567 break;
1568 }
1569 } while ( $noahElt );
1570 }
1571 }
1572
1573 /**
1574 * Find element $a in the list and replace it with element $b
1575 */
1576 public function replace( BalanceElement $a, BalanceElement $b ) {
1577 if ( $this->head !== $a && !$a->prevAFE ) {
1578 throw new ParameterAssertionException( '$a',
1579 "Attempted to replace an element which is not in the AFE list" );
1580 }
1581 // Update head and tail pointers
1582 if ( $this->head === $a ) {
1583 $this->head = $b;
1584 }
1585 if ( $this->tail === $a ) {
1586 $this->tail = $b;
1587 }
1588 // Update previous element
1589 if ( $a->prevAFE ) {
1590 $a->prevAFE->nextAFE = $b;
1591 }
1592 // Update next element
1593 if ( $a->nextAFE ) {
1594 $a->nextAFE->prevAFE = $b;
1595 }
1596 $b->prevAFE = $a->prevAFE;
1597 $b->nextAFE = $a->nextAFE;
1598 $a->nextAFE = $a->prevAFE = null;
1599 // Update Noah list
1600 $this->removeFromNoahList( $a );
1601 $this->addToNoahList( $b );
1602 }
1603
1604 /**
1605 * Find $a in the list and insert $b after it.
1606 */
1607 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
1608 if ( $this->head !== $a && !$a->prevAFE ) {
1609 throw new ParameterAssertionException( '$a',
1610 "Attempted to insert after an element which is not in the AFE list" );
1611 }
1612 if ( $this->tail === $a ) {
1613 $this->tail = $b;
1614 }
1615 if ( $a->nextAFE ) {
1616 $a->nextAFE->prevAFE = $b;
1617 }
1618 $b->nextAFE = $a->nextAFE;
1619 $b->prevAFE = $a;
1620 $a->nextAFE = $b;
1621 $this->addToNoahList( $b );
1622 }
1623
1624 // @codingStandardsIgnoreStart Generic.Files.LineLength.TooLong
1625 /**
1626 * Reconstruct the active formatting elements.
1627 * @param BalanceStack $stack The open elements stack
1628 * @see https://html.spec.whatwg.org/multipage/syntax.html#reconstruct-the-active-formatting-elements
1629 */
1630 // @codingStandardsIgnoreEnd
1631 public function reconstruct( $stack ) {
1632 $entry = $this->tail;
1633 // If there are no entries in the list of active formatting elements,
1634 // then there is nothing to reconstruct
1635 if ( !$entry ) {
1636 return;
1637 }
1638 // If the last is a marker, do nothing.
1639 if ( $entry instanceof BalanceMarker ) {
1640 return;
1641 }
1642 // Or if it is an open element, do nothing.
1643 if ( $stack->indexOf( $entry ) >= 0 ) {
1644 return;
1645 }
1646
1647 // Loop backward through the list until we find a marker or an
1648 // open element
1649 $foundit = false;
1650 while ( $entry->prevAFE ) {
1651 $entry = $entry->prevAFE;
1652 if ( $entry instanceof BalanceMarker || $stack->indexOf( $entry ) >= 0 ) {
1653 $foundit = true;
1654 break;
1655 }
1656 }
1657
1658 // Now loop forward, starting from the element after the current one (or
1659 // the first element if we didn't find a marker or open element),
1660 // recreating formatting elements and pushing them back onto the list
1661 // of open elements.
1662 if ( $foundit ) {
1663 $entry = $entry->nextAFE;
1664 }
1665 do {
1666 $newElement = $stack->insertHTMLElement(
1667 $entry->localName,
1668 $entry->attribs );
1669 $this->replace( $entry, $newElement );
1670 $entry = $newElement->nextAFE;
1671 } while ( $entry );
1672 }
1673
1674 /**
1675 * Get a string representation of the AFE list, for debugging
1676 */
1677 public function __toString() {
1678 $prev = null;
1679 $s = '';
1680 for ( $node = $this->head; $node; $prev = $node, $node = $node->nextAFE ) {
1681 if ( $node instanceof BalanceMarker ) {
1682 $s .= "MARKER\n";
1683 continue;
1684 }
1685 $s .= $node->localName . '#' . substr( md5( spl_object_hash( $node ) ), 0, 8 );
1686 if ( $node->nextNoah ) {
1687 $s .= " (noah sibling: {$node->nextNoah->localName}#" .
1688 substr( md5( spl_object_hash( $node->nextNoah ) ), 0, 8 ) .
1689 ')';
1690 }
1691 if ( $node->nextAFE && $node->nextAFE->prevAFE !== $node ) {
1692 $s .= " (reverse link is wrong!)";
1693 }
1694 $s .= "\n";
1695 }
1696 if ( $prev !== $this->tail ) {
1697 $s .= "(tail pointer is wrong!)\n";
1698 }
1699 return $s;
1700 }
1701 }
1702
1703 /**
1704 * An implementation of the tree building portion of the HTML5 parsing
1705 * spec.
1706 *
1707 * This is used to balance and tidy output so that the result can
1708 * always be cleanly serialized/deserialized by an HTML5 parser. It
1709 * does *not* guarantee "conforming" output -- the HTML5 spec contains
1710 * a number of constraints which are not enforced by the HTML5 parsing
1711 * process. But the result will be free of gross errors: misnested or
1712 * unclosed tags, for example, and will be unchanged by spec-complient
1713 * parsing followed by serialization.
1714 *
1715 * The tree building stage is structured as a state machine.
1716 * When comparing the implementation to
1717 * https://www.w3.org/TR/html5/syntax.html#tree-construction
1718 * note that each state is implemented as a function with a
1719 * name ending in `Mode` (because the HTML spec refers to them
1720 * as insertion modes). The current insertion mode is held by
1721 * the $parseMode property.
1722 *
1723 * The following simplifications have been made:
1724 * - We handle body content only (ie, we start `in body`.)
1725 * - The document is never in "quirks mode".
1726 * - All occurrences of < and > have been entity escaped, so we
1727 * can parse tags by simply splitting on those two characters.
1728 * Similarly, all attributes have been "cleaned" and are double-quoted
1729 * and escaped.
1730 * - All comments and null characters are assumed to have been removed.
1731 * - We don't alter linefeeds after <pre>/<listing>.
1732 * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
1733 * <frame>, <plaintext>, <isindex>, <textarea>, <xmp>, <iframe>,
1734 * <noembed>, <noscript>, <script>, <title>. As a result,
1735 * further simplifications can be made:
1736 * - `frameset-ok` is not tracked.
1737 * - `head element pointer` is not tracked (but presumed non-null)
1738 * - Tokenizer has only a single mode.
1739 *
1740 * We generally mark places where we omit cases from the spec due to
1741 * disallowed elements with a comment: `# OMITTED: <element-name>`.
1742 *
1743 * The HTML spec keeps a flag during the parsing process to track
1744 * whether or not a "parse error" has been encountered. We don't
1745 * bother to track that flag, we just implement the error-handling
1746 * process as specified.
1747 *
1748 * @ingroup Parser
1749 * @since 1.27
1750 * @see https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
1751 */
1752 class Balancer {
1753 private $parseMode;
1754 private $bitsIterator;
1755 private $allowedHtmlElements;
1756 private $afe;
1757 private $stack;
1758 private $strict;
1759 private $tidyCompat;
1760
1761 private $textIntegrationMode = false;
1762 private $pendingTableText;
1763 private $originalInsertionMode;
1764 private $fragmentContext;
1765 private $formElementPointer;
1766
1767 /**
1768 * Create a new Balancer.
1769 * @param array $config Balancer configuration. Includes:
1770 * 'strict' : boolean, defaults to false.
1771 * When true, enforces syntactic constraints on input:
1772 * all non-tag '<' must be escaped, all attributes must be
1773 * separated by a single space and double-quoted. This is
1774 * consistent with the output of the Sanitizer.
1775 * 'allowedHtmlElements' : array, defaults to null.
1776 * When present, the keys of this associative array give
1777 * the acceptable HTML tag names. When not present, no
1778 * tag sanitization is done.
1779 * 'tidyCompat' : boolean, defaults to false.
1780 * When true, the serialization algorithm is tweaked to
1781 * provide historical compatibility with the old "tidy"
1782 * program: <p>-wrapping is done to the children of
1783 * <body> and <blockquote> elements, and empty elements
1784 * are removed.
1785 */
1786 public function __construct( array $config = [] ) {
1787 $config = $config + [
1788 'strict' => false,
1789 'allowedHtmlElements' => null,
1790 'tidyCompat' => false,
1791 ];
1792 $this->allowedHtmlElements = $config['allowedHtmlElements'];
1793 $this->strict = $config['strict'];
1794 $this->tidyCompat = $config['tidyCompat'];
1795 if ( $this->allowedHtmlElements !== null ) {
1796 # Sanity check!
1797 $bad = array_uintersect_assoc(
1798 $this->allowedHtmlElements,
1799 BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE],
1800 function( $a, $b ) {
1801 // Ignore the values (just intersect the keys) by saying
1802 // all values are equal to each other.
1803 return 0;
1804 }
1805 );
1806 if ( count( $bad ) > 0 ) {
1807 $badstr = implode( array_keys( $bad ), ',' );
1808 throw new ParameterAssertionException(
1809 '$config',
1810 'Balance attempted with sanitization including ' .
1811 "unsupported elements: {$badstr}"
1812 );
1813 }
1814 }
1815 }
1816
1817 /**
1818 * Return a balanced HTML string for the HTML fragment given by $text,
1819 * subject to the caveats listed in the class description. The result
1820 * will typically be idempotent -- that is, rebalancing the output
1821 * would result in no change.
1822 *
1823 * @param string $text The markup to be balanced
1824 * @param callable $processingCallback Callback to do any variable or
1825 * parameter replacements in HTML attributes values
1826 * @param array|bool $processingArgs Arguments for the processing callback
1827 * @return string The balanced markup
1828 */
1829 public function balance( $text, $processingCallback = null, $processingArgs = [] ) {
1830 $this->parseMode = 'inBodyMode';
1831 $this->bitsIterator = new ExplodeIterator( '<', $text );
1832 $this->afe = new BalanceActiveFormattingElements();
1833 $this->stack = new BalanceStack();
1834 $this->stack->tidyCompat = $this->tidyCompat;
1835 $this->processingCallback = $processingCallback;
1836 $this->processingArgs = $processingArgs;
1837
1838 # The stack is constructed with an <html> element already on it.
1839 # Set this up as a fragment parsed with <body> as the context.
1840 $this->fragmentContext =
1841 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'body', [] );
1842 $this->resetInsertionMode();
1843 $this->formElementPointer = null;
1844 for ( $e = $this->fragmentContext; $e != null; $e = $e->parent ) {
1845 if ( $e->isHtmlNamed( 'form' ) ) {
1846 $this->formElementPointer = $e;
1847 break;
1848 }
1849 }
1850
1851 // First element is text not tag
1852 $x = $this->bitsIterator->current();
1853 $this->bitsIterator->next();
1854 $this->insertToken( 'text', str_replace( '>', '&gt;', $x ) );
1855 // Now process each tag.
1856 while ( $this->bitsIterator->valid() ) {
1857 $this->advance();
1858 }
1859 $this->insertToken( 'eof', null );
1860 $result = $this->stack->getOutput();
1861 // Free memory before returning.
1862 $this->bitsIterator = null;
1863 $this->afe = null;
1864 $this->stack = null;
1865 $this->fragmentContext = null;
1866 $this->formElementPointer = null;
1867 return $result;
1868 }
1869
1870 /**
1871 * Pass a token to the tree builder. The $token will be one of the
1872 * strings "tag", "endtag", or "text".
1873 */
1874 private function insertToken( $token, $value, $attribs = null, $selfclose = false ) {
1875 // validate tags against $unsupportedSet
1876 if ( $token === 'tag' || $token === 'endtag' ) {
1877 if ( isset( BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE][$value] ) ) {
1878 # As described in "simplifications" above, these tags are
1879 # not supported in the balancer.
1880 Assert::invariant(
1881 !$this->strict,
1882 "Unsupported $token <$value> found."
1883 );
1884 return false;
1885 }
1886 } elseif ( $token === 'text' && $value === '' ) {
1887 # Don't actually inject the empty string as a text token.
1888 return true;
1889 }
1890 // Some hoops we have to jump through
1891 $adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
1892
1893 $isForeign = true;
1894 if (
1895 $this->stack->length() === 0 ||
1896 $adjusted->isHtml() ||
1897 $token === 'eof'
1898 ) {
1899 $isForeign = false;
1900 } elseif ( $adjusted->isMathmlTextIntegrationPoint() ) {
1901 if ( $token === 'text' ) {
1902 $isForeign = false;
1903 } elseif (
1904 $token === 'tag' &&
1905 $value !== 'mglyph' && $value !== 'malignmark'
1906 ) {
1907 $isForeign = false;
1908 }
1909 } elseif (
1910 $adjusted->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
1911 $adjusted->localName === 'annotation-xml' &&
1912 $token === 'tag' && $value === 'svg'
1913 ) {
1914 $isForeign = false;
1915 } elseif (
1916 $adjusted->isHtmlIntegrationPoint() &&
1917 ( $token === 'tag' || $token === 'text' )
1918 ) {
1919 $isForeign = false;
1920 }
1921 if ( $isForeign ) {
1922 return $this->insertForeignToken( $token, $value, $attribs, $selfclose );
1923 } else {
1924 $func = $this->parseMode;
1925 return $this->$func( $token, $value, $attribs, $selfclose );
1926 }
1927 }
1928
1929 private function insertForeignToken( $token, $value, $attribs = null, $selfclose = false ) {
1930 if ( $token === 'text' ) {
1931 $this->stack->insertText( $value );
1932 return true;
1933 } elseif ( $token === 'tag' ) {
1934 switch ( $value ) {
1935 case 'font':
1936 if ( isset( $attribs['color'] )
1937 || isset( $attribs['face'] )
1938 || isset( $attribs['size'] )
1939 ) {
1940 break;
1941 }
1942 /* otherwise, fall through */
1943 case 'b':
1944 case 'big':
1945 case 'blockquote':
1946 case 'body':
1947 case 'br':
1948 case 'center':
1949 case 'code':
1950 case 'dd':
1951 case 'div':
1952 case 'dl':
1953 case 'dt':
1954 case 'em':
1955 case 'embed':
1956 case 'h1':
1957 case 'h2':
1958 case 'h3':
1959 case 'h4':
1960 case 'h5':
1961 case 'h6':
1962 case 'head':
1963 case 'hr':
1964 case 'i':
1965 case 'img':
1966 case 'li':
1967 case 'listing':
1968 case 'menu':
1969 case 'meta':
1970 case 'nobr':
1971 case 'ol':
1972 case 'p':
1973 case 'pre':
1974 case 'ruby':
1975 case 's':
1976 case 'small':
1977 case 'span':
1978 case 'strong':
1979 case 'strike':
1980 case 'sub':
1981 case 'sup':
1982 case 'table':
1983 case 'tt':
1984 case 'u':
1985 case 'ul':
1986 case 'var':
1987 if ( $this->fragmentContext ) {
1988 break;
1989 }
1990 while ( true ) {
1991 $this->stack->pop();
1992 $node = $this->stack->currentNode;
1993 if (
1994 $node->isMathmlTextIntegrationPoint() ||
1995 $node->isHtmlIntegrationPoint() ||
1996 $node->isHtml()
1997 ) {
1998 break;
1999 }
2000 }
2001 return $this->insertToken( $token, $value, $attribs, $selfclose );
2002 }
2003 // "Any other start tag"
2004 $adjusted = ( $this->fragmentContext && $this->stack->length()===1 ) ?
2005 $this->fragmentContext : $this->stack->currentNode;
2006 $this->stack->insertForeignElement(
2007 $adjusted->namespaceURI, $value, $attribs
2008 );
2009 if ( $selfclose ) {
2010 $this->stack->pop();
2011 }
2012 return true;
2013 } elseif ( $token === 'endtag' ) {
2014 $first = true;
2015 foreach ( $this->stack as $i => $node ) {
2016 if ( $node->isHtml() && !$first ) {
2017 // process the end tag as HTML
2018 $func = $this->parseMode;
2019 return $this->$func( $token, $value, $attribs, $selfclose );
2020 } elseif ( $i === 0 ) {
2021 return true;
2022 } elseif ( $node->localName === $value ) {
2023 $this->stack->popTag( $node );
2024 return true;
2025 }
2026 $first = false;
2027 }
2028 }
2029 }
2030
2031 /**
2032 * Grab the next "token" from $bitsIterator. This is either a open/close
2033 * tag or text, depending on whether the Sanitizer approves.
2034 */
2035 private function advance() {
2036 $x = $this->bitsIterator->current();
2037 $this->bitsIterator->next();
2038 $regs = [];
2039 # $slash: Does the current element start with a '/'?
2040 # $t: Current element name
2041 # $attribStr: String between element name and >
2042 # $brace: Ending '>' or '/>'
2043 # $rest: Everything until the next element from the $bitsIterator
2044 if ( preg_match( Sanitizer::ELEMENT_BITS_REGEX, $x, $regs ) ) {
2045 list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs;
2046 $t = strtolower( $t );
2047 if ( $this->strict ) {
2048 /* Verify that attributes are all properly double-quoted */
2049 Assert::invariant(
2050 preg_match(
2051 '/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr
2052 ),
2053 "Bad attribute string found"
2054 );
2055 }
2056 } else {
2057 Assert::invariant(
2058 !$this->strict, "< found which does not start a valid tag"
2059 );
2060 $slash = $t = $attribStr = $brace = $rest = null;
2061 }
2062 $goodtag = $t;
2063 $sanitize = $this->allowedHtmlElements !== null;
2064 if ( $sanitize ) {
2065 $goodtag = $t && isset( $this->allowedHtmlElements[$t] );
2066 }
2067 if ( $goodtag ) {
2068 if ( is_callable( $this->processingCallback ) ) {
2069 call_user_func_array( $this->processingCallback, [ &$attribStr, $this->processingArgs ] );
2070 }
2071 if ( $sanitize ) {
2072 $goodtag = Sanitizer::validateTag( $attribStr, $t );
2073 }
2074 }
2075 if ( $goodtag ) {
2076 if ( $sanitize ) {
2077 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2078 $attribs = Sanitizer::validateTagAttributes( $attribs, $t );
2079 } else {
2080 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2081 }
2082 $goodtag = $this->insertToken(
2083 $slash ? 'endtag' : 'tag', $t, $attribs, $brace === '/>'
2084 );
2085 }
2086 if ( $goodtag ) {
2087 $rest = str_replace( '>', '&gt;', $rest );
2088 $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
2089 } else {
2090 # bad tag; serialize entire thing as text.
2091 $this->insertToken( 'text', '&lt;' . str_replace( '>', '&gt;', $x ) );
2092 }
2093 }
2094
2095 private function switchMode( $mode ) {
2096 Assert::parameter(
2097 substr( $mode, -4 )==='Mode', '$mode', 'should end in Mode'
2098 );
2099 $oldMode = $this->parseMode;
2100 $this->parseMode = $mode;
2101 return $oldMode;
2102 }
2103
2104 private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfclose ) {
2105 $this->switchMode( $mode );
2106 return $this->insertToken( $token, $value, $attribs, $selfclose );
2107 }
2108
2109 private function resetInsertionMode() {
2110 $last = false;
2111 foreach ( $this->stack as $i => $node ) {
2112 if ( $i === 0 ) {
2113 $last = true;
2114 if ( $this->fragmentContext ) {
2115 $node = $this->fragmentContext;
2116 }
2117 }
2118 if ( $node->isHtml() ) {
2119 switch ( $node->localName ) {
2120 case 'select':
2121 $stacklen = $this->stack->length();
2122 for ( $j = $i + 1; $j < $stacklen-1; $j++ ) {
2123 $ancestor = $this->stack->node( $stacklen-$j-1 );
2124 if ( $ancestor->isHtmlNamed( 'template' ) ) {
2125 break;
2126 }
2127 if ( $ancestor->isHtmlNamed( 'table' ) ) {
2128 $this->switchMode( 'inSelectInTableMode' );
2129 return;
2130 }
2131 }
2132 $this->switchMode( 'inSelectMode' );
2133 return;
2134 case 'tr':
2135 $this->switchMode( 'inRowMode' );
2136 return;
2137 case 'tbody':
2138 case 'tfoot':
2139 case 'thead':
2140 $this->switchMode( 'inTableBodyMode' );
2141 return;
2142 case 'caption':
2143 $this->switchMode( 'inCaptionMode' );
2144 return;
2145 case 'colgroup':
2146 $this->switchMode( 'inColumnGroupMode' );
2147 return;
2148 case 'table':
2149 $this->switchMode( 'inTableMode' );
2150 return;
2151 case 'template':
2152 $this->switchMode(
2153 array_slice( $this->templateInsertionModes, -1 )[0]
2154 );
2155 return;
2156 case 'body':
2157 $this->switchMode( 'inBodyMode' );
2158 return;
2159 # OMITTED: <frameset>
2160 # OMITTED: <html>
2161 # OMITTED: <head>
2162 default:
2163 if ( !$last ) {
2164 # OMITTED: <head>
2165 if ( $node->isA( BalanceSets::$tableCellSet ) ) {
2166 $this->switchMode( 'inCellMode' );
2167 return;
2168 }
2169 }
2170 }
2171 }
2172 if ( $last ) {
2173 $this->switchMode( 'inBodyMode' );
2174 return;
2175 }
2176 }
2177 }
2178
2179 private function stopParsing() {
2180 # Most of the spec methods are inapplicable, other than step 2:
2181 # "pop all the nodes off the stack of open elements".
2182 # We're going to keep the top-most <html> element on the stack, though.
2183
2184 # Clear the AFE list first, otherwise the element objects will stay live
2185 # during serialization, potentially using O(N^2) memory. Note that
2186 # popping the stack will never result in reconstructing the active
2187 # formatting elements.
2188 $this->afe = null;
2189 $this->stack->popTo( 1 );
2190 }
2191
2192 private function parseRawText( $value, $attribs = null ) {
2193 $this->stack->insertHTMLElement( $value, $attribs );
2194 // XXX switch tokenizer to rawtext state?
2195 $this->originalInsertionMode = $this->switchMode( 'inTextMode' );
2196 return true;
2197 }
2198
2199 private function inTextMode( $token, $value, $attribs = null, $selfclose = false ) {
2200 if ( $token === 'text' ) {
2201 $this->stack->insertText( $value );
2202 return true;
2203 } elseif ( $token === 'eof' ) {
2204 $this->stack->pop();
2205 return $this->switchModeAndReprocess(
2206 $this->originalInsertionMode, $token, $value, $attribs, $selfclose
2207 );
2208 } elseif ( $token === 'endtag' ) {
2209 $this->stack->pop();
2210 $this->switchMode( $this->originalInsertionMode );
2211 return true;
2212 }
2213 return true;
2214 }
2215
2216 private function inHeadMode( $token, $value, $attribs = null, $selfclose = false ) {
2217 if ( $token === 'text' ) {
2218 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
2219 $this->stack->insertText( $matches[0] );
2220 $value = substr( $value, strlen( $matches[0] ) );
2221 }
2222 if ( strlen( $value ) === 0 ) {
2223 return true; // All text handled.
2224 }
2225 // Fall through to handle non-whitespace below.
2226 } elseif ( $token === 'tag' ) {
2227 switch ( $value ) {
2228 case 'meta':
2229 # OMITTED: in a full HTML parser, this might change the encoding.
2230 /* falls through */
2231 # OMITTED: <html>
2232 case 'base':
2233 case 'basefont':
2234 case 'bgsound':
2235 case 'link':
2236 $this->stack->insertHTMLElement( $value, $attribs );
2237 $this->stack->pop();
2238 return true;
2239 # OMITTED: <title>
2240 # OMITTED: <noscript>
2241 case 'noframes':
2242 case 'style':
2243 return $this->parseRawText( $value, $attribs );
2244 # OMITTED: <script>
2245 case 'template':
2246 $this->stack->insertHTMLElement( $value, $attribs );
2247 $this->afe->insertMarker();
2248 # OMITTED: frameset_ok
2249 $this->switchMode( 'inTemplateMode' );
2250 $this->templateInsertionModes[] = $this->parseMode;
2251 return true;
2252 # OMITTED: <head>
2253 }
2254 } elseif ( $token === 'endtag' ) {
2255 switch ( $value ) {
2256 # OMITTED: <head>
2257 # OMITTED: <body>
2258 # OMITTED: <html>
2259 case 'br':
2260 break; // handle at the bottom of the function
2261 case 'template':
2262 if ( $this->stack->indexOf( $value ) < 0 ) {
2263 return true; // Ignore the token.
2264 }
2265 $this->stack->generateImpliedEndTags( null, true /* thorough */ );
2266 $this->stack->popTag( $value );
2267 $this->afe->clearToMarker();
2268 array_pop( $this->templateInsertionModes );
2269 $this->resetInsertionMode();
2270 return true;
2271 default:
2272 // ignore any other end tag
2273 return true;
2274 }
2275 }
2276
2277 // If not handled above
2278 $this->inHeadMode( 'endtag', 'head' ); // synthetic </head>
2279 // Then redo this one
2280 return $this->insertToken( $token, $value, $attribs, $selfclose );
2281 }
2282
2283 private function inBodyMode( $token, $value, $attribs = null, $selfclose = false ) {
2284 if ( $token === 'text' ) {
2285 $this->afe->reconstruct( $this->stack );
2286 $this->stack->insertText( $value );
2287 return true;
2288 } elseif ( $token === 'eof' ) {
2289 if ( !empty( $this->templateInsertionModes ) ) {
2290 return $this->inTemplateMode( $token, $value, $attribs, $selfclose );
2291 }
2292 $this->stopParsing();
2293 return true;
2294 } elseif ( $token === 'tag' ) {
2295 switch ( $value ) {
2296 # OMITTED: <html>
2297 case 'base':
2298 case 'basefont':
2299 case 'bgsound':
2300 case 'link':
2301 case 'meta':
2302 case 'noframes':
2303 # OMITTED: <script>
2304 case 'style':
2305 case 'template':
2306 # OMITTED: <title>
2307 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2308 # OMITTED: <body>
2309 # OMITTED: <frameset>
2310
2311 case 'address':
2312 case 'article':
2313 case 'aside':
2314 case 'blockquote':
2315 case 'center':
2316 case 'details':
2317 case 'dialog':
2318 case 'dir':
2319 case 'div':
2320 case 'dl':
2321 case 'fieldset':
2322 case 'figcaption':
2323 case 'figure':
2324 case 'footer':
2325 case 'header':
2326 case 'hgroup':
2327 case 'main':
2328 case 'menu':
2329 case 'nav':
2330 case 'ol':
2331 case 'p':
2332 case 'section':
2333 case 'summary':
2334 case 'ul':
2335 if ( $this->stack->inButtonScope( 'p' ) ) {
2336 $this->inBodyMode( 'endtag', 'p' );
2337 }
2338 $this->stack->insertHTMLElement( $value, $attribs );
2339 return true;
2340
2341 case 'h1':
2342 case 'h2':
2343 case 'h3':
2344 case 'h4':
2345 case 'h5':
2346 case 'h6':
2347 if ( $this->stack->inButtonScope( 'p' ) ) {
2348 $this->inBodyMode( 'endtag', 'p' );
2349 }
2350 if ( $this->stack->currentNode->isA( BalanceSets::$headingSet ) ) {
2351 $this->stack->pop();
2352 }
2353 $this->stack->insertHTMLElement( $value, $attribs );
2354 return true;
2355
2356 case 'pre':
2357 case 'listing':
2358 if ( $this->stack->inButtonScope( 'p' ) ) {
2359 $this->inBodyMode( 'endtag', 'p' );
2360 }
2361 $this->stack->insertHTMLElement( $value, $attribs );
2362 # As described in "simplifications" above:
2363 # 1. We don't touch the next token, even if it's a linefeed.
2364 # 2. OMITTED: frameset_ok
2365 return true;
2366
2367 case 'form':
2368 if (
2369 $this->formElementPointer &&
2370 $this->stack->indexOf( 'template' ) < 0
2371 ) {
2372 return true; // in a form, not in a template.
2373 }
2374 if ( $this->stack->inButtonScope( "p" ) ) {
2375 $this->inBodyMode( 'endtag', 'p' );
2376 }
2377 $elt = $this->stack->insertHTMLElement( $value, $attribs );
2378 if ( $this->stack->indexOf( 'template' ) < 0 ) {
2379 $this->formElementPointer = $elt;
2380 }
2381 return true;
2382
2383 case 'li':
2384 # OMITTED: frameset_ok
2385 foreach ( $this->stack as $node ) {
2386 if ( $node->isHtmlNamed( 'li' ) ) {
2387 $this->inBodyMode( 'endtag', 'li' );
2388 break;
2389 }
2390 if (
2391 $node->isA( BalanceSets::$specialSet ) &&
2392 !$node->isA( BalanceSets::$addressDivPSet )
2393 ) {
2394 break;
2395 }
2396 }
2397 if ( $this->stack->inButtonScope( 'p' ) ) {
2398 $this->inBodyMode( 'endtag', 'p' );
2399 }
2400 $this->stack->insertHTMLElement( $value, $attribs );
2401 return true;
2402
2403 case 'dd':
2404 case 'dt':
2405 # OMITTED: frameset_ok
2406 foreach ( $this->stack as $node ) {
2407 if ( $node->isHtmlNamed( 'dd' ) ) {
2408 $this->inBodyMode( 'endtag', 'dd' );
2409 break;
2410 }
2411 if ( $node->isHtmlNamed( 'dt' ) ) {
2412 $this->inBodyMode( 'endtag', 'dt' );
2413 break;
2414 }
2415 if (
2416 $node->isA( BalanceSets::$specialSet ) &&
2417 !$node->isA( BalanceSets::$addressDivPSet )
2418 ) {
2419 break;
2420 }
2421 }
2422 if ( $this->stack->inButtonScope( 'p' ) ) {
2423 $this->inBodyMode( 'endtag', 'p' );
2424 }
2425 $this->stack->insertHTMLElement( $value, $attribs );
2426 return true;
2427
2428 # OMITTED: <plaintext>
2429
2430 case 'button':
2431 if ( $this->stack->inScope( 'button' ) ) {
2432 $this->inBodyMode( 'endtag', 'button' );
2433 return $this->insertToken( $token, $value, $attribs, $selfclose );
2434 }
2435 $this->afe->reconstruct( $this->stack );
2436 $this->stack->insertHTMLElement( $value, $attribs );
2437 return true;
2438
2439 case 'a':
2440 $activeElement = $this->afe->findElementByTag( 'a' );
2441 if ( $activeElement ) {
2442 $this->inBodyMode( 'endtag', 'a' );
2443 if ( $this->afe->isInList( $activeElement ) ) {
2444 $this->afe->remove( $activeElement );
2445 // Don't flatten here, since when we fall
2446 // through below we might foster parent
2447 // the new <a> tag inside this one.
2448 $this->stack->removeElement( $activeElement, false );
2449 }
2450 }
2451 /* Falls through */
2452 case 'b':
2453 case 'big':
2454 case 'code':
2455 case 'em':
2456 case 'font':
2457 case 'i':
2458 case 's':
2459 case 'small':
2460 case 'strike':
2461 case 'strong':
2462 case 'tt':
2463 case 'u':
2464 $this->afe->reconstruct( $this->stack );
2465 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
2466 return true;
2467
2468 case 'nobr':
2469 $this->afe->reconstruct( $this->stack );
2470 if ( $this->stack->inScope( 'nobr' ) ) {
2471 $this->inBodyMode( 'endtag', 'nobr' );
2472 $this->afe->reconstruct( $this->stack );
2473 }
2474 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
2475 return true;
2476
2477 case 'applet':
2478 case 'marquee':
2479 case 'object':
2480 $this->afe->reconstruct( $this->stack );
2481 $this->stack->insertHTMLElement( $value, $attribs );
2482 $this->afe->insertMarker();
2483 # OMITTED: frameset_ok
2484 return true;
2485
2486 case 'table':
2487 # The document is never in "quirks mode"; see simplifications
2488 # above.
2489 if ( $this->stack->inButtonScope( 'p' ) ) {
2490 $this->inBodyMode( 'endtag', 'p' );
2491 }
2492 $this->stack->insertHTMLElement( $value, $attribs );
2493 # OMITTED: frameset_ok
2494 $this->switchMode( 'inTableMode' );
2495 return true;
2496
2497 case 'area':
2498 case 'br':
2499 case 'embed':
2500 case 'img':
2501 case 'keygen':
2502 case 'wbr':
2503 $this->afe->reconstruct( $this->stack );
2504 $this->stack->insertHTMLElement( $value, $attribs );
2505 $this->stack->pop();
2506 # OMITTED: frameset_ok
2507 return true;
2508
2509 case 'input':
2510 $this->afe->reconstruct( $this->stack );
2511 $this->stack->insertHTMLElement( $value, $attribs );
2512 $this->stack->pop();
2513 # OMITTED: frameset_ok
2514 # (hence we don't need to examine the tag's "type" attribute)
2515 return true;
2516
2517 case 'menuitem':
2518 case 'param':
2519 case 'source':
2520 case 'track':
2521 $this->stack->insertHTMLElement( $value, $attribs );
2522 $this->stack->pop();
2523 return true;
2524
2525 case 'hr':
2526 if ( $this->stack->inButtonScope( 'p' ) ) {
2527 $this->inBodyMode( 'endtag', 'p' );
2528 }
2529 $this->stack->insertHTMLElement( $value, $attribs );
2530 $this->stack->pop();
2531 return true;
2532
2533 case 'image':
2534 # warts!
2535 return $this->inBodyMode( $token, 'img', $attribs, $selfclose );
2536
2537 # OMITTED: <isindex>
2538 # OMITTED: <textarea>
2539 # OMITTED: <xmp>
2540 # OMITTED: <iframe>
2541 # OMITTED: <noembed>
2542 # OMITTED: <noscript>
2543
2544 case 'select':
2545 $this->afe->reconstruct( $this->stack );
2546 $this->stack->insertHTMLElement( $value, $attribs );
2547 switch ( $this->parseMode ) {
2548 case 'inTableMode':
2549 case 'inCaptionMode':
2550 case 'inTableBodyMode':
2551 case 'inRowMode':
2552 case 'inCellMode':
2553 $this->switchMode( 'inSelectInTableMode' );
2554 return true;
2555 default:
2556 $this->switchMode( 'inSelectMode' );
2557 return true;
2558 }
2559
2560 case 'optgroup':
2561 case 'option':
2562 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
2563 $this->inBodyMode( 'endtag', 'option' );
2564 }
2565 $this->afe->reconstruct( $this->stack );
2566 $this->stack->insertHTMLElement( $value, $attribs );
2567 return true;
2568
2569 case 'rb':
2570 case 'rtc':
2571 if ( $this->stack->inScope( 'ruby' ) ) {
2572 $this->stack->generateImpliedEndTags();
2573 }
2574 $this->stack->insertHTMLElement( $value, $attribs );
2575 return true;
2576
2577 case 'rp':
2578 case 'rt':
2579 if ( $this->stack->inScope( 'ruby' ) ) {
2580 $this->stack->generateImpliedEndTags( 'rtc' );
2581 }
2582 $this->stack->insertHTMLElement( $value, $attribs );
2583 return true;
2584
2585 case 'math':
2586 $this->afe->reconstruct( $this->stack );
2587 # We skip the spec's "adjust MathML attributes" and
2588 # "adjust foreign attributes" steps, since the browser will
2589 # do this later when it parses the output and it doesn't affect
2590 # balancing.
2591 $this->stack->insertForeignElement(
2592 BalanceSets::MATHML_NAMESPACE, $value, $attribs
2593 );
2594 if ( $selfclose ) {
2595 # emit explicit </math> tag.
2596 $this->stack->pop();
2597 }
2598 return true;
2599
2600 case 'svg':
2601 $this->afe->reconstruct( $this->stack );
2602 # We skip the spec's "adjust SVG attributes" and
2603 # "adjust foreign attributes" steps, since the browser will
2604 # do this later when it parses the output and it doesn't affect
2605 # balancing.
2606 $this->stack->insertForeignElement(
2607 BalanceSets::SVG_NAMESPACE, $value, $attribs
2608 );
2609 if ( $selfclose ) {
2610 # emit explicit </svg> tag.
2611 $this->stack->pop();
2612 }
2613 return true;
2614
2615 case 'caption':
2616 case 'col':
2617 case 'colgroup':
2618 # OMITTED: <frame>
2619 case 'head':
2620 case 'tbody':
2621 case 'td':
2622 case 'tfoot':
2623 case 'th':
2624 case 'thead':
2625 case 'tr':
2626 // Ignore table tags if we're not inTableMode
2627 return true;
2628 }
2629
2630 // Handle any other start tag here
2631 $this->afe->reconstruct( $this->stack );
2632 $this->stack->insertHTMLElement( $value, $attribs );
2633 return true;
2634 } elseif ( $token === 'endtag' ) {
2635 switch ( $value ) {
2636 # </body>,</html> are unsupported.
2637
2638 case 'template':
2639 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2640
2641 case 'address':
2642 case 'article':
2643 case 'aside':
2644 case 'blockquote':
2645 case 'button':
2646 case 'center':
2647 case 'details':
2648 case 'dialog':
2649 case 'dir':
2650 case 'div':
2651 case 'dl':
2652 case 'fieldset':
2653 case 'figcaption':
2654 case 'figure':
2655 case 'footer':
2656 case 'header':
2657 case 'hgroup':
2658 case 'listing':
2659 case 'main':
2660 case 'menu':
2661 case 'nav':
2662 case 'ol':
2663 case 'pre':
2664 case 'section':
2665 case 'summary':
2666 case 'ul':
2667 // Ignore if there is not a matching open tag
2668 if ( !$this->stack->inScope( $value ) ) {
2669 return true;
2670 }
2671 $this->stack->generateImpliedEndTags();
2672 $this->stack->popTag( $value );
2673 return true;
2674
2675 case 'form':
2676 if ( $this->stack->indexOf( 'template' ) < 0 ) {
2677 $openform = $this->formElementPointer;
2678 $this->formElementPointer = null;
2679 if ( !$openform || !$this->stack->inScope( $openform ) ) {
2680 return true;
2681 }
2682 $this->stack->generateImpliedEndTags();
2683 // Don't flatten yet if we're removing a <form> element
2684 // out-of-order. (eg. `<form><div></form>`)
2685 $flatten = ( $this->stack->currentNode === $openform );
2686 $this->stack->removeElement( $openform, $flatten );
2687 } else {
2688 if ( !$this->stack->inScope( 'form' ) ) {
2689 return true;
2690 }
2691 $this->stack->generateImpliedEndTags();
2692 $this->stack->popTag( 'form' );
2693 }
2694 return true;
2695
2696 case 'p':
2697 if ( !$this->stack->inButtonScope( 'p' ) ) {
2698 $this->inBodyMode( 'tag', 'p', [] );
2699 return $this->insertToken( $token, $value, $attribs, $selfclose );
2700 }
2701 $this->stack->generateImpliedEndTags( $value );
2702 $this->stack->popTag( $value );
2703 return true;
2704
2705 case 'li':
2706 if ( !$this->stack->inListItemScope( $value ) ) {
2707 return true; # ignore
2708 }
2709 $this->stack->generateImpliedEndTags( $value );
2710 $this->stack->popTag( $value );
2711 return true;
2712
2713 case 'dd':
2714 case 'dt':
2715 if ( !$this->stack->inScope( $value ) ) {
2716 return true; # ignore
2717 }
2718 $this->stack->generateImpliedEndTags( $value );
2719 $this->stack->popTag( $value );
2720 return true;
2721
2722 case 'h1':
2723 case 'h2':
2724 case 'h3':
2725 case 'h4':
2726 case 'h5':
2727 case 'h6':
2728 if ( !$this->stack->inScope( BalanceSets::$headingSet ) ) {
2729 return true; # ignore
2730 }
2731 $this->stack->generateImpliedEndTags();
2732 $this->stack->popTag( BalanceSets::$headingSet );
2733 return true;
2734
2735 case 'sarcasm':
2736 # Take a deep breath, then:
2737 break;
2738
2739 case 'a':
2740 case 'b':
2741 case 'big':
2742 case 'code':
2743 case 'em':
2744 case 'font':
2745 case 'i':
2746 case 'nobr':
2747 case 's':
2748 case 'small':
2749 case 'strike':
2750 case 'strong':
2751 case 'tt':
2752 case 'u':
2753 if ( $this->stack->adoptionAgency( $value, $this->afe ) ) {
2754 return true; # If we did something, we're done.
2755 }
2756 break; # Go to the "any other end tag" case.
2757
2758 case 'applet':
2759 case 'marquee':
2760 case 'object':
2761 if ( !$this->stack->inScope( $value ) ) {
2762 return true; # ignore
2763 }
2764 $this->stack->generateImpliedEndTags();
2765 $this->stack->popTag( $value );
2766 $this->afe->clearToMarker();
2767 return true;
2768
2769 case 'br':
2770 # Turn </br> into <br>
2771 return $this->inBodyMode( 'tag', $value, [] );
2772 }
2773
2774 // Any other end tag goes here
2775 foreach ( $this->stack as $i => $node ) {
2776 if ( $node->isHtmlNamed( $value ) ) {
2777 $this->stack->generateImpliedEndTags( $value );
2778 $this->stack->popTo( $i ); # including $i
2779 break;
2780 } elseif ( $node->isA( BalanceSets::$specialSet ) ) {
2781 return true; // ignore this close token.
2782 }
2783 }
2784 return true;
2785 } else {
2786 Assert::invariant( false, "Bad token type: $token" );
2787 }
2788 }
2789
2790 private function inTableMode( $token, $value, $attribs = null, $selfclose = false ) {
2791 if ( $token === 'text' ) {
2792 if ( $this->textIntegrationMode ) {
2793 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
2794 } elseif ( $this->stack->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) {
2795 $this->pendingTableText = '';
2796 $this->originalInsertionMode = $this->parseMode;
2797 return $this->switchModeAndReprocess( 'inTableTextMode', $token, $value, $attribs, $selfclose );
2798 }
2799 // fall through to default case.
2800 } elseif ( $token === 'eof' ) {
2801 $this->stopParsing();
2802 return true;
2803 } elseif ( $token === 'tag' ) {
2804 switch ( $value ) {
2805 case 'caption':
2806 $this->afe->insertMarker();
2807 $this->stack->insertHTMLElement( $value, $attribs );
2808 $this->switchMode( 'inCaptionMode' );
2809 return true;
2810 case 'colgroup':
2811 $this->stack->clearToContext( BalanceSets::$tableContextSet );
2812 $this->stack->insertHTMLElement( $value, $attribs );
2813 $this->switchMode( 'inColumnGroupMode' );
2814 return true;
2815 case 'col':
2816 $this->inTableMode( 'tag', 'colgroup', [] );
2817 return $this->insertToken( $token, $value, $attribs, $selfclose );
2818 case 'tbody':
2819 case 'tfoot':
2820 case 'thead':
2821 $this->stack->clearToContext( BalanceSets::$tableContextSet );
2822 $this->stack->insertHTMLElement( $value, $attribs );
2823 $this->switchMode( 'inTableBodyMode' );
2824 return true;
2825 case 'td':
2826 case 'th':
2827 case 'tr':
2828 $this->inTableMode( 'tag', 'tbody', [] );
2829 return $this->insertToken( $token, $value, $attribs, $selfclose );
2830 case 'table':
2831 if ( !$this->stack->inTableScope( $value ) ) {
2832 return true; // Ignore this tag.
2833 }
2834 $this->inTableMode( 'endtag', $value );
2835 return $this->insertToken( $token, $value, $attribs, $selfclose );
2836
2837 case 'style':
2838 # OMITTED: <script>
2839 case 'template':
2840 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2841
2842 case 'input':
2843 if ( !isset( $attribs['type'] ) || strcasecmp( $attribs['type'], 'hidden' ) !== 0 ) {
2844 break; // Handle this as "everything else"
2845 }
2846 $this->stack->insertHTMLElement( $value, $attribs );
2847 $this->stack->pop();
2848 return true;
2849
2850 case 'form':
2851 if (
2852 $this->formElementPointer ||
2853 $this->stack->indexOf( 'template' ) >= 0
2854 ) {
2855 return true; // ignore this token
2856 }
2857 $this->formElementPointer =
2858 $this->stack->insertHTMLElement( $value, $attribs );
2859 $this->stack->popTag( $this->formElementPointer );
2860 return true;
2861 }
2862 // Fall through for "anything else" clause.
2863 } elseif ( $token === 'endtag' ) {
2864 switch ( $value ) {
2865 case 'table':
2866 if ( !$this->stack->inTableScope( $value ) ) {
2867 return true; // Ignore.
2868 }
2869 $this->stack->popTag( $value );
2870 $this->resetInsertionMode();
2871 return true;
2872 # OMITTED: <body>
2873 case 'caption':
2874 case 'col':
2875 case 'colgroup':
2876 # OMITTED: <html>
2877 case 'tbody':
2878 case 'td':
2879 case 'tfoot':
2880 case 'th':
2881 case 'thead':
2882 case 'tr':
2883 return true; // Ignore the token.
2884 case 'template':
2885 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2886 }
2887 // Fall through for "anything else" clause.
2888 }
2889 // This is the "anything else" case:
2890 $this->stack->fosterParentMode = true;
2891 $this->inBodyMode( $token, $value, $attribs, $selfclose );
2892 $this->stack->fosterParentMode = false;
2893 return true;
2894 }
2895
2896 private function inTableTextMode( $token, $value, $attribs = null, $selfclose = false ) {
2897 if ( $token === 'text' ) {
2898 $this->pendingTableText .= $value;
2899 return true;
2900 }
2901 // Non-text token:
2902 $text = $this->pendingTableText;
2903 $this->pendingTableText = '';
2904 if ( preg_match( '/[^\x09\x0A\x0C\x0D\x20]/', $text ) ) {
2905 // This should match the "anything else" case inTableMode
2906 $this->stack->fosterParentMode = true;
2907 $this->inBodyMode( 'text', $text );
2908 $this->stack->fosterParentMode = false;
2909 } else {
2910 // Pending text is just whitespace.
2911 $this->stack->insertText( $text );
2912 }
2913 return $this->switchModeAndReprocess(
2914 $this->originalInsertionMode, $token, $value, $attribs, $selfclose
2915 );
2916 }
2917
2918 // helper for inCaptionMode
2919 private function endCaption() {
2920 if ( !$this->stack->inTableScope( 'caption' ) ) {
2921 return false;
2922 }
2923 $this->stack->generateImpliedEndTags();
2924 $this->stack->popTag( 'caption' );
2925 $this->afe->clearToMarker();
2926 $this->switchMode( 'inTableMode' );
2927 return true;
2928 }
2929
2930 private function inCaptionMode( $token, $value, $attribs = null, $selfclose = false ) {
2931 if ( $token === 'tag' ) {
2932 switch ( $value ) {
2933 case 'caption':
2934 case 'col':
2935 case 'colgroup':
2936 case 'tbody':
2937 case 'td':
2938 case 'tfoot':
2939 case 'th':
2940 case 'thead':
2941 case 'tr':
2942 if ( $this->endCaption() ) {
2943 $this->insertToken( $token, $value, $attribs, $selfclose );
2944 }
2945 return true;
2946 }
2947 // Fall through to "anything else" case.
2948 } elseif ( $token === 'endtag' ) {
2949 switch ( $value ) {
2950 case 'caption':
2951 $this->endCaption();
2952 return true;
2953 case 'table':
2954 if ( $this->endCaption() ) {
2955 $this->insertToken( $token, $value, $attribs, $selfclose );
2956 }
2957 return true;
2958 case 'body':
2959 case 'col':
2960 case 'colgroup':
2961 # OMITTED: <html>
2962 case 'tbody':
2963 case 'td':
2964 case 'tfoot':
2965 case 'th':
2966 case 'thead':
2967 case 'tr':
2968 // Ignore the token
2969 return true;
2970 }
2971 // Fall through to "anything else" case.
2972 }
2973 // The Anything Else case
2974 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
2975 }
2976
2977 private function inColumnGroupMode( $token, $value, $attribs = null, $selfclose = false ) {
2978 if ( $token === 'text' ) {
2979 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
2980 $this->stack->insertText( $matches[0] );
2981 $value = substr( $value, strlen( $matches[0] ) );
2982 }
2983 if ( strlen( $value ) === 0 ) {
2984 return true; // All text handled.
2985 }
2986 // Fall through to handle non-whitespace below.
2987 } elseif ( $token === 'tag' ) {
2988 switch ( $value ) {
2989 # OMITTED: <html>
2990 case 'col':
2991 $this->stack->insertHTMLElement( $value, $attribs );
2992 $this->stack->pop();
2993 return true;
2994 case 'template':
2995 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2996 }
2997 // Fall through for "anything else".
2998 } elseif ( $token === 'endtag' ) {
2999 switch ( $value ) {
3000 case 'colgroup':
3001 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
3002 return true; // Ignore the token.
3003 }
3004 $this->stack->pop();
3005 $this->switchMode( 'inTableMode' );
3006 return true;
3007 case 'col':
3008 return true; // Ignore the token.
3009 case 'template':
3010 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3011 }
3012 // Fall through for "anything else".
3013 } elseif ( $token === 'eof' ) {
3014 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3015 }
3016
3017 // Anything else
3018 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
3019 return true; // Ignore the token.
3020 }
3021 $this->inColumnGroupMode( 'endtag', 'colgroup' );
3022 return $this->insertToken( $token, $value, $attribs, $selfclose );
3023 }
3024
3025 // Helper function for inTableBodyMode
3026 private function endSection() {
3027 if ( !(
3028 $this->stack->inTableScope( 'tbody' ) ||
3029 $this->stack->inTableScope( 'thead' ) ||
3030 $this->stack->inTableScope( 'tfoot' )
3031 ) ) {
3032 return false;
3033 }
3034 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
3035 $this->stack->pop();
3036 $this->switchMode( 'inTableMode' );
3037 return true;
3038 }
3039 private function inTableBodyMode( $token, $value, $attribs = null, $selfclose = false ) {
3040 if ( $token === 'tag' ) {
3041 switch ( $value ) {
3042 case 'tr':
3043 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
3044 $this->stack->insertHTMLElement( $value, $attribs );
3045 $this->switchMode( 'inRowMode' );
3046 return true;
3047 case 'th':
3048 case 'td':
3049 $this->inTableBodyMode( 'tag', 'tr', [] );
3050 $this->insertToken( $token, $value, $attribs, $selfclose );
3051 return true;
3052 case 'caption':
3053 case 'col':
3054 case 'colgroup':
3055 case 'tbody':
3056 case 'tfoot':
3057 case 'thead':
3058 if ( $this->endSection() ) {
3059 $this->insertToken( $token, $value, $attribs, $selfclose );
3060 }
3061 return true;
3062 }
3063 } elseif ( $token === 'endtag' ) {
3064 switch ( $value ) {
3065 case 'table':
3066 if ( $this->endSection() ) {
3067 $this->insertToken( $token, $value, $attribs, $selfclose );
3068 }
3069 return true;
3070 case 'tbody':
3071 case 'tfoot':
3072 case 'thead':
3073 if ( $this->stack->inTableScope( $value ) ) {
3074 $this->endSection();
3075 }
3076 return true;
3077 # OMITTED: <body>
3078 case 'caption':
3079 case 'col':
3080 case 'colgroup':
3081 # OMITTED: <html>
3082 case 'td':
3083 case 'th':
3084 case 'tr':
3085 return true; // Ignore the token.
3086 }
3087 }
3088 // Anything else:
3089 return $this->inTableMode( $token, $value, $attribs, $selfclose );
3090 }
3091
3092 // Helper function for inRowMode
3093 private function endRow() {
3094 if ( !$this->stack->inTableScope( 'tr' ) ) {
3095 return false;
3096 }
3097 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3098 $this->stack->pop();
3099 $this->switchMode( 'inTableBodyMode' );
3100 return true;
3101 }
3102 private function inRowMode( $token, $value, $attribs = null, $selfclose = false ) {
3103 if ( $token === 'tag' ) {
3104 switch ( $value ) {
3105 case 'th':
3106 case 'td':
3107 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3108 $this->stack->insertHTMLElement( $value, $attribs );
3109 $this->switchMode( 'inCellMode' );
3110 $this->afe->insertMarker();
3111 return true;
3112 case 'caption':
3113 case 'col':
3114 case 'colgroup':
3115 case 'tbody':
3116 case 'tfoot':
3117 case 'thead':
3118 case 'tr':
3119 if ( $this->endRow() ) {
3120 $this->insertToken( $token, $value, $attribs, $selfclose );
3121 }
3122 return true;
3123 }
3124 } elseif ( $token === 'endtag' ) {
3125 switch ( $value ) {
3126 case 'tr':
3127 $this->endRow();
3128 return true;
3129 case 'table':
3130 if ( $this->endRow() ) {
3131 $this->insertToken( $token, $value, $attribs, $selfclose );
3132 }
3133 return true;
3134 case 'tbody':
3135 case 'tfoot':
3136 case 'thead':
3137 if (
3138 $this->stack->inTableScope( $value ) &&
3139 $this->endRow()
3140 ) {
3141 $this->insertToken( $token, $value, $attribs, $selfclose );
3142 }
3143 return true;
3144 # OMITTED: <body>
3145 case 'caption':
3146 case 'col':
3147 case 'colgroup':
3148 # OMITTED: <html>
3149 case 'td':
3150 case 'th':
3151 return true; // Ignore the token.
3152 }
3153 }
3154 // Anything else:
3155 return $this->inTableMode( $token, $value, $attribs, $selfclose );
3156 }
3157
3158 // Helper for inCellMode
3159 private function endCell() {
3160 if ( $this->stack->inTableScope( 'td' ) ) {
3161 $this->inCellMode( 'endtag', 'td' );
3162 return true;
3163 } elseif ( $this->stack->inTableScope( 'th' ) ) {
3164 $this->inCellMode( 'endtag', 'th' );
3165 return true;
3166 } else {
3167 return false;
3168 }
3169 }
3170 private function inCellMode( $token, $value, $attribs = null, $selfclose = false ) {
3171 if ( $token === 'tag' ) {
3172 switch ( $value ) {
3173 case 'caption':
3174 case 'col':
3175 case 'colgroup':
3176 case 'tbody':
3177 case 'td':
3178 case 'tfoot':
3179 case 'th':
3180 case 'thead':
3181 case 'tr':
3182 if ( $this->endCell() ) {
3183 $this->insertToken( $token, $value, $attribs, $selfclose );
3184 }
3185 return true;
3186 }
3187 } elseif ( $token === 'endtag' ) {
3188 switch ( $value ) {
3189 case 'td':
3190 case 'th':
3191 if ( $this->stack->inTableScope( $value ) ) {
3192 $this->stack->generateImpliedEndTags();
3193 $this->stack->popTag( $value );
3194 $this->afe->clearToMarker();
3195 $this->switchMode( 'inRowMode' );
3196 }
3197 return true;
3198 # OMITTED: <body>
3199 case 'caption':
3200 case 'col':
3201 case 'colgroup':
3202 # OMITTED: <html>
3203 return true;
3204
3205 case 'table':
3206 case 'tbody':
3207 case 'tfoot':
3208 case 'thead':
3209 case 'tr':
3210 if ( $this->stack->inTableScope( $value ) ) {
3211 $this->stack->generateImpliedEndTags();
3212 $this->stack->popTag( BalanceSets::$tableCellSet );
3213 $this->afe->clearToMarker();
3214 $this->switchMode( 'inRowMode' );
3215 $this->insertToken( $token, $value, $attribs, $selfclose );
3216 }
3217 return true;
3218 }
3219 }
3220 // Anything else:
3221 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3222 }
3223
3224 private function inSelectMode( $token, $value, $attribs = null, $selfclose = false ) {
3225 if ( $token === 'text' ) {
3226 $this->stack->insertText( $value );
3227 return true;
3228 } elseif ( $token === 'eof' ) {
3229 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3230 } elseif ( $token === 'tag' ) {
3231 switch ( $value ) {
3232 # OMITTED: <html>
3233 case 'option':
3234 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3235 $this->stack->pop();
3236 }
3237 $this->stack->insertHTMLElement( $value, $attribs );
3238 return true;
3239 case 'optgroup':
3240 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3241 $this->stack->pop();
3242 }
3243 if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
3244 $this->stack->pop();
3245 }
3246 $this->stack->insertHTMLElement( $value, $attribs );
3247 return true;
3248 case 'select':
3249 $this->inSelectMode( 'endtag', $value ); // treat it like endtag
3250 return true;
3251 case 'input':
3252 case 'keygen':
3253 case 'textarea':
3254 if ( !$this->stack->inSelectScope( 'select' ) ) {
3255 return true; // ignore token (fragment case)
3256 }
3257 $this->inSelectMode( 'endtag', 'select' );
3258 return $this->insertToken( $token, $value, $attribs, $selfclose );
3259 case 'script':
3260 case 'template':
3261 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3262 }
3263 } elseif ( $token === 'endtag' ) {
3264 switch ( $value ) {
3265 case 'optgroup':
3266 if (
3267 $this->stack->currentNode->isHtmlNamed( 'option' ) &&
3268 $this->stack->length() >= 2 &&
3269 $this->stack->node( $this->stack->length() - 2 )->isHtmlNamed( 'optgroup' )
3270 ) {
3271 $this->stack->pop();
3272 }
3273 if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
3274 $this->stack->pop();
3275 }
3276 return true;
3277 case 'option':
3278 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3279 $this->stack->pop();
3280 }
3281 return true;
3282 case 'select':
3283 if ( !$this->stack->inSelectScope( $value ) ) {
3284 return true; // fragment case
3285 }
3286 $this->stack->popTag( $value );
3287 $this->resetInsertionMode();
3288 return true;
3289 case 'template':
3290 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3291 }
3292 }
3293 // anything else: just ignore the token
3294 return true;
3295 }
3296
3297 private function inSelectInTableMode( $token, $value, $attribs = null, $selfclose = false ) {
3298 switch ( $value ) {
3299 case 'caption':
3300 case 'table':
3301 case 'tbody':
3302 case 'tfoot':
3303 case 'thead':
3304 case 'tr':
3305 case 'td':
3306 case 'th':
3307 if ( $token === 'tag' ) {
3308 $this->inSelectInTableMode( 'endtag', 'select' );
3309 return $this->insertToken( $token, $value, $attribs, $selfclose );
3310 } elseif ( $token === 'endtag' ) {
3311 if ( $this->stack->inTableScope( $value ) ) {
3312 $this->inSelectInTableMode( 'endtag', 'select' );
3313 return $this->insertToken( $token, $value, $attribs, $selfclose );
3314 }
3315 return true;
3316 }
3317 }
3318 // anything else
3319 return $this->inSelectMode( $token, $value, $attribs, $selfclose );
3320 }
3321
3322 private function inTemplateMode( $token, $value, $attribs = null, $selfclose = false ) {
3323 if ( $token === 'text' ) {
3324 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3325 } elseif ( $token === 'eof' ) {
3326 if ( $this->stack->indexOf( 'template' ) < 0 ) {
3327 $this->stopParsing();
3328 } else {
3329 $this->stack->popTag( 'template' );
3330 $this->afe->clearToMarker();
3331 array_pop( $this->templateInsertionModes );
3332 $this->resetInsertionMode();
3333 $this->insertToken( $token, $value, $attribs, $selfclose );
3334 }
3335 return true;
3336 } elseif ( $token === 'tag' ) {
3337 switch ( $value ) {
3338 case 'base':
3339 case 'basefont':
3340 case 'bgsound':
3341 case 'link':
3342 case 'meta':
3343 case 'noframes':
3344 # OMITTED: <script>
3345 case 'style':
3346 case 'template':
3347 # OMITTED: <title>
3348 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3349
3350 case 'caption':
3351 case 'colgroup':
3352 case 'tbody':
3353 case 'tfoot':
3354 case 'thead':
3355 return $this->switchModeAndReprocess(
3356 'inTableMode', $token, $value, $attribs, $selfclose
3357 );
3358
3359 case 'col':
3360 return $this->switchModeAndReprocess(
3361 'inColumnGroupMode', $token, $value, $attribs, $selfclose
3362 );
3363
3364 case 'tr':
3365 return $this->switchModeAndReprocess(
3366 'inTableBodyMode', $token, $value, $attribs, $selfclose
3367 );
3368
3369 case 'td':
3370 case 'th':
3371 return $this->switchModeAndReprocess(
3372 'inRowMode', $token, $value, $attribs, $selfclose
3373 );
3374 }
3375 return $this->switchModeAndReprocess(
3376 'inBodyMode', $token, $value, $attribs, $selfclose
3377 );
3378 } elseif ( $token === 'endtag' ) {
3379 switch ( $value ) {
3380 case 'template':
3381 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3382 }
3383 return true;
3384 } else {
3385 Assert::invariant( false, "Bad token type: $token" );
3386 }
3387 }
3388 }