Support tokenizing simple HTML comments in the Balancer.
[lhc/web/wiklou.git] / includes / tidy / Balancer.php
1 <?php
2 /**
3 * An implementation of the tree building portion of the HTML5 parsing
4 * spec.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
20 *
21 * @file
22 * @ingroup Parser
23 * @since 1.27
24 * @author C. Scott Ananian, 2016
25 */
26 namespace MediaWiki\Tidy;
27
28 use Wikimedia\Assert\Assert;
29 use Wikimedia\Assert\ParameterAssertionException;
30 use \ExplodeIterator;
31 use \IteratorAggregate;
32 use \ReverseArrayIterator;
33 use \Sanitizer;
34
35 # A note for future librarization[1] -- this file is a good candidate
36 # for splitting into an independent library, except that it is currently
37 # highly optimized for MediaWiki use. It only implements the portions
38 # of the HTML5 tree builder used by tags supported by MediaWiki, and
39 # does not contain a true tokenizer pass, instead relying on
40 # comment stripping, attribute normalization, and escaping done by
41 # the MediaWiki Sanitizer. It also deliberately avoids building
42 # a true DOM in memory, instead serializing elements to an output string
43 # as soon as possible (usually as soon as the tag is closed) to reduce
44 # its memory footprint.
45
46 # We've been gradually lifting some of these restrictions to handle
47 # non-sanitized output generated by extensions, but we shortcut the tokenizer
48 # for speed (primarily by splitting on `<`) and so rely on syntactic
49 # well-formedness.
50
51 # On the other hand, I've been pretty careful to note with comments in the
52 # code the places where this implementation omits features of the spec or
53 # depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
54 # implement the missing pieces and make this a standalone PHP HTML5 parser.
55 # In order to do so, some sort of MediaWiki-specific API will need
56 # to be added to (a) allow the Balancer to bypass the tokenizer,
57 # and (b) support on-the-fly flattening instead of DOM node creation.
58
59 # [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
60
61 /**
62 * Utility constants and sets for the HTML5 tree building algorithm.
63 * Sets are associative arrays indexed first by namespace and then by
64 * lower-cased tag name.
65 *
66 * @ingroup Parser
67 * @since 1.27
68 */
69 class BalanceSets {
70 const HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml';
71 const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
72 const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
73
74 public static $unsupportedSet = [
75 self::HTML_NAMESPACE => [
76 'html' => true, 'head' => true, 'body' => true, 'frameset' => true,
77 'frame' => true,
78 'plaintext' => true, 'isindex' => true, 'textarea' => true,
79 'xmp' => true, 'iframe' => true, 'noembed' => true,
80 'noscript' => true, 'script' => true,
81 'title' => true
82 ]
83 ];
84
85 public static $emptyElementSet = [
86 self::HTML_NAMESPACE => [
87 'area' => true, 'base' => true, 'basefont' => true,
88 'bgsound' => true, 'br' => true, 'col' => true, 'command' => true,
89 'embed' => true, 'frame' => true, 'hr' => true, 'img' => true,
90 'input' => true, 'keygen' => true, 'link' => true, 'meta' => true,
91 'param' => true, 'source' => true, 'track' => true, 'wbr' => true
92 ]
93 ];
94
95 public static $headingSet = [
96 self::HTML_NAMESPACE => [
97 'h1' => true, 'h2' => true, 'h3' => true,
98 'h4' => true, 'h5' => true, 'h6' => true
99 ]
100 ];
101
102 public static $specialSet = [
103 self::HTML_NAMESPACE => [
104 'address' => true, 'applet' => true, 'area' => true,
105 'article' => true, 'aside' => true, 'base' => true,
106 'basefont' => true, 'bgsound' => true, 'blockquote' => true,
107 'body' => true, 'br' => true, 'button' => true, 'caption' => true,
108 'center' => true, 'col' => true, 'colgroup' => true, 'dd' => true,
109 'details' => true, 'dir' => true, 'div' => true, 'dl' => true,
110 'dt' => true, 'embed' => true, 'fieldset' => true,
111 'figcaption' => true, 'figure' => true, 'footer' => true,
112 'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true,
113 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true,
114 'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true,
115 'hr' => true, 'html' => true, 'iframe' => true, 'img' => true,
116 'input' => true, 'isindex' => true, 'li' => true, 'link' => true,
117 'listing' => true, 'main' => true, 'marquee' => true,
118 'menu' => true, 'menuitem' => true, 'meta' => true, 'nav' => true,
119 'noembed' => true, 'noframes' => true, 'noscript' => true,
120 'object' => true, 'ol' => true, 'p' => true, 'param' => true,
121 'plaintext' => true, 'pre' => true, 'script' => true,
122 'section' => true, 'select' => true, 'source' => true,
123 'style' => true, 'summary' => true, 'table' => true,
124 'tbody' => true, 'td' => true, 'template' => true,
125 'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true,
126 'title' => true, 'tr' => true, 'track' => true, 'ul' => true,
127 'wbr' => true, 'xmp' => true
128 ],
129 self::SVG_NAMESPACE => [
130 'foreignobject' => true, 'desc' => true, 'title' => true
131 ],
132 self::MATHML_NAMESPACE => [
133 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
134 'mtext' => true, 'annotation-xml' => true
135 ]
136 ];
137
138 public static $addressDivPSet = [
139 self::HTML_NAMESPACE => [
140 'address' => true, 'div' => true, 'p' => true
141 ]
142 ];
143
144 public static $tableSectionRowSet = [
145 self::HTML_NAMESPACE => [
146 'table' => true, 'thead' => true, 'tbody' => true,
147 'tfoot' => true, 'tr' => true
148 ]
149 ];
150
151 public static $impliedEndTagsSet = [
152 self::HTML_NAMESPACE => [
153 'dd' => true, 'dt' => true, 'li' => true, 'optgroup' => true,
154 'option' => true, 'p' => true, 'rb' => true, 'rp' => true,
155 'rt' => true, 'rtc' => true
156 ]
157 ];
158
159 public static $thoroughImpliedEndTagsSet = [
160 self::HTML_NAMESPACE => [
161 'caption' => true, 'colgroup' => true, 'dd' => true, 'dt' => true,
162 'li' => true, 'optgroup' => true, 'option' => true, 'p' => true,
163 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true,
164 'tbody' => true, 'td' => true, 'tfoot' => true, 'th' => true,
165 'thead' => true, 'tr' => true
166 ]
167 ];
168
169 public static $tableCellSet = [
170 self::HTML_NAMESPACE => [
171 'td' => true, 'th' => true
172 ]
173 ];
174 public static $tableContextSet = [
175 self::HTML_NAMESPACE => [
176 'table' => true, 'template' => true, 'html' => true
177 ]
178 ];
179
180 public static $tableBodyContextSet = [
181 self::HTML_NAMESPACE => [
182 'tbody' => true, 'tfoot' => true, 'thead' => true,
183 'template' => true, 'html' => true
184 ]
185 ];
186
187 public static $tableRowContextSet = [
188 self::HTML_NAMESPACE => [
189 'tr' => true, 'template' => true, 'html' => true
190 ]
191 ];
192
193 // See https://html.spec.whatwg.org/multipage/forms.html#form-associated-element
194 public static $formAssociatedSet = [
195 self::HTML_NAMESPACE => [
196 'button' => true, 'fieldset' => true, 'input' => true,
197 'keygen' => true, 'object' => true, 'output' => true,
198 'select' => true, 'textarea' => true, 'img' => true
199 ]
200 ];
201
202 public static $inScopeSet = [
203 self::HTML_NAMESPACE => [
204 'applet' => true, 'caption' => true, 'html' => true,
205 'marquee' => true, 'object' => true,
206 'table' => true, 'td' => true, 'template' => true,
207 'th' => true
208 ],
209 self::SVG_NAMESPACE => [
210 'foreignobject' => true, 'desc' => true, 'title' => true
211 ],
212 self::MATHML_NAMESPACE => [
213 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
214 'mtext' => true, 'annotation-xml' => true
215 ]
216 ];
217
218 private static $inListItemScopeSet = null;
219 public static function inListItemScopeSet() {
220 if ( self::$inListItemScopeSet === null ) {
221 self::$inListItemScopeSet = self::$inScopeSet;
222 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ol'] = true;
223 self::$inListItemScopeSet[self::HTML_NAMESPACE]['ul'] = true;
224 }
225 return self::$inListItemScopeSet;
226 }
227
228 private static $inButtonScopeSet = null;
229 public static function inButtonScopeSet() {
230 if ( self::$inButtonScopeSet === null ) {
231 self::$inButtonScopeSet = self::$inScopeSet;
232 self::$inButtonScopeSet[self::HTML_NAMESPACE]['button'] = true;
233 }
234 return self::$inButtonScopeSet;
235 }
236
237 public static $inTableScopeSet = [
238 self::HTML_NAMESPACE => [
239 'html' => true, 'table' => true, 'template' => true
240 ]
241 ];
242
243 public static $inInvertedSelectScopeSet = [
244 self::HTML_NAMESPACE => [
245 'option' => true, 'optgroup' => true
246 ]
247 ];
248
249 public static $mathmlTextIntegrationPointSet = [
250 self::MATHML_NAMESPACE => [
251 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true,
252 'mtext' => true
253 ]
254 ];
255
256 public static $htmlIntegrationPointSet = [
257 self::SVG_NAMESPACE => [
258 'foreignobject' => true,
259 'desc' => true,
260 'title' => true
261 ]
262 ];
263
264 // For tidy compatibility.
265 public static $tidyPWrapSet = [
266 self::HTML_NAMESPACE => [
267 'body' => true, 'blockquote' => true,
268 // We parse with <body> as the fragment context, but the top-level
269 // element on the stack is actually <html>. We could use the
270 // "adjusted current node" everywhere to work around this, but it's
271 // easier just to add <html> to the p-wrap set.
272 'html' => true,
273 ],
274 ];
275 public static $tidyInlineSet = [
276 self::HTML_NAMESPACE => [
277 'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true,
278 'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true,
279 'br' => true, 'button' => true, 'cite' => true, 'code' => true,
280 'dfn' => true, 'em' => true, 'font' => true, 'i' => true,
281 'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true,
282 'label' => true, 'legend' => true, 'map' => true, 'object' => true,
283 'param' => true, 'q' => true, 'rb' => true, 'rbc' => true,
284 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true,
285 's' => true, 'samp' => true, 'select' => true, 'small' => true,
286 'span' => true, 'strike' => true, 'strong' => true, 'sub' => true,
287 'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true,
288 'var' => true,
289 ],
290 ];
291 }
292
293 /**
294 * A BalanceElement is a simplified version of a DOM Node. The main
295 * difference is that we only keep BalanceElements around for nodes
296 * currently on the BalanceStack of open elements. As soon as an
297 * element is closed, with some minor exceptions relating to the
298 * tree builder "adoption agency algorithm", the element and all its
299 * children are serialized to a string using the flatten() method.
300 * This keeps our memory usage low.
301 *
302 * @ingroup Parser
303 * @since 1.27
304 */
305 class BalanceElement {
306 /**
307 * The namespace of the element.
308 * @var string $namespaceURI
309 */
310 public $namespaceURI;
311 /**
312 * The lower-cased name of the element.
313 * @var string $localName
314 */
315 public $localName;
316 /**
317 * Attributes for the element, in array form
318 * @var array $attribs
319 */
320 public $attribs;
321
322 /**
323 * Parent of this element, or the string "flat" if this element has
324 * already been flattened into its parent.
325 * @var string|null $parent
326 */
327 public $parent;
328
329 /**
330 * An array of children of this element. Typically only the last
331 * child will be an actual BalanceElement object; the rest will
332 * be strings, representing either text nodes or flattened
333 * BalanceElement objects.
334 * @var array $children
335 */
336 public $children;
337
338 /**
339 * A unique string identifier for Noah's Ark purposes, lazy initialized
340 */
341 private $noahKey;
342
343 /**
344 * The next active formatting element in the list, or null if this is the
345 * end of the AFE list or if the element is not in the AFE list.
346 */
347 public $nextAFE;
348
349 /**
350 * The previous active formatting element in the list, or null if this is
351 * the start of the list or if the element is not in the AFE list.
352 */
353 public $prevAFE;
354
355 /**
356 * The next element in the Noah's Ark species bucket.
357 */
358 public $nextNoah;
359
360 /**
361 * Make a new BalanceElement corresponding to the HTML DOM Element
362 * with the given localname, namespace, and attributes.
363 *
364 * @param string $namespaceURI The namespace of the element.
365 * @param string $localName The lowercased name of the tag.
366 * @param array $attribs Attributes of the element
367 */
368 public function __construct( $namespaceURI, $localName, array $attribs ) {
369 $this->localName = $localName;
370 $this->namespaceURI = $namespaceURI;
371 $this->attribs = $attribs;
372 $this->contents = '';
373 $this->parent = null;
374 $this->children = [];
375 }
376
377 /**
378 * Remove the given child from this element.
379 * @param BalanceElement $elt
380 */
381 private function removeChild( BalanceElement $elt ) {
382 Assert::precondition(
383 $this->parent !== 'flat', "Can't removeChild after flattening $this"
384 );
385 Assert::parameter(
386 $elt->parent === $this, 'elt', 'must have $this as a parent'
387 );
388 $idx = array_search( $elt, $this->children, true );
389 Assert::parameter( $idx !== false, '$elt', 'must be a child of $this' );
390 $elt->parent = null;
391 array_splice( $this->children, $idx, 1 );
392 }
393
394 /**
395 * Find $a in the list of children and insert $b before it.
396 * @param BalanceElement $a
397 * @param BalanceElement|string $b
398 */
399 public function insertBefore( BalanceElement $a, $b ) {
400 Assert::precondition(
401 $this->parent !== 'flat', "Can't insertBefore after flattening."
402 );
403 $idx = array_search( $a, $this->children, true );
404 Assert::parameter( $idx !== false, '$a', 'must be a child of $this' );
405 if ( is_string( $b ) ) {
406 array_splice( $this->children, $idx, 0, [ $b ] );
407 } else {
408 Assert::parameter( $b->parent !== 'flat', '$b', "Can't be flat" );
409 if ( $b->parent !== null ) {
410 $b->parent->removeChild( $b );
411 }
412 array_splice( $this->children, $idx, 0, [ $b ] );
413 $b->parent = $this;
414 }
415 }
416
417 /**
418 * Append $elt to the end of the list of children.
419 * @param BalanceElement|string $elt
420 */
421 public function appendChild( $elt ) {
422 Assert::precondition(
423 $this->parent !== 'flat', "Can't appendChild after flattening."
424 );
425 if ( is_string( $elt ) ) {
426 array_push( $this->children, $elt );
427 return;
428 }
429 // Remove $elt from parent, if it had one.
430 if ( $elt->parent !== null ) {
431 $elt->parent->removeChild( $elt );
432 }
433 array_push( $this->children, $elt );
434 $elt->parent = $this;
435 }
436
437 /**
438 * Transfer all of the children of $elt to $this.
439 * @param BalanceElement $elt
440 */
441 public function adoptChildren( BalanceElement $elt ) {
442 Assert::precondition(
443 $elt->parent !== 'flat', "Can't adoptChildren after flattening."
444 );
445 foreach ( $elt->children as $child ) {
446 if ( !is_string( $child ) ) {
447 // This is an optimization which avoids an O(n^2) set of
448 // array_splice operations.
449 $child->parent = null;
450 }
451 $this->appendChild( $child );
452 }
453 $elt->children = [];
454 }
455
456 /**
457 * Flatten this node and all of its children into a string, as specified
458 * by the HTML serialization specification, and replace this node
459 * in its parent by that string.
460 *
461 * @see __toString()
462 */
463 public function flatten( $tidyCompat = false ) {
464 Assert::parameter( $this->parent !== null, '$this', 'must be a child' );
465 Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' );
466 $idx = array_search( $this, $this->parent->children, true );
467 Assert::parameter(
468 $idx !== false, '$this', 'must be a child of its parent'
469 );
470 if ( $tidyCompat ) {
471 $blank = true;
472 foreach ( $this->children as $elt ) {
473 if ( !is_string( $elt ) ) {
474 $elt = $elt->flatten( $tidyCompat );
475 }
476 if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) {
477 $blank = false;
478 }
479 }
480 if ( $this->isHtmlNamed( 'mw:p-wrap' ) ) {
481 $this->localName = 'p';
482 } elseif ( $blank ) {
483 // Add 'mw-empty-elt' class so elements can be hidden via CSS
484 // for compatibility with legacy tidy.
485 if ( !count( $this->attribs ) &&
486 ( $this->localName === 'tr' || $this->localName === 'li' )
487 ) {
488 $this->attribs = [ 'class' => "mw-empty-elt" ];
489 }
490 $blank = false;
491 }
492 $flat = $blank ? '' : "{$this}";
493 } else {
494 $flat = "{$this}";
495 }
496 $this->parent->children[$idx] = $flat;
497 $this->parent = 'flat'; # for assertion checking
498 return $flat;
499 }
500
501 /**
502 * Serialize this node and all of its children to a string, as specified
503 * by the HTML serialization specification.
504 *
505 * @return string The serialization of the BalanceElement
506 * @see https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments
507 */
508 public function __toString() {
509 $encAttribs = '';
510 foreach ( $this->attribs as $name => $value ) {
511 $encValue = Sanitizer::encodeAttribute( $value );
512 $encAttribs .= " $name=\"$encValue\"";
513 }
514 if ( !$this->isA( BalanceSets::$emptyElementSet ) ) {
515 $out = "<{$this->localName}{$encAttribs}>";
516 // flatten children
517 foreach ( $this->children as $elt ) {
518 $out .= "{$elt}";
519 }
520 $out .= "</{$this->localName}>";
521 } else {
522 $out = "<{$this->localName}{$encAttribs} />";
523 Assert::invariant(
524 count( $this->children ) === 0,
525 "Empty elements shouldn't have children."
526 );
527 }
528 return $out;
529 }
530
531 # Utility functions on BalanceElements.
532
533 /**
534 * Determine if $this represents a specific HTML tag, is a member of
535 * a tag set, or is equal to another BalanceElement.
536 *
537 * @param BalanceElement|array|string $set The target BalanceElement,
538 * set (from the BalanceSets class), or string (HTML tag name).
539 * @return bool
540 */
541 public function isA( $set ) {
542 if ( $set instanceof BalanceElement ) {
543 return $this === $set;
544 } elseif ( is_array( $set ) ) {
545 return isset( $set[$this->namespaceURI] ) &&
546 isset( $set[$this->namespaceURI][$this->localName] );
547 } else {
548 # assume this is an HTML element name.
549 return $this->isHtml() && $this->localName === $set;
550 }
551 }
552
553 /**
554 * Determine if this element is an HTML element with the specified name
555 * @param string $tagName
556 * @return bool
557 */
558 public function isHtmlNamed( $tagName ) {
559 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE
560 && $this->localName === $tagName;
561 }
562
563 /**
564 * Determine if $this represents an element in the HTML namespace.
565 *
566 * @return bool
567 */
568 public function isHtml() {
569 return $this->namespaceURI === BalanceSets::HTML_NAMESPACE;
570 }
571
572 /**
573 * Determine if $this represents a MathML text integration point,
574 * as defined in the HTML5 specification.
575 *
576 * @return bool
577 * @see https://html.spec.whatwg.org/multipage/syntax.html#mathml-text-integration-point
578 */
579 public function isMathmlTextIntegrationPoint() {
580 return $this->isA( BalanceSets::$mathmlTextIntegrationPointSet );
581 }
582
583 /**
584 * Determine if $this represents an HTML integration point,
585 * as defined in the HTML5 specification.
586 *
587 * @return bool
588 * @see https://html.spec.whatwg.org/multipage/syntax.html#html-integration-point
589 */
590 public function isHtmlIntegrationPoint() {
591 if ( $this->isA( BalanceSets::$htmlIntegrationPointSet ) ) {
592 return true;
593 }
594 if (
595 $this->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
596 $this->localName === 'annotation-xml' &&
597 isset( $this->attribs['encoding'] ) &&
598 ( strcasecmp( $this->attribs['encoding'], 'text/html' ) == 0 ||
599 strcasecmp( $this->attribs['encoding'], 'application/xhtml+xml' ) == 0 )
600 ) {
601 return true;
602 }
603 return false;
604 }
605
606 /**
607 * Get a string key for the Noah's Ark algorithm
608 */
609 public function getNoahKey() {
610 if ( $this->noahKey === null ) {
611 $attribs = $this->attribs;
612 ksort( $attribs );
613 $this->noahKey = serialize( [ $this->namespaceURI, $this->localName, $attribs ] );
614 }
615 return $this->noahKey;
616 }
617 }
618
619 /**
620 * The "stack of open elements" as defined in the HTML5 tree builder
621 * spec. This contains methods to ensure that content (start tags, text)
622 * are inserted at the correct place in the output string, and to
623 * flatten BalanceElements are they are closed to avoid holding onto
624 * a complete DOM tree for the document in memory.
625 *
626 * The stack defines a PHP iterator to traverse it in "reverse order",
627 * that is, the most-recently-added element is visited first in a
628 * foreach loop.
629 *
630 * @ingroup Parser
631 * @since 1.27
632 * @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
633 */
634 class BalanceStack implements IteratorAggregate {
635 /**
636 * Backing storage for the stack.
637 * @var array $elements
638 */
639 private $elements = [];
640 /**
641 * Foster parent mode determines how nodes are inserted into the
642 * stack.
643 * @var bool $fosterParentMode
644 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
645 */
646 public $fosterParentMode = false;
647 /**
648 * Tidy compatibility mode, determines behavior of body/blockquote
649 */
650 public $tidyCompat = false;
651 /**
652 * Reference to the current element
653 */
654 public $currentNode;
655
656 /**
657 * Create a new BalanceStack with a single BalanceElement on it,
658 * representing the root &lt;html&gt; node.
659 */
660 public function __construct() {
661 # always a root <html> element on the stack
662 array_push(
663 $this->elements,
664 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] )
665 );
666 $this->currentNode = $this->elements[0];
667 }
668
669 /**
670 * Return a string representing the output of the tree builder:
671 * all the children of the root &lt;html&gt; node.
672 * @return string
673 */
674 public function getOutput() {
675 // Don't include the outer '<html>....</html>'
676 $out = '';
677 foreach ( $this->elements[0]->children as $elt ) {
678 $out .= is_string( $elt ) ? $elt :
679 $elt->flatten( $this->tidyCompat );
680 }
681 return $out;
682 }
683
684 /**
685 * Insert a comment at the appropriate place for inserting a node.
686 * @param string $value Content of the comment.
687 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-comment
688 */
689 public function insertComment( $value ) {
690 // Just another type of text node, except for tidy p-wrapping.
691 return $this->insertText( '<!--' . $value . '-->', true );
692 }
693
694 /**
695 * Insert text at the appropriate place for inserting a node.
696 * @param string $value
697 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
698 */
699 public function insertText( $value, $isComment = false ) {
700 if (
701 $this->fosterParentMode &&
702 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
703 ) {
704 $this->fosterParent( $value );
705 } elseif (
706 $this->tidyCompat && !$isComment &&
707 $this->currentNode->isA( BalanceSets::$tidyPWrapSet )
708 ) {
709 $this->insertHTMLELement( 'mw:p-wrap', [] );
710 return $this->insertText( $value );
711 } else {
712 $this->currentNode->appendChild( $value );
713 }
714 }
715
716 /**
717 * Insert a BalanceElement at the appropriate place, pushing it
718 * on to the open elements stack.
719 * @param string $namespaceURI The element namespace
720 * @param string $tag The tag name
721 * @param string $attribs Normalized attributes, as a string.
722 * @return BalanceElement
723 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-foreign-element
724 */
725 public function insertForeignElement( $namespaceURI, $tag, $attribs ) {
726 return $this->insertElement(
727 new BalanceElement( $namespaceURI, $tag, $attribs )
728 );
729 }
730
731 /**
732 * Insert an HTML element at the appropriate place, pushing it on to
733 * the open elements stack.
734 * @param string $tag The tag name
735 * @param string $attribs Normalized attributes, as a string.
736 * @return BalanceElement
737 * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-an-html-element
738 */
739 public function insertHTMLElement( $tag, $attribs ) {
740 return $this->insertForeignElement(
741 BalanceSets::HTML_NAMESPACE, $tag, $attribs
742 );
743 }
744
745 /**
746 * Insert an element at the appropriate place and push it on to the
747 * open elements stack.
748 * @param BalanceElement $elt
749 * @return BalanceElement
750 * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
751 */
752 public function insertElement( BalanceElement $elt ) {
753 if (
754 $this->currentNode->isHtmlNamed( 'mw:p-wrap' ) &&
755 !$elt->isA( BalanceSets::$tidyInlineSet )
756 ) {
757 // Tidy compatibility.
758 $this->pop();
759 }
760 if (
761 $this->fosterParentMode &&
762 $this->currentNode->isA( BalanceSets::$tableSectionRowSet )
763 ) {
764 $elt = $this->fosterParent( $elt );
765 } else {
766 $this->currentNode->appendChild( $elt );
767 }
768 Assert::invariant( $elt->parent !== null, "$elt must be in tree" );
769 Assert::invariant( $elt->parent !== 'flat', "$elt must not have been previous flattened" );
770 array_push( $this->elements, $elt );
771 $this->currentNode = $elt;
772 return $elt;
773 }
774
775 /**
776 * Determine if the stack has $tag in scope.
777 * @param BalanceElement|array|string $tag
778 * @return bool
779 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-scope
780 */
781 public function inScope( $tag ) {
782 return $this->inSpecificScope( $tag, BalanceSets::$inScopeSet );
783 }
784
785 /**
786 * Determine if the stack has $tag in button scope.
787 * @param BalanceElement|array|string $tag
788 * @return bool
789 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-button-scope
790 */
791 public function inButtonScope( $tag ) {
792 return $this->inSpecificScope( $tag, BalanceSets::inButtonScopeSet() );
793 }
794
795 /**
796 * Determine if the stack has $tag in list item scope.
797 * @param BalanceElement|array|string $tag
798 * @return bool
799 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-list-item-scope
800 */
801 public function inListItemScope( $tag ) {
802 return $this->inSpecificScope( $tag, BalanceSets::inListItemScopeSet() );
803 }
804
805 /**
806 * Determine if the stack has $tag in table scope.
807 * @param BalanceElement|array|string $tag
808 * @return bool
809 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-table-scope
810 */
811 public function inTableScope( $tag ) {
812 return $this->inSpecificScope( $tag, BalanceSets::$inTableScopeSet );
813 }
814
815 /**
816 * Determine if the stack has $tag in select scope.
817 * @param BalanceElement|array|string $tag
818 * @return bool
819 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-select-scope
820 */
821 public function inSelectScope( $tag ) {
822 // Can't use inSpecificScope to implement this, since it involves
823 // *inverting* a set of tags. Implement manually.
824 foreach ( $this as $elt ) {
825 if ( $elt->isA( $tag ) ) {
826 return true;
827 }
828 if ( !$elt->isA( BalanceSets::$inInvertedSelectScopeSet ) ) {
829 return false;
830 }
831 }
832 return false;
833 }
834
835 /**
836 * Determine if the stack has $tag in a specific scope, $set.
837 * @param BalanceElement|array|string $tag
838 * @param BalanceElement|array|string $set
839 * @return bool
840 * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-the-specific-scope
841 */
842 public function inSpecificScope( $tag, $set ) {
843 foreach ( $this as $elt ) {
844 if ( $elt->isA( $tag ) ) {
845 return true;
846 }
847 if ( $elt->isA( $set ) ) {
848 return false;
849 }
850 }
851 return false;
852 }
853
854 /**
855 * Generate implied end tags.
856 * @param string $butnot
857 * @param bool $thorough True if we should generate end tags thoroughly.
858 * @see https://html.spec.whatwg.org/multipage/syntax.html#generate-implied-end-tags
859 */
860 public function generateImpliedEndTags( $butnot = null, $thorough = false ) {
861 $endTagSet = $thorough ?
862 BalanceSets::$thoroughImpliedEndTagsSet :
863 BalanceSets::$impliedEndTagsSet;
864 while ( $this->currentNode ) {
865 if ( $butnot !== null && $this->currentNode->isHtmlNamed( $butnot ) ) {
866 break;
867 }
868 if ( !$this->currentNode->isA( $endTagSet ) ) {
869 break;
870 }
871 $this->pop();
872 }
873 }
874
875 /**
876 * Return the adjusted current node.
877 */
878 public function adjustedCurrentNode( $fragmentContext ) {
879 return ( $fragmentContext && count( $this->elements ) === 1 ) ?
880 $fragmentContext : $this->currentNode;
881 }
882
883 /**
884 * Return an iterator over this stack which visits the current node
885 * first, and the root node last.
886 * @return Iterator
887 */
888 public function getIterator() {
889 return new ReverseArrayIterator( $this->elements );
890 }
891
892 /**
893 * Return the BalanceElement at the given position $idx, where
894 * position 0 represents the root element.
895 * @param int $idx
896 * @return BalanceElement
897 */
898 public function node( $idx ) {
899 return $this->elements[ $idx ];
900 }
901
902 /**
903 * Replace the element at position $idx in the BalanceStack with $elt.
904 * @param int $idx
905 * @param BalanceElement $elt
906 */
907 public function replaceAt( $idx, BalanceElement $elt ) {
908 Assert::precondition(
909 $this->elements[$idx]->parent !== 'flat',
910 'Replaced element should not have already been flattened.'
911 );
912 Assert::precondition(
913 $elt->parent !== 'flat',
914 'New element should not have already been flattened.'
915 );
916 $this->elements[$idx] = $elt;
917 if ( $idx === count( $this->elements ) - 1 ) {
918 $this->currentNode = $elt;
919 }
920 }
921
922 /**
923 * Return the position of the given BalanceElement, set, or
924 * HTML tag name string in the BalanceStack.
925 * @param BalanceElement|array|string $tag
926 * @return int
927 */
928 public function indexOf( $tag ) {
929 for ( $i = count( $this->elements ) - 1; $i >= 0; $i-- ) {
930 if ( $this->elements[$i]->isA( $tag ) ) {
931 return $i;
932 }
933 }
934 return -1;
935 }
936
937 /**
938 * Return the number of elements currently in the BalanceStack.
939 * @return int
940 */
941 public function length() {
942 return count( $this->elements );
943 }
944
945 /**
946 * Remove the current node from the BalanceStack, flattening it
947 * in the process.
948 */
949 public function pop() {
950 $elt = array_pop( $this->elements );
951 if ( count( $this->elements ) ) {
952 $this->currentNode = $this->elements[ count( $this->elements ) - 1 ];
953 } else {
954 $this->currentNode = null;
955 }
956 if ( !$elt->isHtmlNamed( 'mw:p-wrap' ) ) {
957 $elt->flatten( $this->tidyCompat );
958 }
959 }
960
961 /**
962 * Remove all nodes up to and including position $idx from the
963 * BalanceStack, flattening them in the process.
964 * @param int $idx
965 */
966 public function popTo( $idx ) {
967 $length = count( $this->elements );
968 for ( $length = count( $this->elements ); $length > $idx; $length-- ) {
969 $this->pop();
970 }
971 }
972
973 /**
974 * Pop elements off the stack up to and including the first
975 * element with the specified HTML tagname (or matching the given
976 * set).
977 * @param BalanceElement|array|string $tag
978 */
979 public function popTag( $tag ) {
980 while ( $this->currentNode ) {
981 if ( $this->currentNode->isA( $tag ) ) {
982 $this->pop();
983 break;
984 }
985 $this->pop();
986 }
987 }
988
989 /**
990 * Pop elements off the stack *not including* the first element
991 * in the specified set.
992 * @param BalanceElement|array|string $set
993 */
994 public function clearToContext( $set ) {
995 // Note that we don't loop to 0. Never pop the <html> elt off.
996 for ( $length = count( $this->elements ); $length > 1; $length-- ) {
997 if ( $this->currentNode->isA( $set ) ) {
998 break;
999 }
1000 $this->pop();
1001 }
1002 }
1003
1004 /**
1005 * Remove the given $elt from the BalanceStack, optionally
1006 * flattening it in the process.
1007 * @param BalanceElement $elt The element to remove.
1008 * @param bool $flatten Whether to flatten the removed element.
1009 */
1010 public function removeElement( BalanceElement $elt, $flatten = true ) {
1011 Assert::parameter(
1012 $elt->parent !== 'flat',
1013 '$elt',
1014 '$elt should not already have been flattened.'
1015 );
1016 Assert::parameter(
1017 $elt->parent->parent !== 'flat',
1018 '$elt',
1019 'The parent of $elt should not already have been flattened.'
1020 );
1021 $idx = array_search( $elt, $this->elements, true );
1022 Assert::parameter( $idx !== false, '$elt', 'must be in stack' );
1023 array_splice( $this->elements, $idx, 1 );
1024 if ( $idx === count( $this->elements ) ) {
1025 $this->currentNode = $this->elements[$idx - 1];
1026 }
1027 if ( $flatten ) {
1028 // serialize $elt into its parent
1029 // otherwise, it will eventually serialize when the parent
1030 // is serialized, we just hold onto the memory for its
1031 // tree of objects a little longer.
1032 $elt->flatten( $this->tidyCompat );
1033 }
1034 Assert::postcondition(
1035 array_search( $elt, $this->elements, true ) === false,
1036 '$elt should no longer be in open elements stack'
1037 );
1038 }
1039
1040 /**
1041 * Find $a in the BalanceStack and insert $b after it.
1042 * @param BalanceElement $a
1043 * @param BalanceElement $b
1044 */
1045 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
1046 $idx = $this->indexOf( $a );
1047 Assert::parameter( $idx !== false, '$a', 'must be in stack' );
1048 if ( $idx === count( $this->elements ) - 1 ) {
1049 array_push( $this->elements, $b );
1050 $this->currentNode = $b;
1051 } else {
1052 array_splice( $this->elements, $idx + 1, 0, [ $b ] );
1053 }
1054 }
1055
1056 # Fostering and adoption.
1057
1058 /**
1059 * Foster parent the given $elt in the stack of open elements.
1060 * @param BalanceElement|string $elt
1061 * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent
1062 */
1063 private function fosterParent( $elt ) {
1064 $lastTable = $this->indexOf( 'table' );
1065 $lastTemplate = $this->indexOf( 'template' );
1066 $parent = null;
1067 $before = null;
1068
1069 if ( $lastTemplate >= 0 && ( $lastTable < 0 || $lastTemplate > $lastTable ) ) {
1070 $parent = $this->elements[$lastTemplate];
1071 } elseif ( $lastTable >= 0 ) {
1072 $parent = $this->elements[$lastTable]->parent;
1073 # Assume all tables have parents, since we're not running scripts!
1074 Assert::invariant(
1075 $parent !== null, "All tables should have parents"
1076 );
1077 $before = $this->elements[$lastTable];
1078 } else {
1079 $parent = $this->elements[0]; // the `html` element.
1080 }
1081
1082 if ( $this->tidyCompat ) {
1083 if ( is_string( $elt ) ) {
1084 // We're fostering text: do we need a p-wrapper?
1085 if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) {
1086 $this->insertHTMLElement( 'mw:p-wrap', [] );
1087 $this->insertText( $elt );
1088 return $elt;
1089 }
1090 } else {
1091 // We're fostering an element; do we need to merge p-wrappers?
1092 if ( $elt->isHtmlNamed( 'mw:p-wrap' ) ) {
1093 $idx = $before ?
1094 array_search( $before, $parent->children, true ) :
1095 count( $parent->children );
1096 $after = $idx > 0 ? $parent->children[$idx - 1] : '';
1097 if (
1098 $after instanceof BalanceElement &&
1099 $after->isHtmlNamed( 'mw:p-wrap' )
1100 ) {
1101 return $after; // Re-use existing p-wrapper.
1102 }
1103 }
1104 }
1105 }
1106
1107 if ( $before ) {
1108 $parent->insertBefore( $before, $elt );
1109 } else {
1110 $parent->appendChild( $elt );
1111 }
1112 return $elt;
1113 }
1114
1115 /**
1116 * Run the "adoption agency algoritm" (AAA) for the given subject
1117 * tag name.
1118 * @param string $tag The subject tag name.
1119 * @param BalanceActiveFormattingElements $afe The current
1120 * active formatting elements list.
1121 * @return true if the adoption agency algorithm "did something", false
1122 * if more processing is required by the caller.
1123 * @see https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
1124 */
1125 public function adoptionAgency( $tag, $afe ) {
1126 // If the current node is an HTML element whose tag name is subject,
1127 // and the current node is not in the list of active formatting
1128 // elements, then pop the current node off the stack of open
1129 // elements and abort these steps.
1130 if (
1131 $this->currentNode->isHtmlNamed( $tag ) &&
1132 !$afe->isInList( $this->currentNode )
1133 ) {
1134 $this->pop();
1135 return true; // no more handling required
1136 }
1137
1138 // Let outer loop counter be zero.
1139 $outer = 0;
1140
1141 // Outer loop: If outer loop counter is greater than or
1142 // equal to eight, then abort these steps.
1143 while ( $outer < 8 ) {
1144 // Increment outer loop counter by one.
1145 $outer++;
1146
1147 // Let the formatting element be the last element in the list
1148 // of active formatting elements that: is between the end of
1149 // the list and the last scope marker in the list, if any, or
1150 // the start of the list otherwise, and has the same tag name
1151 // as the token.
1152 $fmtelt = $afe->findElementByTag( $tag );
1153
1154 // If there is no such node, then abort these steps and instead
1155 // act as described in the "any other end tag" entry below.
1156 if ( !$fmtelt ) {
1157 return false; // false means handle by the default case
1158 }
1159
1160 // Otherwise, if there is such a node, but that node is not in
1161 // the stack of open elements, then this is a parse error;
1162 // remove the element from the list, and abort these steps.
1163 $index = $this->indexOf( $fmtelt );
1164 if ( $index < 0 ) {
1165 $afe->remove( $fmtelt );
1166 return true; // true means no more handling required
1167 }
1168
1169 // Otherwise, if there is such a node, and that node is also in
1170 // the stack of open elements, but the element is not in scope,
1171 // then this is a parse error; ignore the token, and abort
1172 // these steps.
1173 if ( !$this->inScope( $fmtelt ) ) {
1174 return true;
1175 }
1176
1177 // Let the furthest block be the topmost node in the stack of
1178 // open elements that is lower in the stack than the formatting
1179 // element, and is an element in the special category. There
1180 // might not be one.
1181 $furthestblock = null;
1182 $furthestblockindex = -1;
1183 $stacklen = $this->length();
1184 for ( $i = $index+1; $i < $stacklen; $i++ ) {
1185 if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) {
1186 $furthestblock = $this->node( $i );
1187 $furthestblockindex = $i;
1188 break;
1189 }
1190 }
1191
1192 // If there is no furthest block, then the UA must skip the
1193 // subsequent steps and instead just pop all the nodes from the
1194 // bottom of the stack of open elements, from the current node
1195 // up to and including the formatting element, and remove the
1196 // formatting element from the list of active formatting
1197 // elements.
1198 if ( !$furthestblock ) {
1199 $this->popTag( $fmtelt );
1200 $afe->remove( $fmtelt );
1201 return true;
1202 } else {
1203 // Let the common ancestor be the element immediately above
1204 // the formatting element in the stack of open elements.
1205 $ancestor = $this->node( $index-1 );
1206
1207 // Let a bookmark note the position of the formatting
1208 // element in the list of active formatting elements
1209 // relative to the elements on either side of it in the
1210 // list.
1211 $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] );
1212 $afe->insertAfter( $fmtelt, $BOOKMARK );
1213
1214 // Let node and last node be the furthest block.
1215 $node = $furthestblock;
1216 $lastnode = $furthestblock;
1217 $nodeindex = $furthestblockindex;
1218 $isAFE = false;
1219
1220 // Let inner loop counter be zero.
1221 $inner = 0;
1222
1223 while ( true ) {
1224
1225 // Increment inner loop counter by one.
1226 $inner++;
1227
1228 // Let node be the element immediately above node in
1229 // the stack of open elements, or if node is no longer
1230 // in the stack of open elements (e.g. because it got
1231 // removed by this algorithm), the element that was
1232 // immediately above node in the stack of open elements
1233 // before node was removed.
1234 $node = $this->node( --$nodeindex );
1235
1236 // If node is the formatting element, then go
1237 // to the next step in the overall algorithm.
1238 if ( $node === $fmtelt ) break;
1239
1240 // If the inner loop counter is greater than three and node
1241 // is in the list of active formatting elements, then remove
1242 // node from the list of active formatting elements.
1243 $isAFE = $afe->isInList( $node );
1244 if ( $inner > 3 && $isAFE ) {
1245 $afe->remove( $node );
1246 $isAFE = false;
1247 }
1248
1249 // If node is not in the list of active formatting
1250 // elements, then remove node from the stack of open
1251 // elements and then go back to the step labeled inner
1252 // loop.
1253 if ( !$isAFE ) {
1254 // Don't flatten here, since we're about to relocate
1255 // parts of this $node.
1256 $this->removeElement( $node, false );
1257 continue;
1258 }
1259
1260 // Create an element for the token for which the
1261 // element node was created with common ancestor as
1262 // the intended parent, replace the entry for node
1263 // in the list of active formatting elements with an
1264 // entry for the new element, replace the entry for
1265 // node in the stack of open elements with an entry for
1266 // the new element, and let node be the new element.
1267 $newelt = new BalanceElement(
1268 $node->namespaceURI, $node->localName, $node->attribs );
1269 $afe->replace( $node, $newelt );
1270 $this->replaceAt( $nodeindex, $newelt );
1271 $node = $newelt;
1272
1273 // If last node is the furthest block, then move the
1274 // aforementioned bookmark to be immediately after the
1275 // new node in the list of active formatting elements.
1276 if ( $lastnode === $furthestblock ) {
1277 $afe->remove( $BOOKMARK );
1278 $afe->insertAfter( $newelt, $BOOKMARK );
1279 }
1280
1281 // Insert last node into node, first removing it from
1282 // its previous parent node if any.
1283 $node->appendChild( $lastnode );
1284
1285 // Let last node be node.
1286 $lastnode = $node;
1287 }
1288
1289 // If the common ancestor node is a table, tbody, tfoot,
1290 // thead, or tr element, then, foster parent whatever last
1291 // node ended up being in the previous step, first removing
1292 // it from its previous parent node if any.
1293 if (
1294 $this->fosterParentMode &&
1295 $ancestor->isA( BalanceSets::$tableSectionRowSet )
1296 ) {
1297 $this->fosterParent( $lastnode );
1298 } else {
1299 // Otherwise, append whatever last node ended up being in
1300 // the previous step to the common ancestor node, first
1301 // removing it from its previous parent node if any.
1302 $ancestor->appendChild( $lastnode );
1303 }
1304
1305 // Create an element for the token for which the
1306 // formatting element was created, with furthest block
1307 // as the intended parent.
1308 $newelt2 = new BalanceElement(
1309 $fmtelt->namespaceURI, $fmtelt->localName, $fmtelt->attribs );
1310
1311 // Take all of the child nodes of the furthest block and
1312 // append them to the element created in the last step.
1313 $newelt2->adoptChildren( $furthestblock );
1314
1315 // Append that new element to the furthest block.
1316 $furthestblock->appendChild( $newelt2 );
1317
1318 // Remove the formatting element from the list of active
1319 // formatting elements, and insert the new element into the
1320 // list of active formatting elements at the position of
1321 // the aforementioned bookmark.
1322 $afe->remove( $fmtelt );
1323 $afe->replace( $BOOKMARK, $newelt2 );
1324
1325 // Remove the formatting element from the stack of open
1326 // elements, and insert the new element into the stack of
1327 // open elements immediately below the position of the
1328 // furthest block in that stack.
1329 $this->removeElement( $fmtelt );
1330 $this->insertAfter( $furthestblock, $newelt2 );
1331 }
1332 }
1333
1334 return true;
1335 }
1336
1337 /**
1338 * Return the contents of the open elements stack as a string for
1339 * debugging.
1340 * @return string
1341 */
1342 public function __toString() {
1343 $r = [];
1344 foreach ( $this->elements as $elt ) {
1345 array_push( $r, $elt->localName );
1346 }
1347 return implode( $r, ' ' );
1348 }
1349 }
1350
1351 /**
1352 * A pseudo-element used as a marker in the list of active formatting elements
1353 *
1354 * @ingroup Parser
1355 * @since 1.27
1356 */
1357 class BalanceMarker {
1358 public $nextAFE;
1359 public $prevAFE;
1360 }
1361
1362 /**
1363 * The list of active formatting elements, which is used to handle
1364 * mis-nested formatting element tags in the HTML5 tree builder
1365 * specification.
1366 *
1367 * @ingroup Parser
1368 * @since 1.27
1369 * @see https://html.spec.whatwg.org/multipage/syntax.html#list-of-active-formatting-elements
1370 */
1371 class BalanceActiveFormattingElements {
1372 /** The last (most recent) element in the list */
1373 private $tail;
1374
1375 /** The first (least recent) element in the list */
1376 private $head;
1377
1378 /**
1379 * An array of arrays representing the population of elements in each bucket
1380 * according to the Noah's Ark clause. The outer array is stack-like, with each
1381 * integer-indexed element representing a segment of the list, bounded by
1382 * markers. The first element represents the segment of the list before the
1383 * first marker.
1384 *
1385 * The inner arrays are indexed by "Noah key", which is a string which uniquely
1386 * identifies each bucket according to the rules in the spec. The value in
1387 * the inner array is the first (least recently inserted) element in the bucket,
1388 * and subsequent members of the bucket can be found by iterating through the
1389 * singly-linked list via $node->nextNoah.
1390 *
1391 * This is optimised for the most common case of inserting into a bucket
1392 * with zero members, and deleting a bucket containing one member. In the
1393 * worst case, iteration through the list is still O(1) in the document
1394 * size, since each bucket can have at most 3 members.
1395 */
1396 private $noahTableStack = [ [] ];
1397
1398 public function __destruct() {
1399 for ( $node = $this->head; $node; $node = $next ) {
1400 $next = $node->nextAFE;
1401 $node->prevAFE = $node->nextAFE = $node->nextNoah = null;
1402 }
1403 $this->head = $this->tail = $this->noahTableStack = null;
1404 }
1405
1406 public function insertMarker() {
1407 $elt = new BalanceMarker;
1408 if ( $this->tail ) {
1409 $this->tail->nextAFE = $elt;
1410 $elt->prevAFE = $this->tail;
1411 } else {
1412 $this->head = $elt;
1413 }
1414 $this->tail = $elt;
1415 $this->noahTableStack[] = [];
1416 }
1417
1418 /**
1419 * Follow the steps required when the spec requires us to "push onto the
1420 * list of active formatting elements".
1421 * @param BalanceElement $elt
1422 */
1423 public function push( BalanceElement $elt ) {
1424 // Must not be in the list already
1425 if ( $elt->prevAFE !== null || $this->head === $elt ) {
1426 throw new ParameterAssertionException( '$elt',
1427 'Cannot insert a node into the AFE list twice' );
1428 }
1429
1430 // "Noah's Ark clause" -- if there are already three copies of
1431 // this element before we encounter a marker, then drop the last
1432 // one.
1433 $noahKey = $elt->getNoahKey();
1434 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1435 if ( !isset( $table[$noahKey] ) ) {
1436 $table[$noahKey] = $elt;
1437 } else {
1438 $count = 1;
1439 $head = $tail = $table[$noahKey];
1440 while ( $tail->nextNoah ) {
1441 $tail = $tail->nextNoah;
1442 $count++;
1443 }
1444 if ( $count >= 3 ) {
1445 $this->remove( $head );
1446 }
1447 $tail->nextNoah = $elt;
1448 }
1449 // Add to the main AFE list
1450 if ( $this->tail ) {
1451 $this->tail->nextAFE = $elt;
1452 $elt->prevAFE = $this->tail;
1453 } else {
1454 $this->head = $elt;
1455 }
1456 $this->tail = $elt;
1457 }
1458
1459 /**
1460 * Follow the steps required when the spec asks us to "clear the list of
1461 * active formatting elements up to the last marker".
1462 */
1463 public function clearToMarker() {
1464 // Iterate back through the list starting from the tail
1465 $tail = $this->tail;
1466 while ( $tail && !( $tail instanceof BalanceMarker ) ) {
1467 // Unlink the element
1468 $prev = $tail->prevAFE;
1469 $tail->prevAFE = null;
1470 if ( $prev ) {
1471 $prev->nextAFE = null;
1472 }
1473 $tail->nextNoah = null;
1474 $tail = $prev;
1475 }
1476 // If we finished on a marker, unlink it and pop it off the Noah table stack
1477 if ( $tail ) {
1478 $prev = $tail->prevAFE;
1479 if ( $prev ) {
1480 $prev->nextAFE = null;
1481 }
1482 $tail = $prev;
1483 array_pop( $this->noahTableStack );
1484 } else {
1485 // No marker: wipe the top-level Noah table (which is the only one)
1486 $this->noahTableStack[0] = [];
1487 }
1488 // If we removed all the elements, clear the head pointer
1489 if ( !$tail ) {
1490 $this->head = null;
1491 }
1492 $this->tail = $tail;
1493 }
1494
1495 /**
1496 * Find and return the last element with the specified tag between the
1497 * end of the list and the last marker on the list.
1498 * Used when parsing &lt;a&gt; "in body mode".
1499 */
1500 public function findElementByTag( $tag ) {
1501 $elt = $this->tail;
1502 while ( $elt && !( $elt instanceof BalanceMarker ) ) {
1503 if ( $elt->localName === $tag ) {
1504 return $elt;
1505 }
1506 $elt = $elt->prevAFE;
1507 }
1508 return null;
1509 }
1510
1511 /**
1512 * Determine whether an element is in the list of formatting elements.
1513 * @return boolean
1514 */
1515 public function isInList( BalanceElement $elt ) {
1516 return $this->head === $elt || $elt->prevAFE;
1517 }
1518
1519 /**
1520 * Find the element $elt in the list and remove it.
1521 * Used when parsing &lt;a&gt; in body mode.
1522 */
1523 public function remove( BalanceElement $elt ) {
1524 if ( $this->head !== $elt && !$elt->prevAFE ) {
1525 throw new ParameterAssertionException( '$elt',
1526 "Attempted to remove an element which is not in the AFE list" );
1527 }
1528 // Update head and tail pointers
1529 if ( $this->head === $elt ) {
1530 $this->head = $elt->nextAFE;
1531 }
1532 if ( $this->tail === $elt ) {
1533 $this->tail = $elt->prevAFE;
1534 }
1535 // Update previous element
1536 if ( $elt->prevAFE ) {
1537 $elt->prevAFE->nextAFE = $elt->nextAFE;
1538 }
1539 // Update next element
1540 if ( $elt->nextAFE ) {
1541 $elt->nextAFE->prevAFE = $elt->prevAFE;
1542 }
1543 // Clear pointers so that isInList() etc. will work
1544 $elt->prevAFE = $elt->nextAFE = null;
1545 // Update Noah list
1546 $this->removeFromNoahList( $elt );
1547 }
1548
1549 private function addToNoahList( BalanceElement $elt ) {
1550 $noahKey = $elt->getNoahKey();
1551 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1552 if ( !isset( $table[$noahKey] ) ) {
1553 $table[$noahKey] = $elt;
1554 } else {
1555 $tail = $table[$noahKey];
1556 while ( $tail->nextNoah ) {
1557 $tail = $tail->nextNoah;
1558 }
1559 $tail->nextNoah = $elt;
1560 }
1561 }
1562
1563 private function removeFromNoahList( BalanceElement $elt ) {
1564 $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ];
1565 $key = $elt->getNoahKey();
1566 $noahElt = $table[$key];
1567 if ( $noahElt === $elt ) {
1568 if ( $noahElt->nextNoah ) {
1569 $table[$key] = $noahElt->nextNoah;
1570 $noahElt->nextNoah = null;
1571 } else {
1572 unset( $table[$key] );
1573 }
1574 } else {
1575 do {
1576 $prevNoahElt = $noahElt;
1577 $noahElt = $prevNoahElt->nextNoah;
1578 if ( $noahElt === $elt ) {
1579 // Found it, unlink
1580 $prevNoahElt->nextNoah = $elt->nextNoah;
1581 $elt->nextNoah = null;
1582 break;
1583 }
1584 } while ( $noahElt );
1585 }
1586 }
1587
1588 /**
1589 * Find element $a in the list and replace it with element $b
1590 */
1591 public function replace( BalanceElement $a, BalanceElement $b ) {
1592 if ( $this->head !== $a && !$a->prevAFE ) {
1593 throw new ParameterAssertionException( '$a',
1594 "Attempted to replace an element which is not in the AFE list" );
1595 }
1596 // Update head and tail pointers
1597 if ( $this->head === $a ) {
1598 $this->head = $b;
1599 }
1600 if ( $this->tail === $a ) {
1601 $this->tail = $b;
1602 }
1603 // Update previous element
1604 if ( $a->prevAFE ) {
1605 $a->prevAFE->nextAFE = $b;
1606 }
1607 // Update next element
1608 if ( $a->nextAFE ) {
1609 $a->nextAFE->prevAFE = $b;
1610 }
1611 $b->prevAFE = $a->prevAFE;
1612 $b->nextAFE = $a->nextAFE;
1613 $a->nextAFE = $a->prevAFE = null;
1614 // Update Noah list
1615 $this->removeFromNoahList( $a );
1616 $this->addToNoahList( $b );
1617 }
1618
1619 /**
1620 * Find $a in the list and insert $b after it.
1621 */
1622 public function insertAfter( BalanceElement $a, BalanceElement $b ) {
1623 if ( $this->head !== $a && !$a->prevAFE ) {
1624 throw new ParameterAssertionException( '$a',
1625 "Attempted to insert after an element which is not in the AFE list" );
1626 }
1627 if ( $this->tail === $a ) {
1628 $this->tail = $b;
1629 }
1630 if ( $a->nextAFE ) {
1631 $a->nextAFE->prevAFE = $b;
1632 }
1633 $b->nextAFE = $a->nextAFE;
1634 $b->prevAFE = $a;
1635 $a->nextAFE = $b;
1636 $this->addToNoahList( $b );
1637 }
1638
1639 // @codingStandardsIgnoreStart Generic.Files.LineLength.TooLong
1640 /**
1641 * Reconstruct the active formatting elements.
1642 * @param BalanceStack $stack The open elements stack
1643 * @see https://html.spec.whatwg.org/multipage/syntax.html#reconstruct-the-active-formatting-elements
1644 */
1645 // @codingStandardsIgnoreEnd
1646 public function reconstruct( $stack ) {
1647 $entry = $this->tail;
1648 // If there are no entries in the list of active formatting elements,
1649 // then there is nothing to reconstruct
1650 if ( !$entry ) {
1651 return;
1652 }
1653 // If the last is a marker, do nothing.
1654 if ( $entry instanceof BalanceMarker ) {
1655 return;
1656 }
1657 // Or if it is an open element, do nothing.
1658 if ( $stack->indexOf( $entry ) >= 0 ) {
1659 return;
1660 }
1661
1662 // Loop backward through the list until we find a marker or an
1663 // open element
1664 $foundit = false;
1665 while ( $entry->prevAFE ) {
1666 $entry = $entry->prevAFE;
1667 if ( $entry instanceof BalanceMarker || $stack->indexOf( $entry ) >= 0 ) {
1668 $foundit = true;
1669 break;
1670 }
1671 }
1672
1673 // Now loop forward, starting from the element after the current one (or
1674 // the first element if we didn't find a marker or open element),
1675 // recreating formatting elements and pushing them back onto the list
1676 // of open elements.
1677 if ( $foundit ) {
1678 $entry = $entry->nextAFE;
1679 }
1680 do {
1681 $newElement = $stack->insertHTMLElement(
1682 $entry->localName,
1683 $entry->attribs );
1684 $this->replace( $entry, $newElement );
1685 $entry = $newElement->nextAFE;
1686 } while ( $entry );
1687 }
1688
1689 /**
1690 * Get a string representation of the AFE list, for debugging
1691 */
1692 public function __toString() {
1693 $prev = null;
1694 $s = '';
1695 for ( $node = $this->head; $node; $prev = $node, $node = $node->nextAFE ) {
1696 if ( $node instanceof BalanceMarker ) {
1697 $s .= "MARKER\n";
1698 continue;
1699 }
1700 $s .= $node->localName . '#' . substr( md5( spl_object_hash( $node ) ), 0, 8 );
1701 if ( $node->nextNoah ) {
1702 $s .= " (noah sibling: {$node->nextNoah->localName}#" .
1703 substr( md5( spl_object_hash( $node->nextNoah ) ), 0, 8 ) .
1704 ')';
1705 }
1706 if ( $node->nextAFE && $node->nextAFE->prevAFE !== $node ) {
1707 $s .= " (reverse link is wrong!)";
1708 }
1709 $s .= "\n";
1710 }
1711 if ( $prev !== $this->tail ) {
1712 $s .= "(tail pointer is wrong!)\n";
1713 }
1714 return $s;
1715 }
1716 }
1717
1718 /**
1719 * An implementation of the tree building portion of the HTML5 parsing
1720 * spec.
1721 *
1722 * This is used to balance and tidy output so that the result can
1723 * always be cleanly serialized/deserialized by an HTML5 parser. It
1724 * does *not* guarantee "conforming" output -- the HTML5 spec contains
1725 * a number of constraints which are not enforced by the HTML5 parsing
1726 * process. But the result will be free of gross errors: misnested or
1727 * unclosed tags, for example, and will be unchanged by spec-complient
1728 * parsing followed by serialization.
1729 *
1730 * The tree building stage is structured as a state machine.
1731 * When comparing the implementation to
1732 * https://www.w3.org/TR/html5/syntax.html#tree-construction
1733 * note that each state is implemented as a function with a
1734 * name ending in `Mode` (because the HTML spec refers to them
1735 * as insertion modes). The current insertion mode is held by
1736 * the $parseMode property.
1737 *
1738 * The following simplifications have been made:
1739 * - We handle body content only (ie, we start `in body`.)
1740 * - The document is never in "quirks mode".
1741 * - All occurrences of < and > have been entity escaped, so we
1742 * can parse tags by simply splitting on those two characters.
1743 * The character < must not appear inside comments.
1744 * Similarly, all attributes have been "cleaned" and are double-quoted
1745 * and escaped.
1746 * - All null characters are assumed to have been removed.
1747 * - We don't alter linefeeds after <pre>/<listing>.
1748 * - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
1749 * <frame>, <plaintext>, <isindex>, <textarea>, <xmp>, <iframe>,
1750 * <noembed>, <noscript>, <script>, <title>. As a result,
1751 * further simplifications can be made:
1752 * - `frameset-ok` is not tracked.
1753 * - `head element pointer` is not tracked (but presumed non-null)
1754 * - Tokenizer has only a single mode.
1755 *
1756 * We generally mark places where we omit cases from the spec due to
1757 * disallowed elements with a comment: `# OMITTED: <element-name>`.
1758 *
1759 * The HTML spec keeps a flag during the parsing process to track
1760 * whether or not a "parse error" has been encountered. We don't
1761 * bother to track that flag, we just implement the error-handling
1762 * process as specified.
1763 *
1764 * @ingroup Parser
1765 * @since 1.27
1766 * @see https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
1767 */
1768 class Balancer {
1769 private $parseMode;
1770 private $bitsIterator;
1771 private $allowedHtmlElements;
1772 private $afe;
1773 private $stack;
1774 private $strict;
1775 private $tidyCompat;
1776 private $allowComments;
1777
1778 private $textIntegrationMode = false;
1779 private $pendingTableText;
1780 private $originalInsertionMode;
1781 private $fragmentContext;
1782 private $formElementPointer;
1783
1784 /**
1785 * Valid HTML5 comments.
1786 * Regex borrowed from Tim Starling's "remex-html" project.
1787 */
1788 const VALID_COMMENT_REGEX = "~ !--
1789 ( # 1. Comment match detector
1790 > | -> | # Invalid short close
1791 ( # 2. Comment contents
1792 (?:
1793 (?! --> )
1794 (?! --!> )
1795 (?! --! \z )
1796 (?! -- \z )
1797 (?! - \z )
1798 .
1799 )*+
1800 )
1801 ( # 3. Comment close
1802 --> | # Normal close
1803 --!> | # Comment end bang
1804 ( # 4. Indicate matches requiring EOF
1805 --! | # EOF in comment end bang state
1806 -- | # EOF in comment end state
1807 - | # EOF in comment end dash state
1808 # EOF in comment state
1809 )
1810 )
1811 )
1812 ([^<]*) \z # 5. Non-tag text after the comment
1813 ~xs";
1814
1815 /**
1816 * Create a new Balancer.
1817 * @param array $config Balancer configuration. Includes:
1818 * 'strict' : boolean, defaults to false.
1819 * When true, enforces syntactic constraints on input:
1820 * all non-tag '<' must be escaped, all attributes must be
1821 * separated by a single space and double-quoted. This is
1822 * consistent with the output of the Sanitizer.
1823 * 'allowedHtmlElements' : array, defaults to null.
1824 * When present, the keys of this associative array give
1825 * the acceptable HTML tag names. When not present, no
1826 * tag sanitization is done.
1827 * 'tidyCompat' : boolean, defaults to false.
1828 * When true, the serialization algorithm is tweaked to
1829 * provide historical compatibility with the old "tidy"
1830 * program: <p>-wrapping is done to the children of
1831 * <body> and <blockquote> elements, and empty elements
1832 * are removed.
1833 * 'allowComments': boolean, defaults to true.
1834 * When true, allows HTML comments in the input.
1835 * The Sanitizer generally strips all comments, so if you
1836 * are running on sanitized output you can set this to
1837 * false to get a bit more performance.
1838 */
1839 public function __construct( array $config = [] ) {
1840 $config = $config + [
1841 'strict' => false,
1842 'allowedHtmlElements' => null,
1843 'tidyCompat' => false,
1844 'allowComments' => true,
1845 ];
1846 $this->allowedHtmlElements = $config['allowedHtmlElements'];
1847 $this->strict = $config['strict'];
1848 $this->tidyCompat = $config['tidyCompat'];
1849 $this->allowComments = $config['allowComments'];
1850 if ( $this->allowedHtmlElements !== null ) {
1851 # Sanity check!
1852 $bad = array_uintersect_assoc(
1853 $this->allowedHtmlElements,
1854 BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE],
1855 function( $a, $b ) {
1856 // Ignore the values (just intersect the keys) by saying
1857 // all values are equal to each other.
1858 return 0;
1859 }
1860 );
1861 if ( count( $bad ) > 0 ) {
1862 $badstr = implode( array_keys( $bad ), ',' );
1863 throw new ParameterAssertionException(
1864 '$config',
1865 'Balance attempted with sanitization including ' .
1866 "unsupported elements: {$badstr}"
1867 );
1868 }
1869 }
1870 }
1871
1872 /**
1873 * Return a balanced HTML string for the HTML fragment given by $text,
1874 * subject to the caveats listed in the class description. The result
1875 * will typically be idempotent -- that is, rebalancing the output
1876 * would result in no change.
1877 *
1878 * @param string $text The markup to be balanced
1879 * @param callable $processingCallback Callback to do any variable or
1880 * parameter replacements in HTML attributes values
1881 * @param array|bool $processingArgs Arguments for the processing callback
1882 * @return string The balanced markup
1883 */
1884 public function balance( $text, $processingCallback = null, $processingArgs = [] ) {
1885 $this->parseMode = 'inBodyMode';
1886 $this->bitsIterator = new ExplodeIterator( '<', $text );
1887 $this->afe = new BalanceActiveFormattingElements();
1888 $this->stack = new BalanceStack();
1889 $this->stack->tidyCompat = $this->tidyCompat;
1890 $this->processingCallback = $processingCallback;
1891 $this->processingArgs = $processingArgs;
1892
1893 # The stack is constructed with an <html> element already on it.
1894 # Set this up as a fragment parsed with <body> as the context.
1895 $this->fragmentContext =
1896 new BalanceElement( BalanceSets::HTML_NAMESPACE, 'body', [] );
1897 $this->resetInsertionMode();
1898 $this->formElementPointer = null;
1899 for ( $e = $this->fragmentContext; $e != null; $e = $e->parent ) {
1900 if ( $e->isHtmlNamed( 'form' ) ) {
1901 $this->formElementPointer = $e;
1902 break;
1903 }
1904 }
1905
1906 // First element is text not tag
1907 $x = $this->bitsIterator->current();
1908 $this->bitsIterator->next();
1909 $this->insertToken( 'text', str_replace( '>', '&gt;', $x ) );
1910 // Now process each tag.
1911 while ( $this->bitsIterator->valid() ) {
1912 $this->advance();
1913 }
1914 $this->insertToken( 'eof', null );
1915 $result = $this->stack->getOutput();
1916 // Free memory before returning.
1917 $this->bitsIterator = null;
1918 $this->afe = null;
1919 $this->stack = null;
1920 $this->fragmentContext = null;
1921 $this->formElementPointer = null;
1922 return $result;
1923 }
1924
1925 /**
1926 * Pass a token to the tree builder. The $token will be one of the
1927 * strings "tag", "endtag", or "text".
1928 */
1929 private function insertToken( $token, $value, $attribs = null, $selfclose = false ) {
1930 // validate tags against $unsupportedSet
1931 if ( $token === 'tag' || $token === 'endtag' ) {
1932 if ( isset( BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE][$value] ) ) {
1933 # As described in "simplifications" above, these tags are
1934 # not supported in the balancer.
1935 Assert::invariant(
1936 !$this->strict,
1937 "Unsupported $token <$value> found."
1938 );
1939 return false;
1940 }
1941 } elseif ( $token === 'text' && $value === '' ) {
1942 # Don't actually inject the empty string as a text token.
1943 return true;
1944 }
1945 // Some hoops we have to jump through
1946 $adjusted = $this->stack->adjustedCurrentNode( $this->fragmentContext );
1947
1948 $isForeign = true;
1949 if (
1950 $this->stack->length() === 0 ||
1951 $adjusted->isHtml() ||
1952 $token === 'eof'
1953 ) {
1954 $isForeign = false;
1955 } elseif ( $adjusted->isMathmlTextIntegrationPoint() ) {
1956 if ( $token === 'text' ) {
1957 $isForeign = false;
1958 } elseif (
1959 $token === 'tag' &&
1960 $value !== 'mglyph' && $value !== 'malignmark'
1961 ) {
1962 $isForeign = false;
1963 }
1964 } elseif (
1965 $adjusted->namespaceURI === BalanceSets::MATHML_NAMESPACE &&
1966 $adjusted->localName === 'annotation-xml' &&
1967 $token === 'tag' && $value === 'svg'
1968 ) {
1969 $isForeign = false;
1970 } elseif (
1971 $adjusted->isHtmlIntegrationPoint() &&
1972 ( $token === 'tag' || $token === 'text' )
1973 ) {
1974 $isForeign = false;
1975 }
1976 if ( $isForeign ) {
1977 return $this->insertForeignToken( $token, $value, $attribs, $selfclose );
1978 } else {
1979 $func = $this->parseMode;
1980 return $this->$func( $token, $value, $attribs, $selfclose );
1981 }
1982 }
1983
1984 private function insertForeignToken( $token, $value, $attribs = null, $selfclose = false ) {
1985 if ( $token === 'text' ) {
1986 $this->stack->insertText( $value );
1987 return true;
1988 } elseif ( $token === 'tag' ) {
1989 switch ( $value ) {
1990 case 'font':
1991 if ( isset( $attribs['color'] )
1992 || isset( $attribs['face'] )
1993 || isset( $attribs['size'] )
1994 ) {
1995 break;
1996 }
1997 /* otherwise, fall through */
1998 case 'b':
1999 case 'big':
2000 case 'blockquote':
2001 case 'body':
2002 case 'br':
2003 case 'center':
2004 case 'code':
2005 case 'dd':
2006 case 'div':
2007 case 'dl':
2008 case 'dt':
2009 case 'em':
2010 case 'embed':
2011 case 'h1':
2012 case 'h2':
2013 case 'h3':
2014 case 'h4':
2015 case 'h5':
2016 case 'h6':
2017 case 'head':
2018 case 'hr':
2019 case 'i':
2020 case 'img':
2021 case 'li':
2022 case 'listing':
2023 case 'menu':
2024 case 'meta':
2025 case 'nobr':
2026 case 'ol':
2027 case 'p':
2028 case 'pre':
2029 case 'ruby':
2030 case 's':
2031 case 'small':
2032 case 'span':
2033 case 'strong':
2034 case 'strike':
2035 case 'sub':
2036 case 'sup':
2037 case 'table':
2038 case 'tt':
2039 case 'u':
2040 case 'ul':
2041 case 'var':
2042 if ( $this->fragmentContext ) {
2043 break;
2044 }
2045 while ( true ) {
2046 $this->stack->pop();
2047 $node = $this->stack->currentNode;
2048 if (
2049 $node->isMathmlTextIntegrationPoint() ||
2050 $node->isHtmlIntegrationPoint() ||
2051 $node->isHtml()
2052 ) {
2053 break;
2054 }
2055 }
2056 return $this->insertToken( $token, $value, $attribs, $selfclose );
2057 }
2058 // "Any other start tag"
2059 $adjusted = ( $this->fragmentContext && $this->stack->length()===1 ) ?
2060 $this->fragmentContext : $this->stack->currentNode;
2061 $this->stack->insertForeignElement(
2062 $adjusted->namespaceURI, $value, $attribs
2063 );
2064 if ( $selfclose ) {
2065 $this->stack->pop();
2066 }
2067 return true;
2068 } elseif ( $token === 'endtag' ) {
2069 $first = true;
2070 foreach ( $this->stack as $i => $node ) {
2071 if ( $node->isHtml() && !$first ) {
2072 // process the end tag as HTML
2073 $func = $this->parseMode;
2074 return $this->$func( $token, $value, $attribs, $selfclose );
2075 } elseif ( $i === 0 ) {
2076 return true;
2077 } elseif ( $node->localName === $value ) {
2078 $this->stack->popTag( $node );
2079 return true;
2080 }
2081 $first = false;
2082 }
2083 }
2084 }
2085
2086 /**
2087 * Grab the next "token" from $bitsIterator. This is either a open/close
2088 * tag or text or a comment, depending on whether the Sanitizer approves.
2089 */
2090 private function advance() {
2091 $x = $this->bitsIterator->current();
2092 $this->bitsIterator->next();
2093 $regs = [];
2094 # Handle comments. These won't be generated by mediawiki (they
2095 # are stripped in the Sanitizer) but may be generated by extensions.
2096 if (
2097 $this->allowComments &&
2098 preg_match( Balancer::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
2099 /* verify EOF condition where necessary */
2100 ( $regs[4][1] < 0 || !$this->bitsIterator->valid() )
2101 ) {
2102 $contents = $regs[2][0];
2103 $rest = $regs[5][0];
2104 $this->insertToken( 'comment', $contents );
2105 $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
2106 return;
2107 }
2108 # $slash: Does the current element start with a '/'?
2109 # $t: Current element name
2110 # $attribStr: String between element name and >
2111 # $brace: Ending '>' or '/>'
2112 # $rest: Everything until the next element from the $bitsIterator
2113 if ( preg_match( Sanitizer::ELEMENT_BITS_REGEX, $x, $regs ) ) {
2114 list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs;
2115 $t = strtolower( $t );
2116 if ( $this->strict ) {
2117 /* Verify that attributes are all properly double-quoted */
2118 Assert::invariant(
2119 preg_match(
2120 '/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr
2121 ),
2122 "Bad attribute string found"
2123 );
2124 }
2125 } else {
2126 Assert::invariant(
2127 !$this->strict, "< found which does not start a valid tag"
2128 );
2129 $slash = $t = $attribStr = $brace = $rest = null;
2130 }
2131 $goodtag = $t;
2132 $sanitize = $this->allowedHtmlElements !== null;
2133 if ( $sanitize ) {
2134 $goodtag = $t && isset( $this->allowedHtmlElements[$t] );
2135 }
2136 if ( $goodtag ) {
2137 if ( is_callable( $this->processingCallback ) ) {
2138 call_user_func_array( $this->processingCallback, [ &$attribStr, $this->processingArgs ] );
2139 }
2140 if ( $sanitize ) {
2141 $goodtag = Sanitizer::validateTag( $attribStr, $t );
2142 }
2143 }
2144 if ( $goodtag ) {
2145 if ( $sanitize ) {
2146 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2147 $attribs = Sanitizer::validateTagAttributes( $attribs, $t );
2148 } else {
2149 $attribs = Sanitizer::decodeTagAttributes( $attribStr );
2150 }
2151 $goodtag = $this->insertToken(
2152 $slash ? 'endtag' : 'tag', $t, $attribs, $brace === '/>'
2153 );
2154 }
2155 if ( $goodtag ) {
2156 $rest = str_replace( '>', '&gt;', $rest );
2157 $this->insertToken( 'text', str_replace( '>', '&gt;', $rest ) );
2158 } else {
2159 # bad tag; serialize entire thing as text.
2160 $this->insertToken( 'text', '&lt;' . str_replace( '>', '&gt;', $x ) );
2161 }
2162 }
2163
2164 private function switchMode( $mode ) {
2165 Assert::parameter(
2166 substr( $mode, -4 )==='Mode', '$mode', 'should end in Mode'
2167 );
2168 $oldMode = $this->parseMode;
2169 $this->parseMode = $mode;
2170 return $oldMode;
2171 }
2172
2173 private function switchModeAndReprocess( $mode, $token, $value, $attribs, $selfclose ) {
2174 $this->switchMode( $mode );
2175 return $this->insertToken( $token, $value, $attribs, $selfclose );
2176 }
2177
2178 private function resetInsertionMode() {
2179 $last = false;
2180 foreach ( $this->stack as $i => $node ) {
2181 if ( $i === 0 ) {
2182 $last = true;
2183 if ( $this->fragmentContext ) {
2184 $node = $this->fragmentContext;
2185 }
2186 }
2187 if ( $node->isHtml() ) {
2188 switch ( $node->localName ) {
2189 case 'select':
2190 $stacklen = $this->stack->length();
2191 for ( $j = $i + 1; $j < $stacklen-1; $j++ ) {
2192 $ancestor = $this->stack->node( $stacklen-$j-1 );
2193 if ( $ancestor->isHtmlNamed( 'template' ) ) {
2194 break;
2195 }
2196 if ( $ancestor->isHtmlNamed( 'table' ) ) {
2197 $this->switchMode( 'inSelectInTableMode' );
2198 return;
2199 }
2200 }
2201 $this->switchMode( 'inSelectMode' );
2202 return;
2203 case 'tr':
2204 $this->switchMode( 'inRowMode' );
2205 return;
2206 case 'tbody':
2207 case 'tfoot':
2208 case 'thead':
2209 $this->switchMode( 'inTableBodyMode' );
2210 return;
2211 case 'caption':
2212 $this->switchMode( 'inCaptionMode' );
2213 return;
2214 case 'colgroup':
2215 $this->switchMode( 'inColumnGroupMode' );
2216 return;
2217 case 'table':
2218 $this->switchMode( 'inTableMode' );
2219 return;
2220 case 'template':
2221 $this->switchMode(
2222 array_slice( $this->templateInsertionModes, -1 )[0]
2223 );
2224 return;
2225 case 'body':
2226 $this->switchMode( 'inBodyMode' );
2227 return;
2228 # OMITTED: <frameset>
2229 # OMITTED: <html>
2230 # OMITTED: <head>
2231 default:
2232 if ( !$last ) {
2233 # OMITTED: <head>
2234 if ( $node->isA( BalanceSets::$tableCellSet ) ) {
2235 $this->switchMode( 'inCellMode' );
2236 return;
2237 }
2238 }
2239 }
2240 }
2241 if ( $last ) {
2242 $this->switchMode( 'inBodyMode' );
2243 return;
2244 }
2245 }
2246 }
2247
2248 private function stopParsing() {
2249 # Most of the spec methods are inapplicable, other than step 2:
2250 # "pop all the nodes off the stack of open elements".
2251 # We're going to keep the top-most <html> element on the stack, though.
2252
2253 # Clear the AFE list first, otherwise the element objects will stay live
2254 # during serialization, potentially using O(N^2) memory. Note that
2255 # popping the stack will never result in reconstructing the active
2256 # formatting elements.
2257 $this->afe = null;
2258 $this->stack->popTo( 1 );
2259 }
2260
2261 private function parseRawText( $value, $attribs = null ) {
2262 $this->stack->insertHTMLElement( $value, $attribs );
2263 // XXX switch tokenizer to rawtext state?
2264 $this->originalInsertionMode = $this->switchMode( 'inTextMode' );
2265 return true;
2266 }
2267
2268 private function inTextMode( $token, $value, $attribs = null, $selfclose = false ) {
2269 if ( $token === 'text' ) {
2270 $this->stack->insertText( $value );
2271 return true;
2272 } elseif ( $token === 'eof' ) {
2273 $this->stack->pop();
2274 return $this->switchModeAndReprocess(
2275 $this->originalInsertionMode, $token, $value, $attribs, $selfclose
2276 );
2277 } elseif ( $token === 'endtag' ) {
2278 $this->stack->pop();
2279 $this->switchMode( $this->originalInsertionMode );
2280 return true;
2281 }
2282 return true;
2283 }
2284
2285 private function inHeadMode( $token, $value, $attribs = null, $selfclose = false ) {
2286 if ( $token === 'text' ) {
2287 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
2288 $this->stack->insertText( $matches[0] );
2289 $value = substr( $value, strlen( $matches[0] ) );
2290 }
2291 if ( strlen( $value ) === 0 ) {
2292 return true; // All text handled.
2293 }
2294 // Fall through to handle non-whitespace below.
2295 } elseif ( $token === 'tag' ) {
2296 switch ( $value ) {
2297 case 'meta':
2298 # OMITTED: in a full HTML parser, this might change the encoding.
2299 /* falls through */
2300 # OMITTED: <html>
2301 case 'base':
2302 case 'basefont':
2303 case 'bgsound':
2304 case 'link':
2305 $this->stack->insertHTMLElement( $value, $attribs );
2306 $this->stack->pop();
2307 return true;
2308 # OMITTED: <title>
2309 # OMITTED: <noscript>
2310 case 'noframes':
2311 case 'style':
2312 return $this->parseRawText( $value, $attribs );
2313 # OMITTED: <script>
2314 case 'template':
2315 $this->stack->insertHTMLElement( $value, $attribs );
2316 $this->afe->insertMarker();
2317 # OMITTED: frameset_ok
2318 $this->switchMode( 'inTemplateMode' );
2319 $this->templateInsertionModes[] = $this->parseMode;
2320 return true;
2321 # OMITTED: <head>
2322 }
2323 } elseif ( $token === 'endtag' ) {
2324 switch ( $value ) {
2325 # OMITTED: <head>
2326 # OMITTED: <body>
2327 # OMITTED: <html>
2328 case 'br':
2329 break; // handle at the bottom of the function
2330 case 'template':
2331 if ( $this->stack->indexOf( $value ) < 0 ) {
2332 return true; // Ignore the token.
2333 }
2334 $this->stack->generateImpliedEndTags( null, true /* thorough */ );
2335 $this->stack->popTag( $value );
2336 $this->afe->clearToMarker();
2337 array_pop( $this->templateInsertionModes );
2338 $this->resetInsertionMode();
2339 return true;
2340 default:
2341 // ignore any other end tag
2342 return true;
2343 }
2344 } elseif ( $token === 'comment' ) {
2345 $this->stack->insertComment( $value );
2346 return true;
2347 }
2348
2349 // If not handled above
2350 $this->inHeadMode( 'endtag', 'head' ); // synthetic </head>
2351 // Then redo this one
2352 return $this->insertToken( $token, $value, $attribs, $selfclose );
2353 }
2354
2355 private function inBodyMode( $token, $value, $attribs = null, $selfclose = false ) {
2356 if ( $token === 'text' ) {
2357 $this->afe->reconstruct( $this->stack );
2358 $this->stack->insertText( $value );
2359 return true;
2360 } elseif ( $token === 'eof' ) {
2361 if ( !empty( $this->templateInsertionModes ) ) {
2362 return $this->inTemplateMode( $token, $value, $attribs, $selfclose );
2363 }
2364 $this->stopParsing();
2365 return true;
2366 } elseif ( $token === 'tag' ) {
2367 switch ( $value ) {
2368 # OMITTED: <html>
2369 case 'base':
2370 case 'basefont':
2371 case 'bgsound':
2372 case 'link':
2373 case 'meta':
2374 case 'noframes':
2375 # OMITTED: <script>
2376 case 'style':
2377 case 'template':
2378 # OMITTED: <title>
2379 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2380 # OMITTED: <body>
2381 # OMITTED: <frameset>
2382
2383 case 'address':
2384 case 'article':
2385 case 'aside':
2386 case 'blockquote':
2387 case 'center':
2388 case 'details':
2389 case 'dialog':
2390 case 'dir':
2391 case 'div':
2392 case 'dl':
2393 case 'fieldset':
2394 case 'figcaption':
2395 case 'figure':
2396 case 'footer':
2397 case 'header':
2398 case 'hgroup':
2399 case 'main':
2400 case 'menu':
2401 case 'nav':
2402 case 'ol':
2403 case 'p':
2404 case 'section':
2405 case 'summary':
2406 case 'ul':
2407 if ( $this->stack->inButtonScope( 'p' ) ) {
2408 $this->inBodyMode( 'endtag', 'p' );
2409 }
2410 $this->stack->insertHTMLElement( $value, $attribs );
2411 return true;
2412
2413 case 'h1':
2414 case 'h2':
2415 case 'h3':
2416 case 'h4':
2417 case 'h5':
2418 case 'h6':
2419 if ( $this->stack->inButtonScope( 'p' ) ) {
2420 $this->inBodyMode( 'endtag', 'p' );
2421 }
2422 if ( $this->stack->currentNode->isA( BalanceSets::$headingSet ) ) {
2423 $this->stack->pop();
2424 }
2425 $this->stack->insertHTMLElement( $value, $attribs );
2426 return true;
2427
2428 case 'pre':
2429 case 'listing':
2430 if ( $this->stack->inButtonScope( 'p' ) ) {
2431 $this->inBodyMode( 'endtag', 'p' );
2432 }
2433 $this->stack->insertHTMLElement( $value, $attribs );
2434 # As described in "simplifications" above:
2435 # 1. We don't touch the next token, even if it's a linefeed.
2436 # 2. OMITTED: frameset_ok
2437 return true;
2438
2439 case 'form':
2440 if (
2441 $this->formElementPointer &&
2442 $this->stack->indexOf( 'template' ) < 0
2443 ) {
2444 return true; // in a form, not in a template.
2445 }
2446 if ( $this->stack->inButtonScope( "p" ) ) {
2447 $this->inBodyMode( 'endtag', 'p' );
2448 }
2449 $elt = $this->stack->insertHTMLElement( $value, $attribs );
2450 if ( $this->stack->indexOf( 'template' ) < 0 ) {
2451 $this->formElementPointer = $elt;
2452 }
2453 return true;
2454
2455 case 'li':
2456 # OMITTED: frameset_ok
2457 foreach ( $this->stack as $node ) {
2458 if ( $node->isHtmlNamed( 'li' ) ) {
2459 $this->inBodyMode( 'endtag', 'li' );
2460 break;
2461 }
2462 if (
2463 $node->isA( BalanceSets::$specialSet ) &&
2464 !$node->isA( BalanceSets::$addressDivPSet )
2465 ) {
2466 break;
2467 }
2468 }
2469 if ( $this->stack->inButtonScope( 'p' ) ) {
2470 $this->inBodyMode( 'endtag', 'p' );
2471 }
2472 $this->stack->insertHTMLElement( $value, $attribs );
2473 return true;
2474
2475 case 'dd':
2476 case 'dt':
2477 # OMITTED: frameset_ok
2478 foreach ( $this->stack as $node ) {
2479 if ( $node->isHtmlNamed( 'dd' ) ) {
2480 $this->inBodyMode( 'endtag', 'dd' );
2481 break;
2482 }
2483 if ( $node->isHtmlNamed( 'dt' ) ) {
2484 $this->inBodyMode( 'endtag', 'dt' );
2485 break;
2486 }
2487 if (
2488 $node->isA( BalanceSets::$specialSet ) &&
2489 !$node->isA( BalanceSets::$addressDivPSet )
2490 ) {
2491 break;
2492 }
2493 }
2494 if ( $this->stack->inButtonScope( 'p' ) ) {
2495 $this->inBodyMode( 'endtag', 'p' );
2496 }
2497 $this->stack->insertHTMLElement( $value, $attribs );
2498 return true;
2499
2500 # OMITTED: <plaintext>
2501
2502 case 'button':
2503 if ( $this->stack->inScope( 'button' ) ) {
2504 $this->inBodyMode( 'endtag', 'button' );
2505 return $this->insertToken( $token, $value, $attribs, $selfclose );
2506 }
2507 $this->afe->reconstruct( $this->stack );
2508 $this->stack->insertHTMLElement( $value, $attribs );
2509 return true;
2510
2511 case 'a':
2512 $activeElement = $this->afe->findElementByTag( 'a' );
2513 if ( $activeElement ) {
2514 $this->inBodyMode( 'endtag', 'a' );
2515 if ( $this->afe->isInList( $activeElement ) ) {
2516 $this->afe->remove( $activeElement );
2517 // Don't flatten here, since when we fall
2518 // through below we might foster parent
2519 // the new <a> tag inside this one.
2520 $this->stack->removeElement( $activeElement, false );
2521 }
2522 }
2523 /* Falls through */
2524 case 'b':
2525 case 'big':
2526 case 'code':
2527 case 'em':
2528 case 'font':
2529 case 'i':
2530 case 's':
2531 case 'small':
2532 case 'strike':
2533 case 'strong':
2534 case 'tt':
2535 case 'u':
2536 $this->afe->reconstruct( $this->stack );
2537 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
2538 return true;
2539
2540 case 'nobr':
2541 $this->afe->reconstruct( $this->stack );
2542 if ( $this->stack->inScope( 'nobr' ) ) {
2543 $this->inBodyMode( 'endtag', 'nobr' );
2544 $this->afe->reconstruct( $this->stack );
2545 }
2546 $this->afe->push( $this->stack->insertHTMLElement( $value, $attribs ), $attribs );
2547 return true;
2548
2549 case 'applet':
2550 case 'marquee':
2551 case 'object':
2552 $this->afe->reconstruct( $this->stack );
2553 $this->stack->insertHTMLElement( $value, $attribs );
2554 $this->afe->insertMarker();
2555 # OMITTED: frameset_ok
2556 return true;
2557
2558 case 'table':
2559 # The document is never in "quirks mode"; see simplifications
2560 # above.
2561 if ( $this->stack->inButtonScope( 'p' ) ) {
2562 $this->inBodyMode( 'endtag', 'p' );
2563 }
2564 $this->stack->insertHTMLElement( $value, $attribs );
2565 # OMITTED: frameset_ok
2566 $this->switchMode( 'inTableMode' );
2567 return true;
2568
2569 case 'area':
2570 case 'br':
2571 case 'embed':
2572 case 'img':
2573 case 'keygen':
2574 case 'wbr':
2575 $this->afe->reconstruct( $this->stack );
2576 $this->stack->insertHTMLElement( $value, $attribs );
2577 $this->stack->pop();
2578 # OMITTED: frameset_ok
2579 return true;
2580
2581 case 'input':
2582 $this->afe->reconstruct( $this->stack );
2583 $this->stack->insertHTMLElement( $value, $attribs );
2584 $this->stack->pop();
2585 # OMITTED: frameset_ok
2586 # (hence we don't need to examine the tag's "type" attribute)
2587 return true;
2588
2589 case 'menuitem':
2590 case 'param':
2591 case 'source':
2592 case 'track':
2593 $this->stack->insertHTMLElement( $value, $attribs );
2594 $this->stack->pop();
2595 return true;
2596
2597 case 'hr':
2598 if ( $this->stack->inButtonScope( 'p' ) ) {
2599 $this->inBodyMode( 'endtag', 'p' );
2600 }
2601 $this->stack->insertHTMLElement( $value, $attribs );
2602 $this->stack->pop();
2603 return true;
2604
2605 case 'image':
2606 # warts!
2607 return $this->inBodyMode( $token, 'img', $attribs, $selfclose );
2608
2609 # OMITTED: <isindex>
2610 # OMITTED: <textarea>
2611 # OMITTED: <xmp>
2612 # OMITTED: <iframe>
2613 # OMITTED: <noembed>
2614 # OMITTED: <noscript>
2615
2616 case 'select':
2617 $this->afe->reconstruct( $this->stack );
2618 $this->stack->insertHTMLElement( $value, $attribs );
2619 switch ( $this->parseMode ) {
2620 case 'inTableMode':
2621 case 'inCaptionMode':
2622 case 'inTableBodyMode':
2623 case 'inRowMode':
2624 case 'inCellMode':
2625 $this->switchMode( 'inSelectInTableMode' );
2626 return true;
2627 default:
2628 $this->switchMode( 'inSelectMode' );
2629 return true;
2630 }
2631
2632 case 'optgroup':
2633 case 'option':
2634 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
2635 $this->inBodyMode( 'endtag', 'option' );
2636 }
2637 $this->afe->reconstruct( $this->stack );
2638 $this->stack->insertHTMLElement( $value, $attribs );
2639 return true;
2640
2641 case 'rb':
2642 case 'rtc':
2643 if ( $this->stack->inScope( 'ruby' ) ) {
2644 $this->stack->generateImpliedEndTags();
2645 }
2646 $this->stack->insertHTMLElement( $value, $attribs );
2647 return true;
2648
2649 case 'rp':
2650 case 'rt':
2651 if ( $this->stack->inScope( 'ruby' ) ) {
2652 $this->stack->generateImpliedEndTags( 'rtc' );
2653 }
2654 $this->stack->insertHTMLElement( $value, $attribs );
2655 return true;
2656
2657 case 'math':
2658 $this->afe->reconstruct( $this->stack );
2659 # We skip the spec's "adjust MathML attributes" and
2660 # "adjust foreign attributes" steps, since the browser will
2661 # do this later when it parses the output and it doesn't affect
2662 # balancing.
2663 $this->stack->insertForeignElement(
2664 BalanceSets::MATHML_NAMESPACE, $value, $attribs
2665 );
2666 if ( $selfclose ) {
2667 # emit explicit </math> tag.
2668 $this->stack->pop();
2669 }
2670 return true;
2671
2672 case 'svg':
2673 $this->afe->reconstruct( $this->stack );
2674 # We skip the spec's "adjust SVG attributes" and
2675 # "adjust foreign attributes" steps, since the browser will
2676 # do this later when it parses the output and it doesn't affect
2677 # balancing.
2678 $this->stack->insertForeignElement(
2679 BalanceSets::SVG_NAMESPACE, $value, $attribs
2680 );
2681 if ( $selfclose ) {
2682 # emit explicit </svg> tag.
2683 $this->stack->pop();
2684 }
2685 return true;
2686
2687 case 'caption':
2688 case 'col':
2689 case 'colgroup':
2690 # OMITTED: <frame>
2691 case 'head':
2692 case 'tbody':
2693 case 'td':
2694 case 'tfoot':
2695 case 'th':
2696 case 'thead':
2697 case 'tr':
2698 // Ignore table tags if we're not inTableMode
2699 return true;
2700 }
2701
2702 // Handle any other start tag here
2703 $this->afe->reconstruct( $this->stack );
2704 $this->stack->insertHTMLElement( $value, $attribs );
2705 return true;
2706 } elseif ( $token === 'endtag' ) {
2707 switch ( $value ) {
2708 # </body>,</html> are unsupported.
2709
2710 case 'template':
2711 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2712
2713 case 'address':
2714 case 'article':
2715 case 'aside':
2716 case 'blockquote':
2717 case 'button':
2718 case 'center':
2719 case 'details':
2720 case 'dialog':
2721 case 'dir':
2722 case 'div':
2723 case 'dl':
2724 case 'fieldset':
2725 case 'figcaption':
2726 case 'figure':
2727 case 'footer':
2728 case 'header':
2729 case 'hgroup':
2730 case 'listing':
2731 case 'main':
2732 case 'menu':
2733 case 'nav':
2734 case 'ol':
2735 case 'pre':
2736 case 'section':
2737 case 'summary':
2738 case 'ul':
2739 // Ignore if there is not a matching open tag
2740 if ( !$this->stack->inScope( $value ) ) {
2741 return true;
2742 }
2743 $this->stack->generateImpliedEndTags();
2744 $this->stack->popTag( $value );
2745 return true;
2746
2747 case 'form':
2748 if ( $this->stack->indexOf( 'template' ) < 0 ) {
2749 $openform = $this->formElementPointer;
2750 $this->formElementPointer = null;
2751 if ( !$openform || !$this->stack->inScope( $openform ) ) {
2752 return true;
2753 }
2754 $this->stack->generateImpliedEndTags();
2755 // Don't flatten yet if we're removing a <form> element
2756 // out-of-order. (eg. `<form><div></form>`)
2757 $flatten = ( $this->stack->currentNode === $openform );
2758 $this->stack->removeElement( $openform, $flatten );
2759 } else {
2760 if ( !$this->stack->inScope( 'form' ) ) {
2761 return true;
2762 }
2763 $this->stack->generateImpliedEndTags();
2764 $this->stack->popTag( 'form' );
2765 }
2766 return true;
2767
2768 case 'p':
2769 if ( !$this->stack->inButtonScope( 'p' ) ) {
2770 $this->inBodyMode( 'tag', 'p', [] );
2771 return $this->insertToken( $token, $value, $attribs, $selfclose );
2772 }
2773 $this->stack->generateImpliedEndTags( $value );
2774 $this->stack->popTag( $value );
2775 return true;
2776
2777 case 'li':
2778 if ( !$this->stack->inListItemScope( $value ) ) {
2779 return true; # ignore
2780 }
2781 $this->stack->generateImpliedEndTags( $value );
2782 $this->stack->popTag( $value );
2783 return true;
2784
2785 case 'dd':
2786 case 'dt':
2787 if ( !$this->stack->inScope( $value ) ) {
2788 return true; # ignore
2789 }
2790 $this->stack->generateImpliedEndTags( $value );
2791 $this->stack->popTag( $value );
2792 return true;
2793
2794 case 'h1':
2795 case 'h2':
2796 case 'h3':
2797 case 'h4':
2798 case 'h5':
2799 case 'h6':
2800 if ( !$this->stack->inScope( BalanceSets::$headingSet ) ) {
2801 return true; # ignore
2802 }
2803 $this->stack->generateImpliedEndTags();
2804 $this->stack->popTag( BalanceSets::$headingSet );
2805 return true;
2806
2807 case 'sarcasm':
2808 # Take a deep breath, then:
2809 break;
2810
2811 case 'a':
2812 case 'b':
2813 case 'big':
2814 case 'code':
2815 case 'em':
2816 case 'font':
2817 case 'i':
2818 case 'nobr':
2819 case 's':
2820 case 'small':
2821 case 'strike':
2822 case 'strong':
2823 case 'tt':
2824 case 'u':
2825 if ( $this->stack->adoptionAgency( $value, $this->afe ) ) {
2826 return true; # If we did something, we're done.
2827 }
2828 break; # Go to the "any other end tag" case.
2829
2830 case 'applet':
2831 case 'marquee':
2832 case 'object':
2833 if ( !$this->stack->inScope( $value ) ) {
2834 return true; # ignore
2835 }
2836 $this->stack->generateImpliedEndTags();
2837 $this->stack->popTag( $value );
2838 $this->afe->clearToMarker();
2839 return true;
2840
2841 case 'br':
2842 # Turn </br> into <br>
2843 return $this->inBodyMode( 'tag', $value, [] );
2844 }
2845
2846 // Any other end tag goes here
2847 foreach ( $this->stack as $i => $node ) {
2848 if ( $node->isHtmlNamed( $value ) ) {
2849 $this->stack->generateImpliedEndTags( $value );
2850 $this->stack->popTo( $i ); # including $i
2851 break;
2852 } elseif ( $node->isA( BalanceSets::$specialSet ) ) {
2853 return true; // ignore this close token.
2854 }
2855 }
2856 return true;
2857 } elseif ( $token === 'comment' ) {
2858 $this->stack->insertComment( $value );
2859 return true;
2860 } else {
2861 Assert::invariant( false, "Bad token type: $token" );
2862 }
2863 }
2864
2865 private function inTableMode( $token, $value, $attribs = null, $selfclose = false ) {
2866 if ( $token === 'text' ) {
2867 if ( $this->textIntegrationMode ) {
2868 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
2869 } elseif ( $this->stack->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) {
2870 $this->pendingTableText = '';
2871 $this->originalInsertionMode = $this->parseMode;
2872 return $this->switchModeAndReprocess( 'inTableTextMode', $token, $value, $attribs, $selfclose );
2873 }
2874 // fall through to default case.
2875 } elseif ( $token === 'eof' ) {
2876 $this->stopParsing();
2877 return true;
2878 } elseif ( $token === 'tag' ) {
2879 switch ( $value ) {
2880 case 'caption':
2881 $this->afe->insertMarker();
2882 $this->stack->insertHTMLElement( $value, $attribs );
2883 $this->switchMode( 'inCaptionMode' );
2884 return true;
2885 case 'colgroup':
2886 $this->stack->clearToContext( BalanceSets::$tableContextSet );
2887 $this->stack->insertHTMLElement( $value, $attribs );
2888 $this->switchMode( 'inColumnGroupMode' );
2889 return true;
2890 case 'col':
2891 $this->inTableMode( 'tag', 'colgroup', [] );
2892 return $this->insertToken( $token, $value, $attribs, $selfclose );
2893 case 'tbody':
2894 case 'tfoot':
2895 case 'thead':
2896 $this->stack->clearToContext( BalanceSets::$tableContextSet );
2897 $this->stack->insertHTMLElement( $value, $attribs );
2898 $this->switchMode( 'inTableBodyMode' );
2899 return true;
2900 case 'td':
2901 case 'th':
2902 case 'tr':
2903 $this->inTableMode( 'tag', 'tbody', [] );
2904 return $this->insertToken( $token, $value, $attribs, $selfclose );
2905 case 'table':
2906 if ( !$this->stack->inTableScope( $value ) ) {
2907 return true; // Ignore this tag.
2908 }
2909 $this->inTableMode( 'endtag', $value );
2910 return $this->insertToken( $token, $value, $attribs, $selfclose );
2911
2912 case 'style':
2913 # OMITTED: <script>
2914 case 'template':
2915 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2916
2917 case 'input':
2918 if ( !isset( $attribs['type'] ) || strcasecmp( $attribs['type'], 'hidden' ) !== 0 ) {
2919 break; // Handle this as "everything else"
2920 }
2921 $this->stack->insertHTMLElement( $value, $attribs );
2922 $this->stack->pop();
2923 return true;
2924
2925 case 'form':
2926 if (
2927 $this->formElementPointer ||
2928 $this->stack->indexOf( 'template' ) >= 0
2929 ) {
2930 return true; // ignore this token
2931 }
2932 $this->formElementPointer =
2933 $this->stack->insertHTMLElement( $value, $attribs );
2934 $this->stack->popTag( $this->formElementPointer );
2935 return true;
2936 }
2937 // Fall through for "anything else" clause.
2938 } elseif ( $token === 'endtag' ) {
2939 switch ( $value ) {
2940 case 'table':
2941 if ( !$this->stack->inTableScope( $value ) ) {
2942 return true; // Ignore.
2943 }
2944 $this->stack->popTag( $value );
2945 $this->resetInsertionMode();
2946 return true;
2947 # OMITTED: <body>
2948 case 'caption':
2949 case 'col':
2950 case 'colgroup':
2951 # OMITTED: <html>
2952 case 'tbody':
2953 case 'td':
2954 case 'tfoot':
2955 case 'th':
2956 case 'thead':
2957 case 'tr':
2958 return true; // Ignore the token.
2959 case 'template':
2960 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
2961 }
2962 // Fall through for "anything else" clause.
2963 } elseif ( $token === 'comment' ) {
2964 $this->stack->insertComment( $value );
2965 return true;
2966 }
2967 // This is the "anything else" case:
2968 $this->stack->fosterParentMode = true;
2969 $this->inBodyMode( $token, $value, $attribs, $selfclose );
2970 $this->stack->fosterParentMode = false;
2971 return true;
2972 }
2973
2974 private function inTableTextMode( $token, $value, $attribs = null, $selfclose = false ) {
2975 if ( $token === 'text' ) {
2976 $this->pendingTableText .= $value;
2977 return true;
2978 }
2979 // Non-text token:
2980 $text = $this->pendingTableText;
2981 $this->pendingTableText = '';
2982 if ( preg_match( '/[^\x09\x0A\x0C\x0D\x20]/', $text ) ) {
2983 // This should match the "anything else" case inTableMode
2984 $this->stack->fosterParentMode = true;
2985 $this->inBodyMode( 'text', $text );
2986 $this->stack->fosterParentMode = false;
2987 } else {
2988 // Pending text is just whitespace.
2989 $this->stack->insertText( $text );
2990 }
2991 return $this->switchModeAndReprocess(
2992 $this->originalInsertionMode, $token, $value, $attribs, $selfclose
2993 );
2994 }
2995
2996 // helper for inCaptionMode
2997 private function endCaption() {
2998 if ( !$this->stack->inTableScope( 'caption' ) ) {
2999 return false;
3000 }
3001 $this->stack->generateImpliedEndTags();
3002 $this->stack->popTag( 'caption' );
3003 $this->afe->clearToMarker();
3004 $this->switchMode( 'inTableMode' );
3005 return true;
3006 }
3007
3008 private function inCaptionMode( $token, $value, $attribs = null, $selfclose = false ) {
3009 if ( $token === 'tag' ) {
3010 switch ( $value ) {
3011 case 'caption':
3012 case 'col':
3013 case 'colgroup':
3014 case 'tbody':
3015 case 'td':
3016 case 'tfoot':
3017 case 'th':
3018 case 'thead':
3019 case 'tr':
3020 if ( $this->endCaption() ) {
3021 $this->insertToken( $token, $value, $attribs, $selfclose );
3022 }
3023 return true;
3024 }
3025 // Fall through to "anything else" case.
3026 } elseif ( $token === 'endtag' ) {
3027 switch ( $value ) {
3028 case 'caption':
3029 $this->endCaption();
3030 return true;
3031 case 'table':
3032 if ( $this->endCaption() ) {
3033 $this->insertToken( $token, $value, $attribs, $selfclose );
3034 }
3035 return true;
3036 case 'body':
3037 case 'col':
3038 case 'colgroup':
3039 # OMITTED: <html>
3040 case 'tbody':
3041 case 'td':
3042 case 'tfoot':
3043 case 'th':
3044 case 'thead':
3045 case 'tr':
3046 // Ignore the token
3047 return true;
3048 }
3049 // Fall through to "anything else" case.
3050 }
3051 // The Anything Else case
3052 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3053 }
3054
3055 private function inColumnGroupMode( $token, $value, $attribs = null, $selfclose = false ) {
3056 if ( $token === 'text' ) {
3057 if ( preg_match( '/^[\x09\x0A\x0C\x0D\x20]+/', $value, $matches ) ) {
3058 $this->stack->insertText( $matches[0] );
3059 $value = substr( $value, strlen( $matches[0] ) );
3060 }
3061 if ( strlen( $value ) === 0 ) {
3062 return true; // All text handled.
3063 }
3064 // Fall through to handle non-whitespace below.
3065 } elseif ( $token === 'tag' ) {
3066 switch ( $value ) {
3067 # OMITTED: <html>
3068 case 'col':
3069 $this->stack->insertHTMLElement( $value, $attribs );
3070 $this->stack->pop();
3071 return true;
3072 case 'template':
3073 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3074 }
3075 // Fall through for "anything else".
3076 } elseif ( $token === 'endtag' ) {
3077 switch ( $value ) {
3078 case 'colgroup':
3079 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
3080 return true; // Ignore the token.
3081 }
3082 $this->stack->pop();
3083 $this->switchMode( 'inTableMode' );
3084 return true;
3085 case 'col':
3086 return true; // Ignore the token.
3087 case 'template':
3088 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3089 }
3090 // Fall through for "anything else".
3091 } elseif ( $token === 'eof' ) {
3092 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3093 } elseif ( $token === 'comment' ) {
3094 $this->stack->insertComment( $value );
3095 return true;
3096 }
3097
3098 // Anything else
3099 if ( !$this->stack->currentNode->isHtmlNamed( 'colgroup' ) ) {
3100 return true; // Ignore the token.
3101 }
3102 $this->inColumnGroupMode( 'endtag', 'colgroup' );
3103 return $this->insertToken( $token, $value, $attribs, $selfclose );
3104 }
3105
3106 // Helper function for inTableBodyMode
3107 private function endSection() {
3108 if ( !(
3109 $this->stack->inTableScope( 'tbody' ) ||
3110 $this->stack->inTableScope( 'thead' ) ||
3111 $this->stack->inTableScope( 'tfoot' )
3112 ) ) {
3113 return false;
3114 }
3115 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
3116 $this->stack->pop();
3117 $this->switchMode( 'inTableMode' );
3118 return true;
3119 }
3120 private function inTableBodyMode( $token, $value, $attribs = null, $selfclose = false ) {
3121 if ( $token === 'tag' ) {
3122 switch ( $value ) {
3123 case 'tr':
3124 $this->stack->clearToContext( BalanceSets::$tableBodyContextSet );
3125 $this->stack->insertHTMLElement( $value, $attribs );
3126 $this->switchMode( 'inRowMode' );
3127 return true;
3128 case 'th':
3129 case 'td':
3130 $this->inTableBodyMode( 'tag', 'tr', [] );
3131 $this->insertToken( $token, $value, $attribs, $selfclose );
3132 return true;
3133 case 'caption':
3134 case 'col':
3135 case 'colgroup':
3136 case 'tbody':
3137 case 'tfoot':
3138 case 'thead':
3139 if ( $this->endSection() ) {
3140 $this->insertToken( $token, $value, $attribs, $selfclose );
3141 }
3142 return true;
3143 }
3144 } elseif ( $token === 'endtag' ) {
3145 switch ( $value ) {
3146 case 'table':
3147 if ( $this->endSection() ) {
3148 $this->insertToken( $token, $value, $attribs, $selfclose );
3149 }
3150 return true;
3151 case 'tbody':
3152 case 'tfoot':
3153 case 'thead':
3154 if ( $this->stack->inTableScope( $value ) ) {
3155 $this->endSection();
3156 }
3157 return true;
3158 # OMITTED: <body>
3159 case 'caption':
3160 case 'col':
3161 case 'colgroup':
3162 # OMITTED: <html>
3163 case 'td':
3164 case 'th':
3165 case 'tr':
3166 return true; // Ignore the token.
3167 }
3168 }
3169 // Anything else:
3170 return $this->inTableMode( $token, $value, $attribs, $selfclose );
3171 }
3172
3173 // Helper function for inRowMode
3174 private function endRow() {
3175 if ( !$this->stack->inTableScope( 'tr' ) ) {
3176 return false;
3177 }
3178 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3179 $this->stack->pop();
3180 $this->switchMode( 'inTableBodyMode' );
3181 return true;
3182 }
3183 private function inRowMode( $token, $value, $attribs = null, $selfclose = false ) {
3184 if ( $token === 'tag' ) {
3185 switch ( $value ) {
3186 case 'th':
3187 case 'td':
3188 $this->stack->clearToContext( BalanceSets::$tableRowContextSet );
3189 $this->stack->insertHTMLElement( $value, $attribs );
3190 $this->switchMode( 'inCellMode' );
3191 $this->afe->insertMarker();
3192 return true;
3193 case 'caption':
3194 case 'col':
3195 case 'colgroup':
3196 case 'tbody':
3197 case 'tfoot':
3198 case 'thead':
3199 case 'tr':
3200 if ( $this->endRow() ) {
3201 $this->insertToken( $token, $value, $attribs, $selfclose );
3202 }
3203 return true;
3204 }
3205 } elseif ( $token === 'endtag' ) {
3206 switch ( $value ) {
3207 case 'tr':
3208 $this->endRow();
3209 return true;
3210 case 'table':
3211 if ( $this->endRow() ) {
3212 $this->insertToken( $token, $value, $attribs, $selfclose );
3213 }
3214 return true;
3215 case 'tbody':
3216 case 'tfoot':
3217 case 'thead':
3218 if (
3219 $this->stack->inTableScope( $value ) &&
3220 $this->endRow()
3221 ) {
3222 $this->insertToken( $token, $value, $attribs, $selfclose );
3223 }
3224 return true;
3225 # OMITTED: <body>
3226 case 'caption':
3227 case 'col':
3228 case 'colgroup':
3229 # OMITTED: <html>
3230 case 'td':
3231 case 'th':
3232 return true; // Ignore the token.
3233 }
3234 }
3235 // Anything else:
3236 return $this->inTableMode( $token, $value, $attribs, $selfclose );
3237 }
3238
3239 // Helper for inCellMode
3240 private function endCell() {
3241 if ( $this->stack->inTableScope( 'td' ) ) {
3242 $this->inCellMode( 'endtag', 'td' );
3243 return true;
3244 } elseif ( $this->stack->inTableScope( 'th' ) ) {
3245 $this->inCellMode( 'endtag', 'th' );
3246 return true;
3247 } else {
3248 return false;
3249 }
3250 }
3251 private function inCellMode( $token, $value, $attribs = null, $selfclose = false ) {
3252 if ( $token === 'tag' ) {
3253 switch ( $value ) {
3254 case 'caption':
3255 case 'col':
3256 case 'colgroup':
3257 case 'tbody':
3258 case 'td':
3259 case 'tfoot':
3260 case 'th':
3261 case 'thead':
3262 case 'tr':
3263 if ( $this->endCell() ) {
3264 $this->insertToken( $token, $value, $attribs, $selfclose );
3265 }
3266 return true;
3267 }
3268 } elseif ( $token === 'endtag' ) {
3269 switch ( $value ) {
3270 case 'td':
3271 case 'th':
3272 if ( $this->stack->inTableScope( $value ) ) {
3273 $this->stack->generateImpliedEndTags();
3274 $this->stack->popTag( $value );
3275 $this->afe->clearToMarker();
3276 $this->switchMode( 'inRowMode' );
3277 }
3278 return true;
3279 # OMITTED: <body>
3280 case 'caption':
3281 case 'col':
3282 case 'colgroup':
3283 # OMITTED: <html>
3284 return true;
3285
3286 case 'table':
3287 case 'tbody':
3288 case 'tfoot':
3289 case 'thead':
3290 case 'tr':
3291 if ( $this->stack->inTableScope( $value ) ) {
3292 $this->stack->generateImpliedEndTags();
3293 $this->stack->popTag( BalanceSets::$tableCellSet );
3294 $this->afe->clearToMarker();
3295 $this->switchMode( 'inRowMode' );
3296 $this->insertToken( $token, $value, $attribs, $selfclose );
3297 }
3298 return true;
3299 }
3300 }
3301 // Anything else:
3302 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3303 }
3304
3305 private function inSelectMode( $token, $value, $attribs = null, $selfclose = false ) {
3306 if ( $token === 'text' ) {
3307 $this->stack->insertText( $value );
3308 return true;
3309 } elseif ( $token === 'eof' ) {
3310 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3311 } elseif ( $token === 'tag' ) {
3312 switch ( $value ) {
3313 # OMITTED: <html>
3314 case 'option':
3315 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3316 $this->stack->pop();
3317 }
3318 $this->stack->insertHTMLElement( $value, $attribs );
3319 return true;
3320 case 'optgroup':
3321 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3322 $this->stack->pop();
3323 }
3324 if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
3325 $this->stack->pop();
3326 }
3327 $this->stack->insertHTMLElement( $value, $attribs );
3328 return true;
3329 case 'select':
3330 $this->inSelectMode( 'endtag', $value ); // treat it like endtag
3331 return true;
3332 case 'input':
3333 case 'keygen':
3334 case 'textarea':
3335 if ( !$this->stack->inSelectScope( 'select' ) ) {
3336 return true; // ignore token (fragment case)
3337 }
3338 $this->inSelectMode( 'endtag', 'select' );
3339 return $this->insertToken( $token, $value, $attribs, $selfclose );
3340 case 'script':
3341 case 'template':
3342 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3343 }
3344 } elseif ( $token === 'endtag' ) {
3345 switch ( $value ) {
3346 case 'optgroup':
3347 if (
3348 $this->stack->currentNode->isHtmlNamed( 'option' ) &&
3349 $this->stack->length() >= 2 &&
3350 $this->stack->node( $this->stack->length() - 2 )->isHtmlNamed( 'optgroup' )
3351 ) {
3352 $this->stack->pop();
3353 }
3354 if ( $this->stack->currentNode->isHtmlNamed( 'optgroup' ) ) {
3355 $this->stack->pop();
3356 }
3357 return true;
3358 case 'option':
3359 if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
3360 $this->stack->pop();
3361 }
3362 return true;
3363 case 'select':
3364 if ( !$this->stack->inSelectScope( $value ) ) {
3365 return true; // fragment case
3366 }
3367 $this->stack->popTag( $value );
3368 $this->resetInsertionMode();
3369 return true;
3370 case 'template':
3371 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3372 }
3373 } elseif ( $token === 'comment' ) {
3374 $this->stack->insertComment( $value );
3375 return true;
3376 }
3377 // anything else: just ignore the token
3378 return true;
3379 }
3380
3381 private function inSelectInTableMode( $token, $value, $attribs = null, $selfclose = false ) {
3382 switch ( $value ) {
3383 case 'caption':
3384 case 'table':
3385 case 'tbody':
3386 case 'tfoot':
3387 case 'thead':
3388 case 'tr':
3389 case 'td':
3390 case 'th':
3391 if ( $token === 'tag' ) {
3392 $this->inSelectInTableMode( 'endtag', 'select' );
3393 return $this->insertToken( $token, $value, $attribs, $selfclose );
3394 } elseif ( $token === 'endtag' ) {
3395 if ( $this->stack->inTableScope( $value ) ) {
3396 $this->inSelectInTableMode( 'endtag', 'select' );
3397 return $this->insertToken( $token, $value, $attribs, $selfclose );
3398 }
3399 return true;
3400 }
3401 }
3402 // anything else
3403 return $this->inSelectMode( $token, $value, $attribs, $selfclose );
3404 }
3405
3406 private function inTemplateMode( $token, $value, $attribs = null, $selfclose = false ) {
3407 if ( $token === 'text' || $token === 'comment' ) {
3408 return $this->inBodyMode( $token, $value, $attribs, $selfclose );
3409 } elseif ( $token === 'eof' ) {
3410 if ( $this->stack->indexOf( 'template' ) < 0 ) {
3411 $this->stopParsing();
3412 } else {
3413 $this->stack->popTag( 'template' );
3414 $this->afe->clearToMarker();
3415 array_pop( $this->templateInsertionModes );
3416 $this->resetInsertionMode();
3417 $this->insertToken( $token, $value, $attribs, $selfclose );
3418 }
3419 return true;
3420 } elseif ( $token === 'tag' ) {
3421 switch ( $value ) {
3422 case 'base':
3423 case 'basefont':
3424 case 'bgsound':
3425 case 'link':
3426 case 'meta':
3427 case 'noframes':
3428 # OMITTED: <script>
3429 case 'style':
3430 case 'template':
3431 # OMITTED: <title>
3432 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3433
3434 case 'caption':
3435 case 'colgroup':
3436 case 'tbody':
3437 case 'tfoot':
3438 case 'thead':
3439 return $this->switchModeAndReprocess(
3440 'inTableMode', $token, $value, $attribs, $selfclose
3441 );
3442
3443 case 'col':
3444 return $this->switchModeAndReprocess(
3445 'inColumnGroupMode', $token, $value, $attribs, $selfclose
3446 );
3447
3448 case 'tr':
3449 return $this->switchModeAndReprocess(
3450 'inTableBodyMode', $token, $value, $attribs, $selfclose
3451 );
3452
3453 case 'td':
3454 case 'th':
3455 return $this->switchModeAndReprocess(
3456 'inRowMode', $token, $value, $attribs, $selfclose
3457 );
3458 }
3459 return $this->switchModeAndReprocess(
3460 'inBodyMode', $token, $value, $attribs, $selfclose
3461 );
3462 } elseif ( $token === 'endtag' ) {
3463 switch ( $value ) {
3464 case 'template':
3465 return $this->inHeadMode( $token, $value, $attribs, $selfclose );
3466 }
3467 return true;
3468 } else {
3469 Assert::invariant( false, "Bad token type: $token" );
3470 }
3471 }
3472 }