use \ReverseArrayIterator;
use \Sanitizer;
-# A note for future librarization[1] -- this file is a good candidate
-# for splitting into an independent library, except that it is currently
-# highly optimized for MediaWiki use. It only implements the portions
-# of the HTML5 tree builder used by tags supported by MediaWiki, and
-# does not contain a true tokenizer pass, instead relying on
-# comment stripping, attribute normalization, and escaping done by
-# the MediaWiki Sanitizer. It also deliberately avoids building
-# a true DOM in memory, instead serializing elements to an output string
-# as soon as possible (usually as soon as the tag is closed) to reduce
-# its memory footprint.
-
-# We've been gradually lifting some of these restrictions to handle
-# non-sanitized output generated by extensions, but we shortcut the tokenizer
-# for speed (primarily by splitting on `<`) and so rely on syntactic
-# well-formedness.
-
-# On the other hand, I've been pretty careful to note with comments in the
-# code the places where this implementation omits features of the spec or
-# depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
-# implement the missing pieces and make this a standalone PHP HTML5 parser.
-# In order to do so, some sort of MediaWiki-specific API will need
-# to be added to (a) allow the Balancer to bypass the tokenizer,
-# and (b) support on-the-fly flattening instead of DOM node creation.
-
-# [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
+// A note for future librarization[1] -- this file is a good candidate
+// for splitting into an independent library, except that it is currently
+// highly optimized for MediaWiki use. It only implements the portions
+// of the HTML5 tree builder used by tags supported by MediaWiki, and
+// does not contain a true tokenizer pass, instead relying on
+// comment stripping, attribute normalization, and escaping done by
+// the MediaWiki Sanitizer. It also deliberately avoids building
+// a true DOM in memory, instead serializing elements to an output string
+// as soon as possible (usually as soon as the tag is closed) to reduce
+// its memory footprint.
+
+// We've been gradually lifting some of these restrictions to handle
+// non-sanitized output generated by extensions, but we shortcut the tokenizer
+// for speed (primarily by splitting on `<`) and so rely on syntactic
+// well-formedness.
+
+// On the other hand, I've been pretty careful to note with comments in the
+// code the places where this implementation omits features of the spec or
+// depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
+// implement the missing pieces and make this a standalone PHP HTML5 parser.
+// In order to do so, some sort of MediaWiki-specific API will need
+// to be added to (a) allow the Balancer to bypass the tokenizer,
+// and (b) support on-the-fly flattening instead of DOM node creation.
+
+// [1]: https://www.mediawiki.org/wiki/Library_infrastructure_for_MediaWiki
/**
* Utility constants and sets for the HTML5 tree building algorithm.
$flat = "{$this}";
}
$this->parent->children[$idx] = $flat;
- $this->parent = 'flat'; # for assertion checking
+ $this->parent = 'flat'; // for assertion checking
return $flat;
}
return $out;
}
- # Utility functions on BalanceElements.
+ // Utility functions on BalanceElements.
/**
* Determine if $this represents a specific HTML tag, is a member of
return isset( $set[$this->namespaceURI] ) &&
isset( $set[$this->namespaceURI][$this->localName] );
} else {
- # assume this is an HTML element name.
+ // assume this is an HTML element name.
return $this->isHtml() && $this->localName === $set;
}
}
* representing the root <html> node.
*/
public function __construct() {
- # always a root <html> element on the stack
+ // always a root <html> element on the stack
array_push(
$this->elements,
new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] )
}
}
- # Fostering and adoption.
+ // Fostering and adoption.
/**
* Foster parent the given $elt in the stack of open elements.
$parent = $this->elements[$lastTemplate];
} elseif ( $lastTable >= 0 ) {
$parent = $this->elements[$lastTable]->parent;
- # Assume all tables have parents, since we're not running scripts!
+ // Assume all tables have parents, since we're not running scripts!
Assert::invariant(
$parent !== null, "All tables should have parents"
);
* <style>/<noframes> want RAWTEXT modes which we only loosely emulate.)
*
* We generally mark places where we omit cases from the spec due to
- * disallowed elements with a comment: `# OMITTED: <element-name>`.
+ * disallowed elements with a comment: `// OMITTED: <element-name>`.
*
* The HTML spec keeps a flag during the parsing process to track
* whether or not a "parse error" has been encountered. We don't
$this->tidyCompat = $config['tidyCompat'];
$this->allowComments = $config['allowComments'];
if ( $this->allowedHtmlElements !== null ) {
- # Sanity check!
+ // Sanity check!
$bad = array_uintersect_assoc(
$this->allowedHtmlElements,
BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE],
$this->inRCDATA =
$this->inRAWTEXT = false;
- # The stack is constructed with an <html> element already on it.
- # Set this up as a fragment parsed with <body> as the context.
+ // The stack is constructed with an <html> element already on it.
+ // Set this up as a fragment parsed with <body> as the context.
$this->fragmentContext =
new BalanceElement( BalanceSets::HTML_NAMESPACE, 'body', [] );
$this->resetInsertionMode();
// validate tags against $unsupportedSet
if ( $token === 'tag' || $token === 'endtag' ) {
if ( isset( BalanceSets::$unsupportedSet[BalanceSets::HTML_NAMESPACE][$value] ) ) {
- # As described in "simplifications" above, these tags are
- # not supported in the balancer.
+ // As described in "simplifications" above, these tags are
+ // not supported in the balancer.
Assert::invariant(
!$this->strict,
"Unsupported $token <$value> found."
return false;
}
} elseif ( $token === 'text' && $value === '' ) {
- # Don't actually inject the empty string as a text token.
+ // Don't actually inject the empty string as a text token.
return true;
}
// Support pre/listing/textarea by suppressing initial linefeed
if ( $token === 'text' ) {
if ( $value[0] === "\n" ) {
if ( $value === "\n" ) {
- # Nothing would be left, don't inject the empty string.
+ // Nothing would be left, don't inject the empty string.
return true;
}
$value = substr( $value, 1 );
) {
break;
}
- /* otherwise, fall through */
+ // otherwise, fall through
case 'b':
case 'big':
case 'blockquote':
$x = $this->bitsIterator->current();
$this->bitsIterator->next();
$regs = [];
- # Handle comments. These won't be generated by mediawiki (they
- # are stripped in the Sanitizer) but may be generated by extensions.
+ // Handle comments. These won't be generated by mediawiki (they
+ // are stripped in the Sanitizer) but may be generated by extensions.
if (
$this->allowComments &&
!( $this->inRCDATA || $this->inRAWTEXT ) &&
preg_match( Balancer::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
- /* verify EOF condition where necessary */
+ // verify EOF condition where necessary
( $regs[4][1] < 0 || !$this->bitsIterator->valid() )
) {
$contents = $regs[2][0];
$this->insertToken( 'text', str_replace( '>', '>', $rest ) );
return;
}
- # $slash: Does the current element start with a '/'?
- # $t: Current element name
- # $attribStr: String between element name and >
- # $brace: Ending '>' or '/>'
- # $rest: Everything until the next element from the $bitsIterator
+ // $slash: Does the current element start with a '/'?
+ // $t: Current element name
+ // $attribStr: String between element name and >
+ // $brace: Ending '>' or '/>'
+ // $rest: Everything until the next element from the $bitsIterator
if ( preg_match( Sanitizer::ELEMENT_BITS_REGEX, $x, $regs ) ) {
list( /* $qbar */, $slash, $t, $attribStr, $brace, $rest ) = $regs;
$t = strtolower( $t );
if ( $this->strict ) {
- /* Verify that attributes are all properly double-quoted */
+ // Verify that attributes are all properly double-quoted
Assert::invariant(
preg_match(
'/^( [:_A-Z0-9][-.:_A-Z0-9]*="[^"]*")*[ ]*$/i', $attribStr
} elseif ( $this->inRAWTEXT ) {
$this->insertToken( 'text', "<$x" );
} else {
- # bad tag; serialize entire thing as text.
+ // bad tag; serialize entire thing as text.
$this->insertToken( 'text', '<' . str_replace( '>', '>', $x ) );
}
}
case 'body':
$this->switchMode( 'inBodyMode' );
return;
- # OMITTED: <frameset>
- # OMITTED: <html>
- # OMITTED: <head>
+ // OMITTED: <frameset>
+ // OMITTED: <html>
+ // OMITTED: <head>
default:
if ( !$last ) {
- # OMITTED: <head>
+ // OMITTED: <head>
if ( $node->isA( BalanceSets::$tableCellSet ) ) {
$this->switchMode( 'inCellMode' );
return;
}
private function stopParsing() {
- # Most of the spec methods are inapplicable, other than step 2:
- # "pop all the nodes off the stack of open elements".
- # We're going to keep the top-most <html> element on the stack, though.
-
- # Clear the AFE list first, otherwise the element objects will stay live
- # during serialization, potentially using O(N^2) memory. Note that
- # popping the stack will never result in reconstructing the active
- # formatting elements.
+ // Most of the spec methods are inapplicable, other than step 2:
+ // "pop all the nodes off the stack of open elements".
+ // We're going to keep the top-most <html> element on the stack, though.
+
+ // Clear the AFE list first, otherwise the element objects will stay live
+ // during serialization, potentially using O(N^2) memory. Note that
+ // popping the stack will never result in reconstructing the active
+ // formatting elements.
$this->afe = null;
$this->stack->popTo( 1 );
}
} elseif ( $token === 'tag' ) {
switch ( $value ) {
case 'meta':
- # OMITTED: in a full HTML parser, this might change the encoding.
- /* falls through */
- # OMITTED: <html>
+ // OMITTED: in a full HTML parser, this might change the encoding.
+ // falls through
+ // OMITTED: <html>
case 'base':
case 'basefont':
case 'bgsound':
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->pop();
return true;
- # OMITTED: <title>
- # OMITTED: <noscript>
+ // OMITTED: <title>
+ // OMITTED: <noscript>
case 'noframes':
case 'style':
return $this->parseRawText( $value, $attribs );
- # OMITTED: <script>
+ // OMITTED: <script>
case 'template':
$this->stack->insertHTMLElement( $value, $attribs );
$this->afe->insertMarker();
- # OMITTED: frameset_ok
+ // OMITTED: frameset_ok
$this->switchMode( 'inTemplateMode' );
$this->templateInsertionModes[] = $this->parseMode;
return true;
- # OMITTED: <head>
+ // OMITTED: <head>
}
} elseif ( $token === 'endtag' ) {
switch ( $value ) {
- # OMITTED: <head>
- # OMITTED: <body>
- # OMITTED: <html>
+ // OMITTED: <head>
+ // OMITTED: <body>
+ // OMITTED: <html>
case 'br':
break; // handle at the bottom of the function
case 'template':
return true;
} elseif ( $token === 'tag' ) {
switch ( $value ) {
- # OMITTED: <html>
+ // OMITTED: <html>
case 'base':
case 'basefont':
case 'bgsound':
case 'link':
case 'meta':
case 'noframes':
- # OMITTED: <script>
+ // OMITTED: <script>
case 'style':
case 'template':
- # OMITTED: <title>
+ // OMITTED: <title>
return $this->inHeadMode( $token, $value, $attribs, $selfclose );
- # OMITTED: <body>
- # OMITTED: <frameset>
+ // OMITTED: <body>
+ // OMITTED: <frameset>
case 'address':
case 'article':
}
$this->stack->insertHTMLElement( $value, $attribs );
$this->ignoreLinefeed = true;
- # OMITTED: frameset_ok
+ // OMITTED: frameset_ok
return true;
case 'form':
return true;
case 'li':
- # OMITTED: frameset_ok
+ // OMITTED: frameset_ok
foreach ( $this->stack as $node ) {
if ( $node->isHtmlNamed( 'li' ) ) {
$this->inBodyMode( 'endtag', 'li' );
case 'dd':
case 'dt':
- # OMITTED: frameset_ok
+ // OMITTED: frameset_ok
foreach ( $this->stack as $node ) {
if ( $node->isHtmlNamed( 'dd' ) ) {
$this->inBodyMode( 'endtag', 'dd' );
$this->stack->insertHTMLElement( $value, $attribs );
return true;
- # OMITTED: <plaintext>
+ // OMITTED: <plaintext>
case 'button':
if ( $this->stack->inScope( 'button' ) ) {
$this->stack->removeElement( $activeElement, false );
}
}
- /* Falls through */
+ // Falls through
case 'b':
case 'big':
case 'code':
$this->afe->reconstruct( $this->stack );
$this->stack->insertHTMLElement( $value, $attribs );
$this->afe->insertMarker();
- # OMITTED: frameset_ok
+ // OMITTED: frameset_ok
return true;
case 'table':
- # The document is never in "quirks mode"; see simplifications
- # above.
+ // The document is never in "quirks mode"; see simplifications
+ // above.
if ( $this->stack->inButtonScope( 'p' ) ) {
$this->inBodyMode( 'endtag', 'p' );
}
$this->stack->insertHTMLElement( $value, $attribs );
- # OMITTED: frameset_ok
+ // OMITTED: frameset_ok
$this->switchMode( 'inTableMode' );
return true;
$this->afe->reconstruct( $this->stack );
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->pop();
- # OMITTED: frameset_ok
+ // OMITTED: frameset_ok
return true;
case 'input':
$this->afe->reconstruct( $this->stack );
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->pop();
- # OMITTED: frameset_ok
- # (hence we don't need to examine the tag's "type" attribute)
+ // OMITTED: frameset_ok
+ // (hence we don't need to examine the tag's "type" attribute)
return true;
case 'menuitem':
return true;
case 'image':
- # warts!
+ // warts!
return $this->inBodyMode( $token, 'img', $attribs, $selfclose );
- # OMITTED: <isindex>
+ // OMITTED: <isindex>
case 'textarea':
$this->stack->insertHTMLElement( $value, $attribs );
$this->ignoreLinefeed = true;
$this->inRCDATA = $value; // emulate rcdata tokenizer mode
- # OMITTED: frameset_ok
+ // OMITTED: frameset_ok
return true;
- # OMITTED: <xmp>
- # OMITTED: <iframe>
- # OMITTED: <noembed>
- # OMITTED: <noscript>
+ // OMITTED: <xmp>
+ // OMITTED: <iframe>
+ // OMITTED: <noembed>
+ // OMITTED: <noscript>
case 'select':
$this->afe->reconstruct( $this->stack );
case 'math':
$this->afe->reconstruct( $this->stack );
- # We skip the spec's "adjust MathML attributes" and
- # "adjust foreign attributes" steps, since the browser will
- # do this later when it parses the output and it doesn't affect
- # balancing.
+ // We skip the spec's "adjust MathML attributes" and
+ // "adjust foreign attributes" steps, since the browser will
+ // do this later when it parses the output and it doesn't affect
+ // balancing.
$this->stack->insertForeignElement(
BalanceSets::MATHML_NAMESPACE, $value, $attribs
);
if ( $selfclose ) {
- # emit explicit </math> tag.
+ // emit explicit </math> tag.
$this->stack->pop();
}
return true;
case 'svg':
$this->afe->reconstruct( $this->stack );
- # We skip the spec's "adjust SVG attributes" and
- # "adjust foreign attributes" steps, since the browser will
- # do this later when it parses the output and it doesn't affect
- # balancing.
+ // We skip the spec's "adjust SVG attributes" and
+ // "adjust foreign attributes" steps, since the browser will
+ // do this later when it parses the output and it doesn't affect
+ // balancing.
$this->stack->insertForeignElement(
BalanceSets::SVG_NAMESPACE, $value, $attribs
);
if ( $selfclose ) {
- # emit explicit </svg> tag.
+ // emit explicit </svg> tag.
$this->stack->pop();
}
return true;
case 'caption':
case 'col':
case 'colgroup':
- # OMITTED: <frame>
+ // OMITTED: <frame>
case 'head':
case 'tbody':
case 'td':
return true;
} elseif ( $token === 'endtag' ) {
switch ( $value ) {
- # </body>,</html> are unsupported.
+ // </body>,</html> are unsupported.
case 'template':
return $this->inHeadMode( $token, $value, $attribs, $selfclose );
case 'li':
if ( !$this->stack->inListItemScope( $value ) ) {
- return true; # ignore
+ return true; // ignore
}
$this->stack->generateImpliedEndTags( $value );
$this->stack->popTag( $value );
case 'dd':
case 'dt':
if ( !$this->stack->inScope( $value ) ) {
- return true; # ignore
+ return true; // ignore
}
$this->stack->generateImpliedEndTags( $value );
$this->stack->popTag( $value );
case 'h5':
case 'h6':
if ( !$this->stack->inScope( BalanceSets::$headingSet ) ) {
- return true; # ignore
+ return true; // ignore
}
$this->stack->generateImpliedEndTags();
$this->stack->popTag( BalanceSets::$headingSet );
return true;
case 'sarcasm':
- # Take a deep breath, then:
+ // Take a deep breath, then:
break;
case 'a':
case 'tt':
case 'u':
if ( $this->stack->adoptionAgency( $value, $this->afe ) ) {
- return true; # If we did something, we're done.
+ return true; // If we did something, we're done.
}
- break; # Go to the "any other end tag" case.
+ break; // Go to the "any other end tag" case.
case 'applet':
case 'marquee':
case 'object':
if ( !$this->stack->inScope( $value ) ) {
- return true; # ignore
+ return true; // ignore
}
$this->stack->generateImpliedEndTags();
$this->stack->popTag( $value );
return true;
case 'br':
- # Turn </br> into <br>
+ // Turn </br> into <br>
return $this->inBodyMode( 'tag', $value, [] );
}
foreach ( $this->stack as $i => $node ) {
if ( $node->isHtmlNamed( $value ) ) {
$this->stack->generateImpliedEndTags( $value );
- $this->stack->popTo( $i ); # including $i
+ $this->stack->popTo( $i ); // including $i
break;
} elseif ( $node->isA( BalanceSets::$specialSet ) ) {
return true; // ignore this close token.
} elseif ( $this->stack->currentNode->isA( BalanceSets::$tableSectionRowSet ) ) {
$this->pendingTableText = '';
$this->originalInsertionMode = $this->parseMode;
- return $this->switchModeAndReprocess( 'inTableTextMode', $token, $value, $attribs, $selfclose );
+ return $this->switchModeAndReprocess( 'inTableTextMode',
+ $token, $value, $attribs, $selfclose );
}
// fall through to default case.
} elseif ( $token === 'eof' ) {
return $this->insertToken( $token, $value, $attribs, $selfclose );
case 'style':
- # OMITTED: <script>
+ // OMITTED: <script>
case 'template':
return $this->inHeadMode( $token, $value, $attribs, $selfclose );
$this->stack->popTag( $value );
$this->resetInsertionMode();
return true;
- # OMITTED: <body>
+ // OMITTED: <body>
case 'caption':
case 'col':
case 'colgroup':
- # OMITTED: <html>
+ // OMITTED: <html>
case 'tbody':
case 'td':
case 'tfoot':
case 'body':
case 'col':
case 'colgroup':
- # OMITTED: <html>
+ // OMITTED: <html>
case 'tbody':
case 'td':
case 'tfoot':
// Fall through to handle non-whitespace below.
} elseif ( $token === 'tag' ) {
switch ( $value ) {
- # OMITTED: <html>
+ // OMITTED: <html>
case 'col':
$this->stack->insertHTMLElement( $value, $attribs );
$this->stack->pop();
$this->endSection();
}
return true;
- # OMITTED: <body>
+ // OMITTED: <body>
case 'caption':
case 'col':
case 'colgroup':
- # OMITTED: <html>
+ // OMITTED: <html>
case 'td':
case 'th':
case 'tr':
$this->insertToken( $token, $value, $attribs, $selfclose );
}
return true;
- # OMITTED: <body>
+ // OMITTED: <body>
case 'caption':
case 'col':
case 'colgroup':
- # OMITTED: <html>
+ // OMITTED: <html>
case 'td':
case 'th':
return true; // Ignore the token.
$this->switchMode( 'inRowMode' );
}
return true;
- # OMITTED: <body>
+ // OMITTED: <body>
case 'caption':
case 'col':
case 'colgroup':
- # OMITTED: <html>
+ // OMITTED: <html>
return true;
case 'table':
return $this->inBodyMode( $token, $value, $attribs, $selfclose );
} elseif ( $token === 'tag' ) {
switch ( $value ) {
- # OMITTED: <html>
+ // OMITTED: <html>
case 'option':
if ( $this->stack->currentNode->isHtmlNamed( 'option' ) ) {
$this->stack->pop();
case 'link':
case 'meta':
case 'noframes':
- # OMITTED: <script>
+ // OMITTED: <script>
case 'style':
case 'template':
- # OMITTED: <title>
+ // OMITTED: <title>
return $this->inHeadMode( $token, $value, $attribs, $selfclose );
case 'caption':