# as soon as possible (usually as soon as the tag is closed) to reduce
# its memory footprint.
+# We've been gradually lifting some of these restrictions to handle
+# non-sanitized output generated by extensions, but we shortcut the tokenizer
+# for speed (primarily by splitting on `<`) and so rely on syntactic
+# well-formedness.
+
# On the other hand, I've been pretty careful to note with comments in the
# code the places where this implementation omits features of the spec or
# depends on the MediaWiki Sanitizer. Perhaps in the future we'll want to
return $out;
}
+ /**
+ * Insert a comment at the appropriate place for inserting a node.
+ * @param string $value Content of the comment.
+ * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-comment
+ */
+ public function insertComment( $value ) {
+ // Just another type of text node, except for tidy p-wrapping.
+ return $this->insertText( '<!--' . $value . '-->', true );
+ }
+
/**
* Insert text at the appropriate place for inserting a node.
* @param string $value
* @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node
*/
- public function insertText( $value ) {
+ public function insertText( $value, $isComment = false ) {
if (
$this->fosterParentMode &&
$this->currentNode->isA( BalanceSets::$tableSectionRowSet )
) {
$this->fosterParent( $value );
} elseif (
- $this->tidyCompat &&
+ $this->tidyCompat && !$isComment &&
$this->currentNode->isA( BalanceSets::$tidyPWrapSet )
) {
$this->insertHTMLELement( 'mw:p-wrap', [] );
* - The document is never in "quirks mode".
* - All occurrences of < and > have been entity escaped, so we
* can parse tags by simply splitting on those two characters.
+ * The character < must not appear inside comments.
* Similarly, all attributes have been "cleaned" and are double-quoted
* and escaped.
- * - All comments and null characters are assumed to have been removed.
+ * - All null characters are assumed to have been removed.
* - We don't alter linefeeds after <pre>/<listing>.
* - The following elements are disallowed: <html>, <head>, <body>, <frameset>,
* <frame>, <plaintext>, <isindex>, <textarea>, <xmp>, <iframe>,
private $stack;
private $strict;
private $tidyCompat;
+ private $allowComments;
private $textIntegrationMode = false;
private $pendingTableText;
private $fragmentContext;
private $formElementPointer;
+ /**
+ * Valid HTML5 comments.
+ * Regex borrowed from Tim Starling's "remex-html" project.
+ */
+ const VALID_COMMENT_REGEX = "~ !--
+ ( # 1. Comment match detector
+ > | -> | # Invalid short close
+ ( # 2. Comment contents
+ (?:
+ (?! --> )
+ (?! --!> )
+ (?! --! \z )
+ (?! -- \z )
+ (?! - \z )
+ .
+ )*+
+ )
+ ( # 3. Comment close
+ --> | # Normal close
+ --!> | # Comment end bang
+ ( # 4. Indicate matches requiring EOF
+ --! | # EOF in comment end bang state
+ -- | # EOF in comment end state
+ - | # EOF in comment end dash state
+ # EOF in comment state
+ )
+ )
+ )
+ ([^<]*) \z # 5. Non-tag text after the comment
+ ~xs";
+
/**
* Create a new Balancer.
* @param array $config Balancer configuration. Includes:
* program: <p>-wrapping is done to the children of
* <body> and <blockquote> elements, and empty elements
* are removed.
+ * 'allowComments': boolean, defaults to true.
+ * When true, allows HTML comments in the input.
+ * The Sanitizer generally strips all comments, so if you
+ * are running on sanitized output you can set this to
+ * false to get a bit more performance.
*/
public function __construct( array $config = [] ) {
$config = $config + [
'strict' => false,
'allowedHtmlElements' => null,
'tidyCompat' => false,
+ 'allowComments' => true,
];
$this->allowedHtmlElements = $config['allowedHtmlElements'];
$this->strict = $config['strict'];
$this->tidyCompat = $config['tidyCompat'];
+ $this->allowComments = $config['allowComments'];
if ( $this->allowedHtmlElements !== null ) {
# Sanity check!
$bad = array_uintersect_assoc(
/**
* Grab the next "token" from $bitsIterator. This is either a open/close
- * tag or text, depending on whether the Sanitizer approves.
+ * tag or text or a comment, depending on whether the Sanitizer approves.
*/
private function advance() {
$x = $this->bitsIterator->current();
$this->bitsIterator->next();
$regs = [];
+ # Handle comments. These won't be generated by mediawiki (they
+ # are stripped in the Sanitizer) but may be generated by extensions.
+ if (
+ $this->allowComments &&
+ preg_match( Balancer::VALID_COMMENT_REGEX, $x, $regs, PREG_OFFSET_CAPTURE ) &&
+ /* verify EOF condition where necessary */
+ ( $regs[4][1] < 0 || !$this->bitsIterator->valid() )
+ ) {
+ $contents = $regs[2][0];
+ $rest = $regs[5][0];
+ $this->insertToken( 'comment', $contents );
+ $this->insertToken( 'text', str_replace( '>', '>', $rest ) );
+ return;
+ }
# $slash: Does the current element start with a '/'?
# $t: Current element name
# $attribStr: String between element name and >
// ignore any other end tag
return true;
}
+ } elseif ( $token === 'comment' ) {
+ $this->stack->insertComment( $value );
+ return true;
}
// If not handled above
}
}
return true;
+ } elseif ( $token === 'comment' ) {
+ $this->stack->insertComment( $value );
+ return true;
} else {
Assert::invariant( false, "Bad token type: $token" );
}
return $this->inHeadMode( $token, $value, $attribs, $selfclose );
}
// Fall through for "anything else" clause.
+ } elseif ( $token === 'comment' ) {
+ $this->stack->insertComment( $value );
+ return true;
}
// This is the "anything else" case:
$this->stack->fosterParentMode = true;
// Fall through for "anything else".
} elseif ( $token === 'eof' ) {
return $this->inBodyMode( $token, $value, $attribs, $selfclose );
+ } elseif ( $token === 'comment' ) {
+ $this->stack->insertComment( $value );
+ return true;
}
// Anything else
case 'template':
return $this->inHeadMode( $token, $value, $attribs, $selfclose );
}
+ } elseif ( $token === 'comment' ) {
+ $this->stack->insertComment( $value );
+ return true;
}
// anything else: just ignore the token
return true;
}
private function inTemplateMode( $token, $value, $attribs = null, $selfclose = false ) {
- if ( $token === 'text' ) {
+ if ( $token === 'text' || $token === 'comment' ) {
return $this->inBodyMode( $token, $value, $attribs, $selfclose );
} elseif ( $token === 'eof' ) {
if ( $this->stack->indexOf( 'template' ) < 0 ) {