'RefreshLinks' => __DIR__ . '/maintenance/refreshLinks.php',
'RefreshLinksJob' => __DIR__ . '/includes/jobqueue/jobs/RefreshLinksJob.php',
'RegexlikeReplacer' => __DIR__ . '/includes/libs/replacers/RegexlikeReplacer.php',
+ 'RemexStripTagHandler' => __DIR__ . '/includes/parser/RemexStripTagHandler.php',
'RemoveInvalidEmails' => __DIR__ . '/maintenance/removeInvalidEmails.php',
'RemoveUnusedAccounts' => __DIR__ . '/maintenance/removeUnusedAccounts.php',
'RenameDbPrefix' => __DIR__ . '/maintenance/renameDbPrefix.php',
--- /dev/null
+<?php
+
+use RemexHtml\Tokenizer\Attributes;
+use RemexHtml\Tokenizer\TokenHandler;
+use RemexHtml\Tokenizer\Tokenizer;
+
+/**
+ * @internal
+ */
+class RemexStripTagHandler implements TokenHandler {
+ private $text = '';
+ public function getResult() {
+ return $this->text;
+ }
+
+ function startDocument( Tokenizer $t, $fns, $fn ) {
+ // Do nothing.
+ }
+ function endDocument( $pos ) {
+ // Do nothing.
+ }
+ function error( $text, $pos ) {
+ // Do nothing.
+ }
+ function characters( $text, $start, $length, $sourceStart, $sourceLength ) {
+ $this->text .= substr( $text, $start, $length );
+ }
+ function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) {
+ // Do nothing.
+ }
+ function endTag( $name, $sourceStart, $sourceLength ) {
+ // Do nothing.
+ }
+ function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) {
+ // Do nothing.
+ }
+ function comment( $text, $sourceStart, $sourceLength ) {
+ // Do nothing.
+ }
+}
* Warning: this return value must be further escaped for literal
* inclusion in HTML output as of 1.10!
*
- * @param string $text HTML fragment
+ * @param string $html HTML fragment
* @return string
*/
- static function stripAllTags( $text ) {
- # Actual <tags>
- $text = StringUtils::delimiterReplace( '<', '>', '', $text );
+ static function stripAllTags( $html ) {
+ // Use RemexHtml to tokenize $html and extract the text
+ $handler = new RemexStripTagHandler;
+ $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [
+ 'ignoreErrors' => true,
+ // don't ignore char refs, we want them to be decoded
+ 'ignoreNulls' => true,
+ 'skipPreprocess' => true,
+ ] );
+ $tokenizer->execute();
+ $text = $handler->getResult();
- # Normalize &entities and whitespace
- $text = self::decodeCharReferences( $text );
$text = self::normalizeWhitespace( $text );
-
return $text;
}
[ '<p id="one">Foo</p><p id="two">Bar</p>', 'FooBar' ],
[ "<p>Foo</p>\n<p>Bar</p>", 'Foo Bar' ],
[ '<p>Hello <strong> world café</p>', 'Hello <strong> world cafĂ©' ],
- // This one is broken, see T179978
- //[
- // '<p><small data-foo=\'bar"<baz>quux\'><a href="./Foo">Bar</a></small> Whee!</p>',
- // 'Bar Whee!'
- //],
+ [
+ '<p><small data-foo=\'bar"<baz>quux\'><a href="./Foo">Bar</a></small> Whee!</p>',
+ 'Bar Whee!'
+ ],
[ '1<span class="<?php">2</span>3', '123' ],
[ '1<span class="<?">2</span>3', '123' ],
];