From: jenkins-bot Date: Thu, 16 Nov 2017 20:34:31 +0000 (+0000) Subject: Merge "Use Remex in Sanitizer::stripAllTags()" X-Git-Tag: 1.31.0-rc.0~1480 X-Git-Url: http://git.cyclocoop.org/%22%20.%20generer_url_ecrire%28%22suivi_revisions%22%29%20.%20%22?a=commitdiff_plain;h=2f15b22b91c0426b06e9ece9ffd2aa14fbdaa621;hp=0321c2cc3b813269e4eb65d3cba8610514e00eb3;p=lhc%2Fweb%2Fwiklou.git Merge "Use Remex in Sanitizer::stripAllTags()" --- diff --git a/autoload.php b/autoload.php index ee47eac26d..aef7e5e2c2 100644 --- a/autoload.php +++ b/autoload.php @@ -1219,6 +1219,7 @@ $wgAutoloadLocalClasses = [ 'RefreshLinks' => __DIR__ . '/maintenance/refreshLinks.php', 'RefreshLinksJob' => __DIR__ . '/includes/jobqueue/jobs/RefreshLinksJob.php', 'RegexlikeReplacer' => __DIR__ . '/includes/libs/replacers/RegexlikeReplacer.php', + 'RemexStripTagHandler' => __DIR__ . '/includes/parser/RemexStripTagHandler.php', 'RemoveInvalidEmails' => __DIR__ . '/maintenance/removeInvalidEmails.php', 'RemoveUnusedAccounts' => __DIR__ . '/maintenance/removeUnusedAccounts.php', 'RenameDbPrefix' => __DIR__ . '/maintenance/renameDbPrefix.php', diff --git a/includes/parser/RemexStripTagHandler.php b/includes/parser/RemexStripTagHandler.php new file mode 100644 index 0000000000..2839147d4f --- /dev/null +++ b/includes/parser/RemexStripTagHandler.php @@ -0,0 +1,40 @@ +text; + } + + function startDocument( Tokenizer $t, $fns, $fn ) { + // Do nothing. + } + function endDocument( $pos ) { + // Do nothing. + } + function error( $text, $pos ) { + // Do nothing. + } + function characters( $text, $start, $length, $sourceStart, $sourceLength ) { + $this->text .= substr( $text, $start, $length ); + } + function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) { + // Do nothing. + } + function endTag( $name, $sourceStart, $sourceLength ) { + // Do nothing. + } + function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) { + // Do nothing. + } + function comment( $text, $sourceStart, $sourceLength ) { + // Do nothing. + } +} diff --git a/includes/parser/Sanitizer.php b/includes/parser/Sanitizer.php index 4c996771e8..7c9f56326b 100644 --- a/includes/parser/Sanitizer.php +++ b/includes/parser/Sanitizer.php @@ -1967,17 +1967,22 @@ class Sanitizer { * Warning: this return value must be further escaped for literal * inclusion in HTML output as of 1.10! * - * @param string $text HTML fragment + * @param string $html HTML fragment * @return string */ - static function stripAllTags( $text ) { - # Actual - $text = StringUtils::delimiterReplace( '<', '>', '', $text ); + static function stripAllTags( $html ) { + // Use RemexHtml to tokenize $html and extract the text + $handler = new RemexStripTagHandler; + $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [ + 'ignoreErrors' => true, + // don't ignore char refs, we want them to be decoded + 'ignoreNulls' => true, + 'skipPreprocess' => true, + ] ); + $tokenizer->execute(); + $text = $handler->getResult(); - # Normalize &entities and whitespace - $text = self::decodeCharReferences( $text ); $text = self::normalizeWhitespace( $text ); - return $text; } diff --git a/tests/phpunit/includes/parser/SanitizerTest.php b/tests/phpunit/includes/parser/SanitizerTest.php index 269575b24e..d7e72e164b 100644 --- a/tests/phpunit/includes/parser/SanitizerTest.php +++ b/tests/phpunit/includes/parser/SanitizerTest.php @@ -530,11 +530,10 @@ class SanitizerTest extends MediaWikiTestCase { [ '

Foo

Bar

', 'FooBar' ], [ "

Foo

\n

Bar

", 'Foo Bar' ], [ '

Hello <strong> world café

', 'Hello world café' ], - // This one is broken, see T179978 - //[ - // '

quux\'>Bar Whee!

', - // 'Bar Whee!' - //], + [ + '

quux\'>Bar Whee!

', + 'Bar Whee!' + ], [ '123', '123' ], [ '123', '123' ], ];