public static function removeHTMLtags( $text, $processCallback = null,
$args = [], $extratags = [], $removetags = [], $warnCallback = null
) {
- extract( self::getRecognizedTagData( $extratags, $removetags ) );
+ $tagData = self::getRecognizedTagData( $extratags, $removetags );
+ $htmlpairs = $tagData['htmlpairs'];
+ $htmlsingle = $tagData['htmlsingle'];
+ $htmlsingleonly = $tagData['htmlsingleonly'];
+ $htmlnest = $tagData['htmlnest'];
+ $tabletags = $tagData['tabletags'];
+ $htmllist = $tagData['htmllist'];
+ $listtags = $tagData['listtags'];
+ $htmlsingleallowed = $tagData['htmlsingleallowed'];
+ $htmlelements = $tagData['htmlelements'];
# Remove HTML comments
$text = self::removeHTMLcomments( $text );
$badtag = true;
} elseif ( $slash ) {
# Closing a tag... is it the one we just opened?
- MediaWiki\suppressWarnings();
+ Wikimedia\suppressWarnings();
$ot = array_pop( $tagstack );
- MediaWiki\restoreWarnings();
+ Wikimedia\restoreWarnings();
if ( $ot != $t ) {
if ( isset( $htmlsingleallowed[$ot] ) ) {
# and see if we find a match below them
$optstack = [];
array_push( $optstack, $ot );
- MediaWiki\suppressWarnings();
+ Wikimedia\suppressWarnings();
$ot = array_pop( $tagstack );
- MediaWiki\restoreWarnings();
+ Wikimedia\restoreWarnings();
while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
array_push( $optstack, $ot );
- MediaWiki\suppressWarnings();
+ Wikimedia\suppressWarnings();
$ot = array_pop( $tagstack );
- MediaWiki\restoreWarnings();
+ Wikimedia\restoreWarnings();
}
if ( $t != $ot ) {
# No match. Push the optional elements back again
$badtag = true;
- MediaWiki\suppressWarnings();
+ Wikimedia\suppressWarnings();
$ot = array_pop( $optstack );
- MediaWiki\restoreWarnings();
+ Wikimedia\restoreWarnings();
while ( $ot ) {
array_push( $tagstack, $ot );
- MediaWiki\suppressWarnings();
+ Wikimedia\suppressWarnings();
$ot = array_pop( $optstack );
- MediaWiki\restoreWarnings();
+ Wikimedia\restoreWarnings();
}
}
} else {
- MediaWiki\suppressWarnings();
+ Wikimedia\suppressWarnings();
array_push( $tagstack, $ot );
- MediaWiki\restoreWarnings();
+ Wikimedia\restoreWarnings();
# <li> can be nested in <ul> or <ol>, skip those cases:
if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
'{' => '{',
'}' => '}', // prevent unpaired language conversion syntax
'[' => '[',
+ ']' => ']',
"''" => '''',
'ISBN' => 'ISBN',
'RFC' => 'RFC',
# Stupid hack
$encValue = preg_replace_callback(
'/((?i)' . wfUrlProtocols() . ')/',
- [ 'Sanitizer', 'armorLinksCallback' ],
+ function ( $matches ) {
+ return str_replace( ':', ':', $matches[1] );
+ },
$encValue );
return $encValue;
}
return $html;
}
- /**
- * Regex replace callback for armoring links against further processing.
- * @param array $matches
- * @return string
- */
- private static function armorLinksCallback( $matches ) {
- return str_replace( ':', ':', $matches[1] );
- }
-
/**
* Return an associative array of attribute names and values from
* a partial tag string. Attribute names are forced to lowercase,
static function normalizeCharReferences( $text ) {
return preg_replace_callback(
self::CHAR_REFS_REGEX,
- [ 'Sanitizer', 'normalizeCharReferencesCallback' ],
+ [ self::class, 'normalizeCharReferencesCallback' ],
$text );
}
public static function decodeCharReferences( $text ) {
return preg_replace_callback(
self::CHAR_REFS_REGEX,
- [ 'Sanitizer', 'decodeCharReferencesCallback' ],
+ [ self::class, 'decodeCharReferencesCallback' ],
$text );
}
global $wgContLang;
$text = preg_replace_callback(
self::CHAR_REFS_REGEX,
- [ 'Sanitizer', 'decodeCharReferencesCallback' ],
+ [ self::class, 'decodeCharReferencesCallback' ],
$text,
-1, //limit
$count
* Warning: this return value must be further escaped for literal
* inclusion in HTML output as of 1.10!
*
- * @param string $text HTML fragment
+ * @param string $html HTML fragment
* @return string
*/
- static function stripAllTags( $text ) {
- # Actual <tags>
- $text = StringUtils::delimiterReplace( '<', '>', '', $text );
+ static function stripAllTags( $html ) {
+ // Use RemexHtml to tokenize $html and extract the text
+ $handler = new RemexStripTagHandler;
+ $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [
+ 'ignoreErrors' => true,
+ // don't ignore char refs, we want them to be decoded
+ 'ignoreNulls' => true,
+ 'skipPreprocess' => true,
+ ] );
+ $tokenizer->execute();
+ $text = $handler->getResult();
- # Normalize &entities and whitespace
- $text = self::decodeCharReferences( $text );
$text = self::normalizeWhitespace( $text );
-
return $text;
}