Merge "Chinese Conversion Table Update 2017-6"
[lhc/web/wiklou.git] / includes / parser / Sanitizer.php
index 4c99677..b570a43 100644 (file)
@@ -477,7 +477,16 @@ class Sanitizer {
        public static function removeHTMLtags( $text, $processCallback = null,
                $args = [], $extratags = [], $removetags = [], $warnCallback = null
        ) {
-               extract( self::getRecognizedTagData( $extratags, $removetags ) );
+               $tagData = self::getRecognizedTagData( $extratags, $removetags );
+               $htmlpairs = $tagData['htmlpairs'];
+               $htmlsingle = $tagData['htmlsingle'];
+               $htmlsingleonly = $tagData['htmlsingleonly'];
+               $htmlnest = $tagData['htmlnest'];
+               $tabletags = $tagData['tabletags'];
+               $htmllist = $tagData['htmllist'];
+               $listtags = $tagData['listtags'];
+               $htmlsingleallowed = $tagData['htmlsingleallowed'];
+               $htmlelements = $tagData['htmlelements'];
 
                # Remove HTML comments
                $text = self::removeHTMLcomments( $text );
@@ -1150,6 +1159,7 @@ class Sanitizer {
                        '{'    => '{',
                        '}'    => '}', // prevent unpaired language conversion syntax
                        '['    => '[',
+                       ']'    => ']',
                        "''"   => '''',
                        'ISBN' => 'ISBN',
                        'RFC'  => 'RFC',
@@ -1967,17 +1977,22 @@ class Sanitizer {
         * Warning: this return value must be further escaped for literal
         * inclusion in HTML output as of 1.10!
         *
-        * @param string $text HTML fragment
+        * @param string $html HTML fragment
         * @return string
         */
-       static function stripAllTags( $text ) {
-               # Actual <tags>
-               $text = StringUtils::delimiterReplace( '<', '>', '', $text );
+       static function stripAllTags( $html ) {
+               // Use RemexHtml to tokenize $html and extract the text
+               $handler = new RemexStripTagHandler;
+               $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [
+                       'ignoreErrors' => true,
+                       // don't ignore char refs, we want them to be decoded
+                       'ignoreNulls' => true,
+                       'skipPreprocess' => true,
+               ] );
+               $tokenizer->execute();
+               $text = $handler->getResult();
 
-               # Normalize &entities and whitespace
-               $text = self::decodeCharReferences( $text );
                $text = self::normalizeWhitespace( $text );
-
                return $text;
        }