Merge "Chinese Conversion Table Update 2017-6"

[lhc/web/wiklou.git] / includes / parser / Sanitizer.php
diff --git a/includes/parser/Sanitizer.php b/includes/parser/Sanitizer.php

index 4c99677..b570a43 100644 (file)
--- a/includes/parser/Sanitizer.php
+++ b/includes/parser/Sanitizer.php
@@ -477,7 +477,16 @@ class Sanitizer {
         public static function removeHTMLtags( $text, $processCallback = null,
                 $args = [], $extratags = [], $removetags = [], $warnCallback = null
         ) {
-               extract( self::getRecognizedTagData( $extratags, $removetags ) );
+               $tagData = self::getRecognizedTagData( $extratags, $removetags );
+               $htmlpairs = $tagData['htmlpairs'];
+               $htmlsingle = $tagData['htmlsingle'];
+               $htmlsingleonly = $tagData['htmlsingleonly'];
+               $htmlnest = $tagData['htmlnest'];
+               $tabletags = $tagData['tabletags'];
+               $htmllist = $tagData['htmllist'];
+               $listtags = $tagData['listtags'];
+               $htmlsingleallowed = $tagData['htmlsingleallowed'];
+               $htmlelements = $tagData['htmlelements'];
  
                 # Remove HTML comments
                 $text = self::removeHTMLcomments( $text );
@@ -1150,6 +1159,7 @@ class Sanitizer {
                         '{'    => '&#123;',
                         '}'    => '&#125;', // prevent unpaired language conversion syntax
                         '['    => '&#91;',
+                       ']'    => '&#93;',
                         "''"   => '&#39;&#39;',
                         'ISBN' => '&#73;SBN',
                         'RFC'  => '&#82;FC',
@@ -1967,17 +1977,22 @@ class Sanitizer {
          * Warning: this return value must be further escaped for literal
          * inclusion in HTML output as of 1.10!
          *
-        * @param string $text HTML fragment
+        * @param string $html HTML fragment
          * @return string
          */
-       static function stripAllTags( $text ) {
-               # Actual <tags>
-               $text = StringUtils::delimiterReplace( '<', '>', '', $text );
+       static function stripAllTags( $html ) {
+               // Use RemexHtml to tokenize $html and extract the text
+               $handler = new RemexStripTagHandler;
+               $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [
+                       'ignoreErrors' => true,
+                       // don't ignore char refs, we want them to be decoded
+                       'ignoreNulls' => true,
+                       'skipPreprocess' => true,
+               ] );
+               $tokenizer->execute();
+               $text = $handler->getResult();
  
-               # Normalize &entities and whitespace
-               $text = self::decodeCharReferences( $text );
                 $text = self::normalizeWhitespace( $text );
-
                 return $text;
         }