Don't percent-encode HTML5 IDs

[lhc/web/wiklou.git] / includes / Sanitizer.php
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php

index b08bc69..7d17cd1 100644 (file)
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -56,6 +56,21 @@ class Sanitizer {
         const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
         const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
  
+       /**
+        * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
+        *
+        * @since 1.30
+        */
+       const ID_PRIMARY = 0;
+
+       /**
+        * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false
+        * if no fallback is configured.
+        *
+        * @since 1.30
+        */
+       const ID_FALLBACK = 1;
+
         /**
          * List of all named character entities defined in HTML 4.01
          * https://www.w3.org/TR/html4/sgml/entities.html
@@ -465,7 +480,7 @@ class Sanitizer {
                 extract( self::getRecognizedTagData( $extratags, $removetags ) );
  
                 # Remove HTML comments
-               $text = Sanitizer::removeHTMLcomments( $text );
+               $text = self::removeHTMLcomments( $text );
                 $bits = explode( '<', $text );
                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
                 if ( !MWTidy::isEnabled() ) {
@@ -583,12 +598,12 @@ class Sanitizer {
                                                         call_user_func_array( $processCallback, [ &$params, $args ] );
                                                 }
  
-                                               if ( !Sanitizer::validateTag( $params, $t ) ) {
+                                               if ( !self::validateTag( $params, $t ) ) {
                                                         $badtag = true;
                                                 }
  
                                                 # Strip non-approved attributes from the tag
-                                               $newparams = Sanitizer::fixTagAttributes( $params, $t );
+                                               $newparams = self::fixTagAttributes( $params, $t );
                                         }
                                         if ( !$badtag ) {
                                                 $rest = str_replace( '>', '&gt;', $rest );
@@ -629,11 +644,11 @@ class Sanitizer {
                                                                 call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
                                                         }
                                                 }
-                                               if ( !Sanitizer::validateTag( $params, $t ) ) {
+                                               if ( !self::validateTag( $params, $t ) ) {
                                                         $badtag = true;
                                                 }
  
-                                               $newparams = Sanitizer::fixTagAttributes( $params, $t );
+                                               $newparams = self::fixTagAttributes( $params, $t );
                                                 if ( !$badtag ) {
                                                         if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
                                                                 # Interpret self-closing tags as empty tags even when
@@ -710,7 +725,7 @@ class Sanitizer {
          * @return bool
          */
         static function validateTag( $params, $element ) {
-               $params = Sanitizer::decodeTagAttributes( $params );
+               $params = self::decodeTagAttributes( $params );
  
                 if ( $element == 'meta' || $element == 'link' ) {
                         if ( !isset( $params['itemprop'] ) ) {
@@ -746,8 +761,8 @@ class Sanitizer {
          * @todo Check for unique id attribute :P
          */
         static function validateTagAttributes( $attribs, $element ) {
-               return Sanitizer::validateAttributes( $attribs,
-                       Sanitizer::attributeWhitelist( $element ) );
+               return self::validateAttributes( $attribs,
+                       self::attributeWhitelist( $element ) );
         }
  
         /**
@@ -795,12 +810,12 @@ class Sanitizer {
                         # Strip javascript "expression" from stylesheets.
                         # https://msdn.microsoft.com/en-us/library/ms537634.aspx
                         if ( $attribute == 'style' ) {
-                               $value = Sanitizer::checkCss( $value );
+                               $value = self::checkCss( $value );
                         }
  
                         # Escape HTML id attributes
                         if ( $attribute === 'id' ) {
-                               $value = Sanitizer::escapeId( $value, 'noninitial' );
+                               $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
                         }
  
                         # Escape HTML id reference lists
@@ -809,7 +824,7 @@ class Sanitizer {
                                 || $attribute === 'aria-labelledby'
                                 || $attribute === 'aria-owns'
                         ) {
-                               $value = Sanitizer::escapeIdReferenceList( $value, 'noninitial' );
+                               $value = self::escapeIdReferenceList( $value, 'noninitial' );
                         }
  
                         // RDFa and microdata properties allow URLs, URIs and/or CURIs.
@@ -907,7 +922,7 @@ class Sanitizer {
          */
         public static function normalizeCss( $value ) {
                 // Decode character references like &#123;
-               $value = Sanitizer::decodeCharReferences( $value );
+               $value = self::decodeCharReferences( $value );
  
                 // Decode escape sequences and line continuation
                 // See the grammar in the CSS 2 spec, appendix D.
@@ -1087,14 +1102,14 @@ class Sanitizer {
                         return '';
                 }
  
-               $decoded = Sanitizer::decodeTagAttributes( $text );
-               $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
+               $decoded = self::decodeTagAttributes( $text );
+               $stripped = self::validateTagAttributes( $decoded, $element );
  
                 if ( $sorted ) {
                         ksort( $stripped );
                 }
  
-               return Sanitizer::safeEncodeTagAttributes( $stripped );
+               return self::safeEncodeTagAttributes( $stripped );
         }
  
         /**
@@ -1124,7 +1139,7 @@ class Sanitizer {
          * @return string HTML-encoded text fragment
          */
         static function safeEncodeAttribute( $text ) {
-               $encValue = Sanitizer::encodeAttribute( $text );
+               $encValue = self::encodeAttribute( $text );
  
                 # Templates and links may be expanded in later parsing,
                 # creating invalid or dangerous output. Suppress this.
@@ -1164,6 +1179,8 @@ class Sanitizer {
          * ambiguous if it's part of something that looks like a percent escape
          * (which don't work reliably in fragments cross-browser).
          *
+        * @deprecated since 1.30, use one of this class' escapeIdFor*() functions
+        *
          * @see https://www.w3.org/TR/html401/types.html#type-name Valid characters
          *   in the id and name attributes
          * @see https://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with
@@ -1186,7 +1203,7 @@ class Sanitizer {
                 global $wgExperimentalHtmlIds;
                 $options = (array)$options;
  
-               $id = Sanitizer::decodeCharReferences( $id );
+               $id = self::decodeCharReferences( $id );
  
                 if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
                         $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
@@ -1215,21 +1232,128 @@ class Sanitizer {
                 return $id;
         }
  
+       /**
+        * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
+        * a valid HTML id attribute.
+        *
+        * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe,
+        * be sure to use proper escaping.
+        *
+        * @param string $id String to escape
+        * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding
+        *     should be used.
+        * @return string|bool Escaped ID or false if fallback encoding is requested but it's not
+        *     configured.
+        *
+        * @since 1.30
+        */
+       public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
+               global $wgFragmentMode;
+
+               if ( !isset( $wgFragmentMode[$mode] ) ) {
+                       if ( $mode === self::ID_PRIMARY ) {
+                               throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
+                       }
+                       return false;
+               }
+
+               $internalMode = $wgFragmentMode[$mode];
+
+               return self::escapeIdInternal( $id, $internalMode );
+       }
+
+       /**
+        * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
+        * a valid URL fragment.
+        *
+        * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe,
+        * be sure to use proper escaping.
+        *
+        * @param string $id String to escape
+        * @return string Escaped ID
+        *
+        * @since 1.30
+        */
+       public static function escapeIdForLink( $id ) {
+               global $wgFragmentMode;
+
+               if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
+                       throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
+               }
+
+               $mode = $wgFragmentMode[self::ID_PRIMARY];
+
+               $id = self::escapeIdInternal( $id, $mode );
+
+               return $id;
+       }
+
+       /**
+        * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
+        * a valid URL fragment for external interwikis.
+        *
+        * @param string $id String to escape
+        * @return string Escaped ID
+        *
+        * @since 1.30
+        */
+       public static function escapeIdForExternalInterwiki( $id ) {
+               global $wgExternalInterwikiFragmentMode;
+
+               $id = self::escapeIdInternal( $id, $wgExternalInterwikiFragmentMode );
+
+               return $id;
+       }
+
+       /**
+        * Helper for escapeIdFor*() functions. Performs most of the actual escaping.
+        *
+        * @param string $id String to escape
+        * @param string $mode One of modes from $wgFragmentMode
+        * @return string
+        */
+       private static function escapeIdInternal( $id, $mode ) {
+               $id = self::decodeCharReferences( $id );
+
+               switch ( $mode ) {
+                       case 'html5':
+                               $id = str_replace( ' ', '_', $id );
+                               break;
+                       case 'legacy':
+                               // This corresponds to 'noninitial' mode of the old escapeId()
+                               static $replace = [
+                                       '%3A' => ':',
+                                       '%' => '.'
+                               ];
+
+                               $id = urlencode( str_replace( ' ', '_', $id ) );
+                               $id = strtr( $id, $replace );
+                               break;
+                       case 'html5-legacy':
+                               $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
+                               $id = trim( $id, '_' );
+                               if ( $id === '' ) {
+                                       // Must have been all whitespace to start with.
+                                       $id = '_';
+                               }
+                               break;
+                       default:
+                               throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
+               }
+
+               return $id;
+       }
+
         /**
          * Given a string containing a space delimited list of ids, escape each id
          * to match ids escaped by the escapeId() function.
          *
+        * @todo wfDeprecated() uses of $options in 1.31, remove completely in 1.32
+        *
          * @since 1.27
          *
          * @param string $referenceString Space delimited list of ids
-        * @param string|array $options String or array of strings (default is array()):
-        *   'noninitial': This is a non-initial fragment of an id, not a full id,
-        *       so don't pay attention if the first character isn't valid at the
-        *       beginning of an id.  Only matters if $wgExperimentalHtmlIds is
-        *       false.
-        *   'legacy': Behave the way the old HTML 4-based ID escaping worked even
-        *       if $wgExperimentalHtmlIds is used, so we can generate extra
-        *       anchors and links won't break.
+        * @param string|array $options Deprecated and does nothing.
          * @return string
          */
         static function escapeIdReferenceList( $referenceString, $options = [] ) {
@@ -1238,7 +1362,7 @@ class Sanitizer {
  
                 # Escape each token as an id
                 foreach ( $references as &$ref ) {
-                       $ref = Sanitizer::escapeId( $ref, $options );
+                       $ref = self::escapeIdForAttribute( $ref );
                 }
  
                 # Merge the array back to a space delimited list string
@@ -1275,7 +1399,7 @@ class Sanitizer {
          * @return string Escaped input
          */
         static function escapeHtmlAllowEntities( $html ) {
-               $html = Sanitizer::decodeCharReferences( $html );
+               $html = self::decodeCharReferences( $html );
                 # It seems wise to escape ' as well as ", as a matter of course.  Can't
                 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
                 # don't cause the entire string to disappear.
@@ -1317,14 +1441,14 @@ class Sanitizer {
  
                 foreach ( $pairs as $set ) {
                         $attribute = strtolower( $set[1] );
-                       $value = Sanitizer::getTagAttributeCallback( $set );
+                       $value = self::getTagAttributeCallback( $set );
  
                         // Normalize whitespace
                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
                         $value = trim( $value );
  
                         // Decode character references
-                       $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
+                       $attribs[$attribute] = self::decodeCharReferences( $value );
                 }
                 return $attribs;
         }
@@ -1340,7 +1464,7 @@ class Sanitizer {
                 $attribs = [];
                 foreach ( $assoc_array as $attribute => $value ) {
                         $encAttribute = htmlspecialchars( $attribute );
-                       $encValue = Sanitizer::safeEncodeAttribute( $value );
+                       $encValue = self::safeEncodeAttribute( $value );
  
                         $attribs[] = "$encAttribute=\"$encValue\"";
                 }
@@ -1427,11 +1551,11 @@ class Sanitizer {
         static function normalizeCharReferencesCallback( $matches ) {
                 $ret = null;
                 if ( $matches[1] != '' ) {
-                       $ret = Sanitizer::normalizeEntity( $matches[1] );
+                       $ret = self::normalizeEntity( $matches[1] );
                 } elseif ( $matches[2] != '' ) {
-                       $ret = Sanitizer::decCharReference( $matches[2] );
+                       $ret = self::decCharReference( $matches[2] );
                 } elseif ( $matches[3] != '' ) {
-                       $ret = Sanitizer::hexCharReference( $matches[3] );
+                       $ret = self::hexCharReference( $matches[3] );
                 }
                 if ( is_null( $ret ) ) {
                         return htmlspecialchars( $matches[0] );
@@ -1468,7 +1592,7 @@ class Sanitizer {
          */
         static function decCharReference( $codepoint ) {
                 $point = intval( $codepoint );
-               if ( Sanitizer::validateCodepoint( $point ) ) {
+               if ( self::validateCodepoint( $point ) ) {
                         return sprintf( '&#%d;', $point );
                 } else {
                         return null;
@@ -1481,7 +1605,7 @@ class Sanitizer {
          */
         static function hexCharReference( $codepoint ) {
                 $point = hexdec( $codepoint );
-               if ( Sanitizer::validateCodepoint( $point ) ) {
+               if ( self::validateCodepoint( $point ) ) {
                         return sprintf( '&#x%x;', $point );
                 } else {
                         return null;
@@ -1535,7 +1659,10 @@ class Sanitizer {
                 $text = preg_replace_callback(
                         self::CHAR_REFS_REGEX,
                         [ 'Sanitizer', 'decodeCharReferencesCallback' ],
-                       $text, /* limit */ -1, $count );
+                       $text,
+                       -1, //limit
+                       $count
+               );
  
                 if ( $count ) {
                         return $wgContLang->normalize( $text );
@@ -1550,11 +1677,11 @@ class Sanitizer {
          */
         static function decodeCharReferencesCallback( $matches ) {
                 if ( $matches[1] != '' ) {
-                       return Sanitizer::decodeEntity( $matches[1] );
+                       return self::decodeEntity( $matches[1] );
                 } elseif ( $matches[2] != '' ) {
-                       return Sanitizer::decodeChar( intval( $matches[2] ) );
+                       return self::decodeChar( intval( $matches[2] ) );
                 } elseif ( $matches[3] != '' ) {
-                       return Sanitizer::decodeChar( hexdec( $matches[3] ) );
+                       return self::decodeChar( hexdec( $matches[3] ) );
                 }
                 # Last case should be an ampersand by itself
                 return $matches[0];
@@ -1568,7 +1695,7 @@ class Sanitizer {
          * @private
          */
         static function decodeChar( $codepoint ) {
-               if ( Sanitizer::validateCodepoint( $codepoint ) ) {
+               if ( self::validateCodepoint( $codepoint ) ) {
                         return UtfNormal\Utils::codepointToUtf8( $codepoint );
                 } else {
                         return UtfNormal\Constants::UTF8_REPLACEMENT;
@@ -1601,7 +1728,7 @@ class Sanitizer {
          * @return array
          */
         static function attributeWhitelist( $element ) {
-               $list = Sanitizer::setupAttributeWhitelist();
+               $list = self::setupAttributeWhitelist();
                 return isset( $list[$element] )
                         ? $list[$element]
                         : [];
@@ -1772,7 +1899,7 @@ class Sanitizer {
                         # Not usually allowed, but may be used for extension-style hooks
                         # such as <math> when it is rasterized, or if $wgAllowImageTag is
                         # true
-                       'img'        => array_merge( $common, [ 'alt', 'src', 'width', 'height' ] ),
+                       'img'        => array_merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
  
                         'video'      => array_merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
                         'source'     => array_merge( $common, [ 'type', 'src' ] ),
@@ -1809,6 +1936,10 @@ class Sanitizer {
                         # https://www.w3.org/TR/REC-MathML/
                         'math'       => [ 'class', 'style', 'id', 'title' ],
  
+                       // HTML 5 section 4.5
+                       'figure'     => $common,
+                       'figcaption' => $common,
+
                         # HTML 5 section 4.6
                         'bdi' => $common,
  
@@ -1824,7 +1955,7 @@ class Sanitizer {
                         // (ie: validateTag rejects tags missing the attributes needed for Microdata)
                         // So we don't bother including $common attributes that have no purpose.
                         'meta' => [ 'itemprop', 'content' ],
-                       'link' => [ 'itemprop', 'href' ],
+                       'link' => [ 'itemprop', 'href', 'title' ],
                 ];
  
                 return $whitelist;
@@ -1876,7 +2007,7 @@ class Sanitizer {
         static function cleanUrl( $url ) {
                 # Normalize any HTML entities in input. They will be
                 # re-escaped by makeExternalLink().
-               $url = Sanitizer::decodeCharReferences( $url );
+               $url = self::decodeCharReferences( $url );
  
                 # Escape any control characters introduced by the above step
                 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',