const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
+ /**
+ * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
+ *
+ * @since 1.30
+ */
+ const ID_PRIMARY = 0;
+
+ /**
+ * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false
+ * if no fallback is configured.
+ *
+ * @since 1.30
+ */
+ const ID_FALLBACK = 1;
+
/**
* List of all named character entities defined in HTML 4.01
* https://www.w3.org/TR/html4/sgml/entities.html
extract( self::getRecognizedTagData( $extratags, $removetags ) );
# Remove HTML comments
- $text = Sanitizer::removeHTMLcomments( $text );
+ $text = self::removeHTMLcomments( $text );
$bits = explode( '<', $text );
$text = str_replace( '>', '>', array_shift( $bits ) );
if ( !MWTidy::isEnabled() ) {
call_user_func_array( $processCallback, [ &$params, $args ] );
}
- if ( !Sanitizer::validateTag( $params, $t ) ) {
+ if ( !self::validateTag( $params, $t ) ) {
$badtag = true;
}
# Strip non-approved attributes from the tag
- $newparams = Sanitizer::fixTagAttributes( $params, $t );
+ $newparams = self::fixTagAttributes( $params, $t );
}
if ( !$badtag ) {
$rest = str_replace( '>', '>', $rest );
call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
}
}
- if ( !Sanitizer::validateTag( $params, $t ) ) {
+ if ( !self::validateTag( $params, $t ) ) {
$badtag = true;
}
- $newparams = Sanitizer::fixTagAttributes( $params, $t );
+ $newparams = self::fixTagAttributes( $params, $t );
if ( !$badtag ) {
if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
# Interpret self-closing tags as empty tags even when
* @return bool
*/
static function validateTag( $params, $element ) {
- $params = Sanitizer::decodeTagAttributes( $params );
+ $params = self::decodeTagAttributes( $params );
if ( $element == 'meta' || $element == 'link' ) {
if ( !isset( $params['itemprop'] ) ) {
* @todo Check for unique id attribute :P
*/
static function validateTagAttributes( $attribs, $element ) {
- return Sanitizer::validateAttributes( $attribs,
- Sanitizer::attributeWhitelist( $element ) );
+ return self::validateAttributes( $attribs,
+ self::attributeWhitelist( $element ) );
}
/**
# Strip javascript "expression" from stylesheets.
# https://msdn.microsoft.com/en-us/library/ms537634.aspx
if ( $attribute == 'style' ) {
- $value = Sanitizer::checkCss( $value );
+ $value = self::checkCss( $value );
}
# Escape HTML id attributes
if ( $attribute === 'id' ) {
- $value = Sanitizer::escapeId( $value, 'noninitial' );
+ $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
}
# Escape HTML id reference lists
|| $attribute === 'aria-labelledby'
|| $attribute === 'aria-owns'
) {
- $value = Sanitizer::escapeIdReferenceList( $value, 'noninitial' );
+ $value = self::escapeIdReferenceList( $value, 'noninitial' );
}
// RDFa and microdata properties allow URLs, URIs and/or CURIs.
*/
public static function normalizeCss( $value ) {
// Decode character references like {
- $value = Sanitizer::decodeCharReferences( $value );
+ $value = self::decodeCharReferences( $value );
// Decode escape sequences and line continuation
// See the grammar in the CSS 2 spec, appendix D.
return '';
}
- $decoded = Sanitizer::decodeTagAttributes( $text );
- $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
+ $decoded = self::decodeTagAttributes( $text );
+ $stripped = self::validateTagAttributes( $decoded, $element );
if ( $sorted ) {
ksort( $stripped );
}
- return Sanitizer::safeEncodeTagAttributes( $stripped );
+ return self::safeEncodeTagAttributes( $stripped );
}
/**
* @return string HTML-encoded text fragment
*/
static function safeEncodeAttribute( $text ) {
- $encValue = Sanitizer::encodeAttribute( $text );
+ $encValue = self::encodeAttribute( $text );
# Templates and links may be expanded in later parsing,
# creating invalid or dangerous output. Suppress this.
* ambiguous if it's part of something that looks like a percent escape
* (which don't work reliably in fragments cross-browser).
*
+ * @deprecated since 1.30, use one of this class' escapeIdFor*() functions
+ *
* @see https://www.w3.org/TR/html401/types.html#type-name Valid characters
* in the id and name attributes
* @see https://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with
global $wgExperimentalHtmlIds;
$options = (array)$options;
- $id = Sanitizer::decodeCharReferences( $id );
+ $id = self::decodeCharReferences( $id );
if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
$id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
return $id;
}
+ /**
+ * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
+ * a valid HTML id attribute.
+ *
+ * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe,
+ * be sure to use proper escaping.
+ *
+ * @param string $id String to escape
+ * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding
+ * should be used.
+ * @return string|bool Escaped ID or false if fallback encoding is requested but it's not
+ * configured.
+ *
+ * @since 1.30
+ */
+ public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
+ global $wgFragmentMode;
+
+ if ( !isset( $wgFragmentMode[$mode] ) ) {
+ if ( $mode === self::ID_PRIMARY ) {
+ throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
+ }
+ return false;
+ }
+
+ $internalMode = $wgFragmentMode[$mode];
+
+ return self::escapeIdInternal( $id, $internalMode );
+ }
+
+ /**
+ * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
+ * a valid URL fragment.
+ *
+ * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe,
+ * be sure to use proper escaping.
+ *
+ * @param string $id String to escape
+ * @return string Escaped ID
+ *
+ * @since 1.30
+ */
+ public static function escapeIdForLink( $id ) {
+ global $wgFragmentMode;
+
+ if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
+ throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
+ }
+
+ $mode = $wgFragmentMode[self::ID_PRIMARY];
+
+ $id = self::escapeIdInternal( $id, $mode );
+
+ return $id;
+ }
+
+ /**
+ * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
+ * a valid URL fragment for external interwikis.
+ *
+ * @param string $id String to escape
+ * @return string Escaped ID
+ *
+ * @since 1.30
+ */
+ public static function escapeIdForExternalInterwiki( $id ) {
+ global $wgExternalInterwikiFragmentMode;
+
+ $id = self::escapeIdInternal( $id, $wgExternalInterwikiFragmentMode );
+
+ return $id;
+ }
+
+ /**
+ * Helper for escapeIdFor*() functions. Performs most of the actual escaping.
+ *
+ * @param string $id String to escape
+ * @param string $mode One of modes from $wgFragmentMode
+ * @return string
+ */
+ private static function escapeIdInternal( $id, $mode ) {
+ $id = self::decodeCharReferences( $id );
+
+ switch ( $mode ) {
+ case 'html5':
+ $id = str_replace( ' ', '_', $id );
+ break;
+ case 'legacy':
+ // This corresponds to 'noninitial' mode of the old escapeId()
+ static $replace = [
+ '%3A' => ':',
+ '%' => '.'
+ ];
+
+ $id = urlencode( str_replace( ' ', '_', $id ) );
+ $id = strtr( $id, $replace );
+ break;
+ case 'html5-legacy':
+ $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
+ $id = trim( $id, '_' );
+ if ( $id === '' ) {
+ // Must have been all whitespace to start with.
+ $id = '_';
+ }
+ break;
+ default:
+ throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
+ }
+
+ return $id;
+ }
+
/**
* Given a string containing a space delimited list of ids, escape each id
* to match ids escaped by the escapeId() function.
*
+ * @todo wfDeprecated() uses of $options in 1.31, remove completely in 1.32
+ *
* @since 1.27
*
* @param string $referenceString Space delimited list of ids
- * @param string|array $options String or array of strings (default is array()):
- * 'noninitial': This is a non-initial fragment of an id, not a full id,
- * so don't pay attention if the first character isn't valid at the
- * beginning of an id. Only matters if $wgExperimentalHtmlIds is
- * false.
- * 'legacy': Behave the way the old HTML 4-based ID escaping worked even
- * if $wgExperimentalHtmlIds is used, so we can generate extra
- * anchors and links won't break.
+ * @param string|array $options Deprecated and does nothing.
* @return string
*/
static function escapeIdReferenceList( $referenceString, $options = [] ) {
# Escape each token as an id
foreach ( $references as &$ref ) {
- $ref = Sanitizer::escapeId( $ref, $options );
+ $ref = self::escapeIdForAttribute( $ref );
}
# Merge the array back to a space delimited list string
* @return string Escaped input
*/
static function escapeHtmlAllowEntities( $html ) {
- $html = Sanitizer::decodeCharReferences( $html );
+ $html = self::decodeCharReferences( $html );
# It seems wise to escape ' as well as ", as a matter of course. Can't
# hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
# don't cause the entire string to disappear.
foreach ( $pairs as $set ) {
$attribute = strtolower( $set[1] );
- $value = Sanitizer::getTagAttributeCallback( $set );
+ $value = self::getTagAttributeCallback( $set );
// Normalize whitespace
$value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
$value = trim( $value );
// Decode character references
- $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
+ $attribs[$attribute] = self::decodeCharReferences( $value );
}
return $attribs;
}
$attribs = [];
foreach ( $assoc_array as $attribute => $value ) {
$encAttribute = htmlspecialchars( $attribute );
- $encValue = Sanitizer::safeEncodeAttribute( $value );
+ $encValue = self::safeEncodeAttribute( $value );
$attribs[] = "$encAttribute=\"$encValue\"";
}
static function normalizeCharReferencesCallback( $matches ) {
$ret = null;
if ( $matches[1] != '' ) {
- $ret = Sanitizer::normalizeEntity( $matches[1] );
+ $ret = self::normalizeEntity( $matches[1] );
} elseif ( $matches[2] != '' ) {
- $ret = Sanitizer::decCharReference( $matches[2] );
+ $ret = self::decCharReference( $matches[2] );
} elseif ( $matches[3] != '' ) {
- $ret = Sanitizer::hexCharReference( $matches[3] );
+ $ret = self::hexCharReference( $matches[3] );
}
if ( is_null( $ret ) ) {
return htmlspecialchars( $matches[0] );
*/
static function decCharReference( $codepoint ) {
$point = intval( $codepoint );
- if ( Sanitizer::validateCodepoint( $point ) ) {
+ if ( self::validateCodepoint( $point ) ) {
return sprintf( '&#%d;', $point );
} else {
return null;
*/
static function hexCharReference( $codepoint ) {
$point = hexdec( $codepoint );
- if ( Sanitizer::validateCodepoint( $point ) ) {
+ if ( self::validateCodepoint( $point ) ) {
return sprintf( '&#x%x;', $point );
} else {
return null;
$text = preg_replace_callback(
self::CHAR_REFS_REGEX,
[ 'Sanitizer', 'decodeCharReferencesCallback' ],
- $text, /* limit */ -1, $count );
+ $text,
+ -1, //limit
+ $count
+ );
if ( $count ) {
return $wgContLang->normalize( $text );
*/
static function decodeCharReferencesCallback( $matches ) {
if ( $matches[1] != '' ) {
- return Sanitizer::decodeEntity( $matches[1] );
+ return self::decodeEntity( $matches[1] );
} elseif ( $matches[2] != '' ) {
- return Sanitizer::decodeChar( intval( $matches[2] ) );
+ return self::decodeChar( intval( $matches[2] ) );
} elseif ( $matches[3] != '' ) {
- return Sanitizer::decodeChar( hexdec( $matches[3] ) );
+ return self::decodeChar( hexdec( $matches[3] ) );
}
# Last case should be an ampersand by itself
return $matches[0];
* @private
*/
static function decodeChar( $codepoint ) {
- if ( Sanitizer::validateCodepoint( $codepoint ) ) {
+ if ( self::validateCodepoint( $codepoint ) ) {
return UtfNormal\Utils::codepointToUtf8( $codepoint );
} else {
return UtfNormal\Constants::UTF8_REPLACEMENT;
* @return array
*/
static function attributeWhitelist( $element ) {
- $list = Sanitizer::setupAttributeWhitelist();
+ $list = self::setupAttributeWhitelist();
return isset( $list[$element] )
? $list[$element]
: [];
# Not usually allowed, but may be used for extension-style hooks
# such as <math> when it is rasterized, or if $wgAllowImageTag is
# true
- 'img' => array_merge( $common, [ 'alt', 'src', 'width', 'height' ] ),
+ 'img' => array_merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
'video' => array_merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
'source' => array_merge( $common, [ 'type', 'src' ] ),
# https://www.w3.org/TR/REC-MathML/
'math' => [ 'class', 'style', 'id', 'title' ],
+ // HTML 5 section 4.5
+ 'figure' => $common,
+ 'figcaption' => $common,
+
# HTML 5 section 4.6
'bdi' => $common,
// (ie: validateTag rejects tags missing the attributes needed for Microdata)
// So we don't bother including $common attributes that have no purpose.
'meta' => [ 'itemprop', 'content' ],
- 'link' => [ 'itemprop', 'href' ],
+ 'link' => [ 'itemprop', 'href', 'title' ],
];
return $whitelist;
static function cleanUrl( $url ) {
# Normalize any HTML entities in input. They will be
# re-escaped by makeExternalLink().
- $url = Sanitizer::decodeCharReferences( $url );
+ $url = self::decodeCharReferences( $url );
# Escape any control characters introduced by the above step
$url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',