* @access private
* @static
*/
- function extractTags($tag, $text, &$content, $uniq_prefix = ''){
+ function extractTagsAndParams($tag, $text, &$content, &$tags, &$params, $uniq_prefix = ''){
$rnd = $uniq_prefix . '-' . $tag . Parser::getRandomString();
if ( !$content ) {
$content = array( );
}
$n = 1;
$stripped = '';
+
+ if ( !$tags ) {
+ $tags = array( );
+ }
+
+ if ( !$params ) {
+ $params = array( );
+ }
+
+ if( $tag == STRIP_COMMENTS ) {
+ $start = '/<!--()/';
+ $end = '/-->/';
+ } else {
+ $start = "/<$tag([^>]*)>/i";
+ $end = "/<\\/$tag\\s*>/i";
+ }
while ( '' != $text ) {
- if($tag==STRIP_COMMENTS) {
- $p = preg_split( '/<!--/', $text, 2 );
- } else {
- $p = preg_split( "/<\\s*$tag\\s*>/i", $text, 2 );
- }
+ $p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE );
$stripped .= $p[0];
- if ( ( count( $p ) < 2 ) || ( '' == $p[1] ) ) {
- $text = '';
+ if( count( $p ) < 3 ) {
+ break;
+ }
+ $attributes = $p[1];
+ $inside = $p[2];
+
+ $marker = $rnd . sprintf('%08X', $n++);
+ $stripped .= $marker;
+
+ $tags[$marker] = "<$tag$attributes>";
+ $params[$marker] = Sanitizer::decodeTagAttributes( $attributes );
+
+ $q = preg_split( $end, $inside, 2 );
+ $content[$marker] = $q[0];
+ if( count( $q ) < 1 ) {
+ # No end tag -- let it run out to the end of the text.
+ break;
} else {
- if($tag==STRIP_COMMENTS) {
- $q = preg_split( '/-->/i', $p[1], 2 );
- } else {
- $q = preg_split( "/<\\/\\s*$tag\\s*>/i", $p[1], 2 );
- }
- $marker = $rnd . sprintf('%08X', $n++);
- $content[$marker] = $q[0];
- $stripped .= $marker;
$text = $q[1];
}
}
return $stripped;
}
+ /**
+ * Wrapper function for extractTagsAndParams
+ * for cases where $tags and $params isn't needed
+ * i.e. where tags will never have params, like <nowiki>
+ *
+ * @access private
+ * @static
+ */
+ function extractTags( $tag, $text, &$content, $uniq_prefix = '' ) {
+ $dummy_tags = array();
+ $dummy_params = array();
+
+ return Parser::extractTagsAndParams( $tag, $text, $content,
+ $dummy_tags, $dummy_params, $uniq_prefix );
+ }
+
/**
* Strips and renders nowiki, pre, math, hiero
* If $render is set, performs necessary rendering operations on plugins
$pre_content = array();
$comment_content = array();
$ext_content = array();
+ $ext_tags = array();
+ $ext_params = array();
$gallery_content = array();
# Replace any instances of the placeholders
# Extensions
foreach ( $this->mTagHooks as $tag => $callback ) {
$ext_content[$tag] = array();
- $text = Parser::extractTags( $tag, $text, $ext_content[$tag], $uniq_prefix );
+ $text = Parser::extractTagsAndParams( $tag, $text, $ext_content[$tag],
+ $ext_tags[$tag], $ext_params[$tag], $uniq_prefix );
foreach( $ext_content[$tag] as $marker => $content ) {
+ $full_tag = $ext_tags[$tag][$marker];
+ $params = $ext_params[$tag][$marker];
if ( $render ) {
- $ext_content[$tag][$marker] = $callback( $content );
+ $ext_content[$tag][$marker] = $callback( $content, $params );
} else {
- $ext_content[$tag][$marker] = "<$tag>$content</$tag>";
+ $ext_content[$tag][$marker] = "$full_tag$content</$tag>";
}
}
}
|&\#X([0-9A-Za-z]+);
|(&)/x' );
+/**
+ * Regular expression to match HTML/XML attribute pairs within a tag.
+ * Allows some... latitude.
+ * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
+ */
+$attrib = '[A-Za-z0-9]';
+$space = '[\x09\x0a\x0d\x20]';
+define( 'MW_ATTRIBS_REGEX',
+ "/(?:^|$space)($attrib+)
+ ($space*=$space*
+ (?:
+ # The attribute value: quoted or alone
+ \"([^<\"]*)\"
+ | '([^<']*)'
+ | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
+ | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
+ # colors are specified like this.
+ # We'll be normalizing it.
+ )
+ )?(?=$space|\$)/sx" );
+
/**
* List of all named character entities defined in HTML 4.01
* http://www.w3.org/TR/html4/sgml/entities.html
# Unquoted attribute
# Since we quote this later, this can be anything distinguishable
# from the end of the attribute
- $attrib = '[A-Za-z0-9]';
- $space = '[\x09\x0a\x0d\x20]';
if( !preg_match_all(
- "/(?:^|$space)($attrib+)
- ($space*=$space*
- (?:
- # The attribute value: quoted or alone
- \"([^<\"]*)\"
- | '([^<']*)'
- | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
- | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
- # colors are specified like this.
- # We'll be normalizing it.
- )
- )?(?=$space|\$)/sx",
+ MW_ATTRIBS_REGEX,
$text,
$pairs,
PREG_SET_ORDER ) ) {
$attribute = strtolower( $set[1] );
if( !isset( $whitelist[$attribute] ) ) {
continue;
- } elseif( isset( $set[6] ) ) {
- # Illegal #XXXXXX color with no quotes.
- $value = Sanitizer::normalizeAttributeValue( $set[6] );
- } elseif( isset( $set[5] ) ) {
- # No quotes.
- $value = Sanitizer::normalizeAttributeValue( $set[5] );
- } elseif( isset( $set[4] ) ) {
- # Single-quoted
- $value = str_replace( '"', '"',
- Sanitizer::normalizeAttributeValue( $set[4] ) );
- } elseif( isset( $set[3] ) ) {
- # Double-quoted
- $value = Sanitizer::normalizeAttributeValue( $set[3] );
- } elseif( !isset( $set[2] ) ) {
- # In XHTML, attributes must have a value.
- $value = $set[1];
- } else {
- wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
}
+ $raw = Sanitizer::getTagAttributeCallback( $set );
+ $value = Sanitizer::normalizeAttributeValue( $raw );
+
# Strip javascript "expression" from stylesheets.
# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
if( $attribute == 'style' && preg_match(
}
}
+ /**
+ * Return an associative array of attribute names and values from
+ * a partial tag string. Attribute names are forces to lowercase,
+ * character references are decoded to UTF-8 text.
+ *
+ * @param string
+ * @return array
+ */
+ function decodeTagAttributes( $text ) {
+ $attribs = array();
+
+ if( trim( $text ) == '' ) {
+ return $attribs;
+ }
+
+ if( !preg_match_all(
+ MW_ATTRIBS_REGEX,
+ $text,
+ $pairs,
+ PREG_SET_ORDER ) ) {
+ return $attribs;
+ }
+
+ foreach( $pairs as $set ) {
+ $attribute = strtolower( $set[1] );
+ $value = Sanitizer::getTagAttributeCallback( $set );
+ $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
+ }
+ return $attribs;
+ }
+
+ /**
+ * Pick the appropriate attribute value from a match set from the
+ * MW_ATTRIBS_REGEX matches.
+ *
+ * @param array $set
+ * @return string
+ * @access private
+ */
+ function getTagAttributeCallback( $set ) {
+ if( isset( $set[6] ) ) {
+ # Illegal #XXXXXX color with no quotes.
+ return $set[6];
+ } elseif( isset( $set[5] ) ) {
+ # No quotes.
+ return $set[5];
+ } elseif( isset( $set[4] ) ) {
+ # Single-quoted
+ return $set[4];
+ } elseif( isset( $set[3] ) ) {
+ # Double-quoted
+ return $set[3];
+ } elseif( !isset( $set[2] ) ) {
+ # In XHTML, attributes must have a value.
+ # For 'reduced' form, return explicitly the attribute name here.
+ return $set[1];
+ } else {
+ wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
+ }
+ }
+
/**
* Normalize whitespace and character references in an XML source-
* encoded text for an attribute value.
* @access private
*/
function normalizeAttributeValue( $text ) {
- return preg_replace(
- '/\r\n|[\x20\x0d\x0a\x09]/',
- ' ',
- Sanitizer::normalizeCharReferences( $text ) );
+ return str_replace( '"', '"',
+ preg_replace(
+ '/\r\n|[\x20\x0d\x0a\x09]/',
+ ' ',
+ Sanitizer::normalizeCharReferences( $text ) ) );
}
/**