From 13eb618dd9f5cee7963f0119057d7173c1488f1d Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Fri, 3 Jun 2005 08:12:48 +0000 Subject: [PATCH] * (bug 684) Accept an attribute parameter array on parser hook tags Some parts of http://bugzilla.wikimedia.org/attachment.cgi?id=96&action=view with heavy modification; using tag matching in the style we accept regular HTML elements, and decode attribute values to proper strings. --- RELEASE-NOTES | 1 + includes/Parser.php | 78 +++++++++++++++++++------ includes/Sanitizer.php | 127 +++++++++++++++++++++++++++++------------ 3 files changed, 151 insertions(+), 55 deletions(-) diff --git a/RELEASE-NOTES b/RELEASE-NOTES index 4cac32b865..923c14ba17 100644 --- a/RELEASE-NOTES +++ b/RELEASE-NOTES @@ -239,6 +239,7 @@ Various bugfixes, small features, and a few experimental things: * (bug 2173) Fatal error when removing an article with an empty title from the watchlist * Removed -f parameter from mail() usage, likely to cause failures and bounces. * (bug 2130) Fixed interwiki links with fragments +* (bug 684) Accept an attribute parameter array on parser hook tags === Caveats === diff --git a/includes/Parser.php b/includes/Parser.php index 7d32701bcc..d9fa2af7e7 100644 --- a/includes/Parser.php +++ b/includes/Parser.php @@ -258,38 +258,73 @@ class Parser * @access private * @static */ - function extractTags($tag, $text, &$content, $uniq_prefix = ''){ + function extractTagsAndParams($tag, $text, &$content, &$tags, &$params, $uniq_prefix = ''){ $rnd = $uniq_prefix . '-' . $tag . Parser::getRandomString(); if ( !$content ) { $content = array( ); } $n = 1; $stripped = ''; + + if ( !$tags ) { + $tags = array( ); + } + + if ( !$params ) { + $params = array( ); + } + + if( $tag == STRIP_COMMENTS ) { + $start = '//'; + } else { + $start = "/<$tag([^>]*)>/i"; + $end = "/<\\/$tag\\s*>/i"; + } while ( '' != $text ) { - if($tag==STRIP_COMMENTS) { - $p = preg_split( '//i', $p[1], 2 ); - } else { - $q = preg_split( "/<\\/\\s*$tag\\s*>/i", $p[1], 2 ); - } - $marker = $rnd . sprintf('%08X', $n++); - $content[$marker] = $q[0]; - $stripped .= $marker; $text = $q[1]; } } return $stripped; } + /** + * Wrapper function for extractTagsAndParams + * for cases where $tags and $params isn't needed + * i.e. where tags will never have params, like + * + * @access private + * @static + */ + function extractTags( $tag, $text, &$content, $uniq_prefix = '' ) { + $dummy_tags = array(); + $dummy_params = array(); + + return Parser::extractTagsAndParams( $tag, $text, $content, + $dummy_tags, $dummy_params, $uniq_prefix ); + } + /** * Strips and renders nowiki, pre, math, hiero * If $render is set, performs necessary rendering operations on plugins @@ -311,6 +346,8 @@ class Parser $pre_content = array(); $comment_content = array(); $ext_content = array(); + $ext_tags = array(); + $ext_params = array(); $gallery_content = array(); # Replace any instances of the placeholders @@ -387,12 +424,15 @@ class Parser # Extensions foreach ( $this->mTagHooks as $tag => $callback ) { $ext_content[$tag] = array(); - $text = Parser::extractTags( $tag, $text, $ext_content[$tag], $uniq_prefix ); + $text = Parser::extractTagsAndParams( $tag, $text, $ext_content[$tag], + $ext_tags[$tag], $ext_params[$tag], $uniq_prefix ); foreach( $ext_content[$tag] as $marker => $content ) { + $full_tag = $ext_tags[$tag][$marker]; + $params = $ext_params[$tag][$marker]; if ( $render ) { - $ext_content[$tag][$marker] = $callback( $content ); + $ext_content[$tag][$marker] = $callback( $content, $params ); } else { - $ext_content[$tag][$marker] = "<$tag>$content"; + $ext_content[$tag][$marker] = "$full_tag$content"; } } } diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index d28ed93af0..40016d93b2 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -36,6 +36,27 @@ define( 'MW_CHAR_REFS_REGEX', |&\#X([0-9A-Za-z]+); |(&)/x' ); +/** + * Regular expression to match HTML/XML attribute pairs within a tag. + * Allows some... latitude. + * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes + */ +$attrib = '[A-Za-z0-9]'; +$space = '[\x09\x0a\x0d\x20]'; +define( 'MW_ATTRIBS_REGEX', + "/(?:^|$space)($attrib+) + ($space*=$space* + (?: + # The attribute value: quoted or alone + \"([^<\"]*)\" + | '([^<']*)' + | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) + | (\#[0-9a-fA-F]+) # Technically wrong, but lots of + # colors are specified like this. + # We'll be normalizing it. + ) + )?(?=$space|\$)/sx" ); + /** * List of all named character entities defined in HTML 4.01 * http://www.w3.org/TR/html4/sgml/entities.html @@ -490,21 +511,8 @@ class Sanitizer { # Unquoted attribute # Since we quote this later, this can be anything distinguishable # from the end of the attribute - $attrib = '[A-Za-z0-9]'; - $space = '[\x09\x0a\x0d\x20]'; if( !preg_match_all( - "/(?:^|$space)($attrib+) - ($space*=$space* - (?: - # The attribute value: quoted or alone - \"([^<\"]*)\" - | '([^<']*)' - | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) - | (\#[0-9a-fA-F]+) # Technically wrong, but lots of - # colors are specified like this. - # We'll be normalizing it. - ) - )?(?=$space|\$)/sx", + MW_ATTRIBS_REGEX, $text, $pairs, PREG_SET_ORDER ) ) { @@ -517,26 +525,11 @@ class Sanitizer { $attribute = strtolower( $set[1] ); if( !isset( $whitelist[$attribute] ) ) { continue; - } elseif( isset( $set[6] ) ) { - # Illegal #XXXXXX color with no quotes. - $value = Sanitizer::normalizeAttributeValue( $set[6] ); - } elseif( isset( $set[5] ) ) { - # No quotes. - $value = Sanitizer::normalizeAttributeValue( $set[5] ); - } elseif( isset( $set[4] ) ) { - # Single-quoted - $value = str_replace( '"', '"', - Sanitizer::normalizeAttributeValue( $set[4] ) ); - } elseif( isset( $set[3] ) ) { - # Double-quoted - $value = Sanitizer::normalizeAttributeValue( $set[3] ); - } elseif( !isset( $set[2] ) ) { - # In XHTML, attributes must have a value. - $value = $set[1]; - } else { - wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." ); } + $raw = Sanitizer::getTagAttributeCallback( $set ); + $value = Sanitizer::normalizeAttributeValue( $raw ); + # Strip javascript "expression" from stylesheets. # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp if( $attribute == 'style' && preg_match( @@ -557,6 +550,67 @@ class Sanitizer { } } + /** + * Return an associative array of attribute names and values from + * a partial tag string. Attribute names are forces to lowercase, + * character references are decoded to UTF-8 text. + * + * @param string + * @return array + */ + function decodeTagAttributes( $text ) { + $attribs = array(); + + if( trim( $text ) == '' ) { + return $attribs; + } + + if( !preg_match_all( + MW_ATTRIBS_REGEX, + $text, + $pairs, + PREG_SET_ORDER ) ) { + return $attribs; + } + + foreach( $pairs as $set ) { + $attribute = strtolower( $set[1] ); + $value = Sanitizer::getTagAttributeCallback( $set ); + $attribs[$attribute] = Sanitizer::decodeCharReferences( $value ); + } + return $attribs; + } + + /** + * Pick the appropriate attribute value from a match set from the + * MW_ATTRIBS_REGEX matches. + * + * @param array $set + * @return string + * @access private + */ + function getTagAttributeCallback( $set ) { + if( isset( $set[6] ) ) { + # Illegal #XXXXXX color with no quotes. + return $set[6]; + } elseif( isset( $set[5] ) ) { + # No quotes. + return $set[5]; + } elseif( isset( $set[4] ) ) { + # Single-quoted + return $set[4]; + } elseif( isset( $set[3] ) ) { + # Double-quoted + return $set[3]; + } elseif( !isset( $set[2] ) ) { + # In XHTML, attributes must have a value. + # For 'reduced' form, return explicitly the attribute name here. + return $set[1]; + } else { + wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." ); + } + } + /** * Normalize whitespace and character references in an XML source- * encoded text for an attribute value. @@ -570,10 +624,11 @@ class Sanitizer { * @access private */ function normalizeAttributeValue( $text ) { - return preg_replace( - '/\r\n|[\x20\x0d\x0a\x09]/', - ' ', - Sanitizer::normalizeCharReferences( $text ) ); + return str_replace( '"', '"', + preg_replace( + '/\r\n|[\x20\x0d\x0a\x09]/', + ' ', + Sanitizer::normalizeCharReferences( $text ) ) ); } /** -- 2.20.1