* (bug 684) Accept an attribute parameter array on parser hook tags
authorBrion Vibber <brion@users.mediawiki.org>
Fri, 3 Jun 2005 08:12:48 +0000 (08:12 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Fri, 3 Jun 2005 08:12:48 +0000 (08:12 +0000)
Some parts of http://bugzilla.wikimedia.org/attachment.cgi?id=96&action=view
with heavy modification; using tag matching in the style we accept regular
HTML elements, and decode attribute values to proper strings.

RELEASE-NOTES
includes/Parser.php
includes/Sanitizer.php

index 4cac32b..923c14b 100644 (file)
@@ -239,6 +239,7 @@ Various bugfixes, small features, and a few experimental things:
 * (bug 2173) Fatal error when removing an article with an empty title from the watchlist
 * Removed -f parameter from mail() usage, likely to cause failures and bounces.
 * (bug 2130) Fixed interwiki links with fragments
+* (bug 684) Accept an attribute parameter array on parser hook tags
 
 
 === Caveats ===
index 7d32701..d9fa2af 100644 (file)
@@ -258,38 +258,73 @@ class Parser
         * @access private
         * @static
         */
-       function extractTags($tag, $text, &$content, $uniq_prefix = ''){
+       function extractTagsAndParams($tag, $text, &$content, &$tags, &$params, $uniq_prefix = ''){
                $rnd = $uniq_prefix . '-' . $tag . Parser::getRandomString();
                if ( !$content ) {
                        $content = array( );
                }
                $n = 1;
                $stripped = '';
+       
+               if ( !$tags ) {
+                       $tags = array( );
+               }
+               
+               if ( !$params ) {
+                       $params = array( );
+               }
+
+               if( $tag == STRIP_COMMENTS ) {
+                       $start = '/<!--()/';
+                       $end   = '/-->/';
+               } else {
+                       $start = "/<$tag([^>]*)>/i";
+                       $end   = "/<\\/$tag\\s*>/i";
+               }
 
                while ( '' != $text ) {
-                       if($tag==STRIP_COMMENTS) {
-                               $p = preg_split( '/<!--/', $text, 2 );
-                       } else {
-                               $p = preg_split( "/<\\s*$tag\\s*>/i", $text, 2 );
-                       }
+                       $p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE );
                        $stripped .= $p[0];
-                       if ( ( count( $p ) < 2 ) || ( '' == $p[1] ) ) {
-                               $text = '';
+                       if( count( $p ) < 3 ) {
+                               break;
+                       }
+                       $attributes = $p[1];
+                       $inside     = $p[2];
+                       
+                       $marker = $rnd . sprintf('%08X', $n++);
+                       $stripped .= $marker;
+                       
+                       $tags[$marker] = "<$tag$attributes>";
+                       $params[$marker] = Sanitizer::decodeTagAttributes( $attributes );
+                       
+                       $q = preg_split( $end, $inside, 2 );
+                       $content[$marker] = $q[0];
+                       if( count( $q ) < 1 ) {
+                               # No end tag -- let it run out to the end of the text.
+                               break;
                        } else {
-                               if($tag==STRIP_COMMENTS) {
-                                       $q = preg_split( '/-->/i', $p[1], 2 );
-                               } else {
-                                       $q = preg_split( "/<\\/\\s*$tag\\s*>/i", $p[1], 2 );
-                               }
-                               $marker = $rnd . sprintf('%08X', $n++);
-                               $content[$marker] = $q[0];
-                               $stripped .= $marker;
                                $text = $q[1];
                        }
                }
                return $stripped;
        }
 
+       /**
+        * Wrapper function for extractTagsAndParams
+        * for cases where $tags and $params isn't needed
+        * i.e. where tags will never have params, like <nowiki>
+        *
+        * @access private
+        * @static
+        */
+       function extractTags( $tag, $text, &$content, $uniq_prefix = '' ) {
+               $dummy_tags = array();
+               $dummy_params = array();
+               
+               return Parser::extractTagsAndParams( $tag, $text, $content,
+                       $dummy_tags, $dummy_params, $uniq_prefix );
+       }
+       
        /**
         * Strips and renders nowiki, pre, math, hiero
         * If $render is set, performs necessary rendering operations on plugins
@@ -311,6 +346,8 @@ class Parser
                $pre_content = array();
                $comment_content = array();
                $ext_content = array();
+               $ext_tags = array();
+               $ext_params = array();
                $gallery_content = array();
 
                # Replace any instances of the placeholders
@@ -387,12 +424,15 @@ class Parser
                # Extensions
                foreach ( $this->mTagHooks as $tag => $callback ) {
                        $ext_content[$tag] = array();
-                       $text = Parser::extractTags( $tag, $text, $ext_content[$tag], $uniq_prefix );
+                       $text = Parser::extractTagsAndParams( $tag, $text, $ext_content[$tag],
+                               $ext_tags[$tag], $ext_params[$tag], $uniq_prefix );
                        foreach( $ext_content[$tag] as $marker => $content ) {
+                               $full_tag = $ext_tags[$tag][$marker];
+                               $params = $ext_params[$tag][$marker];
                                if ( $render ) {
-                                       $ext_content[$tag][$marker] = $callback( $content );
+                                       $ext_content[$tag][$marker] = $callback( $content, $params );
                                } else {
-                                       $ext_content[$tag][$marker] = "<$tag>$content</$tag>";
+                                       $ext_content[$tag][$marker] = "$full_tag$content</$tag>";
                                }
                        }
                }
index d28ed93..40016d9 100644 (file)
@@ -36,6 +36,27 @@ define( 'MW_CHAR_REFS_REGEX',
         |&\#X([0-9A-Za-z]+);
         |(&)/x' );
 
+/**
+ * Regular expression to match HTML/XML attribute pairs within a tag.
+ * Allows some... latitude.
+ * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
+ */
+$attrib = '[A-Za-z0-9]'; 
+$space = '[\x09\x0a\x0d\x20]';
+define( 'MW_ATTRIBS_REGEX',
+       "/(?:^|$space)($attrib+)
+         ($space*=$space*
+               (?:
+                # The attribute value: quoted or alone
+                 \"([^<\"]*)\"
+                | '([^<']*)'
+                |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
+                |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
+                                                        # colors are specified like this.
+                                                        # We'll be normalizing it.
+               )
+          )?(?=$space|\$)/sx" );
+
 /**
  * List of all named character entities defined in HTML 4.01
  * http://www.w3.org/TR/html4/sgml/entities.html
@@ -490,21 +511,8 @@ class Sanitizer {
                # Unquoted attribute
                # Since we quote this later, this can be anything distinguishable 
                # from the end of the attribute
-               $attrib = '[A-Za-z0-9]'; 
-               $space = '[\x09\x0a\x0d\x20]';
                if( !preg_match_all(
-                       "/(?:^|$space)($attrib+)
-                         ($space*=$space*
-                           (?:
-                            # The attribute value: quoted or alone
-                             \"([^<\"]*)\"
-                            | '([^<']*)'
-                            |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
-                            |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
-                                                # colors are specified like this.
-                                                # We'll be normalizing it.
-                           )
-                          )?(?=$space|\$)/sx",
+                       MW_ATTRIBS_REGEX,
                        $text,
                        $pairs,
                        PREG_SET_ORDER ) ) {
@@ -517,26 +525,11 @@ class Sanitizer {
                        $attribute = strtolower( $set[1] );
                        if( !isset( $whitelist[$attribute] ) ) {
                                continue;
-                       } elseif( isset( $set[6] ) ) {
-                               # Illegal #XXXXXX color with no quotes.
-                               $value = Sanitizer::normalizeAttributeValue( $set[6] );
-                       } elseif( isset( $set[5] ) ) {
-                               # No quotes.
-                               $value = Sanitizer::normalizeAttributeValue( $set[5] );
-                       } elseif( isset( $set[4] ) ) {
-                               # Single-quoted
-                               $value = str_replace( '"', '&quot;',
-                                       Sanitizer::normalizeAttributeValue( $set[4] ) );
-                       } elseif( isset( $set[3] ) ) {
-                               # Double-quoted
-                               $value = Sanitizer::normalizeAttributeValue( $set[3] );
-                       } elseif( !isset( $set[2] ) ) {
-                               # In XHTML, attributes must have a value.
-                               $value = $set[1];
-                       } else {
-                               wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
                        }
                        
+                       $raw   = Sanitizer::getTagAttributeCallback( $set );
+                       $value = Sanitizer::normalizeAttributeValue( $raw );
+                       
                        # Strip javascript "expression" from stylesheets.
                        # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
                        if( $attribute == 'style' && preg_match(
@@ -557,6 +550,67 @@ class Sanitizer {
                }
        }
        
+       /**
+        * Return an associative array of attribute names and values from
+        * a partial tag string. Attribute names are forces to lowercase,
+        * character references are decoded to UTF-8 text.
+        *
+        * @param string
+        * @return array
+        */
+       function decodeTagAttributes( $text ) {
+               $attribs = array();
+               
+               if( trim( $text ) == '' ) {
+                       return $attribs;
+               }
+               
+               if( !preg_match_all(
+                       MW_ATTRIBS_REGEX,
+                       $text,
+                       $pairs,
+                       PREG_SET_ORDER ) ) {
+                       return $attribs;
+               }
+
+               foreach( $pairs as $set ) {
+                       $attribute = strtolower( $set[1] );
+                       $value = Sanitizer::getTagAttributeCallback( $set );
+                       $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
+               }
+               return $attribs;
+       }
+       
+       /**
+        * Pick the appropriate attribute value from a match set from the
+        * MW_ATTRIBS_REGEX matches.
+        *
+        * @param array $set
+        * @return string
+        * @access private
+        */
+       function getTagAttributeCallback( $set ) {
+               if( isset( $set[6] ) ) {
+                       # Illegal #XXXXXX color with no quotes.
+                       return $set[6];
+               } elseif( isset( $set[5] ) ) {
+                       # No quotes.
+                       return $set[5];
+               } elseif( isset( $set[4] ) ) {
+                       # Single-quoted
+                       return $set[4];
+               } elseif( isset( $set[3] ) ) {
+                       # Double-quoted
+                       return $set[3];
+               } elseif( !isset( $set[2] ) ) {
+                       # In XHTML, attributes must have a value.
+                       # For 'reduced' form, return explicitly the attribute name here.
+                       return $set[1];
+               } else {
+                       wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
+               }
+       }
+       
        /**
         * Normalize whitespace and character references in an XML source-
         * encoded text for an attribute value.
@@ -570,10 +624,11 @@ class Sanitizer {
         * @access private
         */
        function normalizeAttributeValue( $text ) {
-               return preg_replace(
-                       '/\r\n|[\x20\x0d\x0a\x09]/',
-                       ' ',
-                       Sanitizer::normalizeCharReferences( $text ) );
+               return str_replace( '"', '&quot;',
+                       preg_replace(
+                               '/\r\n|[\x20\x0d\x0a\x09]/',
+                               ' ',
+                               Sanitizer::normalizeCharReferences( $text ) ) );
        }
        
        /**