* (bug 5021) Transcluding the same special page twice now works
[lhc/web/wiklou.git] / includes / Parser.php
index 6dacd60..4afa23e 100644 (file)
@@ -59,6 +59,16 @@ define( 'EXT_IMAGE_REGEX',
        '('.EXT_IMAGE_FNAME_CLASS.'+)\\.((?i)'.EXT_IMAGE_EXTENSIONS.')$/S' # Filename
 );
 
+// State constants for the definition list colon extraction
+define( 'MW_COLON_STATE_TEXT', 0 );
+define( 'MW_COLON_STATE_TAG', 1 );
+define( 'MW_COLON_STATE_TAGSTART', 2 );
+define( 'MW_COLON_STATE_CLOSETAG', 3 );
+define( 'MW_COLON_STATE_TAGSLASH', 4 );
+define( 'MW_COLON_STATE_COMMENT', 5 );
+define( 'MW_COLON_STATE_COMMENTDASH', 6 );
+define( 'MW_COLON_STATE_COMMENTDASHDASH', 7 );
+
 /**
  * PHP Parser
  *
@@ -311,20 +321,20 @@ class Parser
        function getOptions() { return $this->mOptions; }
 
        /**
-        * Replaces all occurrences of <$tag>content</$tag> in the text
-        * with a random marker and returns the new text. the output parameter
-        * $content will be an associative array filled with data on the form
-        * $unique_marker => content.
+        * Replaces all occurrences of HTML-style comments and the given tags
+        * in the text with a random marker and returns teh next text. The output
+        * parameter $matches will be an associative array filled with data in
+        * the form:
+        *   'UNIQ-xxxxx' => array(
+        *     'element',
+        *     'tag content',
+        *     array( 'param' => 'x' ),
+        *     '<element param="x">tag content</element>' ) )
         *
-        * If $content is already set, the additional entries will be appended
-        * If $tag is set to STRIP_COMMENTS, the function will extract
-        * <!-- HTML comments -->
+        * @param $elements list of element names. Comments are always extracted.
+        * @param $text Source text string.
+        * @param $uniq_prefix
         *
-        * $output: array( 'UNIQ-xxxxx' => array(
-        *    'element',
-        *    'tag content',
-        *    array( 'param' => 'x' ),
-        *    '<element param="x">' ) )
         * @private
         * @static
         */
@@ -334,58 +344,59 @@ class Parser
                $stripped = '';
                $matches = array();
 
-               if( $elements == STRIP_COMMENTS ) {
-                       $start = '/<!--()()/';
-               } else {
-                       $taglist = implode( '|', $elements );
-                       $start = "/<($taglist)(\\s+[^>]*|\\s*\/?)>/i";
-               }
+               $taglist = implode( '|', $elements );
+               $start = "/<($taglist)(\\s+[^>]*?|\\s*?)(\/?>)|<(!--)/i";
 
                while ( '' != $text ) {
                        $p = preg_split( $start, $text, 2, PREG_SPLIT_DELIM_CAPTURE );
                        $stripped .= $p[0];
-                       if( count( $p ) < 4 ) {
+                       if( count( $p ) < 5 ) {
                                break;
                        }
-                       $element    = $p[1];
-                       $attributes = $p[2];
-                       $inside     = $p[3];
-
-                       // If $attributes ends with '/', we have an empty element tag, <tag />
-                       if( $element != '' && substr( $attributes, -1 ) == '/' ) {
-                               $attributes = substr( $attributes, 0, -1);
-                               $empty = '/';
+                       if( count( $p ) > 5 ) {
+                               // comment
+                               $element    = $p[4];
+                               $attributes = '';
+                               $close      = '';
+                               $inside     = $p[5];
                        } else {
-                               $empty = '';
+                               // tag
+                               $element    = $p[1];
+                               $attributes = $p[2];
+                               $close      = $p[3];
+                               $inside     = $p[4];
                        }
 
                        $marker = "$uniq_prefix-$element-$rand" . sprintf('%08X', $n++);
                        $stripped .= $marker;
 
-                       if ( $empty === '/' ) {
+                       if ( $close === '/>' ) {
                                // Empty element tag, <tag />
                                $content = null;
                                $text = $inside;
+                               $tail = null;
                        } else {
-                               if( $element ) {
-                                       $end = "/<\\/$element\\s*>/i";
+                               if( $element == '!--' ) {
+                                       $end = '/(-->)/';
                                } else {
-                                       $end = '/-->/';
+                                       $end = "/(<\\/$element\\s*>)/i";
                                }
-                               $q = preg_split( $end, $inside, 2 );
+                               $q = preg_split( $end, $inside, 2, PREG_SPLIT_DELIM_CAPTURE );
                                $content = $q[0];
-                               if( count( $q ) < 2 ) {
+                               if( count( $q ) < 3 ) {
                                        # No end tag -- let it run out to the end of the text.
-                                       break;
+                                       $tail = '';
+                                       $text = '';
                                } else {
-                                       $text = $q[1];
+                                       $tail = $q[1];
+                                       $text = $q[2];
                                }
                        }
                        
                        $matches[$marker] = array( $element,
                                $content,
                                Sanitizer::decodeTagAttributes( $attributes ),
-                               "<$element$attributes$empty>" );
+                               "<$element$attributes$close$content$tail" );
                }
                return $stripped;
        }
@@ -409,6 +420,7 @@ class Parser
                # Replace any instances of the placeholders
                $uniq_prefix = $this->mUniqPrefix;
                #$text = str_replace( $uniq_prefix, wfHtmlEscapeFirst( $uniq_prefix ), $text );
+               $commentState = array();
                
                $elements = array_merge(
                        array( 'nowiki', 'pre', 'gallery' ),
@@ -422,26 +434,24 @@ class Parser
                }
                
 
-               // Strip comments in a first pass.
-               // This saves us from needlessly rendering extensions in comment text
-               $text = Parser::extractTagsAndParams(STRIP_COMMENTS, $text, $comment_matches, $uniq_prefix);
-               $commentState = array();
-               foreach( $comment_matches as $marker => $data ){
-                       list( $element, $content, $params, $tag ) = $data;
-                       $commentState[$marker] = '<!--' . $content . '-->';
-               }
-               
                $matches = array();
                $text = Parser::extractTagsAndParams( $elements, $text, $matches, $uniq_prefix );
                
                foreach( $matches as $marker => $data ) {
                        list( $element, $content, $params, $tag ) = $data;
-                       // Restore any comments; the extension can deal with them.
-                       if( $content !== null) {
-                               $content = strtr( $content, $commentState );
-                       }
                        if( $render ) {
-                               switch( $element ) {
+                               $tagName = strtolower( $element );
+                               switch( $tagName ) {
+                               case '!--':
+                                       // Comment
+                                       if( substr( $tag, -3 ) == '-->' ) {
+                                               $output = $tag;
+                                       } else {
+                                               // Unclosed comment in input.
+                                               // Close it so later stripping can remove it
+                                               $output = "$tag-->";
+                                       }
+                                       break;
                                case 'html':
                                        if( $wgRawHtml ) {
                                                $output = $content;
@@ -463,7 +473,6 @@ class Parser
                                        $output = $this->renderImageGallery( $content );
                                        break;
                                default:
-                                       $tagName = strtolower( $element );
                                        if( isset( $this->mTagHooks[$tagName] ) ) {
                                                $output = call_user_func_array( $this->mTagHooks[$tagName],
                                                        array( $content, $params, $this ) );
@@ -473,25 +482,20 @@ class Parser
                                }
                        } else {
                                // Just stripping tags; keep the source
-                               if( $content === null ) {
-                                       $output = $tag;
-                               } else {
-                                       $output = "$tag$content</$element>";
-                               }
+                               $output = $tag;
+                       }
+                       if( !$stripcomments && $element == '!--' ) {
+                               $commentState[$marker] = $output;
+                       } else {
+                               $state[$element][$marker] = $output;
                        }
-                       $state[$element][$marker] = $output;
                }
 
                # Unstrip comments unless explicitly told otherwise.
                # (The comments are always stripped prior to this point, so as to
                # not invoke any extension tags / parser hooks contained within
                # a comment.)
-               if ( $stripcomments ) {
-                       // Add remaining comments to the state array
-                       foreach( $commentState as $marker => $content ) {
-                               $state['comment'][$marker] = $content;
-                       }
-               } else {
+               if ( !$stripcomments ) {
                        // Put them all back and forget them
                        $text = strtr( $text, $commentState );
                }
@@ -1892,10 +1896,10 @@ class Parser
                                wfProfileIn( "$fname-paragraph" );
                                # No prefix (not in list)--go to paragraph mode
                                // XXX: use a stack for nestable elements like span, table and div
-                               $openmatch = preg_match('/(<table|<blockquote|<h1|<h2|<h3|<h4|<h5|<h6|<pre|<tr|<p|<ul|<li|<\\/tr|<\\/td|<\\/th)/iS', $t );
+                               $openmatch = preg_match('/(<table|<blockquote|<h1|<h2|<h3|<h4|<h5|<h6|<pre|<tr|<p|<ul|<ol|<li|<\\/tr|<\\/td|<\\/th)/iS', $t );
                                $closematch = preg_match(
                                        '/(<\\/table|<\\/blockquote|<\\/h1|<\\/h2|<\\/h3|<\\/h4|<\\/h5|<\\/h6|'.
-                                       '<td|<th|<div|<\\/div|<hr|<\\/pre|<\\/p|'.$this->mUniqPrefix.'-pre|<\\/li|<\\/ul)/iS', $t );
+                                       '<td|<th|<div|<\\/div|<hr|<\\/pre|<\\/p|'.$this->mUniqPrefix.'-pre|<\\/li|<\\/ul|<\\/ol)/iS', $t );
                                if ( $openmatch or $closematch ) {
                                        $paragraphStack = false;
                                        # TODO bug 5718: paragraph closed
@@ -1969,43 +1973,167 @@ class Parser
        }
 
        /**
-        * Split up a string on ':', ignoring any occurences inside
-        * <a>..</a> or <span>...</span>
+        * Split up a string on ':', ignoring any occurences inside tags
+        * to prevent illegal overlapping.
         * @param string $str the string to split
         * @param string &$before set to everything before the ':'
         * @param string &$after set to everything after the ':'
         * return string the position of the ':', or false if none found
         */
        function findColonNoLinks($str, &$before, &$after) {
-               # I wonder if we should make this count all tags, not just <a>
-               # and <span>. That would prevent us from matching a ':' that
-               # comes in the middle of italics other such formatting....
-               # -- Wil
                $fname = 'Parser::findColonNoLinks';
                wfProfileIn( $fname );
-               $pos = 0;
-               do {
-                       $colon = strpos($str, ':', $pos);
-
-                       if ($colon !== false) {
-                               $before = substr($str, 0, $colon);
-                               $after = substr($str, $colon + 1);
-
-                               # Skip any ':' within <a> or <span> pairs
-                               $a = substr_count($before, '<a');
-                               $s = substr_count($before, '<span');
-                               $ca = substr_count($before, '</a>');
-                               $cs = substr_count($before, '</span>');
-
-                               if ($a <= $ca and $s <= $cs) {
-                                       # Tags are balanced before ':'; ok
+               
+               $pos = strpos( $str, ':' );
+               if( $pos === false ) {
+                       // Nothing to find!
+                       wfProfileOut( $fname );
+                       return false;
+               }
+               
+               $lt = strpos( $str, '<' );
+               if( $lt === false || $lt > $pos ) {
+                       // Easy; no tag nesting to worry about
+                       $before = substr( $str, 0, $pos );
+                       $after = substr( $str, $pos+1 );
+                       wfProfileOut( $fname );
+                       return $pos;
+               }
+               
+               // Ugly state machine to walk through avoiding tags.
+               $state = MW_COLON_STATE_TEXT;
+               $stack = 0;
+               $len = strlen( $str );
+               for( $i = 0; $i < $len; $i++ ) {
+                       $c = $str{$i};
+                       
+                       switch( $state ) {
+                       // (Using the number is a performance hack for common cases)
+                       case 0: // MW_COLON_STATE_TEXT:
+                               switch( $c ) {
+                               case "<":
+                                       // Could be either a <start> tag or an </end> tag
+                                       $state = MW_COLON_STATE_TAGSTART;
+                                       break;
+                               case ":":
+                                       if( $stack == 0 ) {
+                                               // We found it!
+                                               $before = substr( $str, 0, $i );
+                                               $after = substr( $str, $i + 1 );
+                                               wfProfileOut( $fname );
+                                               return $i;
+                                       }
+                                       // Embedded in a tag; don't break it.
                                        break;
+                               default:
+                                       // Skip ahead looking for something interesting
+                                       $colon = strpos( $str, ':', $i );
+                                       if( $colon === false ) {
+                                               // Nothing else interesting
+                                               wfProfileOut( $fname );
+                                               return false;
+                                       }
+                                       $lt = strpos( $str, '<', $i );
+                                       if( $stack === 0 ) {
+                                               if( $lt === false || $colon < $lt ) {
+                                                       // We found it!
+                                                       $before = substr( $str, 0, $colon );
+                                                       $after = substr( $str, $colon + 1 );
+                                                       wfProfileOut( $fname );
+                                                       return $i;
+                                               }
+                                       }
+                                       if( $lt === false ) {
+                                               // Nothing else interesting to find; abort!
+                                               // We're nested, but there's no close tags left. Abort!
+                                               break 2;
+                                       }
+                                       // Skip ahead to next tag start
+                                       $i = $lt;
+                                       $state = MW_COLON_STATE_TAGSTART;
                                }
-                               $pos = $colon + 1;
+                               break;
+                       case 1: // MW_COLON_STATE_TAG:
+                               // In a <tag>
+                               switch( $c ) {
+                               case ">":
+                                       $stack++;
+                                       $state = MW_COLON_STATE_TEXT;
+                                       break;
+                               case "/":
+                                       // Slash may be followed by >?
+                                       $state = MW_COLON_STATE_TAGSLASH;
+                                       break;
+                               default:
+                                       // ignore
+                               }
+                               break;
+                       case 2: // MW_COLON_STATE_TAGSTART:
+                               switch( $c ) {
+                               case "/":
+                                       $state = MW_COLON_STATE_CLOSETAG;
+                                       break;
+                               case "!":
+                                       $state = MW_COLON_STATE_COMMENT;
+                                       break;
+                               case ">":
+                                       // Illegal early close? This shouldn't happen D:
+                                       $state = MW_COLON_STATE_TEXT;
+                                       break;
+                               default:
+                                       $state = MW_COLON_STATE_TAG;
+                               }
+                               break;
+                       case 3: // MW_COLON_STATE_CLOSETAG:
+                               // In a </tag>
+                               if( $c == ">" ) {
+                                       $stack--;
+                                       if( $stack < 0 ) {
+                                               wfDebug( "Invalid input in $fname; too many close tags\n" );
+                                               wfProfileOut( $fname );
+                                               return false;
+                                       }
+                                       $state = MW_COLON_STATE_TEXT;
+                               }
+                               break;
+                       case MW_COLON_STATE_TAGSLASH:
+                               if( $c == ">" ) {
+                                       // Yes, a self-closed tag <blah/>
+                                       $state = MW_COLON_STATE_TEXT;
+                               } else {
+                                       // Probably we're jumping the gun, and this is an attribute
+                                       $state = MW_COLON_STATE_TAG;
+                               }
+                               break;
+                       case 5: // MW_COLON_STATE_COMMENT:
+                               if( $c == "-" ) {
+                                       $state = MW_COLON_STATE_COMMENTDASH;
+                               }
+                               break;
+                       case MW_COLON_STATE_COMMENTDASH:
+                               if( $c == "-" ) {
+                                       $state = MW_COLON_STATE_COMMENTDASHDASH;
+                               } else {
+                                       $state = MW_COLON_STATE_COMMENT;
+                               }
+                               break;
+                       case MW_COLON_STATE_COMMENTDASHDASH:
+                               if( $c == ">" ) {
+                                       $state = MW_COLON_STATE_TEXT;
+                               } else {
+                                       $state = MW_COLON_STATE_COMMENT;
+                               }
+                               break;
+                       default:
+                               wfDebugDieBacktrace( "State machine error in $fname" );
                        }
-               } while ($colon !== false);
+               }
+               if( $stack > 0 ) {
+                       wfDebug( "Invalid input in $fname; not enough close tags (stack $stack, state $state)\n" );
+                       return false;
+               }
                wfProfileOut( $fname );
-               return $colon;
+               return false;
        }
 
        /**
@@ -2790,7 +2918,11 @@ class Parser
                                # Use the original $piece['title'] not the mangled $part1, so that
                                # modifiers such as RAW: produce separate cache entries
                                if( $found ) {
-                                       $this->mTemplates[$piece['title']] = $text;
+                                       if( $isHTML ) {
+                                               // A special page; don't store it in the template cache.
+                                       } else {
+                                               $this->mTemplates[$piece['title']] = $text;
+                                       }
                                        $text = $linestart . $text;
                                }
                        }
@@ -3664,6 +3796,7 @@ class Parser
         * @return The old value of the mTagHooks array associated with the hook
         */
        function setHook( $tag, $callback ) {
+               $tag = strtolower( $tag );
                $oldVal = @$this->mTagHooks[$tag];
                $this->mTagHooks[$tag] = $callback;
 
@@ -3909,10 +4042,6 @@ class Parser
         * 'A tree'.
         */
        function renderImageGallery( $text ) {
-               # Setup the parser
-               $parserOptions = new ParserOptions;
-               $localParser = new Parser();
-
                $ig = new ImageGallery();
                $ig->setShowBytes( false );
                $ig->setShowFilename( false );
@@ -3938,7 +4067,12 @@ class Parser
                                $label = '';
                        }
 
-                       $pout = $localParser->parse( $label , $this->mTitle, $parserOptions );
+                       $pout = $this->parse( $label,
+                               $this->mTitle,
+                               $this->mOptions,
+                               false, // Strip whitespace...?
+                               false  // Don't clear state!
+                       );
                        $html = $pout->getText();
 
                        $ig->add( new Image( $nt ), $html );
@@ -3947,13 +4081,6 @@ class Parser
                        if ( $nt->getNamespace() == NS_IMAGE ) {
                                $this->mOutput->addImage( $nt->getDBkey() );
                        }
-                       
-                       # Register links with the parent parser
-                       foreach( $pout->getLinks() as $ns => $keys ) {
-                               foreach( $keys as $dbk => $id )
-                                       $this->mOutput->addLink( Title::makeTitle( $ns, $dbk ), $id );
-                       }
-                       
                }
                return $ig->toHTML();
        }