* Make lc and uc parser functions skip strip markers
authorTim Starling <tstarling@users.mediawiki.org>
Thu, 24 Jan 2008 09:07:47 +0000 (09:07 +0000)
committerTim Starling <tstarling@users.mediawiki.org>
Thu, 24 Jan 2008 09:07:47 +0000 (09:07 +0000)
* Made ==foo==<!----> create a valid section edit link
* Changed header processing heuristics -- now double-equals signs are generally respected as header starts, and will break template invocations, and single equals signs are respected as header syntax but might not generate a section edit link.

includes/CoreParserFunctions.php
includes/Parser.php
includes/Preprocessor_DOM.php
maintenance/parserTests.txt

index 670676b..4b83688 100644 (file)
@@ -51,12 +51,20 @@ class CoreParserFunctions {
 
        static function lc( $parser, $s = '' ) {
                global $wgContLang;
-               return $wgContLang->lc( $s );
+               if ( is_callable( array( $parser, 'markerSkipCallback' ) ) ) {
+                       return $parser->markerSkipCallback( $s, array( $wgContLang, 'lc' ) );
+               } else {
+                       return $wgContLang->lc( $s );
+               }
        }
 
        static function uc( $parser, $s = '' ) {
                global $wgContLang;
-               return $wgContLang->uc( $s );
+               if ( is_callable( array( $parser, 'markerSkipCallback' ) ) ) {
+                       return $parser->markerSkipCallback( $s, array( $wgContLang, 'uc' ) );
+               } else {
+                       return $wgContLang->uc( $s );
+               }
        }
 
        static function localurl( $parser, $s = '', $arg = null ) { return self::urlFunction( 'getLocalURL', $s, $arg ); }
index d5354cb..5f84062 100644 (file)
@@ -4815,6 +4815,30 @@ class Parser
                }
                return $this->testSrvus( $text, $title, $options, self::OT_PREPROCESS );
        }
+
+       function markerSkipCallback( $s, $callback ) {
+               $i = 0;
+               $out = '';
+               while ( $i < strlen( $s ) ) {
+                       $markerStart = strpos( $s, $this->mUniqPrefix, $i );
+                       if ( $markerStart === false ) {
+                               $out .= call_user_func( $callback, substr( $s, $i ) );
+                               break;
+                       } else {
+                               $out .= call_user_func( $callback, substr( $s, $i, $markerStart - $i ) );
+                               $markerEnd = strpos( $s, $this->mMarkerSuffix, $markerStart );
+                               if ( $markerEnd === false ) {
+                                       $out .= substr( $s, $markerStart );
+                                       break;
+                               } else {
+                                       $markerEnd += strlen( $this->mMarkerSuffix );
+                                       $out .= substr( $s, $markerStart, $markerEnd - $markerStart );
+                                       $i = $markerEnd;
+                               }
+                       }
+               }
+               return $out;
+       }
 }
 
 /**
index 3a712b0..f248d00 100644 (file)
@@ -99,7 +99,7 @@ class Preprocessor_DOM implements Preprocessor {
        
                $stack = new PPDStack;
 
-               $searchBase = '[{<'; #}
+               $searchBase = "[{<\n"; #}
                $revText = strrev( $text ); // For fast reverse searches
 
                $i = 0;                     # Input pointer, starts out pointing to a pseudo-newline before the start
@@ -148,17 +148,6 @@ class Preprocessor_DOM implements Preprocessor {
                                if ( $findEquals ) {
                                        // First equals will be for the template
                                        $search .= '=';
-                               } else {
-                                       // Look for headings
-                                       // We can't look for headings when $findEquals is true, because the ambiguity 
-                                       // between template name/value separators and heading starts would be unresolved
-                                       // until the closing double-brace is found. This would mean either infinite 
-                                       // backtrack, or creating and updating two separate tree structures until the
-                                       // end of the ambiguity -- one tree structure assuming a heading, and the other 
-                                       // assuming a template argument.
-                                       //
-                                       // Easier to just break some section edit links.
-                                       $search .= "\n";
                                }
                                $rule = null;
                                # Output literal section, advance input counter
@@ -240,7 +229,7 @@ class Preprocessor_DOM implements Preprocessor {
                                                $wsEnd = $endPos + 2 + strspn( $text, ' ', $endPos + 3 );
                                                // Eat the line if possible
                                                // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at 
-                                               // the overall start. That's not how Sanitizer::removeHTMLcomments() does it, but 
+                                               // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but 
                                                // it's a possible beneficial b/c break.
                                                if ( $wsStart > 0 && substr( $text, $wsStart - 1, 1 ) == "\n" 
                                                        && substr( $text, $wsEnd + 1, 1 ) == "\n" )
@@ -253,28 +242,24 @@ class Preprocessor_DOM implements Preprocessor {
                                                        if ( $wsLength > 0 && substr( $accum, -$wsLength ) === str_repeat( ' ', $wsLength ) ) {
                                                                $accum = substr( $accum, 0, -$wsLength );
                                                        }
-                                                       // Do a line-start run next time to look for headings after the comment,
-                                                       // but only if stack->top===false, because headings don't exist at deeper levels.
-                                                       if ( $stack->top === false ) {
-                                                               $fakeLineStart = true;
-                                                       }
+                                                       // Do a line-start run next time to look for headings after the comment
+                                                       $fakeLineStart = true;
                                                } else {
                                                        // No line to eat, just take the comment itself
                                                        $startPos = $i;
                                                        $endPos += 2;
                                                }
 
-                                               /*
                                                if ( $stack->top ) {
-                                                       if ( $stack->top->commentEndPos !== false && $stack->top->commentEndPos == $wsStart ) {
+                                                       $part = $stack->top->getCurrentPart();
+                                                       if ( isset( $part->commentEnd ) && $part->commentEnd == $wsStart - 1 ) {
                                                                // Comments abutting, no change in visual end
-                                                               $stack->top->commentEndPos = $wsEnd;
+                                                               $part->commentEnd = $wsEnd;
                                                        } else {
-                                                               $stack->top->visualEndPos = $wsStart;
-                                                               $stack->top->commentEndPos = $wsEnd;
+                                                               $part->visualEnd = $wsStart;
+                                                               $part->commentEnd = $endPos;
                                                        }
                                                }
-                                                */
                                                $i = $endPos + 1;
                                                $inner = substr( $text, $startPos, $endPos - $startPos + 1 );
                                                $accum .= '<comment>' . htmlspecialchars( $inner ) . '</comment>';
@@ -356,7 +341,11 @@ class Preprocessor_DOM implements Preprocessor {
                                }
                                
                                $count = strspn( $text, '=', $i, 6 );
-                               if ( $count > 0 ) {
+                               if ( $count == 1 && $findEquals ) {
+                                       // DWIM: This looks kind of like a name/value separator
+                                       // Let's let the equals handler have it and break the potential heading
+                                       // This is heuristic, but AFAICT the methods for completely correct disambiguation are very complex.
+                               } elseif ( $count > 0 ) {
                                        $piece = array(
                                                'open' => "\n",
                                                'close' => "\n",
@@ -374,23 +363,32 @@ class Preprocessor_DOM implements Preprocessor {
                                $piece = $stack->top;
                                // A heading must be open, otherwise \n wouldn't have been in the search list
                                assert( $piece->open == "\n" );
+                               $part = $piece->getCurrentPart();
                                // Search back through the input to see if it has a proper close
                                // Do this using the reversed string since the other solutions (end anchor, etc.) are inefficient
-                               $m = false;
+                               $wsLength = strspn( $revText, " \t", strlen( $text ) - $i );
+                               $searchStart = $i - $wsLength;
+                               if ( isset( $part->commentEnd ) && $searchStart - 1 == $part->commentEnd ) {
+                                       // Comment found at line end
+                                       // Search for equals signs before the comment
+                                       $searchStart = $part->visualEnd;
+                                       $searchStart -= strspn( $revText, " \t", strlen( $text ) - $searchStart );
+                               }
                                $count = $piece->count;
-                               if ( preg_match( "/\s*(=+)/A", $revText, $m, 0, strlen( $text ) - $i ) ) {
-                                       if ( $i - strlen( $m[0] ) == $piece->startPos ) {
+                               $equalsLength = strspn( $revText, '=', strlen( $text ) - $searchStart );
+                               if ( $equalsLength > 0 ) {
+                                       if ( $i - $equalsLength == $piece->startPos ) {
                                                // This is just a single string of equals signs on its own line
                                                // Replicate the doHeadings behaviour /={count}(.+)={count}/
                                                // First find out how many equals signs there really are (don't stop at 6)
-                                               $count = strlen( $m[1] );
+                                               $count = $equalsLength;
                                                if ( $count < 3 ) {
                                                        $count = 0;
                                                } else {
                                                        $count = min( 6, intval( ( $count - 1 ) / 2 ) );
                                                }
                                        } else {
-                                               $count = min( strlen( $m[1] ), $count );
+                                               $count = min( $equalsLength, $count );
                                        }
                                        if ( $count > 0 ) {
                                                // Normal match, output <h>
@@ -869,13 +867,6 @@ class PPFrame_DOM implements PPFrame {
                        } elseif ( is_array( $contextNode ) || $contextNode instanceof DOMNodeList ) {
                                $newIterator = $contextNode;
                        } elseif ( $contextNode instanceof DOMNode ) {
-                               /*
-                               print str_repeat( '&nbsp;', count( debug_backtrace() ) ) . $contextNode->nodeName;
-                               if ( $contextNode->nodeName == 'title' ) {
-                                       print ' = ' . $contextNode->textContent;
-                               }
-                               print "<br/>\n";
-                                */
                                if ( $contextNode->nodeType == XML_TEXT_NODE ) {
                                        $out .= $contextNode->nodeValue;
                                } elseif ( $contextNode->nodeName == 'template' ) {
index 6e91419..fea30ca 100644 (file)
@@ -5348,11 +5348,10 @@ Section extraction test with comment after heading (section 1)
 section=1
 !! input
 ==a==
-==unmarked== <!-- an unmarked section -->
-==b==
+==b== <!-- -->
+==c==
 !! result
 ==a==
-==unmarked== <!-- an unmarked section -->
 !! end
 
 !! test
@@ -5361,10 +5360,10 @@ Section extraction test with comment after heading (section 2)
 section=2
 !! input
 ==a==
-==unmarked== <!-- an unmarked section -->
-==b==
+==b== <!-- -->
+==c==
 !! result
-==b==
+==b== <!-- -->
 !! end
 
 !! test
@@ -6712,56 +6711,26 @@ Morwen/13: Unclosed link followed by heading
 !! end
 
 !! test
-HHP1: Heuristics for headings in preprocessor parenthetical structures
-!! input
-{{foo
-==heading==
-!! result
-<p>{{foo
-</p>
-<a name="heading"></a><h2><span class="editsection">[<a href="/index.php?title=Parser_test&amp;action=edit&amp;section=1" title="Edit section: heading">edit</a>]</span> <span class="mw-headline">heading</span></h2>
-
-!! end
-
-!! test
-HHP2: Heuristics for headings in preprocessor parenthetical structures
+HHP2.1: Heuristics for headings in preprocessor parenthetical structures
 !! input
 {{foo|
-==heading==
+=heading=
 !! result
 <p>{{foo|
 </p>
-<a name="heading"></a><h2> <span class="mw-headline">heading</span></h2>
+<a name="heading"></a><h1> <span class="mw-headline">heading</span></h1>
 
 !! end
 
 !! test
-HHP3: Heuristics for headings in preprocessor parenthetical structures
+HHP2.2: Heuristics for headings in preprocessor parenthetical structures
 !! input
 {{foo|
-==heading 1==
-==heading 2==
+==heading==
 !! result
 <p>{{foo|
 </p>
-<a name="heading_1"></a><h2> <span class="mw-headline">heading 1</span></h2>
-<a name="heading_2"></a><h2><span class="editsection">[<a href="/index.php?title=Parser_test&amp;action=edit&amp;section=1" title="Edit section: heading 2">edit</a>]</span> <span class="mw-headline">heading 2</span></h2>
-
-!! end
-
-# Note that heading 2 is counted, so heading 3 gets section=2 not section=1
-!! test
-HHP4: Heuristics for headings in preprocessor parenthetical structures
-!! input
-{{foo|
-==heading 1==
-==heading 2==
-}}
-==heading 3==
-!! result
-<p>FOO
-</p>
-<a name="heading_3"></a><h2><span class="editsection">[<a href="/index.php?title=Parser_test&amp;action=edit&amp;section=2" title="Edit section: heading 3">edit</a>]</span> <span class="mw-headline">heading 3</span></h2>
+<a name="heading"></a><h2><span class="editsection">[<a href="/index.php?title=Parser_test&amp;action=edit&amp;section=1" title="Edit section: heading">edit</a>]</span> <span class="mw-headline">heading</span></h2>
 
 !! end