From f748d710ee0ac0663a8041cb9d2de727190268ed Mon Sep 17 00:00:00 2001
From: Tim Starling <tstarling@users.mediawiki.org>
Date: Thu, 24 Jan 2008 09:07:47 +0000
Subject: [PATCH] * Make lc and uc parser functions skip strip markers * Made
 ==foo==<!----> create a valid section edit link * Changed header processing
 heuristics -- now double-equals signs are generally respected as header
 starts, and will break template invocations, and single equals signs are
 respected as header syntax but might not generate a section edit link.

---
 includes/CoreParserFunctions.php | 12 +++++-
 includes/Parser.php              | 24 ++++++++++++
 includes/Preprocessor_DOM.php    | 65 ++++++++++++++------------------
 maintenance/parserTests.txt      | 53 ++++++--------------------
 4 files changed, 73 insertions(+), 81 deletions(-)

diff --git a/includes/CoreParserFunctions.php b/includes/CoreParserFunctions.php
index 670676b6e9..4b836887a6 100644
--- a/includes/CoreParserFunctions.php
+++ b/includes/CoreParserFunctions.php
@@ -51,12 +51,20 @@ class CoreParserFunctions {
 
 	static function lc( $parser, $s = '' ) {
 		global $wgContLang;
-		return $wgContLang->lc( $s );
+		if ( is_callable( array( $parser, 'markerSkipCallback' ) ) ) {
+			return $parser->markerSkipCallback( $s, array( $wgContLang, 'lc' ) );
+		} else {
+			return $wgContLang->lc( $s );
+		}
 	}
 
 	static function uc( $parser, $s = '' ) {
 		global $wgContLang;
-		return $wgContLang->uc( $s );
+		if ( is_callable( array( $parser, 'markerSkipCallback' ) ) ) {
+			return $parser->markerSkipCallback( $s, array( $wgContLang, 'uc' ) );
+		} else {
+			return $wgContLang->uc( $s );
+		}
 	}
 
 	static function localurl( $parser, $s = '', $arg = null ) { return self::urlFunction( 'getLocalURL', $s, $arg ); }
diff --git a/includes/Parser.php b/includes/Parser.php
index d5354cb21a..5f84062a5f 100644
--- a/includes/Parser.php
+++ b/includes/Parser.php
@@ -4815,6 +4815,30 @@ class Parser
 		}
 		return $this->testSrvus( $text, $title, $options, self::OT_PREPROCESS );
 	}
+
+	function markerSkipCallback( $s, $callback ) {
+		$i = 0;
+		$out = '';
+		while ( $i < strlen( $s ) ) {
+			$markerStart = strpos( $s, $this->mUniqPrefix, $i );
+			if ( $markerStart === false ) {
+				$out .= call_user_func( $callback, substr( $s, $i ) );
+				break;
+			} else {
+				$out .= call_user_func( $callback, substr( $s, $i, $markerStart - $i ) );
+				$markerEnd = strpos( $s, $this->mMarkerSuffix, $markerStart );
+				if ( $markerEnd === false ) {
+					$out .= substr( $s, $markerStart );
+					break;
+				} else {
+					$markerEnd += strlen( $this->mMarkerSuffix );
+					$out .= substr( $s, $markerStart, $markerEnd - $markerStart );
+					$i = $markerEnd;
+				}
+			}
+		}
+		return $out;
+	}
 }
 
 /**
diff --git a/includes/Preprocessor_DOM.php b/includes/Preprocessor_DOM.php
index 3a712b0109..f248d005e4 100644
--- a/includes/Preprocessor_DOM.php
+++ b/includes/Preprocessor_DOM.php
@@ -99,7 +99,7 @@ class Preprocessor_DOM implements Preprocessor {
 	
 		$stack = new PPDStack;
 
-		$searchBase = '[{<'; #}
+		$searchBase = "[{<\n"; #}
 		$revText = strrev( $text ); // For fast reverse searches
 
 		$i = 0;                     # Input pointer, starts out pointing to a pseudo-newline before the start
@@ -148,17 +148,6 @@ class Preprocessor_DOM implements Preprocessor {
 				if ( $findEquals ) {
 					// First equals will be for the template
 					$search .= '=';
-				} else {
-					// Look for headings
-					// We can't look for headings when $findEquals is true, because the ambiguity 
-					// between template name/value separators and heading starts would be unresolved
-					// until the closing double-brace is found. This would mean either infinite 
-					// backtrack, or creating and updating two separate tree structures until the
-					// end of the ambiguity -- one tree structure assuming a heading, and the other 
-					// assuming a template argument.
-					//
-					// Easier to just break some section edit links.
-					$search .= "\n";
 				}
 				$rule = null;
 				# Output literal section, advance input counter
@@ -240,7 +229,7 @@ class Preprocessor_DOM implements Preprocessor {
 						$wsEnd = $endPos + 2 + strspn( $text, ' ', $endPos + 3 );
 						// Eat the line if possible
 						// TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at 
-						// the overall start. That's not how Sanitizer::removeHTMLcomments() does it, but 
+						// the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but 
 						// it's a possible beneficial b/c break.
 						if ( $wsStart > 0 && substr( $text, $wsStart - 1, 1 ) == "\n" 
 							&& substr( $text, $wsEnd + 1, 1 ) == "\n" )
@@ -253,28 +242,24 @@ class Preprocessor_DOM implements Preprocessor {
 							if ( $wsLength > 0 && substr( $accum, -$wsLength ) === str_repeat( ' ', $wsLength ) ) {
 								$accum = substr( $accum, 0, -$wsLength );
 							}
-							// Do a line-start run next time to look for headings after the comment,
-							// but only if stack->top===false, because headings don't exist at deeper levels.
-							if ( $stack->top === false ) {
-								$fakeLineStart = true;
-							}
+							// Do a line-start run next time to look for headings after the comment
+							$fakeLineStart = true;
 						} else {
 							// No line to eat, just take the comment itself
 							$startPos = $i;
 							$endPos += 2;
 						}
 
-						/*
 						if ( $stack->top ) {
-							if ( $stack->top->commentEndPos !== false && $stack->top->commentEndPos == $wsStart ) {
+							$part = $stack->top->getCurrentPart();
+							if ( isset( $part->commentEnd ) && $part->commentEnd == $wsStart - 1 ) {
 								// Comments abutting, no change in visual end
-								$stack->top->commentEndPos = $wsEnd;
+								$part->commentEnd = $wsEnd;
 							} else {
-								$stack->top->visualEndPos = $wsStart;
-								$stack->top->commentEndPos = $wsEnd;
+								$part->visualEnd = $wsStart;
+								$part->commentEnd = $endPos;
 							}
 						}
-						 */
 						$i = $endPos + 1;
 						$inner = substr( $text, $startPos, $endPos - $startPos + 1 );
 						$accum .= '<comment>' . htmlspecialchars( $inner ) . '</comment>';
@@ -356,7 +341,11 @@ class Preprocessor_DOM implements Preprocessor {
 				}
 				
 				$count = strspn( $text, '=', $i, 6 );
-				if ( $count > 0 ) {
+				if ( $count == 1 && $findEquals ) {
+					// DWIM: This looks kind of like a name/value separator
+					// Let's let the equals handler have it and break the potential heading
+					// This is heuristic, but AFAICT the methods for completely correct disambiguation are very complex.
+				} elseif ( $count > 0 ) {
 					$piece = array(
 						'open' => "\n",
 						'close' => "\n",
@@ -374,23 +363,32 @@ class Preprocessor_DOM implements Preprocessor {
 				$piece = $stack->top;
 				// A heading must be open, otherwise \n wouldn't have been in the search list
 				assert( $piece->open == "\n" );
+				$part = $piece->getCurrentPart();
 				// Search back through the input to see if it has a proper close
 				// Do this using the reversed string since the other solutions (end anchor, etc.) are inefficient
-				$m = false;
+				$wsLength = strspn( $revText, " \t", strlen( $text ) - $i );
+				$searchStart = $i - $wsLength;
+				if ( isset( $part->commentEnd ) && $searchStart - 1 == $part->commentEnd ) {
+					// Comment found at line end
+					// Search for equals signs before the comment
+					$searchStart = $part->visualEnd;
+					$searchStart -= strspn( $revText, " \t", strlen( $text ) - $searchStart );
+				}
 				$count = $piece->count;
-				if ( preg_match( "/\s*(=+)/A", $revText, $m, 0, strlen( $text ) - $i ) ) {
-					if ( $i - strlen( $m[0] ) == $piece->startPos ) {
+				$equalsLength = strspn( $revText, '=', strlen( $text ) - $searchStart );
+				if ( $equalsLength > 0 ) {
+					if ( $i - $equalsLength == $piece->startPos ) {
 						// This is just a single string of equals signs on its own line
 						// Replicate the doHeadings behaviour /={count}(.+)={count}/
 						// First find out how many equals signs there really are (don't stop at 6)
-						$count = strlen( $m[1] );
+						$count = $equalsLength;
 						if ( $count < 3 ) {
 							$count = 0;
 						} else {
 							$count = min( 6, intval( ( $count - 1 ) / 2 ) );
 						}
 					} else {
-						$count = min( strlen( $m[1] ), $count );
+						$count = min( $equalsLength, $count );
 					}
 					if ( $count > 0 ) {
 						// Normal match, output <h>
@@ -869,13 +867,6 @@ class PPFrame_DOM implements PPFrame {
 			} elseif ( is_array( $contextNode ) || $contextNode instanceof DOMNodeList ) {
 				$newIterator = $contextNode;
 			} elseif ( $contextNode instanceof DOMNode ) {
-				/*
-				print str_repeat( '&nbsp;', count( debug_backtrace() ) ) . $contextNode->nodeName;
-				if ( $contextNode->nodeName == 'title' ) {
-					print ' = ' . $contextNode->textContent;
-				}
-				print "<br/>\n";
-				 */
 				if ( $contextNode->nodeType == XML_TEXT_NODE ) {
 					$out .= $contextNode->nodeValue;
 				} elseif ( $contextNode->nodeName == 'template' ) {
diff --git a/maintenance/parserTests.txt b/maintenance/parserTests.txt
index 6e914191bf..fea30ca9b3 100644
--- a/maintenance/parserTests.txt
+++ b/maintenance/parserTests.txt
@@ -5348,11 +5348,10 @@ Section extraction test with comment after heading (section 1)
 section=1
 !! input
 ==a==
-==unmarked== <!-- an unmarked section -->
-==b==
+==b== <!-- -->
+==c==
 !! result
 ==a==
-==unmarked== <!-- an unmarked section -->
 !! end
 
 !! test
@@ -5361,10 +5360,10 @@ Section extraction test with comment after heading (section 2)
 section=2
 !! input
 ==a==
-==unmarked== <!-- an unmarked section -->
-==b==
+==b== <!-- -->
+==c==
 !! result
-==b==
+==b== <!-- -->
 !! end
 
 !! test
@@ -6712,56 +6711,26 @@ Morwen/13: Unclosed link followed by heading
 !! end
 
 !! test
-HHP1: Heuristics for headings in preprocessor parenthetical structures
-!! input
-{{foo
-==heading==
-!! result
-<p>{{foo
-</p>
-<a name="heading"></a><h2><span class="editsection">[<a href="/index.php?title=Parser_test&amp;action=edit&amp;section=1" title="Edit section: heading">edit</a>]</span> <span class="mw-headline">heading</span></h2>
-
-!! end
-
-!! test
-HHP2: Heuristics for headings in preprocessor parenthetical structures
+HHP2.1: Heuristics for headings in preprocessor parenthetical structures
 !! input
 {{foo|
-==heading==
+=heading=
 !! result
 <p>{{foo|
 </p>
-<a name="heading"></a><h2> <span class="mw-headline">heading</span></h2>
+<a name="heading"></a><h1> <span class="mw-headline">heading</span></h1>
 
 !! end
 
 !! test
-HHP3: Heuristics for headings in preprocessor parenthetical structures
+HHP2.2: Heuristics for headings in preprocessor parenthetical structures
 !! input
 {{foo|
-==heading 1==
-==heading 2==
+==heading==
 !! result
 <p>{{foo|
 </p>
-<a name="heading_1"></a><h2> <span class="mw-headline">heading 1</span></h2>
-<a name="heading_2"></a><h2><span class="editsection">[<a href="/index.php?title=Parser_test&amp;action=edit&amp;section=1" title="Edit section: heading 2">edit</a>]</span> <span class="mw-headline">heading 2</span></h2>
-
-!! end
-
-# Note that heading 2 is counted, so heading 3 gets section=2 not section=1
-!! test
-HHP4: Heuristics for headings in preprocessor parenthetical structures
-!! input
-{{foo|
-==heading 1==
-==heading 2==
-}}
-==heading 3==
-!! result
-<p>FOO
-</p>
-<a name="heading_3"></a><h2><span class="editsection">[<a href="/index.php?title=Parser_test&amp;action=edit&amp;section=2" title="Edit section: heading 3">edit</a>]</span> <span class="mw-headline">heading 3</span></h2>
+<a name="heading"></a><h2><span class="editsection">[<a href="/index.php?title=Parser_test&amp;action=edit&amp;section=1" title="Edit section: heading">edit</a>]</span> <span class="mw-headline">heading</span></h2>
 
 !! end
 
-- 
2.20.1