From dfecc217abfb45c3a77100c4c4b58dac7b01d6e0 Mon Sep 17 00:00:00 2001
From: Erik Moeller <erik@users.mediawiki.org>
Date: Wed, 28 Apr 2004 04:50:35 +0000
Subject: [PATCH] fix longstanding bug with section editing where section
 headers within <nowiki> or <!-- .. --> tags would cause it to miscount
 section numbers. Parser::extractTags modified to allow stripping HTML
 comments as well. Note: Presently HTML comments are completeley removed from
 the output using preg_replace. Should they ever be rendered instead,
 Parser::extractTags should be used.

---
 includes/Article.php | 65 +++++++++++++++++++++++++++-----------
 includes/Parser.php  | 74 +++++++++++++++++++++++++++++---------------
 2 files changed, 96 insertions(+), 43 deletions(-)
diff --git a/includes/Article.php b/includes/Article.php
index 3289dd14b0..ef285db046 100644
--- a/includes/Article.php
+++ b/includes/Article.php
@@ -70,7 +70,7 @@ class Article {
 		$action = $wgRequest->getText( 'action', 'view' );
 		$section = $wgRequest->getText( 'section' );
 
-		$fname =  "Article::getContent"; 
+		$fname =  "Article::getContent";
 		wfProfileIn( $fname );
 
 		if ( 0 == $this->getID() ) {
@@ -82,34 +82,50 @@ class Article {
 			return wfMsg( "noarticletext" );
 		} else {
 			$this->loadContent( $noredir );
-						
+
 			if(
 				# check if we're displaying a [[User talk:x.x.x.x]] anonymous talk page
 				( $this->mTitle->getNamespace() == Namespace::getTalk( Namespace::getUser()) ) &&
 				  preg_match("/^\d{1,3}\.\d{1,3}.\d{1,3}\.\d{1,3}$/",$this->mTitle->getText()) &&
 				  $action=="view"
-				) 
+				)
 				{
 				wfProfileOut( $fname );
 				return $this->mContent . "\n" .wfMsg("anontalkpagetext"); }
-			else {				
+			else {
 				if($action=="edit") {
 					if($section!="") {
-						if($section=="new") { 
+						if($section=="new") {
 							wfProfileOut( $fname );
-							return ""; 
+							return "";
 						}
 
-						$secs=preg_split("/(^=+.*?=+|^<h[1-6].*?>.*?<\/h[1-6].*?>)/mi",
-						 $this->mContent, -1,
-						 PREG_SPLIT_DELIM_CAPTURE);
+						# strip NOWIKI etc. to avoid confusion (true-parameter causes HTML
+						# comments to be stripped as well)
+						$striparray=array();
+						$parser=new Parser();
+						$parser->mOutputType=OT_WIKI;
+						$striptext=$parser->strip($this->mContent, $striparray, true);
+
+						# now that we can be sure that no pseudo-sections are in the source,
+						# split it up by section
+						$secs =
+						  preg_split(
+						  "/(^=+.*?=+|^<h[1-6].*?>.*?<\/h[1-6].*?>)/mi",
+						  $striptext, -1,
+						  PREG_SPLIT_DELIM_CAPTURE);
+
 						if($section==0) {
-							wfProfileOut( $fname );
-							return trim($secs[0]);
+							$rv=$secs[0];
 						} else {
-							wfProfileOut( $fname );
-							return trim($secs[$section*2-1] . $secs[$section*2]);
+							$rv=$secs[$section*2-1] . $secs[$section*2];
 						}
+
+						# reinsert stripped tags
+						$rv=$parser->unstrip($rv,$striparray);
+						$rv=trim($rv);
+						wfProfileOut( $fname );
+						return $rv;
 					}
 				}
 				wfProfileOut( $fname );
@@ -117,12 +133,12 @@ class Article {
 			}
 		}
 	}
-	
+
 	# Load the revision (including cur_text) into this object
 	function loadContent( $noredir = false )
 	{
 		global $wgOut, $wgMwRedir, $wgRequest;
-		
+
 		# Query variables :P
 		$oldid = $wgRequest->getVal( 'oldid' );
 		$redirect = $wgRequest->getVal( 'redirect' );
@@ -131,12 +147,12 @@ class Article {
 		$fname = "Article::loadContent";
 		
 		# Pre-fill content with error message so that if something 	 
-		# fails we'll have something telling us what we intended. 	 
+		# fails we'll have something telling us what we intended.
 
 		$t = $this->mTitle->getPrefixedText(); 	 
 		if ( isset( $oldid ) ) { 	 
 			$oldid = IntVal( $oldid ); 	 
-			$t .= ",oldid={$oldid}"; 	 
+			$t .= ",oldid={$oldid}";
 		} 	 
 		if ( isset( $redirect ) ) { 	 
 			$redirect = ($redirect == "no") ? "no" : "yes"; 	 
@@ -558,11 +574,24 @@ class Article {
 				if($summary) $subject="== {$summary} ==\n\n";
 				$text=$oldtext."\n\n".$subject.$text;
 			} else {
+
+				# strip NOWIKI etc. to avoid confusion (true-parameter causes HTML
+				# comments to be stripped as well)
+				$striparray=array();
+				$parser=new Parser();
+				$parser->mOutputType=OT_WIKI;
+				$oldtext=$parser->strip($oldtext, $striparray, true);
+
+				# now that we can be sure that no pseudo-sections are in the source,
+				# split it up
 				$secs=preg_split("/(^=+.*?=+|^<h[1-6].*?>.*?<\/h[1-6].*?>)/mi",
 				  $oldtext,-1,PREG_SPLIT_DELIM_CAPTURE);
 				$secs[$section*2]=$text."\n\n"; // replace with edited
 				if($section) { $secs[$section*2-1]=""; } // erase old headline
-				$text=join("",$secs);		
+				$text=join("",$secs);
+
+				# reinsert the stuff that we stripped out earlier
+				$text=$parser->unstrip($text,$striparray,true);
 			}
 		}
 		return $text;
diff --git a/includes/Parser.php b/includes/Parser.php
index 5f1043a007..942621c1eb 100644
--- a/includes/Parser.php
+++ b/includes/Parser.php
@@ -44,6 +44,12 @@ define( "OT_HTML", 1 );
 define( "OT_WIKI", 2 );
 define( "OT_MSG", 3 );
 
+# string parameter for extractTags which will cause it
+# to strip HTML comments in addition to regular
+# <XML>-style tags. This should not be anything we
+# may want to use in wikisyntax
+define( "STRIP_COMMENTS", "HTMLCommentStrip" );
+
 # prefix for escaping, used in two functions at least
 define( "UNIQ_PREFIX", "NaodW29");
 
@@ -127,6 +133,9 @@ class Parser
 
 	# If $content is already set, the additional entries will be appended
 
+	# If $tag is set to STRIP_COMMENTS, the function will extract
+	# <!-- HTML comments -->
+
 	/* static */ function extractTags($tag, $text, &$content, $uniq_prefix = ""){
 		$rnd = $uniq_prefix . '-' . $tag . Parser::getRandomString();
 		if ( !$content ) {
@@ -136,12 +145,20 @@ class Parser
 		$stripped = "";
 
 		while ( "" != $text ) {
-			$p = preg_split( "/<\\s*$tag\\s*>/i", $text, 2 );
+			if($tag==STRIP_COMMENTS) {
+				$p = preg_split( "/<!--/i", $text, 2 );
+			} else {
+				$p = preg_split( "/<\\s*$tag\\s*>/i", $text, 2 );
+			}
 			$stripped .= $p[0];
 			if ( ( count( $p ) < 2 ) || ( "" == $p[1] ) ) {
 				$text = "";
 			} else {
-				$q = preg_split( "/<\\/\\s*$tag\\s*>/i", $p[1], 2 );
+				if($tag==STRIP_COMMENTS) {
+					$q = preg_split( "/-->/i", $p[1], 2 );
+				} else {
+					$q = preg_split( "/<\\/\\s*$tag\\s*>/i", $p[1], 2 );
+				}
 				$marker = $rnd . sprintf("%08X", $n++);
 				$content[$marker] = $q[0];
 				$stripped .= $marker;
@@ -151,18 +168,23 @@ class Parser
 		return $stripped;
 	}
 
-	# Strips <nowiki>, <pre> and <math>
+	# Strips and renders <nowiki>, <pre>, <math>, <hiero>
+	# If $render is set, performs necessary rendering operations on plugins
 	# Returns the text, and fills an array with data needed in unstrip()
 	# If the $state is already a valid strip state, it adds to the state
-	#
-	function strip( $text, &$state )
+
+	# When $stripcomments is set, HTML comments <!-- like this -->
+	# will be stripped in addition to other tags. This is important
+	# for section editing, where these comments cause confusion when
+	# counting the sections in the wikisource
+	function strip( $text, &$state, $stripcomments = false )
 	{
 		$render = ($this->mOutputType == OT_HTML);
 		$nowiki_content = array();
 		$hiero_content = array();
 		$math_content = array();
 		$pre_content = array();
-		$item_content = array();
+		$comment_content = array();
 
 		# Replace any instances of the placeholders
 		$uniq_prefix = UNIQ_PREFIX;
@@ -177,25 +199,21 @@ class Parser
 			}
 		}
 
-		if( $GLOBALS['wgUseWikiHiero'] ){
-			$text = Parser::extractTags("hiero", $text, $hiero_content, $uniq_prefix);
-			foreach( $hiero_content as $marker => $content ){
-				if( $render ){
-					$hiero_content[$marker] = WikiHiero( $content, WH_MODE_HTML);
-				} else {
-					$hiero_content[$marker] = "<hiero>$content</hiero>";
-				}
+		$text = Parser::extractTags("hiero", $text, $hiero_content, $uniq_prefix);
+		foreach( $hiero_content as $marker => $content ){
+			if( $render && $GLOBALS['wgUseWikiHiero']){
+				$hiero_content[$marker] = WikiHiero( $content, WH_MODE_HTML);
+			} else {
+				$hiero_content[$marker] = "<hiero>$content</hiero>";
 			}
 		}
 
-		if( $this->mOptions->getUseTeX() ){
-			$text = Parser::extractTags("math", $text, $math_content, $uniq_prefix);
-			foreach( $math_content as $marker => $content ){
-				if( $render ){
-					$math_content[$marker] = renderMath( $content );
-				} else {
-					$math_content[$marker] = "<math>$content</math>";
-				}
+		$text = Parser::extractTags("math", $text, $math_content, $uniq_prefix);
+		foreach( $math_content as $marker => $content ){
+			if( $render && $this->mOptions->getUseTeX() ){
+				$math_content[$marker] = renderMath( $content );
+			} else {
+				$math_content[$marker] = "<math>$content</math>";
 			}
 		}
 
@@ -207,6 +225,12 @@ class Parser
 				$pre_content[$marker] = "<pre>$content</pre>";
 			}
 		}
+		if($stripcomments) {
+			$text = Parser::extractTags(STRIP_COMMENTS, $text, $comment_content, $uniq_prefix);
+			foreach( $comment_content as $marker => $content ){
+				$comment_content[$marker] = "<!--$content-->";
+			}
+		}
 
 		# Merge state with the pre-existing state, if there is one
 		if ( $state ) {
@@ -214,13 +238,14 @@ class Parser
 			$state['hiero'] = $state['hiero'] + $hiero_content;
 			$state['math'] = $state['math'] + $math_content;
 			$state['pre'] = $state['pre'] + $pre_content;
+			$state['comment'] = $state['comment'] + $comment_content;
 		} else {
 			$state = array(
 			  'nowiki' => $nowiki_content,
 			  'hiero' => $hiero_content,
 			  'math' => $math_content,
 			  'pre' => $pre_content,
-			  'item' => $item_content
+			  'comment' => $comment_content
 			);
 		}
 		return $text;
@@ -251,8 +276,7 @@ class Parser
 			  'nowiki' => array(),
 			  'hiero' => array(),
 			  'math' => array(),
-			  'pre' => array(),
-			  'item' => array()
+			  'pre' => array()
 			);
 		}
 		$state['item'][$rnd] = $text;
-- 
2.20.1