From badf11ffe60edccaa294a912294beee382d9d1ef Mon Sep 17 00:00:00 2001
From: Tim Starling <tstarling@users.mediawiki.org>
Date: Sat, 7 Aug 2004 18:24:12 +0000
Subject: [PATCH] Rewrote external link handler, using preg_split(). Passes all
 external link test cases on [[m:Parser testing]]. Fixes some parts of bug
 #583234: fixes URL inside URL problem and incorrectly rejected character
 problem

---
 includes/Parser.php | 224 +++++++++++++++++++++++++-------------------
 1 file changed, 126 insertions(+), 98 deletions(-)
diff --git a/includes/Parser.php b/includes/Parser.php
index a0e8649dd4..d6c7c222bc 100644
--- a/includes/Parser.php
+++ b/includes/Parser.php
@@ -42,10 +42,29 @@ define( "OT_MSG", 3 );
 # to strip HTML comments in addition to regular
 # <XML>-style tags. This should not be anything we
 # may want to use in wikisyntax
-define( "STRIP_COMMENTS", "HTMLCommentStrip" );
+define( 'STRIP_COMMENTS', 'HTMLCommentStrip' );
 
 # prefix for escaping, used in two functions at least
-define( "UNIQ_PREFIX", "NaodW29");
+define( 'UNIQ_PREFIX', 'NaodW29');
+
+
+# Constants needed for external link processing
+
+define( 'URL_PROTOCOLS', 'http|https|ftp|irc|gopher|news|mailto' );
+define( 'HTTP_PROTOCOLS', 'http|https' );
+# Everything except bracket, space, or control characters
+define( 'EXT_LINK_URL_CLASS', '[^]\\x00-\\x20\\x7F]' );
+define( 'INVERSE_EXT_LINK_URL_CLASS', '[\]\\x00-\\x20\\x7F]' );
+# Including space
+define( 'EXT_LINK_TEXT_CLASS', '[^\]\\x00-\\x1F\\x7F]' );
+define( 'EXT_IMAGE_FNAME_CLASS', '[A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]' );
+define( 'EXT_IMAGE_EXTENSIONS', 'gif|png|jpg|jpeg' );
+define( 'EXT_LINK_BRACKETED',  '/\[(('.URL_PROTOCOLS.'):'.EXT_LINK_URL_CLASS.'+) *('.EXT_LINK_TEXT_CLASS.'*?)\]/S' ); 
+define( 'EXT_IMAGE_REGEX', 
+	'/^('.HTTP_PROTOCOLS.':)'.  # Protocol
+	'('.EXT_LINK_URL_CLASS.'+)\\/'.  # Hostname and path
+	'('.EXT_IMAGE_FNAME_CLASS.'+)\\.((?i)'.EXT_IMAGE_EXTENSIONS.')$/S' # Filename
+);
 
 class Parser
 {
@@ -373,7 +392,7 @@ class Parser
 
 	# This method generates the list of subcategories and pages for a category
 	function oldCategoryMagic () {
-		global $wgLang , $wgUser ;
+		global $wgLang ;
 		$fname = 'Parser::oldCategoryMagic';
 
 		if ( !$this->mOptions->getUseCategoryMagic() ) return ; # Doesn't use categories at all
@@ -383,7 +402,7 @@ class Parser
 		$r = "<br style=\"clear:both;\"/>\n";
 
 
-		$sk =& $wgUser->getSkin() ;
+		$sk =& $this->mOptions->getSkin() ;
 
 		$articles = array() ;
 		$children = array() ;
@@ -436,7 +455,7 @@ class Parser
 
 
 	function newCategoryMagic () {
-		global $wgLang , $wgUser ;
+		global $wgLang;
 		if ( !$this->mOptions->getUseCategoryMagic() ) return ; # Doesn't use categories at all
 
 		if ( $this->mTitle->getNamespace() != NS_CATEGORY ) return '' ; # This ain't a category page
@@ -444,7 +463,7 @@ class Parser
 		$r = "<br style=\"clear:both;\"/>\n";
 
 
-		$sk =& $wgUser->getSkin() ;
+		$sk =& $this->mOptions->getSkin() ;
 
 		$articles = array() ;
 		$articles_start_char = array();
@@ -844,7 +863,7 @@ class Parser
 			$text .= $this->categoryMagic () ;
 			$this->categoryMagicDone = true ;
 		}
-
+		
 		wfProfileOut( $fname );
 		return $text;
 	}
@@ -1058,115 +1077,124 @@ class Parser
 		$fname = 'Parser::replaceExternalLinks';
 		wfProfileIn( $fname );
 
-		$text = $this->subReplaceExternalLinks( $text, 'http', true );
-		$text = $this->subReplaceExternalLinks( $text, 'https', true );
-		$text = $this->subReplaceExternalLinks( $text, 'ftp', false );
-		$text = $this->subReplaceExternalLinks( $text, 'irc', false );
-		$text = $this->subReplaceExternalLinks( $text, 'gopher', false );
-		$text = $this->subReplaceExternalLinks( $text, 'news', false );
-		$text = $this->subReplaceExternalLinks( $text, 'mailto', false );
-		wfProfileOut( $fname );
-		return $text;
-	}
-
-	# Replaces all external links with a given protocol
-	/* private */ function subReplaceExternalLinks( $s, $protocol, $autonumber ) {
-		$unique = '4jzAfzB8hNvf4sqyO9Edd8pSmk9rE2in0Tgw3';
-
-		# URL character class
-		$uc = "A-Za-z0-9_\\/~%\\-+&*#?!=()@\\x80-\\xFF";
-
-		# this is  the list of separators that should be ignored if they
-		# are the last character of an URL but that should be included
-		# if they occur within the URL, e.g. "go to www.foo.com, where .."
-		# in this case, the last comma should not become part of the URL,
-		# but in "www.foo.com/123,2342,32.htm" it should.
-		$sep = ",;\.:";
-
-		# File name character class, used for identifying images
-		$fnc = 'A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF';
-		# Recognised image extensions
-		$images = 'gif|png|jpg|jpeg';
+		$sk =& $this->mOptions->getSkin();
+		$linktrail = wfMsg('linktrail');
+		$bits = preg_split( EXT_LINK_BRACKETED, $text, -1, PREG_SPLIT_DELIM_CAPTURE );
 
-		# PLEASE NOTE: The curly braces { } are not part of the regex,
-		# they are interpreted as part of the string (used to tell PHP
-		# that the content of the string should be inserted there).
+		$s = $this->replaceFreeExternalLinks( array_shift( $bits ) );
 
-		# Regexp for matching image URLs
-		$rxImageURL = "/(^|[^\\[])({$protocol}:)([{$uc}{$sep}]+)\\/([{$fnc}]+)\\." .
-		  "((?i){$images})([^{$uc}]|$)/";
+		$i = 0;
+		while ( $i<count( $bits ) ) {
+			$url = $bits[$i++];
+			$protocol = $bits[$i++];
+			$text = $bits[$i++];
+			$trail = $bits[$i++];
+			
+			# If the link text is an image URL, replace it with an <img> tag
+			# This happened by accident in the original parser, but some people used it extensively
+			$img = $this->maybeMakeImageLink( $text );
+			if ( $img !== false ) {
+				$text = $img;
+			}
+
+			$dtrail = '';
+
+			# No link text, e.g. [http://domain.tld/some.link]
+			if ( $text == '' ) {
+				# Autonumber if allowed
+				if ( strpos( HTTP_PROTOCOLS, $protocol ) !== false ) { 
+					$text = "[" . ++$this->mAutonumber . "]";
+				} else { 
+					# Otherwise just use the URL
+					$text = wfEscapeHTML( $url ); 
+				}
+			} else {
+				# Have link text, e.g. [http://domain.tld/some.link text]s
+				# Check for trail
+				if ( preg_match( $linktrail, $trail, $m2 ) ) {
+					$dtrail = $m2[1];
+					$trail = $m2[2];
+				}
+			}
+			
+			$encUrl = htmlspecialchars( $url );
+			# Bit in parentheses showing the URL for the printable version
+			if( $url == $text || preg_match( "!$protocol://" . preg_quote( $text, "/" ) . "/?$!", $url ) ) {
+				$paren = '';
+			} else {
+				# Expand the URL for printable version
+				$paren = "<span class='urlexpansion'> (<i>" . htmlspecialchars ( $encUrl ) . "</i>)</span>";
+			}
 
-		# Regexp for matching non-delimited URLs
-		$rxFreeURL = "/(^|[^\\[])({$protocol}:)(([".$uc."]|[".$sep."][".$uc."])+)([^". $uc . $sep. "]|[".$sep."]|$)/";
-		$sk =& $this->mOptions->getSkin();
+			# Process the trail (i.e. everything after this link up until start of the next link),
+			# replacing any non-bracketed links
+			$trail = $this->replaceFreeExternalLinks( $trail );
 
-		# Replace image URLs with <img> tags, but only for HTTP and HTTPS ($autonumber=true)
-		if ( $autonumber and $this->mOptions->getAllowExternalImages() ) {
-			$s = preg_replace( $rxImageURL, '\\1' . $sk->makeImage( "{$unique}:\\3" .
-			  '/\\4.\\5', '\\4.\\5' ) . '\\6', $s );
+			$la = $sk->getExternalLinkAttributes( $url, $text );
+			
+			# Use the encoded URL
+			# This means that users can paste URLs directly into the text
+			# Funny characters like &ouml; aren't valid in URLs anyway
+			# This was changed in August 2004
+			$s .= "<a href=\"{$url}\" {$la}>{$text}</a>{$dtrail}{$paren}{$trail}";
 		}
 
-		# Replace free URLs
-		$s = preg_replace( $rxFreeURL, '\\1' . "<a href=\"{$unique}:\\3\"" .
-		  $sk->getExternalLinkAttributes( "{$unique}:\\3", wfEscapeHTML(
-		  "{$unique}:\\3" ) ) . ">" . wfEscapeHTML( "{$unique}:\\3" ) .
-		  '</a>\\5', $s );
-		$s = str_replace( $unique, $protocol, $s );
+		wfProfileOut( $fname );
+		return $s;
+	}
 
-		# Search for external links with square brackets by splitting with explode()
-		$a = explode( "[{$protocol}:", " " . $s );
-		$s = array_shift( $a );
-		$s = substr( $s, 1 );
+	# Replace anything that looks like a URL with a link
+	function replaceFreeExternalLinks( $text ) {
+		$bits = preg_split( '/((?:'.URL_PROTOCOLS.'):)/', $text, -1, PREG_SPLIT_DELIM_CAPTURE );
+		$s = array_shift( $bits );
+		$i = 0;
 
-		# Regexp for URL in square brackets
-		$rxWithoutText = "/^([{$uc}{$sep}]+)\\](.*)\$/sD";
-		# Regexp for URL with link text in square brackets
-		$rxWithText = "/^([{$uc}{$sep}]+)\\s+([^\\]]+)\\](.*)\$/sD";
+		# Characters which may occur in the middle of a URL, but not at the end
+		$sep = ",;\.:";
 
-		# Now interpret each instance of [protocol:
-		foreach ( $a as $line ) {
+		$sk =& $this->mOptions->getSkin();
 
-			# CASE 1: Link in square brackets, e.g.
-			# some text [http://domain.tld/some.link] more text
-			if ( preg_match( $rxWithoutText, $line, $m ) ) {
-				$link = "{$protocol}:{$m[1]}";
-				$trail = $m[2];
-				if ( $autonumber ) { $text = "[" . ++$this->mAutonumber . "]"; }
-				else { $text = wfEscapeHTML( $link ); }
-			}
+		while ( $i < count( $bits ) ){
+			$protocol = $bits[$i++];
+			$remainder = $bits[$i++];
 
-			# CASE 2: Link with link text and text directly following it, e.g.
-			# This is a collection of [http://domain.tld/some.link link]s
-			else if ( preg_match( $rxWithText, $line, $m ) ) {
-				$link = "{$protocol}:{$m[1]}";
-				$text = $m[2];
-				$dtrail = '';
-				$trail = $m[3];
-				if ( preg_match( wfMsg ('linktrail'), $trail, $m2 ) ) {
-					$dtrail = $m2[1];
-					$trail = $m2[2];
+			if ( preg_match( '/^('.EXT_LINK_URL_CLASS.'+)(.*)$/s', $remainder, $m ) ) {
+				# Found some characters after the protocol that look promising
+				$url = $protocol . $m[1];
+				$trail = $m[2];
+				
+				# Move characters in $sep to $trail
+				$numSepChars = strspn( strrev( $url ), $sep );
+				if ( $numSepChars ) {
+					$trail = substr( $url, -$numSepChars ) . $trail;
+					$url = substr( $url, 0, -$numSepChars );
 				}
-			}
 
-			# CASE 3: Nothing matches, just output the source text
-			else {
-				$s .= "[{$protocol}:" . $line;
-				continue;
-			}
-
-			if( $link == $text || preg_match( "!$protocol://" . preg_quote( $text, "/" ) . "/?$!", $link ) ) {
-				$paren = '';
+				# Is this an external image?
+				$text = $this->maybeMakeImageLink( $url );
+				if ( $text === false ) {
+					# Not an image, make a link
+					$text = $sk->makeExternalLink( $url, $url );
+				}
+				$s .= $text . $trail;
 			} else {
-				# Expand the URL for printable version
-				$paren = "<span class='urlexpansion'> (<i>" . htmlspecialchars ( $link ) . "</i>)</span>";
+				$s .= $protocol . $remainder;
 			}
-			$la = $sk->getExternalLinkAttributes( $link, $text );
-			$s .= "<a href='{$link}'{$la}>{$text}</a>{$dtrail}{$paren}{$trail}";
-
 		}
 		return $s;
 	}
-
+				
+	function maybeMakeImageLink( $url ) {
+		$sk =& $this->mOptions->getSkin();
+		$text = false;
+		if ( $this->mOptions->getAllowExternalImages() ) {
+			if ( preg_match( EXT_IMAGE_REGEX, $url ) ) {
+				# Image found
+				$text = $sk->makeImage( $url );
+			}
+		}
+		return $text;
+	}
 
 	/* private */ function replaceInternalLinks( $s ) {
 		global $wgLang, $wgLinkCache;
-- 
2.20.1