X-Git-Url: http://git.cyclocoop.org/?a=blobdiff_plain;f=includes%2Fparser%2FParser.php;h=84bb224300dfe9a362fb08886ab7bef49a9c9ad6;hb=9fa902583cc8e1c2a04b76a58c3fa517df06a013;hp=61fffc5564851dede1e8e7c8f04c6bde6d17182f;hpb=eea755d3babc7148de76efc0a74f700020ceaae2;p=lhc%2Fweb%2Fwiklou.git

diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php
index 61fffc5564..84bb224300 100644
--- a/includes/parser/Parser.php
+++ b/includes/parser/Parser.php
@@ -1402,7 +1402,7 @@ class Parser {
 				$this->getExternalLinkAttribs( $url ) );
 			# Register it in the output object...
 			# Replace unnecessary URL escape codes with their equivalent characters
-			$pasteurized = self::replaceUnusualEscapes( $url );
+			$pasteurized = self::normalizeLinkUrl( $url );
 			$this->mOutput->addExternalLink( $pasteurized );
 		}
 		wfProfileOut( __METHOD__ );
@@ -1710,7 +1710,7 @@ class Parser {
 			# Register link in the output object.
 			# Replace unnecessary URL escape codes with the referenced character
 			# This prevents spammers from hiding links from the filters
-			$pasteurized = self::replaceUnusualEscapes( $url );
+			$pasteurized = self::normalizeLinkUrl( $url );
 			$this->mOutput->addExternalLink( $pasteurized );
 		}
 
@@ -1759,40 +1759,75 @@ class Parser {
 	}
 
 	/**
-	 * Replace unusual URL escape codes with their equivalent characters
+	 * Replace unusual escape codes in a URL with their equivalent characters
 	 *
+	 * @deprecated since 1.24, use normalizeLinkUrl
 	 * @param string $url
 	 * @return string
-	 *
-	 * @todo This can merge genuinely required bits in the path or query string,
-	 *       breaking legit URLs. A proper fix would treat the various parts of
-	 *       the URL differently; as a workaround, just use the output for
-	 *       statistical records, not for actual linking/output.
 	 */
 	public static function replaceUnusualEscapes( $url ) {
-		return preg_replace_callback( '/%[0-9A-Fa-f]{2}/',
-			array( __CLASS__, 'replaceUnusualEscapesCallback' ), $url );
+		wfDeprecated( __METHOD__, '1.24' );
+		return self::normalizeLinkUrl( $url );
 	}
 
 	/**
-	 * Callback function used in replaceUnusualEscapes().
-	 * Replaces unusual URL escape codes with their equivalent character
+	 * Replace unusual escape codes in a URL with their equivalent characters
 	 *
-	 * @param array $matches
+	 * This generally follows the syntax defined in RFC 3986, with special
+	 * consideration for HTTP query strings.
 	 *
+	 * @param string $url
 	 * @return string
 	 */
-	private static function replaceUnusualEscapesCallback( $matches ) {
-		$char = urldecode( $matches[0] );
-		$ord = ord( $char );
-		# Is it an unsafe or HTTP reserved character according to RFC 1738?
-		if ( $ord > 32 && $ord < 127 && strpos( '<>"#{}|\^~[]`;/?', $char ) === false ) {
-			# No, shouldn't be escaped
-			return $char;
-		} else {
-			# Yes, leave it escaped
-			return $matches[0];
+	public static function normalizeLinkUrl( $url ) {
+		# First, make sure unsafe characters are encoded
+		$url = preg_replace_callback( '/[\x00-\x20"<>\[\\\\\]^`{|}\x7F-\xFF]/',
+			function ( $m ) {
+				return rawurlencode( $m[0] );
+			},
+			$url
+		);
+
+		$ret = '';
+		$end = strlen( $url );
+
+		# Fragment part - 'fragment'
+		$start = strpos( $url, '#' );
+		if ( $start !== false && $start < $end ) {
+			$ret = self::normalizeUrlComponent(
+				substr( $url, $start, $end - $start ), '"#%<>[\]^`{|}' ) . $ret;
+			$end = $start;
+		}
+
+		# Query part - 'query' minus &=+;
+		$start = strpos( $url, '?' );
+		if ( $start !== false && $start < $end ) {
+			$ret = self::normalizeUrlComponent(
+				substr( $url, $start, $end - $start ), '"#%<>[\]^`{|}&=+;' ) . $ret;
+			$end = $start;
 		}
+
+		# Scheme and path part - 'pchar'
+		# (we assume no userinfo or encoded colons in the host)
+		$ret = self::normalizeUrlComponent(
+			substr( $url, 0, $end ), '"#%<>[\]^`{|}/?' ) . $ret;
+
+		return $ret;
+	}
+
+	private static function normalizeUrlComponent( $component, $unsafe ) {
+		$callback = function ( $matches ) use ( $unsafe ) {
+			$char = urldecode( $matches[0] );
+			$ord = ord( $char );
+			if ( $ord > 32 && $ord < 127 && strpos( $unsafe, $char ) === false ) {
+				# Unescape it
+				return $char;
+			} else {
+				# Leave it escaped, but use uppercase for a-f
+				return strtoupper( $matches[0] );
+			}
+		};
+		return preg_replace_callback( '/%[0-9A-Fa-f]{2}/', $callback, $component );
 	}
 
 	/**
@@ -4597,13 +4632,13 @@ class Parser {
 				if ( $isTemplate ) {
 					# Put a T flag in the section identifier, to indicate to extractSections()
 					# that sections inside <includeonly> should be counted.
-					$editlinkArgs = array( $titleText, "T-$sectionIndex"/*, null */ );
+					$editsectionPage = $titleText;
+					$editsectionSection = "T-$sectionIndex";
+					$editsectionContent = null;
 				} else {
-					$editlinkArgs = array(
-						$this->mTitle->getPrefixedText(),
-						$sectionIndex,
-						$headlineHint
-					);
+					$editsectionPage = $this->mTitle->getPrefixedText();
+					$editsectionSection = $sectionIndex;
+					$editsectionContent = $headlineHint;
 				}
 				// We use a bit of pesudo-xml for editsection markers. The
 				// language converter is run later on. Using a UNIQ style marker
@@ -4616,10 +4651,11 @@ class Parser {
 				// important bits of data, but put the headline hint inside a
 				// content block because the language converter is supposed to
 				// be able to convert that piece of data.
-				$editlink = '<mw:editsection page="' . htmlspecialchars( $editlinkArgs[0] );
-				$editlink .= '" section="' . htmlspecialchars( $editlinkArgs[1] ) . '"';
-				if ( isset( $editlinkArgs[2] ) ) {
-					$editlink .= '>' . $editlinkArgs[2] . '</mw:editsection>';
+				// Gets replaced with html in ParserOutput::getText
+				$editlink = '<mw:editsection page="' . htmlspecialchars( $editsectionPage );
+				$editlink .= '" section="' . htmlspecialchars( $editsectionSection ) . '"';
+				if ( $editsectionContent !== null ) {
+					$editlink .= '>' . $editsectionContent . '</mw:editsection>';
 				} else {
 					$editlink .= '/>';
 				}