X-Git-Url: http://git.cyclocoop.org/?a=blobdiff_plain;f=includes%2Fparser%2FParser.php;h=84bb224300dfe9a362fb08886ab7bef49a9c9ad6;hb=9fa902583cc8e1c2a04b76a58c3fa517df06a013;hp=8bf400a0cfaa9da8f73070d30f6536499f724b25;hpb=5f8b92578167107e91e2b10b631de0cada0b4854;p=lhc%2Fweb%2Fwiklou.git diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index 8bf400a0cf..84bb224300 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -1402,7 +1402,7 @@ class Parser { $this->getExternalLinkAttribs( $url ) ); # Register it in the output object... # Replace unnecessary URL escape codes with their equivalent characters - $pasteurized = self::replaceUnusualEscapes( $url ); + $pasteurized = self::normalizeLinkUrl( $url ); $this->mOutput->addExternalLink( $pasteurized ); } wfProfileOut( __METHOD__ ); @@ -1710,7 +1710,7 @@ class Parser { # Register link in the output object. # Replace unnecessary URL escape codes with the referenced character # This prevents spammers from hiding links from the filters - $pasteurized = self::replaceUnusualEscapes( $url ); + $pasteurized = self::normalizeLinkUrl( $url ); $this->mOutput->addExternalLink( $pasteurized ); } @@ -1759,40 +1759,75 @@ class Parser { } /** - * Replace unusual URL escape codes with their equivalent characters + * Replace unusual escape codes in a URL with their equivalent characters * + * @deprecated since 1.24, use normalizeLinkUrl * @param string $url * @return string - * - * @todo This can merge genuinely required bits in the path or query string, - * breaking legit URLs. A proper fix would treat the various parts of - * the URL differently; as a workaround, just use the output for - * statistical records, not for actual linking/output. */ public static function replaceUnusualEscapes( $url ) { - return preg_replace_callback( '/%[0-9A-Fa-f]{2}/', - array( __CLASS__, 'replaceUnusualEscapesCallback' ), $url ); + wfDeprecated( __METHOD__, '1.24' ); + return self::normalizeLinkUrl( $url ); } /** - * Callback function used in replaceUnusualEscapes(). - * Replaces unusual URL escape codes with their equivalent character + * Replace unusual escape codes in a URL with their equivalent characters * - * @param array $matches + * This generally follows the syntax defined in RFC 3986, with special + * consideration for HTTP query strings. * + * @param string $url * @return string */ - private static function replaceUnusualEscapesCallback( $matches ) { - $char = urldecode( $matches[0] ); - $ord = ord( $char ); - # Is it an unsafe or HTTP reserved character according to RFC 1738? - if ( $ord > 32 && $ord < 127 && strpos( '<>"#{}|\^~[]`;/?', $char ) === false ) { - # No, shouldn't be escaped - return $char; - } else { - # Yes, leave it escaped - return $matches[0]; + public static function normalizeLinkUrl( $url ) { + # First, make sure unsafe characters are encoded + $url = preg_replace_callback( '/[\x00-\x20"<>\[\\\\\]^`{|}\x7F-\xFF]/', + function ( $m ) { + return rawurlencode( $m[0] ); + }, + $url + ); + + $ret = ''; + $end = strlen( $url ); + + # Fragment part - 'fragment' + $start = strpos( $url, '#' ); + if ( $start !== false && $start < $end ) { + $ret = self::normalizeUrlComponent( + substr( $url, $start, $end - $start ), '"#%<>[\]^`{|}' ) . $ret; + $end = $start; + } + + # Query part - 'query' minus &=+; + $start = strpos( $url, '?' ); + if ( $start !== false && $start < $end ) { + $ret = self::normalizeUrlComponent( + substr( $url, $start, $end - $start ), '"#%<>[\]^`{|}&=+;' ) . $ret; + $end = $start; } + + # Scheme and path part - 'pchar' + # (we assume no userinfo or encoded colons in the host) + $ret = self::normalizeUrlComponent( + substr( $url, 0, $end ), '"#%<>[\]^`{|}/?' ) . $ret; + + return $ret; + } + + private static function normalizeUrlComponent( $component, $unsafe ) { + $callback = function ( $matches ) use ( $unsafe ) { + $char = urldecode( $matches[0] ); + $ord = ord( $char ); + if ( $ord > 32 && $ord < 127 && strpos( $unsafe, $char ) === false ) { + # Unescape it + return $char; + } else { + # Leave it escaped, but use uppercase for a-f + return strtoupper( $matches[0] ); + } + }; + return preg_replace_callback( '/%[0-9A-Fa-f]{2}/', $callback, $component ); } /**