Improve/rename Parser::replaceUnusualEscapes

author Brad Jorsch <bjorsch@wikimedia.org>

Sat, 21 Dec 2013 02:14:48 +0000 (21:14 -0500)

committer Tim Starling <tstarling@wikimedia.org>

Tue, 16 Sep 2014 23:00:16 +0000 (23:00 +0000)
author Brad Jorsch <bjorsch@wikimedia.org>
Sat, 21 Dec 2013 02:14:48 +0000 (21:14 -0500)
committer Tim Starling <tstarling@wikimedia.org>
Tue, 16 Sep 2014 23:00:16 +0000 (23:00 +0000)
diff --git a/RELEASE-NOTES-1.24 b/RELEASE-NOTES-1.24

index 20aaad6..0cd1131 100644 (file)
--- a/RELEASE-NOTES-1.24
+++ b/RELEASE-NOTES-1.24
@@ -216,6 +216,8 @@ production.
  * (bug 69789) Title::getContentModel() now loads from the database when
    necessary instead of incorrectly returning the default content model.
  * (bug 69249) wfBaseConvert() now works around PHP Bug #50175 when using GMP.
+* (bug 57909) URLs in the externallinks table will no longer have certain
+  characters decoded in the query string.
  
  === Action API changes in 1.24 ===
  * action=parse API now supports prop=modules, which provides the list of
diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php

index 8bf400a..84bb224 100644 (file)
--- a/includes/parser/Parser.php
+++ b/includes/parser/Parser.php
@@ -1402,7 +1402,7 @@ class Parser {
                                 $this->getExternalLinkAttribs( $url ) );
                         # Register it in the output object...
                         # Replace unnecessary URL escape codes with their equivalent characters
-                       $pasteurized = self::replaceUnusualEscapes( $url );
+                       $pasteurized = self::normalizeLinkUrl( $url );
                         $this->mOutput->addExternalLink( $pasteurized );
                 }
                 wfProfileOut( __METHOD__ );
@@ -1710,7 +1710,7 @@ class Parser {
                         # Register link in the output object.
                         # Replace unnecessary URL escape codes with the referenced character
                         # This prevents spammers from hiding links from the filters
-                       $pasteurized = self::replaceUnusualEscapes( $url );
+                       $pasteurized = self::normalizeLinkUrl( $url );
                         $this->mOutput->addExternalLink( $pasteurized );
                 }
  
@@ -1759,40 +1759,75 @@ class Parser {
         }
  
         /**
-        * Replace unusual URL escape codes with their equivalent characters
+        * Replace unusual escape codes in a URL with their equivalent characters
          *
+        * @deprecated since 1.24, use normalizeLinkUrl
          * @param string $url
          * @return string
-        *
-        * @todo This can merge genuinely required bits in the path or query string,
-        *       breaking legit URLs. A proper fix would treat the various parts of
-        *       the URL differently; as a workaround, just use the output for
-        *       statistical records, not for actual linking/output.
          */
         public static function replaceUnusualEscapes( $url ) {
-               return preg_replace_callback( '/%[0-9A-Fa-f]{2}/',
-                       array( __CLASS__, 'replaceUnusualEscapesCallback' ), $url );
+               wfDeprecated( __METHOD__, '1.24' );
+               return self::normalizeLinkUrl( $url );
         }
  
         /**
-        * Callback function used in replaceUnusualEscapes().
-        * Replaces unusual URL escape codes with their equivalent character
+        * Replace unusual escape codes in a URL with their equivalent characters
          *
-        * @param array $matches
+        * This generally follows the syntax defined in RFC 3986, with special
+        * consideration for HTTP query strings.
          *
+        * @param string $url
          * @return string
          */
-       private static function replaceUnusualEscapesCallback( $matches ) {
-               $char = urldecode( $matches[0] );
-               $ord = ord( $char );
-               # Is it an unsafe or HTTP reserved character according to RFC 1738?
-               if ( $ord > 32 && $ord < 127 && strpos( '<>"#{}|\^~[]`;/?', $char ) === false ) {
-                       # No, shouldn't be escaped
-                       return $char;
-               } else {
-                       # Yes, leave it escaped
-                       return $matches[0];
+       public static function normalizeLinkUrl( $url ) {
+               # First, make sure unsafe characters are encoded
+               $url = preg_replace_callback( '/[\x00-\x20"<>\[\\\\\]^`{|}\x7F-\xFF]/',
+                       function ( $m ) {
+                               return rawurlencode( $m[0] );
+                       },
+                       $url
+               );
+
+               $ret = '';
+               $end = strlen( $url );
+
+               # Fragment part - 'fragment'
+               $start = strpos( $url, '#' );
+               if ( $start !== false && $start < $end ) {
+                       $ret = self::normalizeUrlComponent(
+                               substr( $url, $start, $end - $start ), '"#%<>[\]^`{|}' ) . $ret;
+                       $end = $start;
+               }
+
+               # Query part - 'query' minus &=+;
+               $start = strpos( $url, '?' );
+               if ( $start !== false && $start < $end ) {
+                       $ret = self::normalizeUrlComponent(
+                               substr( $url, $start, $end - $start ), '"#%<>[\]^`{|}&=+;' ) . $ret;
+                       $end = $start;
                 }
+
+               # Scheme and path part - 'pchar'
+               # (we assume no userinfo or encoded colons in the host)
+               $ret = self::normalizeUrlComponent(
+                       substr( $url, 0, $end ), '"#%<>[\]^`{|}/?' ) . $ret;
+
+               return $ret;
+       }
+
+       private static function normalizeUrlComponent( $component, $unsafe ) {
+               $callback = function ( $matches ) use ( $unsafe ) {
+                       $char = urldecode( $matches[0] );
+                       $ord = ord( $char );
+                       if ( $ord > 32 && $ord < 127 && strpos( $unsafe, $char ) === false ) {
+                               # Unescape it
+                               return $char;
+                       } else {
+                               # Leave it escaped, but use uppercase for a-f
+                               return strtoupper( $matches[0] );
+                       }
+               };
+               return preg_replace_callback( '/%[0-9A-Fa-f]{2}/', $callback, $component );
         }
  
         /**
diff --git a/tests/phpunit/includes/parser/ParserMethodsTest.php b/tests/phpunit/includes/parser/ParserMethodsTest.php

index cbf4803..1790086 100644 (file)
--- a/tests/phpunit/includes/parser/ParserMethodsTest.php
+++ b/tests/phpunit/includes/parser/ParserMethodsTest.php
@@ -147,6 +147,41 @@ class ParserMethodsTest extends MediaWikiLangTestCase {
                         ),
                 ), $out->getSections(), 'getSections() with proper value when <h2> is used' );
         }
+
+       /**
+        * @dataProvider provideNormalizeLinkUrl
+        * @covers Parser::normalizeLinkUrl
+        * @covers Parser::normalizeUrlComponent
+        */
+       public function testNormalizeLinkUrl( $explanation, $url, $expected ) {
+               $this->assertEquals( $expected, Parser::normalizeLinkUrl( $url ), $explanation );
+       }
+
+       public static function provideNormalizeLinkUrl() {
+               return array(
+                       array(
+                               'Escaping of unsafe characters',
+                               'http://example.org/foo bar?param[]="value"&param[]=valüe',
+                               'http://example.org/foo%20bar?param%5B%5D=%22value%22&param%5B%5D=val%C3%BCe',
+                       ),
+                       array(
+                               'Case normalization of percent-encoded characters',
+                               'http://example.org/%ab%cD%Ef%FF',
+                               'http://example.org/%AB%CD%EF%FF',
+                       ),
+                       array(
+                               'Unescaping of safe characters',
+                               'http://example.org/%3C%66%6f%6F%3E?%3C%66%6f%6F%3E#%3C%66%6f%6F%3E',
+                               'http://example.org/%3Cfoo%3E?%3Cfoo%3E#%3Cfoo%3E',
+                       ),
+                       array(
+                               'Context-sensitive replacement of sometimes-safe characters',
+                               'http://example.org/%23%2F%3F%26%3D%2B%3B?%23%2F%3F%26%3D%2B%3B#%23%2F%3F%26%3D%2B%3B',
+                               'http://example.org/%23%2F%3F&=+;?%23/?%26%3D%2B%3B#%23/?&=+;',
+                       ),
+               );
+       }
+
         // @todo Add tests for cleanSig() / cleanSigInSig(), getSection(),
         // replaceSection(), getPreloadText()
  }
author	Brad Jorsch <bjorsch@wikimedia.org>
	Sat, 21 Dec 2013 02:14:48 +0000 (21:14 -0500)
committer	Tim Starling <tstarling@wikimedia.org>
	Tue, 16 Sep 2014 23:00:16 +0000 (23:00 +0000)
RELEASE-NOTES-1.24		patch \| blob \| history
includes/parser/Parser.php		patch \| blob \| history
tests/phpunit/includes/parser/ParserMethodsTest.php		patch \| blob \| history