* Pre-strip characters ignored in IDNs from URLs so they can't be used to break the...
authorBrion Vibber <brion@users.mediawiki.org>
Tue, 11 Jul 2006 19:54:20 +0000 (19:54 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Tue, 11 Jul 2006 19:54:20 +0000 (19:54 +0000)
Moved shared behavior of URL fixing from two Parser functions into Sanitizer::cleanUrl
This would be a good place to add some validation...

RELEASE-NOTES
includes/Parser.php
includes/Sanitizer.php
maintenance/parserTests.txt

index 473b55d..c65edd2 100644 (file)
@@ -52,6 +52,9 @@ it from source control: http://www.mediawiki.org/wiki/Download_from_SVN
 * Removed forced dereferencements (new() returns a reference in PHP5)
 * Note about $wgUploadSizeWarning using byte
 * (bug 6592) Add most viewed pages summary to Special:Statistics
+* Pre-strip characters ignored in IDNs from URLs so they can't be used
+  to break the blacklists for regular URLs
+
 
 == Languages updated ==
 
index 194e31c..2144288 100644 (file)
@@ -1197,13 +1197,8 @@ class Parser
                        }
 
                        $text = $wgContLang->markNoConversion($text);
-
-                       # Normalize any HTML entities in input. They will be
-                       # re-escaped by makeExternalLink().
-                       $url = Sanitizer::decodeCharReferences( $url );
-
-                       # Escape any control characters introduced by the above step
-                       $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
+                       
+                       $url = Sanitizer::cleanUrl( $url );
 
                        # Process the trail (i.e. everything after this link up until start of the next link),
                        # replacing any non-bracketed links
@@ -1284,12 +1279,7 @@ class Parser
                                        $url = substr( $url, 0, -$numSepChars );
                                }
 
-                               # Normalize any HTML entities in input. They will be
-                               # re-escaped by makeExternalLink() or maybeMakeExternalImage()
-                               $url = Sanitizer::decodeCharReferences( $url );
-
-                               # Escape any control characters introduced by the above step
-                               $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
+                               $url = Sanitizer::cleanUrl( $url );
 
                                # Is this an external image?
                                $text = $this->maybeMakeExternalImage( $url );
index 6fd262e..a4ab662 100644 (file)
@@ -1181,6 +1181,47 @@ class Sanitizer {
                $out .= "]>\n";
                return $out;
        }
+       
+       static function cleanUrl( $url, $hostname=true ) {
+               # Normalize any HTML entities in input. They will be
+               # re-escaped by makeExternalLink().
+               $url = Sanitizer::decodeCharReferences( $url );
+
+               # Escape any control characters introduced by the above step
+               $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
+               
+               # Validate hostname portion
+               if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
+                       list( $whole, $protocol, $host, $rest ) = $matches;
+                       
+                       // Characters that will be ignored in IDNs.
+                       // http://tools.ietf.org/html/3454#section-3.1
+                       // Strip them before further processing so blacklists and such work.
+                       $strip = "/
+                               \\s|          # general whitespace
+                               \xc2\xad|     # 00ad SOFT HYPHEN
+                               \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
+                               \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
+                               \xe2\x81\xa0| # 2060 WORD JOINER
+                               \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
+                               \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
+                               \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
+                               \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
+                               \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
+                               \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
+                               \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
+                               [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
+                               /xuD";
+                       
+                       $host = preg_replace( $strip, '', $host );
+                       
+                       // @fixme: validate hostnames here
+                       
+                       return $protocol . $host . $rest;
+               } else {
+                       return $url;
+               }
+       }
 
 }
 
index bce9866..1fb856c 100644 (file)
@@ -714,11 +714,20 @@ External links: [encoded equals] (bug 6102)
 !! end
 
 !! test
-External links: [illegal character reference in hostname]
+External links: [IDN ignored character reference in hostname; strip it right off]
 !! input
 [http://e&zwnj;xample.com/]
 !! result
-<p>[http://e&zwnj;xample.com/]
+<p><a href="http://example.com/" class="external autonumber" title="http://example.com/" rel="nofollow">[1]</a>
+</p>
+!! end
+
+!! test
+External links: IDN ignored character reference in hostname; strip it right off
+!! input
+http://e&zwnj;xample.com/
+!! result
+<p><a href="http://example.com/" class="external free" title="http://example.com/" rel="nofollow">http://example.com/</a>
 </p>
 !! end