* Pre-strip characters ignored in IDNs from URLs so they can't be used to break the...

author Brion Vibber <brion@users.mediawiki.org>

Tue, 11 Jul 2006 19:54:20 +0000 (19:54 +0000)

committer Brion Vibber <brion@users.mediawiki.org>

Tue, 11 Jul 2006 19:54:20 +0000 (19:54 +0000)
author Brion Vibber <brion@users.mediawiki.org>
Tue, 11 Jul 2006 19:54:20 +0000 (19:54 +0000)
committer Brion Vibber <brion@users.mediawiki.org>
Tue, 11 Jul 2006 19:54:20 +0000 (19:54 +0000)
diff --git a/RELEASE-NOTES b/RELEASE-NOTES

index 473b55d..c65edd2 100644 (file)
--- a/RELEASE-NOTES
+++ b/RELEASE-NOTES
@@ -52,6 +52,9 @@ it from source control: http://www.mediawiki.org/wiki/Download_from_SVN
  * Removed forced dereferencements (new() returns a reference in PHP5)
  * Note about $wgUploadSizeWarning using byte
  * (bug 6592) Add most viewed pages summary to Special:Statistics
+* Pre-strip characters ignored in IDNs from URLs so they can't be used
+  to break the blacklists for regular URLs
+
  
  == Languages updated ==
  
diff --git a/includes/Parser.php b/includes/Parser.php

index 194e31c..2144288 100644 (file)
--- a/includes/Parser.php
+++ b/includes/Parser.php
@@ -1197,13 +1197,8 @@ class Parser
                         }
  
                         $text = $wgContLang->markNoConversion($text);
-
-                       # Normalize any HTML entities in input. They will be
-                       # re-escaped by makeExternalLink().
-                       $url = Sanitizer::decodeCharReferences( $url );
-
-                       # Escape any control characters introduced by the above step
-                       $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
+                       
+                       $url = Sanitizer::cleanUrl( $url );
  
                         # Process the trail (i.e. everything after this link up until start of the next link),
                         # replacing any non-bracketed links
@@ -1284,12 +1279,7 @@ class Parser
                                         $url = substr( $url, 0, -$numSepChars );
                                 }
  
-                               # Normalize any HTML entities in input. They will be
-                               # re-escaped by makeExternalLink() or maybeMakeExternalImage()
-                               $url = Sanitizer::decodeCharReferences( $url );
-
-                               # Escape any control characters introduced by the above step
-                               $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
+                               $url = Sanitizer::cleanUrl( $url );
  
                                 # Is this an external image?
                                 $text = $this->maybeMakeExternalImage( $url );
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php

index 6fd262e..a4ab662 100644 (file)
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -1181,6 +1181,47 @@ class Sanitizer {
                 $out .= "]>\n";
                 return $out;
         }
+       
+       static function cleanUrl( $url, $hostname=true ) {
+               # Normalize any HTML entities in input. They will be
+               # re-escaped by makeExternalLink().
+               $url = Sanitizer::decodeCharReferences( $url );
+
+               # Escape any control characters introduced by the above step
+               $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
+               
+               # Validate hostname portion
+               if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
+                       list( $whole, $protocol, $host, $rest ) = $matches;
+                       
+                       // Characters that will be ignored in IDNs.
+                       // http://tools.ietf.org/html/3454#section-3.1
+                       // Strip them before further processing so blacklists and such work.
+                       $strip = "/
+                               \\s|          # general whitespace
+                               \xc2\xad|     # 00ad SOFT HYPHEN
+                               \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
+                               \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
+                               \xe2\x81\xa0| # 2060 WORD JOINER
+                               \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
+                               \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
+                               \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
+                               \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
+                               \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
+                               \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
+                               \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
+                               [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
+                               /xuD";
+                       
+                       $host = preg_replace( $strip, '', $host );
+                       
+                       // @fixme: validate hostnames here
+                       
+                       return $protocol . $host . $rest;
+               } else {
+                       return $url;
+               }
+       }
  
  }
  
diff --git a/maintenance/parserTests.txt b/maintenance/parserTests.txt

index bce9866..1fb856c 100644 (file)
--- a/maintenance/parserTests.txt
+++ b/maintenance/parserTests.txt
@@ -714,11 +714,20 @@ External links: [encoded equals] (bug 6102)
  !! end
  
  !! test
-External links: [illegal character reference in hostname]
+External links: [IDN ignored character reference in hostname; strip it right off]
  !! input
  [http://e&zwnj;xample.com/]
  !! result
-<p>[http://e&zwnj;xample.com/]
+<p><a href="http://example.com/" class="external autonumber" title="http://example.com/" rel="nofollow">[1]</a>
+</p>
+!! end
+
+!! test
+External links: IDN ignored character reference in hostname; strip it right off
+!! input
+http://e&zwnj;xample.com/
+!! result
+<p><a href="http://example.com/" class="external free" title="http://example.com/" rel="nofollow">http://example.com/</a>
  </p>
  !! end
author	Brion Vibber <brion@users.mediawiki.org>
	Tue, 11 Jul 2006 19:54:20 +0000 (19:54 +0000)
committer	Brion Vibber <brion@users.mediawiki.org>
	Tue, 11 Jul 2006 19:54:20 +0000 (19:54 +0000)
RELEASE-NOTES		patch \| blob \| history
includes/Parser.php		patch \| blob \| history
includes/Sanitizer.php		patch \| blob \| history
maintenance/parserTests.txt		patch \| blob \| history