From: Brion Vibber Date: Tue, 11 Jul 2006 19:54:20 +0000 (+0000) Subject: * Pre-strip characters ignored in IDNs from URLs so they can't be used to break the... X-Git-Tag: 1.31.0-rc.0~56288 X-Git-Url: http://git.cyclocoop.org/fichier?a=commitdiff_plain;h=843f9a8eb78a1614b33a4e3459ec186320749de6;p=lhc%2Fweb%2Fwiklou.git * Pre-strip characters ignored in IDNs from URLs so they can't be used to break the blacklists for regular URLs Moved shared behavior of URL fixing from two Parser functions into Sanitizer::cleanUrl This would be a good place to add some validation... --- diff --git a/RELEASE-NOTES b/RELEASE-NOTES index 473b55d0c6..c65edd2368 100644 --- a/RELEASE-NOTES +++ b/RELEASE-NOTES @@ -52,6 +52,9 @@ it from source control: http://www.mediawiki.org/wiki/Download_from_SVN * Removed forced dereferencements (new() returns a reference in PHP5) * Note about $wgUploadSizeWarning using byte * (bug 6592) Add most viewed pages summary to Special:Statistics +* Pre-strip characters ignored in IDNs from URLs so they can't be used + to break the blacklists for regular URLs + == Languages updated == diff --git a/includes/Parser.php b/includes/Parser.php index 194e31c4ac..2144288af9 100644 --- a/includes/Parser.php +++ b/includes/Parser.php @@ -1197,13 +1197,8 @@ class Parser } $text = $wgContLang->markNoConversion($text); - - # Normalize any HTML entities in input. They will be - # re-escaped by makeExternalLink(). - $url = Sanitizer::decodeCharReferences( $url ); - - # Escape any control characters introduced by the above step - $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url ); + + $url = Sanitizer::cleanUrl( $url ); # Process the trail (i.e. everything after this link up until start of the next link), # replacing any non-bracketed links @@ -1284,12 +1279,7 @@ class Parser $url = substr( $url, 0, -$numSepChars ); } - # Normalize any HTML entities in input. They will be - # re-escaped by makeExternalLink() or maybeMakeExternalImage() - $url = Sanitizer::decodeCharReferences( $url ); - - # Escape any control characters introduced by the above step - $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url ); + $url = Sanitizer::cleanUrl( $url ); # Is this an external image? $text = $this->maybeMakeExternalImage( $url ); diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index 6fd262e96c..a4ab66229a 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -1181,6 +1181,47 @@ class Sanitizer { $out .= "]>\n"; return $out; } + + static function cleanUrl( $url, $hostname=true ) { + # Normalize any HTML entities in input. They will be + # re-escaped by makeExternalLink(). + $url = Sanitizer::decodeCharReferences( $url ); + + # Escape any control characters introduced by the above step + $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url ); + + # Validate hostname portion + if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) { + list( $whole, $protocol, $host, $rest ) = $matches; + + // Characters that will be ignored in IDNs. + // http://tools.ietf.org/html/3454#section-3.1 + // Strip them before further processing so blacklists and such work. + $strip = "/ + \\s| # general whitespace + \xc2\xad| # 00ad SOFT HYPHEN + \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN + \xe2\x80\x8b| # 200b ZERO WIDTH SPACE + \xe2\x81\xa0| # 2060 WORD JOINER + \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE + \xcd\x8f| # 034f COMBINING GRAPHEME JOINER + \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE + \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO + \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE + \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER + \xe2\x80\x8d| # 200d ZERO WIDTH JOINER + [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16 + /xuD"; + + $host = preg_replace( $strip, '', $host ); + + // @fixme: validate hostnames here + + return $protocol . $host . $rest; + } else { + return $url; + } + } } diff --git a/maintenance/parserTests.txt b/maintenance/parserTests.txt index bce9866d85..1fb856cdb5 100644 --- a/maintenance/parserTests.txt +++ b/maintenance/parserTests.txt @@ -714,11 +714,20 @@ External links: [encoded equals] (bug 6102) !! end !! test -External links: [illegal character reference in hostname] +External links: [IDN ignored character reference in hostname; strip it right off] !! input [http://e‌xample.com/] !! result -

[http://e‌xample.com/] +

[1] +

+!! end + +!! test +External links: IDN ignored character reference in hostname; strip it right off +!! input +http://e‌xample.com/ +!! result +

http://example.com/

!! end