Moved shared behavior of URL fixing from two Parser functions into Sanitizer::cleanUrl
This would be a good place to add some validation...
* Removed forced dereferencements (new() returns a reference in PHP5)
* Note about $wgUploadSizeWarning using byte
* (bug 6592) Add most viewed pages summary to Special:Statistics
+* Pre-strip characters ignored in IDNs from URLs so they can't be used
+ to break the blacklists for regular URLs
+
== Languages updated ==
}
$text = $wgContLang->markNoConversion($text);
-
- # Normalize any HTML entities in input. They will be
- # re-escaped by makeExternalLink().
- $url = Sanitizer::decodeCharReferences( $url );
-
- # Escape any control characters introduced by the above step
- $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
+
+ $url = Sanitizer::cleanUrl( $url );
# Process the trail (i.e. everything after this link up until start of the next link),
# replacing any non-bracketed links
$url = substr( $url, 0, -$numSepChars );
}
- # Normalize any HTML entities in input. They will be
- # re-escaped by makeExternalLink() or maybeMakeExternalImage()
- $url = Sanitizer::decodeCharReferences( $url );
-
- # Escape any control characters introduced by the above step
- $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
+ $url = Sanitizer::cleanUrl( $url );
# Is this an external image?
$text = $this->maybeMakeExternalImage( $url );
$out .= "]>\n";
return $out;
}
+
+ static function cleanUrl( $url, $hostname=true ) {
+ # Normalize any HTML entities in input. They will be
+ # re-escaped by makeExternalLink().
+ $url = Sanitizer::decodeCharReferences( $url );
+
+ # Escape any control characters introduced by the above step
+ $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
+
+ # Validate hostname portion
+ if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
+ list( $whole, $protocol, $host, $rest ) = $matches;
+
+ // Characters that will be ignored in IDNs.
+ // http://tools.ietf.org/html/3454#section-3.1
+ // Strip them before further processing so blacklists and such work.
+ $strip = "/
+ \\s| # general whitespace
+ \xc2\xad| # 00ad SOFT HYPHEN
+ \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
+ \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
+ \xe2\x81\xa0| # 2060 WORD JOINER
+ \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
+ \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
+ \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
+ \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
+ \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
+ \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
+ \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
+ [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
+ /xuD";
+
+ $host = preg_replace( $strip, '', $host );
+
+ // @fixme: validate hostnames here
+
+ return $protocol . $host . $rest;
+ } else {
+ return $url;
+ }
+ }
}
!! end
!! test
-External links: [illegal character reference in hostname]
+External links: [IDN ignored character reference in hostname; strip it right off]
!! input
[http://e‌xample.com/]
!! result
-<p>[http://e‌xample.com/]
+<p><a href="http://example.com/" class="external autonumber" title="http://example.com/" rel="nofollow">[1]</a>
+</p>
+!! end
+
+!! test
+External links: IDN ignored character reference in hostname; strip it right off
+!! input
+http://e‌xample.com/
+!! result
+<p><a href="http://example.com/" class="external free" title="http://example.com/" rel="nofollow">http://example.com/</a>
</p>
!! end