$this->getExternalLinkAttribs( $url ) );
# Register it in the output object...
# Replace unnecessary URL escape codes with their equivalent characters
- $pasteurized = self::replaceUnusualEscapes( $url );
+ $pasteurized = self::normalizeLinkUrl( $url );
$this->mOutput->addExternalLink( $pasteurized );
}
wfProfileOut( __METHOD__ );
# Register link in the output object.
# Replace unnecessary URL escape codes with the referenced character
# This prevents spammers from hiding links from the filters
- $pasteurized = self::replaceUnusualEscapes( $url );
+ $pasteurized = self::normalizeLinkUrl( $url );
$this->mOutput->addExternalLink( $pasteurized );
}
}
/**
- * Replace unusual URL escape codes with their equivalent characters
+ * Replace unusual escape codes in a URL with their equivalent characters
*
+ * @deprecated since 1.24, use normalizeLinkUrl
* @param string $url
* @return string
- *
- * @todo This can merge genuinely required bits in the path or query string,
- * breaking legit URLs. A proper fix would treat the various parts of
- * the URL differently; as a workaround, just use the output for
- * statistical records, not for actual linking/output.
*/
public static function replaceUnusualEscapes( $url ) {
- return preg_replace_callback( '/%[0-9A-Fa-f]{2}/',
- array( __CLASS__, 'replaceUnusualEscapesCallback' ), $url );
+ wfDeprecated( __METHOD__, '1.24' );
+ return self::normalizeLinkUrl( $url );
}
/**
- * Callback function used in replaceUnusualEscapes().
- * Replaces unusual URL escape codes with their equivalent character
+ * Replace unusual escape codes in a URL with their equivalent characters
*
- * @param array $matches
+ * This generally follows the syntax defined in RFC 3986, with special
+ * consideration for HTTP query strings.
*
+ * @param string $url
* @return string
*/
- private static function replaceUnusualEscapesCallback( $matches ) {
- $char = urldecode( $matches[0] );
- $ord = ord( $char );
- # Is it an unsafe or HTTP reserved character according to RFC 1738?
- if ( $ord > 32 && $ord < 127 && strpos( '<>"#{}|\^~[]`;/?', $char ) === false ) {
- # No, shouldn't be escaped
- return $char;
- } else {
- # Yes, leave it escaped
- return $matches[0];
+ public static function normalizeLinkUrl( $url ) {
+ # First, make sure unsafe characters are encoded
+ $url = preg_replace_callback( '/[\x00-\x20"<>\[\\\\\]^`{|}\x7F-\xFF]/',
+ function ( $m ) {
+ return rawurlencode( $m[0] );
+ },
+ $url
+ );
+
+ $ret = '';
+ $end = strlen( $url );
+
+ # Fragment part - 'fragment'
+ $start = strpos( $url, '#' );
+ if ( $start !== false && $start < $end ) {
+ $ret = self::normalizeUrlComponent(
+ substr( $url, $start, $end - $start ), '"#%<>[\]^`{|}' ) . $ret;
+ $end = $start;
+ }
+
+ # Query part - 'query' minus &=+;
+ $start = strpos( $url, '?' );
+ if ( $start !== false && $start < $end ) {
+ $ret = self::normalizeUrlComponent(
+ substr( $url, $start, $end - $start ), '"#%<>[\]^`{|}&=+;' ) . $ret;
+ $end = $start;
}
+
+ # Scheme and path part - 'pchar'
+ # (we assume no userinfo or encoded colons in the host)
+ $ret = self::normalizeUrlComponent(
+ substr( $url, 0, $end ), '"#%<>[\]^`{|}/?' ) . $ret;
+
+ return $ret;
+ }
+
+ private static function normalizeUrlComponent( $component, $unsafe ) {
+ $callback = function ( $matches ) use ( $unsafe ) {
+ $char = urldecode( $matches[0] );
+ $ord = ord( $char );
+ if ( $ord > 32 && $ord < 127 && strpos( $unsafe, $char ) === false ) {
+ # Unescape it
+ return $char;
+ } else {
+ # Leave it escaped, but use uppercase for a-f
+ return strtoupper( $matches[0] );
+ }
+ };
+ return preg_replace_callback( '/%[0-9A-Fa-f]{2}/', $callback, $component );
}
/**
),
), $out->getSections(), 'getSections() with proper value when <h2> is used' );
}
+
+ /**
+ * @dataProvider provideNormalizeLinkUrl
+ * @covers Parser::normalizeLinkUrl
+ * @covers Parser::normalizeUrlComponent
+ */
+ public function testNormalizeLinkUrl( $explanation, $url, $expected ) {
+ $this->assertEquals( $expected, Parser::normalizeLinkUrl( $url ), $explanation );
+ }
+
+ public static function provideNormalizeLinkUrl() {
+ return array(
+ array(
+ 'Escaping of unsafe characters',
+ 'http://example.org/foo bar?param[]="value"¶m[]=valüe',
+ 'http://example.org/foo%20bar?param%5B%5D=%22value%22¶m%5B%5D=val%C3%BCe',
+ ),
+ array(
+ 'Case normalization of percent-encoded characters',
+ 'http://example.org/%ab%cD%Ef%FF',
+ 'http://example.org/%AB%CD%EF%FF',
+ ),
+ array(
+ 'Unescaping of safe characters',
+ 'http://example.org/%3C%66%6f%6F%3E?%3C%66%6f%6F%3E#%3C%66%6f%6F%3E',
+ 'http://example.org/%3Cfoo%3E?%3Cfoo%3E#%3Cfoo%3E',
+ ),
+ array(
+ 'Context-sensitive replacement of sometimes-safe characters',
+ 'http://example.org/%23%2F%3F%26%3D%2B%3B?%23%2F%3F%26%3D%2B%3B#%23%2F%3F%26%3D%2B%3B',
+ 'http://example.org/%23%2F%3F&=+;?%23/?%26%3D%2B%3B#%23/?&=+;',
+ ),
+ );
+ }
+
// @todo Add tests for cleanSig() / cleanSigInSig(), getSection(),
// replaceSection(), getPreloadText()
}