From d4d6819a75f45de4c0234bf84563e5f98ca13700 Mon Sep 17 00:00:00 2001 From: mrbluesky Date: Sun, 21 Oct 2012 15:01:56 +0200 Subject: [PATCH] (bug 40588) LinkSearch cannot search with a port in the url Special:LinkSearch doesn't handle urls with a port correctly. The detection of the protocol to search for (in LinkSearchPage::execute()) cant handle the extra colon a port introduces in the url. Fixed by making LinkSearchPage::execute use wfParseUrl() to detect the protocol. LinkFilter::makeLikeArray didn't handle the port correctly too, putting it in the domain part of the url. Fixed by rewriting makeLikeArray to make use of wfParseUrl(). This also fixes the problem of makeLikeArray not removing username and pass from the search pattern. Allow queries like 'ftp://*' or 'mailto:*' to find all links with specific protocols Bug: 40588 Change-Id: Id3dd31993456bf6cbba4cf17962cf0083b612bed --- includes/LinkFilter.php | 101 +++++--- includes/specials/SpecialLinkSearch.php | 54 +++-- tests/phpunit/includes/LinkFilterTest.php | 274 ++++++++++++++++++++++ 3 files changed, 364 insertions(+), 65 deletions(-) create mode 100644 tests/phpunit/includes/LinkFilterTest.php diff --git a/includes/LinkFilter.php b/includes/LinkFilter.php index d552c69615..97834fe9d0 100644 --- a/includes/LinkFilter.php +++ b/includes/LinkFilter.php @@ -84,16 +84,28 @@ class LinkFilter { * * Asterisks in any other location are considered invalid. * - * @param string $filterEntry domainparts - * @param $prot String: protocol + * This function does the same as wfMakeUrlIndexes(), except it also takes care + * of adding wildcards + * + * @param String $filterEntry domainparts + * @param String $protocol protocol (default http://) * @return Array to be passed to DatabaseBase::buildLike() or false on error */ - public static function makeLikeArray( $filterEntry, $prot = 'http://' ) { + public static function makeLikeArray( $filterEntry, $protocol = 'http://' ) { $db = wfGetDB( DB_MASTER ); - if ( substr( $filterEntry, 0, 2 ) == '*.' ) { + + $target = $protocol . $filterEntry; + $bits = wfParseUrl( $target ); + + if ( $bits == false ) { + // Unknown protocol? + return false; + } + + if ( substr( $bits['host'], 0, 2 ) == '*.' ) { $subdomains = true; - $filterEntry = substr( $filterEntry, 2 ); - if ( $filterEntry == '' ) { + $bits['host'] = substr( $bits['host'], 2 ); + if ( $bits['host'] == '' ) { // We don't want to make a clause that will match everything, // that could be dangerous return false; @@ -101,52 +113,63 @@ class LinkFilter { } else { $subdomains = false; } - // No stray asterisks, that could cause confusion - // It's not simple or efficient to handle it properly so we don't - // handle it at all. - if ( strpos( $filterEntry, '*' ) !== false ) { - return false; - } - $slash = strpos( $filterEntry, '/' ); - if ( $slash !== false ) { - $path = substr( $filterEntry, $slash ); - $host = substr( $filterEntry, 0, $slash ); - } else { - $path = '/'; - $host = $filterEntry; - } + // Reverse the labels in the hostname, convert to lower case // For emails reverse domainpart only - if ( $prot == 'mailto:' && strpos( $host, '@' ) ) { + if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' ) ) { // complete email address - $mailparts = explode( '@', $host ); + $mailparts = explode( '@', $bits['host'] ); $domainpart = strtolower( implode( '.', array_reverse( explode( '.', $mailparts[1] ) ) ) ); - $host = $domainpart . '@' . $mailparts[0]; - $like = array( "$prot$host", $db->anyString() ); - } elseif ( $prot == 'mailto:' ) { - // domainpart of email address only. do not add '.' - $host = strtolower( implode( '.', array_reverse( explode( '.', $host ) ) ) ); - $like = array( "$prot$host", $db->anyString() ); + $bits['host'] = $domainpart . '@' . $mailparts[0]; + } elseif ( $bits['scheme'] === 'mailto' ) { + // domainpart of email address only, do not add '.' + $bits['host'] = strtolower( implode( '.', array_reverse( explode( '.', $bits['host']) ) ) ); } else { - $host = strtolower( implode( '.', array_reverse( explode( '.', $host ) ) ) ); - if ( substr( $host, -1, 1 ) !== '.' ) { - $host .= '.'; + $bits['host'] = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) ); + if ( substr( $bits['host'], -1, 1 ) !== '.' ) { + $bits['host'] .= '.'; } - $like = array( "$prot$host" ); + } - if ( $subdomains ) { - $like[] = $db->anyString(); - } - if ( !$subdomains || $path !== '/' ) { - $like[] = $path; - $like[] = $db->anyString(); + $like[] = $bits['scheme'] . $bits['delimiter'] . $bits['host']; + + if ( $subdomains ) { + $like[] = $db->anyString(); + } + + if ( isset( $bits['port'] ) ) { + $like[] = ':' . $bits['port']; + } + if ( isset( $bits['path'] ) ) { + $like[] = $bits['path']; + } elseif ( !$subdomains ) { + $like[] = '/'; + } + if ( isset( $bits['query'] ) ) { + $like[] = '?' . $bits['query']; + } + if ( isset( $bits['fragment'] ) ) { + $like[] = '#' . $bits['fragment']; + } + + // Check for stray asterisks: asterisk only allowed at the start of the domain + foreach ( $like as $likepart ) { + if ( !( $likepart instanceof LikeMatch ) && strpos( $likepart, '*' ) !== false ) { + return false; } } + + if ( !( $like[count( $like ) - 1] instanceof LikeMatch ) ) { + // Add wildcard at the end if there isn't one already + $like[] = $db->anyString(); + } + return $like; } /** - * Filters an array returned by makeLikeArray(), removing everything past first pattern placeholder. + * Filters an array returned by makeLikeArray(), removing everything past first + * pattern placeholder. * * @param array $arr array to filter * @return array filtered array diff --git a/includes/specials/SpecialLinkSearch.php b/includes/specials/SpecialLinkSearch.php index ed6e2a4bbb..5c121bab99 100644 --- a/includes/specials/SpecialLinkSearch.php +++ b/includes/specials/SpecialLinkSearch.php @@ -62,28 +62,19 @@ class LinkSearchPage extends QueryPage { } $target2 = $target; - $protocol = ''; - $pr_sl = strpos( $target2, '//' ); - $pr_cl = strpos( $target2, ':' ); - if ( $pr_sl ) { - // For protocols with '//' - $protocol = substr( $target2, 0, $pr_sl + 2 ); - $target2 = substr( $target2, $pr_sl + 2 ); - } elseif ( !$pr_sl && $pr_cl ) { - // For protocols without '//' like 'mailto:' - $protocol = substr( $target2, 0, $pr_cl + 1 ); - $target2 = substr( $target2, $pr_cl + 1 ); - } elseif ( $target2 != '' ) { - // default - $protocol = 'http://'; - } - if ( $protocol != '' && !in_array( $protocol, $protocols_list ) ) { - // Unsupported protocol, show original search request - $target2 = $target; - // Since links with unsupported protocols don't end up in - // externallinks, assume $protocol is actually part of a link - // containing ':' or '//' and default to http as above. - $protocol = 'http://'; + // Get protocol, default is http:// + $protocol = 'http://'; + $bits = wfParseUrl( $target ); + if ( isset( $bits['scheme'] ) && isset( $bits['delimiter'] ) ) { + $protocol = $bits['scheme'] . $bits['delimiter']; + // Make sure wfParseUrl() didn't make some well-intended correction in the + // protocol + if ( strcasecmp( $protocol, substr( $target, 0, strlen( $protocol ) ) ) === 0 ) { + $target2 = substr( $target, strlen( $protocol ) ); + } else { + // If it did, let LinkFilter::makeLikeArray() handle this + $protocol = ''; + } } $out->addWikiMsg( @@ -148,18 +139,26 @@ class LinkSearchPage extends QueryPage { /** * Return an appropriately formatted LIKE query and the clause * - * @param string $query - * @param string $prot + * @param String $query Search pattern to search for + * @param String $prot Protocol, e.g. 'http://' + * * @return array */ static function mungeQuery( $query, $prot ) { $field = 'el_index'; - $rv = LinkFilter::makeLikeArray( $query, $prot ); + $dbr = wfGetDB( DB_SLAVE ); + + if ( $query === '*' && $prot !== '' ) { + // Allow queries like 'ftp://*' to find all ftp links + $rv = array( $prot, $dbr->anyString() ); + } else { + $rv = LinkFilter::makeLikeArray( $query, $prot ); + } + if ( $rv === false ) { // LinkFilter doesn't handle wildcard in IP, so we'll have to munge here. $pattern = '/^(:?[0-9]{1,3}\.)+\*\s*$|^(:?[0-9]{1,3}\.){3}[0-9]{1,3}:[0-9]*\*\s*$/'; if ( preg_match( $pattern, $query ) ) { - $dbr = wfGetDB( DB_SLAVE ); $rv = array( $prot . rtrim( $query, " \t*" ), $dbr->anyString() ); $field = 'el_to'; } @@ -230,6 +229,9 @@ class LinkSearchPage extends QueryPage { /** * Override to check query validity. + * + * @param mixed $offset Numerical offset or false for no offset + * @param mixed $limit Numerical limit or false for no limit */ function doQuery( $offset = false, $limit = false ) { list( $this->mMungedQuery, ) = LinkSearchPage::mungeQuery( $this->mQuery, $this->mProt ); diff --git a/tests/phpunit/includes/LinkFilterTest.php b/tests/phpunit/includes/LinkFilterTest.php new file mode 100644 index 0000000000..1ca8e9f065 --- /dev/null +++ b/tests/phpunit/includes/LinkFilterTest.php @@ -0,0 +1,274 @@ +setMwGlobals( 'wgUrlProtocols', array( + 'http://', + 'https://', + 'ftp://', + 'irc://', + 'ircs://', + 'gopher://', + 'telnet://', + 'nntp://', + 'worldwind://', + 'mailto:', + 'news:', + 'svn://', + 'git://', + 'mms://', + '//', + ) ); + + } + + /** + * createRegexFromLike($like) + * + * Takes an array as created by LinkFilter::makeLikeArray() and creates a regex from it + * + * @param Array $like Array as created by LinkFilter::makeLikeArray() + * @return string Regex + */ + function createRegexFromLIKE( $like ) { + + $regex = '!^'; + + foreach ( $like as $item ) { + + if ( $item instanceof LikeMatch ) { + if ( $item->toString() == '%' ) { + $regex .= '.*'; + } elseif ( $item->toString() == '_' ) { + $regex .= '.'; + } + } else { + $regex .= preg_quote( $item, '!' ); + } + + } + + $regex .= '$!'; + + return $regex; + + } + + /** + * provideValidPatterns() + * + * @return array + */ + public static function provideValidPatterns() { + + return array( + // Protocol, Search pattern, URL which matches the pattern + array( 'http://' , '*.test.com' , 'http://www.test.com' ), + array( 'http://' , 'test.com:8080/dir/file' , 'http://name:pass@test.com:8080/dir/file' ), + array( 'https://' , '*.com' , 'https://s.s.test..com:88/dir/file?a=1&b=2' ), + array( 'https://' , '*.com' , 'https://name:pass@secure.com/index.html' ), + array( 'http://' , 'name:pass@test.com' , 'http://test.com' ), + array( 'http://' , 'test.com' , 'http://name:pass@test.com' ), + array( 'http://' , '*.test.com' , 'http://a.b.c.test.com/dir/dir/file?a=6'), + array( null , 'http://*.test.com' , 'http://www.test.com' ), + array( 'mailto:' , 'name@mail.test123.com' , 'mailto:name@mail.test123.com' ), + array( '' , + 'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg' , + 'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg' + ), + array( '' , 'http://name:pass@*.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg' , + 'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg' ), + array( '' , 'http://name:wrongpass@*.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]' , + 'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg' ), + array( 'http://' , 'name:pass@*.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg' , + 'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg' ), + array( '' , 'http://name:pass@www.test.com:12345' , + 'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg' ), + array( 'ftp://', 'user:pass@ftp.test.com:1233/home/user/file;type=efw', + 'ftp://user:pass@ftp.test.com:1233/home/user/file;type=efw' ), + array( null , 'ftp://otheruser:otherpass@ftp.test.com:1233/home/user/file;type=', + 'ftp://user:pass@ftp.test.com:1233/home/user/file;type=efw' ), + array( null , 'ftp://@ftp.test.com:1233/home/user/file;type=', + 'ftp://user:pass@ftp.test.com:1233/home/user/file;type=efw' ), + array( null , 'ftp://ftp.test.com/', + 'ftp://user:pass@ftp.test.com/home/user/file;type=efw' ), + array( null , 'ftp://ftp.test.com/', + 'ftp://user:pass@ftp.test.com/home/user/file;type=efw' ), + array( null , 'ftp://*.test.com:222/', + 'ftp://user:pass@ftp.test.com:222/home' ), + array( 'irc://', '*.myserver:6667/', 'irc://test.myserver:6667/' ), + array( 'irc://', 'name:pass@*.myserver/', 'irc://test.myserver:6667/' ), + array( 'irc://', 'name:pass@*.myserver/', 'irc://other:@test.myserver:6667/' ), + array( '', 'irc://test/name,string,abc?msg=t', 'irc://test/name,string,abc?msg=test' ), + array( '', 'https://gerrit.wikimedia.org/r/#/q/status:open,n,z', + 'https://gerrit.wikimedia.org/r/#/q/status:open,n,z' ), + array( '', 'https://gerrit.wikimedia.org', + 'https://gerrit.wikimedia.org/r/#/q/status:open,n,z' ), + array( 'mailto:', '*.test.com', 'mailto:name@pop3.test.com' ), + array( 'mailto:', 'test.com', 'mailto:name@test.com' ), + array( 'news:', 'test.1234afc@news.test.com', 'news:test.1234afc@news.test.com' ), + array( 'news:', '*.test.com', 'news:test.1234afc@news.test.com' ), + array( '' , 'news:4df8kh$iagfewewf(at)newsbf02aaa.news.aol.com', + 'news:4df8kh$iagfewewf(at)newsbf02aaa.news.aol.com' ), + array( '' , 'news:*.aol.com', + 'news:4df8kh$iagfewewf(at)newsbf02aaa.news.aol.com' ), + array( '' , 'git://github.com/prwef/abc-def.git' , 'git://github.com/prwef/abc-def.git' ), + array( 'git://' , 'github.com/' , 'git://github.com/prwef/abc-def.git' ), + array( 'git://' , '*.github.com/' , 'git://a.b.c.d.e.f.github.com/prwef/abc-def.git' ), + array( '' , 'gopher://*.test.com/' , 'gopher://gopher.test.com/0/v2/vstat'), + array( 'telnet://' , '*.test.com' , 'telnet://shell.test.com/~home/'), + + // + // The following only work in PHP >= 5.3.7, due to a bug in parse_url which eats + // the path from the url (https://bugs.php.net/bug.php?id=54180) + // + // array( '', 'http://test.com', 'http://test.com/index?arg=1' ), + // array( 'http://', '*.test.com', 'http://www.test.com/index?arg=1' ), + // array( '' , + // 'http://xx23124:__ffdfdef__@www.test.com:12345/dir' , + // 'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg' + // ), + // + + // + // Tests for false positives + // + array( 'http://' , 'test.com' , 'http://www.test.com', false ), + array( 'http://' , 'www1.test.com' , 'http://www.test.com', false ), + array( 'http://' , '*.test.com' , 'http://www.test.t.com', false ), + array( '' , 'http://test.com:8080' , 'http://www.test.com:8080', false ), + array( '' , 'https://test.com' , 'http://test.com', false ), + array( '' , 'http://test.com' , 'https://test.com', false ), + array( 'http://', 'http://test.com', 'http://test.com', false ), + array( null, 'http://www.test.com', 'http://www.test.com:80', false ), + array( null, 'http://www.test.com:80', 'http://www.test.com', false ), + array( null, 'http://*.test.com:80', 'http://www.test.com', false ), + array( '', 'https://gerrit.wikimedia.org/r/#/XXX/status:open,n,z', + 'https://gerrit.wikimedia.org/r/#/q/status:open,n,z', false ), + array( '', 'https://*.wikimedia.org/r/#/q/status:open,n,z', + 'https://gerrit.wikimedia.org/r/#/XXX/status:open,n,z', false ), + array( 'mailto:', '@test.com', '@abc.test.com', false ), + array( 'mailto:', 'mail@test.com', 'mail2@test.com', false ), + array( '', 'mailto:mail@test.com', 'mail2@test.com', false ), + array( '', 'mailto:@test.com', '@abc.test.com', false ), + array( 'ftp://', '*.co', 'ftp://www.co.uk', false ), + array( 'ftp://', '*.co', 'ftp://www.co.m', false ), + array( 'ftp://', '*.co/dir/', 'ftp://www.co/dir2/', false ), + array( 'ftp://', 'www.co/dir/', 'ftp://www.co/dir2/', false ), + array( 'ftp://', 'test.com/dir/', 'ftp://test.com/', false ), + array( '', 'http://test.com:8080/dir/', 'http://test.com:808/dir/', false ), + array( '', 'http://test.com/dir/index.html', 'http://test.com/dir/index.php', false ), + + // + // These are false positives too and ideally shouldn't match, but that + // would require using regexes and RLIKE instead of LIKE + // + // array( null, 'http://*.test.com', 'http://www.test.com:80', false ), + // array( '', 'https://*.wikimedia.org/r/#/q/status:open,n,z', + // 'https://gerrit.wikimedia.org/XXX/r/#/q/status:open,n,z', false ), + ); + + } + + /** + * testMakeLikeArrayWithValidPatterns() + * + * Tests whether the LIKE clause produced by LinkFilter::makeLikeArray($pattern, $protocol) + * will find one of the URL indexes produced by wfMakeUrlIndexes($url) + * + * @dataProvider provideValidPatterns + * + * @param String $protocol Protocol, e.g. 'http://' or 'mailto:' + * @param String $pattern Search pattern to feed to LinkFilter::makeLikeArray + * @param String $url URL to feed to wfMakeUrlIndexes + * @param bool $shouldBeFound Should the URL be found? (defaults true) + */ + function testMakeLikeArrayWithValidPatterns( $protocol, $pattern, $url, $shouldBeFound = true ) { + + $indexes = wfMakeUrlIndexes( $url ); + $likeArray = LinkFilter::makeLikeArray( $pattern, $protocol ); + + $this->assertTrue( $likeArray !== false, + "LinkFilter::makeLikeArray('$pattern', '$protocol') returned false on a valid pattern" + ); + + $regex = $this->createRegexFromLIKE( $likeArray ); + $debugmsg = "Regex: '" . $regex . "'\n"; + $debugmsg .= count( $indexes ) . " index(es) created by wfMakeUrlIndexes():\n"; + + $matches = 0; + + foreach( $indexes as $index ) { + $matches += preg_match( $regex, $index ); + $debugmsg .= "\t'$index'\n"; + } + + if ( $shouldBeFound ) { + $this->assertTrue( + $matches > 0, + "Search pattern '$protocol$pattern' does not find url '$url' \n$debugmsg" + ); + } else { + $this->assertFalse( + $matches > 0, + "Search pattern '$protocol$pattern' should not find url '$url' \n$debugmsg" + ); + } + + } + + /** + * provideInvalidPatterns() + * + * @return array + */ + public static function provideInvalidPatterns() { + + return array( + array( '' ), + array( '*' ), + array( 'http://*' ), + array( 'http://*/' ), + array( 'http://*/dir/file' ), + array( 'test.*.com' ), + array( 'http://test.*.com' ), + array( 'test.*.com' ), + array( 'http://*.test.*' ), + array( 'http://*test.com' ), + array( 'https://*' ), + array( '*://test.com'), + array( 'mailto:name:pass@t*est.com' ), + array( 'http://*:888/'), + array( '*http://'), + array( 'test.com/*/index' ), + array( 'test.com/dir/index?arg=*' ), + ); + + } + + /** + * testMakeLikeArrayWithInvalidPatterns() + * + * Tests whether LinkFilter::makeLikeArray($pattern) will reject invalid search patterns + * + * @dataProvider provideInvalidPatterns + * + * @param $pattern string: Invalid search pattern + */ + function testMakeLikeArrayWithInvalidPatterns( $pattern ) { + + $this->assertFalse( + LinkFilter::makeLikeArray( $pattern ), + "'$pattern' is not a valid pattern and should be rejected" + ); + + } + +} -- 2.20.1