(bug 40588) LinkSearch cannot search with a port in the url
authormrbluesky <mrbluesky@wikipedia.be>
Sun, 21 Oct 2012 13:01:56 +0000 (15:01 +0200)
committerBrian Wolff <bawolff+wn@gmail.com>
Sun, 1 Dec 2013 19:10:14 +0000 (15:10 -0400)
Special:LinkSearch doesn't handle urls with a port correctly.

The detection of the protocol to search for (in LinkSearchPage::execute()) cant handle
the extra colon a port introduces in the url. Fixed by making LinkSearchPage::execute
use wfParseUrl() to detect the protocol.

LinkFilter::makeLikeArray didn't handle the port correctly too, putting it in the
domain part of the url. Fixed by rewriting makeLikeArray to make use of wfParseUrl().
This also fixes the problem of makeLikeArray not removing username and pass from
the search pattern.

Allow queries like 'ftp://*' or 'mailto:*' to find all links with specific protocols

Bug: 40588
Change-Id: Id3dd31993456bf6cbba4cf17962cf0083b612bed

includes/LinkFilter.php
includes/specials/SpecialLinkSearch.php
tests/phpunit/includes/LinkFilterTest.php [new file with mode: 0644]

index d552c69..97834fe 100644 (file)
@@ -84,16 +84,28 @@ class LinkFilter {
         *
         * Asterisks in any other location are considered invalid.
         *
-        * @param string $filterEntry domainparts
-        * @param $prot        String: protocol
+        * This function does the same as wfMakeUrlIndexes(), except it also takes care
+        * of adding wildcards
+        *
+        * @param String $filterEntry domainparts
+        * @param String $protocol protocol (default http://)
         * @return Array to be passed to DatabaseBase::buildLike() or false on error
         */
-       public static function makeLikeArray( $filterEntry, $prot = 'http://' ) {
+       public static function makeLikeArray( $filterEntry, $protocol = 'http://' ) {
                $db = wfGetDB( DB_MASTER );
-               if ( substr( $filterEntry, 0, 2 ) == '*.' ) {
+
+               $target = $protocol . $filterEntry;
+               $bits = wfParseUrl( $target );
+
+               if ( $bits == false ) {
+                       // Unknown protocol?
+                       return false;
+               }
+
+               if ( substr( $bits['host'], 0, 2 ) == '*.' ) {
                        $subdomains = true;
-                       $filterEntry = substr( $filterEntry, 2 );
-                       if ( $filterEntry == '' ) {
+                       $bits['host'] = substr( $bits['host'], 2 );
+                       if ( $bits['host'] == '' ) {
                                // We don't want to make a clause that will match everything,
                                // that could be dangerous
                                return false;
@@ -101,52 +113,63 @@ class LinkFilter {
                } else {
                        $subdomains = false;
                }
-               // No stray asterisks, that could cause confusion
-               // It's not simple or efficient to handle it properly so we don't
-               // handle it at all.
-               if ( strpos( $filterEntry, '*' ) !== false ) {
-                       return false;
-               }
-               $slash = strpos( $filterEntry, '/' );
-               if ( $slash !== false ) {
-                       $path = substr( $filterEntry, $slash );
-                       $host = substr( $filterEntry, 0, $slash );
-               } else {
-                       $path = '/';
-                       $host = $filterEntry;
-               }
+
                // Reverse the labels in the hostname, convert to lower case
                // For emails reverse domainpart only
-               if ( $prot == 'mailto:' && strpos( $host, '@' ) ) {
+               if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' ) ) {
                        // complete email address
-                       $mailparts = explode( '@', $host );
+                       $mailparts = explode( '@', $bits['host'] );
                        $domainpart = strtolower( implode( '.', array_reverse( explode( '.', $mailparts[1] ) ) ) );
-                       $host = $domainpart . '@' . $mailparts[0];
-                       $like = array( "$prot$host", $db->anyString() );
-               } elseif ( $prot == 'mailto:' ) {
-                       // domainpart of email address only. do not add '.'
-                       $host = strtolower( implode( '.', array_reverse( explode( '.', $host ) ) ) );
-                       $like = array( "$prot$host", $db->anyString() );
+                       $bits['host'] = $domainpart . '@' . $mailparts[0];
+               } elseif ( $bits['scheme'] === 'mailto' ) {
+                       // domainpart of email address only, do not add '.'
+                       $bits['host'] = strtolower( implode( '.', array_reverse( explode( '.', $bits['host']) ) ) );
                } else {
-                       $host = strtolower( implode( '.', array_reverse( explode( '.', $host ) ) ) );
-                       if ( substr( $host, -1, 1 ) !== '.' ) {
-                               $host .= '.';
+                       $bits['host'] = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) );
+                       if ( substr( $bits['host'], -1, 1 ) !== '.' ) {
+                               $bits['host'] .= '.';
                        }
-                       $like = array( "$prot$host" );
+               }
 
-                       if ( $subdomains ) {
-                               $like[] = $db->anyString();
-                       }
-                       if ( !$subdomains || $path !== '/' ) {
-                               $like[] = $path;
-                               $like[] = $db->anyString();
+               $like[] = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
+
+               if ( $subdomains ) {
+                       $like[] = $db->anyString();
+               }
+
+               if ( isset( $bits['port'] ) ) {
+                       $like[] = ':' . $bits['port'];
+               }
+               if ( isset( $bits['path'] ) ) {
+                       $like[] = $bits['path'];
+               } elseif ( !$subdomains ) {
+                       $like[] = '/';
+               }
+               if ( isset( $bits['query'] ) ) {
+                       $like[] = '?' . $bits['query'];
+               }
+               if ( isset( $bits['fragment'] ) ) {
+                       $like[] = '#' . $bits['fragment'];
+               }
+
+               // Check for stray asterisks: asterisk only allowed at the start of the domain
+               foreach ( $like as $likepart ) {
+                       if ( !( $likepart instanceof LikeMatch ) && strpos( $likepart, '*' ) !== false ) {
+                               return false;
                        }
                }
+
+               if ( !( $like[count( $like ) - 1] instanceof LikeMatch ) ) {
+                       // Add wildcard at the end if there isn't one already
+                       $like[] = $db->anyString();
+               }
+
                return $like;
        }
 
        /**
-        * Filters an array returned by makeLikeArray(), removing everything past first pattern placeholder.
+        * Filters an array returned by makeLikeArray(), removing everything past first
+        * pattern placeholder.
         *
         * @param array $arr array to filter
         * @return array filtered array
index ed6e2a4..5c121ba 100644 (file)
@@ -62,28 +62,19 @@ class LinkSearchPage extends QueryPage {
                }
 
                $target2 = $target;
-               $protocol = '';
-               $pr_sl = strpos( $target2, '//' );
-               $pr_cl = strpos( $target2, ':' );
-               if ( $pr_sl ) {
-                       // For protocols with '//'
-                       $protocol = substr( $target2, 0, $pr_sl + 2 );
-                       $target2 = substr( $target2, $pr_sl + 2 );
-               } elseif ( !$pr_sl && $pr_cl ) {
-                       // For protocols without '//' like 'mailto:'
-                       $protocol = substr( $target2, 0, $pr_cl + 1 );
-                       $target2 = substr( $target2, $pr_cl + 1 );
-               } elseif ( $target2 != '' ) {
-                       // default
-                       $protocol = 'http://';
-               }
-               if ( $protocol != '' && !in_array( $protocol, $protocols_list ) ) {
-                       // Unsupported protocol, show original search request
-                       $target2 = $target;
-                       // Since links with unsupported protocols don't end up in
-                       // externallinks, assume $protocol is actually part of a link
-                       // containing ':' or '//' and default to http as above.
-                       $protocol = 'http://';
+               // Get protocol, default is http://
+               $protocol = 'http://';
+               $bits = wfParseUrl( $target );
+               if ( isset( $bits['scheme'] ) && isset( $bits['delimiter'] ) ) {
+                       $protocol = $bits['scheme'] . $bits['delimiter'];
+                       // Make sure wfParseUrl() didn't make some well-intended correction in the
+                       // protocol
+                       if ( strcasecmp( $protocol, substr( $target, 0, strlen( $protocol ) ) ) === 0 ) {
+                               $target2 = substr( $target, strlen( $protocol ) );
+                       } else {
+                               // If it did, let LinkFilter::makeLikeArray() handle this
+                               $protocol = '';
+                       }
                }
 
                $out->addWikiMsg(
@@ -148,18 +139,26 @@ class LinkSearchPage extends QueryPage {
        /**
         * Return an appropriately formatted LIKE query and the clause
         *
-        * @param string $query
-        * @param string $prot
+        * @param String $query Search pattern to search for
+        * @param String $prot Protocol, e.g. 'http://'
+        *
         * @return array
         */
        static function mungeQuery( $query, $prot ) {
                $field = 'el_index';
-               $rv = LinkFilter::makeLikeArray( $query, $prot );
+               $dbr = wfGetDB( DB_SLAVE );
+
+               if ( $query === '*' && $prot !== '' ) {
+                       // Allow queries like 'ftp://*' to find all ftp links
+                       $rv = array( $prot, $dbr->anyString() );
+               } else {
+                       $rv = LinkFilter::makeLikeArray( $query, $prot );
+               }
+
                if ( $rv === false ) {
                        // LinkFilter doesn't handle wildcard in IP, so we'll have to munge here.
                        $pattern = '/^(:?[0-9]{1,3}\.)+\*\s*$|^(:?[0-9]{1,3}\.){3}[0-9]{1,3}:[0-9]*\*\s*$/';
                        if ( preg_match( $pattern, $query ) ) {
-                               $dbr = wfGetDB( DB_SLAVE );
                                $rv = array( $prot . rtrim( $query, " \t*" ), $dbr->anyString() );
                                $field = 'el_to';
                        }
@@ -230,6 +229,9 @@ class LinkSearchPage extends QueryPage {
 
        /**
         * Override to check query validity.
+        *
+        * @param mixed $offset Numerical offset or false for no offset
+        * @param mixed $limit Numerical limit or false for no limit
         */
        function doQuery( $offset = false, $limit = false ) {
                list( $this->mMungedQuery, ) = LinkSearchPage::mungeQuery( $this->mQuery, $this->mProt );
diff --git a/tests/phpunit/includes/LinkFilterTest.php b/tests/phpunit/includes/LinkFilterTest.php
new file mode 100644 (file)
index 0000000..1ca8e9f
--- /dev/null
@@ -0,0 +1,274 @@
+<?php
+
+/**
+ * @group Database
+ */
+class LinkFilterTest extends MediaWikiLangTestCase {
+
+       protected function setUp() {
+
+               parent::setUp();
+
+               $this->setMwGlobals( 'wgUrlProtocols', array(
+                       'http://',
+                       'https://',
+                       'ftp://',
+                       'irc://',
+                       'ircs://',
+                       'gopher://',
+                       'telnet://',
+                       'nntp://',
+                       'worldwind://',
+                       'mailto:',
+                       'news:',
+                       'svn://',
+                       'git://',
+                       'mms://',
+                       '//',
+               ) );
+
+       }
+
+       /**
+        * createRegexFromLike($like)
+        *
+        * Takes an array as created by LinkFilter::makeLikeArray() and creates a regex from it
+        *
+        * @param Array $like Array as created by LinkFilter::makeLikeArray()
+        * @return string Regex
+        */
+       function createRegexFromLIKE( $like ) {
+
+               $regex = '!^';
+
+               foreach ( $like as $item ) {
+
+                       if ( $item instanceof LikeMatch ) {
+                               if ( $item->toString() == '%' ) {
+                                       $regex .= '.*';
+                               } elseif ( $item->toString() == '_' ) {
+                                       $regex .= '.';
+                               }
+                       } else {
+                               $regex .= preg_quote( $item, '!' );
+                       }
+
+               }
+
+               $regex .= '$!';
+
+               return $regex;
+
+       }
+
+       /**
+        * provideValidPatterns()
+        *
+        * @return array
+        */
+       public static function provideValidPatterns() {
+
+               return array(
+                       // Protocol, Search pattern, URL which matches the pattern
+                       array( 'http://' , '*.test.com' , 'http://www.test.com' ),
+                       array( 'http://' , 'test.com:8080/dir/file' , 'http://name:pass@test.com:8080/dir/file' ),
+                       array( 'https://' , '*.com' , 'https://s.s.test..com:88/dir/file?a=1&b=2' ),
+                       array( 'https://' , '*.com' , 'https://name:pass@secure.com/index.html' ),
+                       array( 'http://' , 'name:pass@test.com' , 'http://test.com' ),
+                       array( 'http://' , 'test.com' , 'http://name:pass@test.com' ),
+                       array( 'http://' , '*.test.com' , 'http://a.b.c.test.com/dir/dir/file?a=6'),
+                       array( null , 'http://*.test.com' , 'http://www.test.com' ),
+                       array( 'mailto:' , 'name@mail.test123.com' , 'mailto:name@mail.test123.com' ),
+                       array( '' ,
+                               'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg' ,
+                               'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg'
+                       ),
+                       array( '' , 'http://name:pass@*.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg' ,
+                               'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg' ),
+                       array( '' , 'http://name:wrongpass@*.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]' ,
+                               'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg' ),
+                       array( 'http://' , 'name:pass@*.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg' ,
+                               'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg' ),
+                       array( ''  , 'http://name:pass@www.test.com:12345' ,
+                               'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg' ),
+                       array( 'ftp://', 'user:pass@ftp.test.com:1233/home/user/file;type=efw',
+                               'ftp://user:pass@ftp.test.com:1233/home/user/file;type=efw' ),
+                       array( null , 'ftp://otheruser:otherpass@ftp.test.com:1233/home/user/file;type=',
+                               'ftp://user:pass@ftp.test.com:1233/home/user/file;type=efw' ),
+                       array( null , 'ftp://@ftp.test.com:1233/home/user/file;type=',
+                               'ftp://user:pass@ftp.test.com:1233/home/user/file;type=efw' ),
+                       array( null , 'ftp://ftp.test.com/',
+                               'ftp://user:pass@ftp.test.com/home/user/file;type=efw' ),
+                       array( null , 'ftp://ftp.test.com/',
+                               'ftp://user:pass@ftp.test.com/home/user/file;type=efw' ),
+                       array( null , 'ftp://*.test.com:222/',
+                               'ftp://user:pass@ftp.test.com:222/home' ),
+                       array( 'irc://', '*.myserver:6667/', 'irc://test.myserver:6667/' ),
+                       array( 'irc://', 'name:pass@*.myserver/', 'irc://test.myserver:6667/' ),
+                       array( 'irc://', 'name:pass@*.myserver/', 'irc://other:@test.myserver:6667/' ),
+                       array( '', 'irc://test/name,string,abc?msg=t', 'irc://test/name,string,abc?msg=test' ),
+                       array( '', 'https://gerrit.wikimedia.org/r/#/q/status:open,n,z',
+                               'https://gerrit.wikimedia.org/r/#/q/status:open,n,z' ),
+                       array( '', 'https://gerrit.wikimedia.org',
+                               'https://gerrit.wikimedia.org/r/#/q/status:open,n,z' ),
+                       array( 'mailto:', '*.test.com', 'mailto:name@pop3.test.com' ),
+                       array( 'mailto:', 'test.com', 'mailto:name@test.com' ),
+                       array( 'news:', 'test.1234afc@news.test.com', 'news:test.1234afc@news.test.com' ),
+                       array( 'news:', '*.test.com', 'news:test.1234afc@news.test.com' ),
+                       array( '' , 'news:4df8kh$iagfewewf(at)newsbf02aaa.news.aol.com',
+                               'news:4df8kh$iagfewewf(at)newsbf02aaa.news.aol.com' ),
+                       array( '' , 'news:*.aol.com',
+                               'news:4df8kh$iagfewewf(at)newsbf02aaa.news.aol.com' ),
+                       array( '' , 'git://github.com/prwef/abc-def.git' , 'git://github.com/prwef/abc-def.git' ),
+                       array( 'git://' , 'github.com/' , 'git://github.com/prwef/abc-def.git' ),
+                       array( 'git://' , '*.github.com/' , 'git://a.b.c.d.e.f.github.com/prwef/abc-def.git' ),
+                       array( '' , 'gopher://*.test.com/' , 'gopher://gopher.test.com/0/v2/vstat'),
+                       array( 'telnet://' , '*.test.com' , 'telnet://shell.test.com/~home/'),
+
+                       //
+                       // The following only work in PHP >= 5.3.7, due to a bug in parse_url which eats
+                       // the path from the url (https://bugs.php.net/bug.php?id=54180)
+                       //
+                       // array( '', 'http://test.com', 'http://test.com/index?arg=1' ),
+                       // array( 'http://', '*.test.com', 'http://www.test.com/index?arg=1' ),
+                       // array( '' ,
+                       //    'http://xx23124:__ffdfdef__@www.test.com:12345/dir' ,
+                       //    'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg'
+                       // ),
+                       //
+
+                       //
+                       // Tests for false positives
+                       //
+                       array( 'http://' , 'test.com' , 'http://www.test.com', false ),
+                       array( 'http://' , 'www1.test.com' , 'http://www.test.com', false ),
+                       array( 'http://' , '*.test.com' , 'http://www.test.t.com', false ),
+                       array( '' , 'http://test.com:8080' , 'http://www.test.com:8080', false ),
+                       array( '' , 'https://test.com' , 'http://test.com', false ),
+                       array( '' , 'http://test.com' , 'https://test.com', false ),
+                       array( 'http://', 'http://test.com', 'http://test.com', false ),
+                       array( null, 'http://www.test.com', 'http://www.test.com:80', false ),
+                       array( null, 'http://www.test.com:80', 'http://www.test.com', false ),
+                       array( null, 'http://*.test.com:80', 'http://www.test.com', false ),
+                       array( '', 'https://gerrit.wikimedia.org/r/#/XXX/status:open,n,z',
+                               'https://gerrit.wikimedia.org/r/#/q/status:open,n,z', false ),
+                       array( '', 'https://*.wikimedia.org/r/#/q/status:open,n,z',
+                               'https://gerrit.wikimedia.org/r/#/XXX/status:open,n,z', false ),
+                       array( 'mailto:', '@test.com', '@abc.test.com', false ),
+                       array( 'mailto:', 'mail@test.com', 'mail2@test.com', false ),
+                       array( '', 'mailto:mail@test.com', 'mail2@test.com', false ),
+                       array( '', 'mailto:@test.com', '@abc.test.com', false ),
+                       array( 'ftp://', '*.co', 'ftp://www.co.uk', false ),
+                       array( 'ftp://', '*.co', 'ftp://www.co.m', false ),
+                       array( 'ftp://', '*.co/dir/', 'ftp://www.co/dir2/', false ),
+                       array( 'ftp://', 'www.co/dir/', 'ftp://www.co/dir2/', false ),
+                       array( 'ftp://', 'test.com/dir/', 'ftp://test.com/', false ),
+                       array( '', 'http://test.com:8080/dir/', 'http://test.com:808/dir/', false ),
+                       array( '', 'http://test.com/dir/index.html', 'http://test.com/dir/index.php', false ),
+
+                       //
+                       // These are false positives too and ideally shouldn't match, but that
+                       // would require using regexes and RLIKE instead of LIKE
+                       //
+                       // array( null, 'http://*.test.com', 'http://www.test.com:80', false ),
+                       // array( '', 'https://*.wikimedia.org/r/#/q/status:open,n,z',
+                       //      'https://gerrit.wikimedia.org/XXX/r/#/q/status:open,n,z', false ),
+               );
+
+       }
+
+       /**
+        * testMakeLikeArrayWithValidPatterns()
+        *
+        * Tests whether the LIKE clause produced by LinkFilter::makeLikeArray($pattern, $protocol)
+        * will find one of the URL indexes produced by wfMakeUrlIndexes($url)
+        *
+        * @dataProvider provideValidPatterns
+        *
+        * @param String $protocol Protocol, e.g. 'http://' or 'mailto:'
+        * @param String $pattern Search pattern to feed to LinkFilter::makeLikeArray
+        * @param String $url URL to feed to wfMakeUrlIndexes
+        * @param bool $shouldBeFound Should the URL be found? (defaults true)
+        */
+       function testMakeLikeArrayWithValidPatterns( $protocol, $pattern, $url, $shouldBeFound = true ) {
+
+               $indexes = wfMakeUrlIndexes( $url );
+               $likeArray = LinkFilter::makeLikeArray( $pattern, $protocol );
+
+               $this->assertTrue( $likeArray !== false,
+                       "LinkFilter::makeLikeArray('$pattern', '$protocol') returned false on a valid pattern"
+               );
+
+               $regex = $this->createRegexFromLIKE( $likeArray );
+               $debugmsg = "Regex: '" . $regex . "'\n";
+               $debugmsg .= count( $indexes ) . " index(es) created by wfMakeUrlIndexes():\n";
+
+               $matches = 0;
+
+               foreach( $indexes as $index ) {
+                       $matches += preg_match( $regex, $index );
+                       $debugmsg .= "\t'$index'\n";
+               }
+
+               if ( $shouldBeFound ) {
+                       $this->assertTrue(
+                               $matches > 0,
+                               "Search pattern '$protocol$pattern' does not find url '$url' \n$debugmsg"
+                       );
+               } else {
+                       $this->assertFalse(
+                               $matches > 0,
+                               "Search pattern '$protocol$pattern' should not find url '$url' \n$debugmsg"
+                       );
+               }
+
+       }
+
+       /**
+        * provideInvalidPatterns()
+        *
+        * @return array
+        */
+       public static function provideInvalidPatterns() {
+
+               return array(
+                       array( '' ),
+                       array( '*' ),
+                       array( 'http://*' ),
+                       array( 'http://*/' ),
+                       array( 'http://*/dir/file' ),
+                       array( 'test.*.com' ),
+                       array( 'http://test.*.com' ),
+                       array( 'test.*.com' ),
+                       array( 'http://*.test.*' ),
+                       array( 'http://*test.com' ),
+                       array( 'https://*' ),
+                       array( '*://test.com'),
+                       array( 'mailto:name:pass@t*est.com' ),
+                       array( 'http://*:888/'),
+                       array( '*http://'),
+                       array( 'test.com/*/index' ),
+                       array( 'test.com/dir/index?arg=*' ),
+               );
+
+       }
+
+       /**
+        * testMakeLikeArrayWithInvalidPatterns()
+        *
+        * Tests whether LinkFilter::makeLikeArray($pattern) will reject invalid search patterns
+        *
+        * @dataProvider provideInvalidPatterns
+        *
+        * @param $pattern string: Invalid search pattern
+        */
+       function testMakeLikeArrayWithInvalidPatterns( $pattern ) {
+
+               $this->assertFalse(
+                       LinkFilter::makeLikeArray( $pattern ),
+                       "'$pattern' is not a valid pattern and should be rejected"
+               );
+
+       }
+
+}