From d045b999ec4f83deb78831945eb1b7ea0ad01312 Mon Sep 17 00:00:00 2001 From: Roan Kattouw Date: Mon, 14 Nov 2011 09:13:58 +0000 Subject: [PATCH] (bug 29854) Store protocol-relative links twice in the externallinks table, one with http: in el_index and once with https: . Modified patch by Brad Jorsch --- includes/AutoLoader.php | 1 + includes/GlobalFunctions.php | 13 ++- includes/LinksUpdate.php | 12 +-- includes/api/ApiQueryExternalLinks.php | 5 ++ includes/installer/DatabaseUpdater.php | 3 +- maintenance/fixExtLinksProtocolRelative.php | 81 +++++++++++++++++++ .../includes/GlobalFunctions/GlobalTest.php | 29 ++++--- 7 files changed, 124 insertions(+), 20 deletions(-) create mode 100644 maintenance/fixExtLinksProtocolRelative.php diff --git a/includes/AutoLoader.php b/includes/AutoLoader.php index c7d11e10b5..e0de7ec7f3 100644 --- a/includes/AutoLoader.php +++ b/includes/AutoLoader.php @@ -861,6 +861,7 @@ $wgAutoloadLocalClasses = array( 'FakeMaintenance' => 'maintenance/Maintenance.php', 'LoggedUpdateMaintenance' => 'maintenance/Maintenance.php', 'Maintenance' => 'maintenance/Maintenance.php', + 'FixExtLinksProtocolRelative' => 'maintenance/fixExtLinksProtocolRelative.php', 'PopulateCategory' => 'maintenance/populateCategory.php', 'PopulateImageSha1' => 'maintenance/populateImageSha1.php', 'PopulateLogSearch' => 'maintenance/populateLogSearch.php', diff --git a/includes/GlobalFunctions.php b/includes/GlobalFunctions.php index 17e13ec172..42b683a20d 100644 --- a/includes/GlobalFunctions.php +++ b/includes/GlobalFunctions.php @@ -647,12 +647,12 @@ function wfParseUrl( $url ) { } /** - * Make a URL index, appropriate for the el_index field of externallinks. + * Make URL indexes, appropriate for the el_index field of externallinks. * * @param $url String - * @return String + * @return array */ -function wfMakeUrlIndex( $url ) { +function wfMakeUrlIndexes( $url ) { $bits = wfParseUrl( $url ); // Reverse the labels in the hostname, convert to lower case @@ -692,7 +692,12 @@ function wfMakeUrlIndex( $url ) { if ( isset( $bits['fragment'] ) ) { $index .= '#' . $bits['fragment']; } - return $index; + + if ( $prot == '' ) { + return array( "http:$index", "https:$index" ); + } else { + return array( $index ); + } } /** diff --git a/includes/LinksUpdate.php b/includes/LinksUpdate.php index 0002096ad4..38ed4cb4f4 100644 --- a/includes/LinksUpdate.php +++ b/includes/LinksUpdate.php @@ -456,11 +456,13 @@ class LinksUpdate { $arr = array(); $diffs = array_diff_key( $this->mExternals, $existing ); foreach( $diffs as $url => $dummy ) { - $arr[] = array( - 'el_from' => $this->mId, - 'el_to' => $url, - 'el_index' => wfMakeUrlIndex( $url ), - ); + foreach( wfMakeUrlIndexes( $url ) as $index ) { + $arr[] = array( + 'el_from' => $this->mId, + 'el_to' => $url, + 'el_index' => $index, + ); + } } return $arr; } diff --git a/includes/api/ApiQueryExternalLinks.php b/includes/api/ApiQueryExternalLinks.php index c2fed6ace8..c9711651bf 100644 --- a/includes/api/ApiQueryExternalLinks.php +++ b/includes/api/ApiQueryExternalLinks.php @@ -69,6 +69,11 @@ class ApiQueryExternalLinks extends ApiQueryBase { $this->addOption( 'ORDER BY', 'el_from' ); } + // If we're querying all protocols, use DISTINCT to avoid repeating protocol-relative links twice + if ( $protocol === null ) { + $this->addOption( 'DISTINCT' ); + } + $this->addOption( 'LIMIT', $params['limit'] + 1 ); $offset = isset( $params['offset'] ) ? $params['offset'] : 0; if ( $offset ) { diff --git a/includes/installer/DatabaseUpdater.php b/includes/installer/DatabaseUpdater.php index a394f19fb2..94c790c787 100644 --- a/includes/installer/DatabaseUpdater.php +++ b/includes/installer/DatabaseUpdater.php @@ -43,7 +43,8 @@ abstract class DatabaseUpdater { 'DeleteDefaultMessages', 'PopulateRevisionLength', 'PopulateRevisionSha1', - 'PopulateImageSha1' + 'PopulateImageSha1', + 'FixExtLinksProtocolRelative', ); /** diff --git a/maintenance/fixExtLinksProtocolRelative.php b/maintenance/fixExtLinksProtocolRelative.php new file mode 100644 index 0000000000..e10b194cc2 --- /dev/null +++ b/maintenance/fixExtLinksProtocolRelative.php @@ -0,0 +1,81 @@ +mDescription = "Fixes any entries in the externallinks table containing protocol-relative URLs"; + } + + protected function getUpdateKey() { + return 'fix protocol-relative URLs in externallinks'; + } + + protected function updateSkippedMessage() { + return 'protocol-relative URLs in externallinks table already fixed.'; + } + + protected function doDBUpdates() { + $db = wfGetDB( DB_MASTER ); + if ( !$db->tableExists( 'externallinks' ) ) { + $this->error( "externallinks table does not exist" ); + return false; + } + $this->output( "Fixing protocol-relative entries in the externallinks table...\n" ); + $res = $db->select( 'externallinks', array( 'el_from', 'el_to', 'el_index' ), + array( 'el_index' . $db->buildLike( '//', $db->anyString() ) ), + __METHOD__ + ); + $count = 0; + foreach ( $res as $row ) { + $count++; + if ( $count % 100 == 0 ) { + $this->output( $count ); + wfWaitForSlaves(); + } + $db->insert( 'externallinks', + array( + array( + 'el_from' => $row->el_from, + 'el_to' => $row->el_to, + 'el_index' => "http:{$row->el_index}", + ), + array( + 'el_from' => $row->el_from, + 'el_to' => $row->el_to, + 'el_index' => "https:{$row->el_index}", + ) + ), __METHOD__, array( 'IGNORE' ) + ); + $db->delete( 'externallinks', array( 'el_index' => $row->el_index ), __METHOD__ ); + } + $this->output( "Done, $count rows updated.\n" ); + return true; + } +} + +$maintClass = "FixExtLinksProtocolRelative"; +require_once( RUN_MAINTENANCE_IF_MAIN ); diff --git a/tests/phpunit/includes/GlobalFunctions/GlobalTest.php b/tests/phpunit/includes/GlobalFunctions/GlobalTest.php index 01da0a522b..5b5ee6400c 100644 --- a/tests/phpunit/includes/GlobalFunctions/GlobalTest.php +++ b/tests/phpunit/includes/GlobalFunctions/GlobalTest.php @@ -831,42 +831,42 @@ class GlobalTest extends MediaWikiTestCase { } /** - * @dataProvider provideMakeUrlIndex() + * @dataProvider provideMakeUrlIndexes() */ - function testMakeUrlIndex( $url, $expected ) { - $index = wfMakeUrlIndex( $url ); - $this->assertEquals( $expected, $index, "wfMakeUrlIndex(\"$url\")" ); + function testMakeUrlIndexes( $url, $expected ) { + $index = wfMakeUrlIndexes( $url ); + $this->assertEquals( $expected, $index, "wfMakeUrlIndexes(\"$url\")" ); } - function provideMakeUrlIndex() { + function provideMakeUrlIndexes() { return array( array( // just a regular :) 'https://bugzilla.wikimedia.org/show_bug.cgi?id=28627', - 'https://org.wikimedia.bugzilla./show_bug.cgi?id=28627' + array( 'https://org.wikimedia.bugzilla./show_bug.cgi?id=28627' ) ), array( // mailtos are handled special // is this really right though? that final . probably belongs earlier? 'mailto:wiki@wikimedia.org', - 'mailto:org.wikimedia@wiki.', + array( 'mailto:org.wikimedia@wiki.' ) ), // file URL cases per bug 28627... array( // three slashes: local filesystem path Unix-style 'file:///whatever/you/like.txt', - 'file://./whatever/you/like.txt' + array( 'file://./whatever/you/like.txt' ) ), array( // three slashes: local filesystem path Windows-style 'file:///c:/whatever/you/like.txt', - 'file://./c:/whatever/you/like.txt' + array( 'file://./c:/whatever/you/like.txt' ) ), array( // two slashes: UNC filesystem path Windows-style 'file://intranet/whatever/you/like.txt', - 'file://intranet./whatever/you/like.txt' + array( 'file://intranet./whatever/you/like.txt' ) ), // Multiple-slash cases that can sorta work on Mozilla // if you hack it just right are kinda pathological, @@ -875,6 +875,15 @@ class GlobalTest extends MediaWikiTestCase { // // Those will survive the algorithm but with results that // are less consistent. + + // protocol-relative URL cases per bug 29854... + array( + '//bugzilla.wikimedia.org/show_bug.cgi?id=28627', + array( + 'http://org.wikimedia.bugzilla./show_bug.cgi?id=28627', + 'https://org.wikimedia.bugzilla./show_bug.cgi?id=28627' + ) + ), ); } -- 2.20.1