'FakeMaintenance' => 'maintenance/Maintenance.php',
'LoggedUpdateMaintenance' => 'maintenance/Maintenance.php',
'Maintenance' => 'maintenance/Maintenance.php',
+ 'FixExtLinksProtocolRelative' => 'maintenance/fixExtLinksProtocolRelative.php',
'PopulateCategory' => 'maintenance/populateCategory.php',
'PopulateImageSha1' => 'maintenance/populateImageSha1.php',
'PopulateLogSearch' => 'maintenance/populateLogSearch.php',
}
/**
- * Make a URL index, appropriate for the el_index field of externallinks.
+ * Make URL indexes, appropriate for the el_index field of externallinks.
*
* @param $url String
- * @return String
+ * @return array
*/
-function wfMakeUrlIndex( $url ) {
+function wfMakeUrlIndexes( $url ) {
$bits = wfParseUrl( $url );
// Reverse the labels in the hostname, convert to lower case
if ( isset( $bits['fragment'] ) ) {
$index .= '#' . $bits['fragment'];
}
- return $index;
+
+ if ( $prot == '' ) {
+ return array( "http:$index", "https:$index" );
+ } else {
+ return array( $index );
+ }
}
/**
$arr = array();
$diffs = array_diff_key( $this->mExternals, $existing );
foreach( $diffs as $url => $dummy ) {
- $arr[] = array(
- 'el_from' => $this->mId,
- 'el_to' => $url,
- 'el_index' => wfMakeUrlIndex( $url ),
- );
+ foreach( wfMakeUrlIndexes( $url ) as $index ) {
+ $arr[] = array(
+ 'el_from' => $this->mId,
+ 'el_to' => $url,
+ 'el_index' => $index,
+ );
+ }
}
return $arr;
}
$this->addOption( 'ORDER BY', 'el_from' );
}
+ // If we're querying all protocols, use DISTINCT to avoid repeating protocol-relative links twice
+ if ( $protocol === null ) {
+ $this->addOption( 'DISTINCT' );
+ }
+
$this->addOption( 'LIMIT', $params['limit'] + 1 );
$offset = isset( $params['offset'] ) ? $params['offset'] : 0;
if ( $offset ) {
'DeleteDefaultMessages',
'PopulateRevisionLength',
'PopulateRevisionSha1',
- 'PopulateImageSha1'
+ 'PopulateImageSha1',
+ 'FixExtLinksProtocolRelative',
);
/**
--- /dev/null
+<?php
+/**
+ * Fixes any entries for protocol-relative URLs in the externallinks table,
+ * replacing each protocol-relative entry with two entries, one for http
+ * and one for https.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @ingroup Maintenance
+ */
+
+require_once( dirname( __FILE__ ) . '/Maintenance.php' );
+
+class FixExtLinksProtocolRelative extends LoggedUpdateMaintenance {
+ public function __construct() {
+ parent::__construct();
+ $this->mDescription = "Fixes any entries in the externallinks table containing protocol-relative URLs";
+ }
+
+ protected function getUpdateKey() {
+ return 'fix protocol-relative URLs in externallinks';
+ }
+
+ protected function updateSkippedMessage() {
+ return 'protocol-relative URLs in externallinks table already fixed.';
+ }
+
+ protected function doDBUpdates() {
+ $db = wfGetDB( DB_MASTER );
+ if ( !$db->tableExists( 'externallinks' ) ) {
+ $this->error( "externallinks table does not exist" );
+ return false;
+ }
+ $this->output( "Fixing protocol-relative entries in the externallinks table...\n" );
+ $res = $db->select( 'externallinks', array( 'el_from', 'el_to', 'el_index' ),
+ array( 'el_index' . $db->buildLike( '//', $db->anyString() ) ),
+ __METHOD__
+ );
+ $count = 0;
+ foreach ( $res as $row ) {
+ $count++;
+ if ( $count % 100 == 0 ) {
+ $this->output( $count );
+ wfWaitForSlaves();
+ }
+ $db->insert( 'externallinks',
+ array(
+ array(
+ 'el_from' => $row->el_from,
+ 'el_to' => $row->el_to,
+ 'el_index' => "http:{$row->el_index}",
+ ),
+ array(
+ 'el_from' => $row->el_from,
+ 'el_to' => $row->el_to,
+ 'el_index' => "https:{$row->el_index}",
+ )
+ ), __METHOD__, array( 'IGNORE' )
+ );
+ $db->delete( 'externallinks', array( 'el_index' => $row->el_index ), __METHOD__ );
+ }
+ $this->output( "Done, $count rows updated.\n" );
+ return true;
+ }
+}
+
+$maintClass = "FixExtLinksProtocolRelative";
+require_once( RUN_MAINTENANCE_IF_MAIN );
}
/**
- * @dataProvider provideMakeUrlIndex()
+ * @dataProvider provideMakeUrlIndexes()
*/
- function testMakeUrlIndex( $url, $expected ) {
- $index = wfMakeUrlIndex( $url );
- $this->assertEquals( $expected, $index, "wfMakeUrlIndex(\"$url\")" );
+ function testMakeUrlIndexes( $url, $expected ) {
+ $index = wfMakeUrlIndexes( $url );
+ $this->assertEquals( $expected, $index, "wfMakeUrlIndexes(\"$url\")" );
}
- function provideMakeUrlIndex() {
+ function provideMakeUrlIndexes() {
return array(
array(
// just a regular :)
'https://bugzilla.wikimedia.org/show_bug.cgi?id=28627',
- 'https://org.wikimedia.bugzilla./show_bug.cgi?id=28627'
+ array( 'https://org.wikimedia.bugzilla./show_bug.cgi?id=28627' )
),
array(
// mailtos are handled special
// is this really right though? that final . probably belongs earlier?
'mailto:wiki@wikimedia.org',
- 'mailto:org.wikimedia@wiki.',
+ array( 'mailto:org.wikimedia@wiki.' )
),
// file URL cases per bug 28627...
array(
// three slashes: local filesystem path Unix-style
'file:///whatever/you/like.txt',
- 'file://./whatever/you/like.txt'
+ array( 'file://./whatever/you/like.txt' )
),
array(
// three slashes: local filesystem path Windows-style
'file:///c:/whatever/you/like.txt',
- 'file://./c:/whatever/you/like.txt'
+ array( 'file://./c:/whatever/you/like.txt' )
),
array(
// two slashes: UNC filesystem path Windows-style
'file://intranet/whatever/you/like.txt',
- 'file://intranet./whatever/you/like.txt'
+ array( 'file://intranet./whatever/you/like.txt' )
),
// Multiple-slash cases that can sorta work on Mozilla
// if you hack it just right are kinda pathological,
//
// Those will survive the algorithm but with results that
// are less consistent.
+
+ // protocol-relative URL cases per bug 29854...
+ array(
+ '//bugzilla.wikimedia.org/show_bug.cgi?id=28627',
+ array(
+ 'http://org.wikimedia.bugzilla./show_bug.cgi?id=28627',
+ 'https://org.wikimedia.bugzilla./show_bug.cgi?id=28627'
+ )
+ ),
);
}