'DateFormats' => __DIR__ . '/maintenance/language/date-formats.php',
'DateFormatter' => __DIR__ . '/includes/parser/DateFormatter.php',
'DeadendPagesPage' => __DIR__ . '/includes/specials/SpecialDeadendpages.php',
+ 'DeduplicateArchiveRevId' => __DIR__ . '/maintenance/deduplicateArchiveRevId.php',
'DeferrableCallback' => __DIR__ . '/includes/deferred/DeferrableCallback.php',
'DeferrableUpdate' => __DIR__ . '/includes/deferred/DeferrableUpdate.php',
'DeferredStringifier' => __DIR__ . '/includes/libs/DeferredStringifier.php',
return true;
}
+ /**
+ * Run a maintenance script
+ *
+ * This should only be used when the maintenance script must run before
+ * later updates. If later updates don't depend on the script, add it to
+ * DatabaseUpdater::$postDatabaseUpdateMaintenance instead.
+ *
+ * The script's execute() method must return true to indicate successful
+ * completion, and must return false (or throw an exception) to indicate
+ * unsuccessful completion.
+ *
+ * @since 1.32
+ * @param string $class Maintenance subclass
+ * @param string $script Script path and filename, usually "maintenance/fooBar.php"
+ */
+ public function runMaintenance( $class, $script ) {
+ $this->output( "Running $script...\n" );
+ $task = $this->maintenance->runChild( $class );
+ $ok = $task->execute();
+ if ( !$ok ) {
+ throw new RuntimeException( "Execution of $script did not complete successfully." );
+ }
+ $this->output( "done.\n" );
+ }
+
/**
* Set any .htaccess files or equivilent for storage repos
*
[ 'populateExternallinksIndex60' ],
[ 'modifyfield', 'externallinks', 'el_index_60',
'patch-externallinks-el_index_60-drop-default.sql' ],
+ [ 'runMaintenance', DeduplicateArchiveRevId::class, 'maintenance/deduplicateArchiveRevId.php' ],
];
}
[ 'populateExternallinksIndex60' ],
[ 'modifyfield', 'externallinks', 'el_index_60',
'patch-externallinks-el_index_60-drop-default.sql' ],
+ [ 'runMaintenance', DeduplicateArchiveRevId::class, 'maintenance/deduplicateArchiveRevId.php' ],
];
}
// 1.32
[ 'addTable', 'change_tag_def', 'patch-change_tag_def.sql' ],
[ 'populateExternallinksIndex60' ],
+ [ 'runMaintenance', DeduplicateArchiveRevId::class, 'maintenance/deduplicateArchiveRevId.php' ],
// KEEP THIS AT THE BOTTOM!!
[ 'doRebuildDuplicateFunction' ],
[ 'addTable', 'change_tag_def', 'patch-change_tag_def.sql' ],
[ 'populateExternallinksIndex60' ],
[ 'dropDefault', 'externallinks', 'el_index_60' ],
+ [ 'runMaintenance', DeduplicateArchiveRevId::class, 'maintenance/deduplicateArchiveRevId.php' ],
];
}
[ 'populateExternallinksIndex60' ],
[ 'modifyfield', 'externallinks', 'el_index_60',
'patch-externallinks-el_index_60-drop-default.sql' ],
+ [ 'runMaintenance', DeduplicateArchiveRevId::class, 'maintenance/deduplicateArchiveRevId.php' ],
];
}
--- /dev/null
+<?php
+
+use Wikimedia\Rdbms\IDatabase;
+
+require_once __DIR__ . '/Maintenance.php';
+
+/**
+ * Maintenance script that cleans up archive rows with duplicated ar_rev_id,
+ * both within archive and between archive and revision.
+ *
+ * @ingroup Maintenance
+ * @since 1.32
+ */
+class DeduplicateArchiveRevId extends LoggedUpdateMaintenance {
+
+ /** @var array|null */
+ private $arActorQuery = null;
+
+ private $deleted = 0;
+ private $reassigned = 0;
+
+ public function __construct() {
+ parent::__construct();
+ $this->addDescription(
+ 'Clean up duplicate ar_rev_id, both within archive and between archive and revision.'
+ );
+ $this->setBatchSize( 10000 );
+ }
+
+ protected function getUpdateKey() {
+ return __CLASS__;
+ }
+
+ protected function doDBUpdates() {
+ $this->output( "Deduplicating ar_rev_id...\n" );
+
+ $dbw = $this->getDB( DB_MASTER );
+
+ $minId = $dbw->selectField( 'archive', 'MIN(ar_rev_id)', [], __METHOD__ );
+ $maxId = $dbw->selectField( 'archive', 'MAX(ar_rev_id)', [], __METHOD__ );
+ $batchSize = $this->getBatchSize();
+
+ $this->arActorQuery = ActorMigration::newMigration()->getJoin( 'ar_user' );
+ $revActorQuery = ActorMigration::newMigration()->getJoin( 'rev_user' );
+
+ for ( $id = $minId; $id <= $maxId; $id += $batchSize ) {
+ $endId = min( $maxId, $id + $batchSize - 1 );
+
+ $this->beginTransaction( $dbw, __METHOD__ );
+
+ // Lock the archive and revision table rows for the IDs we're checking
+ // to try to prevent deletions or undeletions from confusing things.
+ $dbw->selectRowCount(
+ 'archive',
+ 1,
+ [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
+ __METHOD__,
+ [ 'FOR UPDATE' ]
+ );
+ $dbw->selectRowCount(
+ 'revision',
+ 1,
+ [ 'rev_id >= ' . (int)$id, 'rev_id <= ' . (int)$endId ],
+ __METHOD__,
+ [ 'LOCK IN SHARE MODE' ]
+ );
+
+ // Figure out the ar_rev_ids we actually need to look at
+ $res = $dbw->select(
+ [ 'archive', 'revision' ] + $revActorQuery['tables'],
+ [ 'rev_id', 'rev_timestamp', 'rev_sha1' ] + $revActorQuery['fields'],
+ [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
+ __METHOD__,
+ [ 'DISTINCT' ],
+ [ 'revision' => [ 'JOIN', 'ar_rev_id = rev_id' ] ] + $revActorQuery['joins']
+ );
+ $revRows = [];
+ foreach ( $res as $row ) {
+ $revRows[$row->rev_id] = $row;
+ }
+
+ $arRevIds = $dbw->selectFieldValues(
+ [ 'archive' ],
+ 'ar_rev_id',
+ [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
+ __METHOD__,
+ [ 'GROUP BY' => 'ar_rev_id', 'HAVING' => 'COUNT(*) > 1' ]
+ );
+ $arRevIds = array_values( array_unique( array_merge( $arRevIds, array_keys( $revRows ) ) ) );
+
+ if ( $arRevIds ) {
+ $this->processArRevIds( $dbw, $arRevIds, $revRows );
+ }
+
+ $this->output( "... $id-$endId\n" );
+ $this->commitTransaction( $dbw, __METHOD__ );
+ }
+
+ $this->output(
+ "Finished deduplicating ar_rev_id. $this->deleted rows deleted, "
+ . "$this->reassigned assigned new IDs.\n"
+ );
+ return true;
+ }
+
+ /**
+ * Process a set of ar_rev_ids
+ * @param IDatabase $dbw
+ * @param int[] $arRevIds IDs to process
+ * @param object[] $revRows Existing revision-table row data
+ */
+ private function processArRevIds( IDatabase $dbw, array $arRevIds, array $revRows ) {
+ // Select all the data we need for deduplication
+ $res = $dbw->select(
+ [ 'archive' ] + $this->arActorQuery['tables'],
+ [ 'ar_id', 'ar_rev_id', 'ar_namespace', 'ar_title', 'ar_timestamp', 'ar_sha1' ]
+ + $this->arActorQuery['fields'],
+ [ 'ar_rev_id' => $arRevIds ],
+ __METHOD__,
+ [],
+ $this->arActorQuery['joins']
+ );
+
+ // Determine which rows we need to delete or reassign
+ $seen = [];
+ $toDelete = [];
+ $toReassign = [];
+ foreach ( $res as $row ) {
+ // Revision-table row exists?
+ if ( isset( $revRows[$row->ar_rev_id] ) ) {
+ $revRow = $revRows[$row->ar_rev_id];
+
+ // Record the rev_id as seen, so the code below will always delete or reassign.
+ if ( !isset( $seen[$revRow->rev_id] ) ) {
+ $seen[$revRow->rev_id] = [
+ 'first' => "revision row",
+ ];
+ }
+
+ // Delete the archive row if it seems to be the same regardless
+ // of page, because moves can change IDs and titles.
+ if ( $row->ar_timestamp === $revRow->rev_timestamp &&
+ $row->ar_sha1 === $revRow->rev_sha1 &&
+ $row->ar_user === $revRow->rev_user &&
+ $row->ar_user_text === $revRow->rev_user_text
+ ) {
+ $this->output(
+ "Row $row->ar_id duplicates revision row for rev_id $revRow->rev_id, deleting\n"
+ );
+ $toDelete[] = $row->ar_id;
+ continue;
+ }
+ }
+
+ $key = $this->getSeenKey( $row );
+ if ( !isset( $seen[$row->ar_rev_id] ) ) {
+ // This rev_id hasn't even been seen yet, nothing to do besides record it.
+ $seen[$row->ar_rev_id] = [
+ 'first' => "archive row $row->ar_id",
+ $key => $row->ar_id,
+ ];
+ } elseif ( !isset( $seen[$row->ar_rev_id][$key] ) ) {
+ // The rev_id was seen, but not this particular change. Reassign it.
+ $seen[$row->ar_rev_id][$key] = $row->ar_id;
+ $this->output(
+ "Row $row->ar_id conflicts with {$seen[$row->ar_rev_id]['first']} "
+ . "for rev_id $row->ar_rev_id, reassigning\n"
+ );
+ $toReassign[] = $row->ar_id;
+ } else {
+ // The rev_id was seen with a row that matches this change. Delete it.
+ $this->output(
+ "Row $row->ar_id duplicates archive row {$seen[$row->ar_rev_id][$key]} "
+ . "for rev_id $row->ar_rev_id, deleting\n"
+ );
+ $toDelete[] = $row->ar_id;
+ }
+ }
+
+ // Perform the updates
+ if ( $toDelete ) {
+ $dbw->delete( 'archive', [ 'ar_id' => $toDelete ], __METHOD__ );
+ $this->deleted += $dbw->affectedRows();
+ }
+ if ( $toReassign ) {
+ $this->reassigned += PopulateArchiveRevId::reassignArRevIds( $dbw, $toReassign );
+ }
+ }
+
+ /**
+ * Make a key identifying a "unique" change from a row
+ * @param object $row
+ * @return string
+ */
+ private function getSeenKey( $row ) {
+ return implode( "\n", [
+ $row->ar_namespace,
+ $row->ar_title,
+ $row->ar_timestamp,
+ $row->ar_sha1,
+ $row->ar_user,
+ $row->ar_user_text,
+ ] );
+ }
+
+}
+
+$maintClass = "DeduplicateArchiveRevId";
+require_once RUN_MAINTENANCE_IF_MAIN;
* @since 1.31
*/
class PopulateArchiveRevId extends LoggedUpdateMaintenance {
+
+ /** @var array|null Dummy revision row */
+ private static $dummyRev = null;
+
public function __construct() {
parent::__construct();
$this->addDescription( 'Populate ar_rev_id in pre-1.5 rows' );
return true;
}
- $rev = $this->makeDummyRevisionRow( $dbw );
$count = 0;
while ( true ) {
wfWaitForSlaves();
return true;
}
- try {
- $updates = $dbw->doAtomicSection( __METHOD__, function ( $dbw, $fname ) use ( $arIds, $rev ) {
- // Create new rev_ids by inserting dummy rows into revision and then deleting them.
- $dbw->insert( 'revision', array_fill( 0, count( $arIds ), $rev ), $fname );
- $revIds = $dbw->selectFieldValues(
- 'revision',
- 'rev_id',
- [ 'rev_timestamp' => $rev['rev_timestamp'] ],
- $fname
- );
- if ( !is_array( $revIds ) ) {
- throw new UnexpectedValueException( 'Failed to insert dummy revisions' );
- }
- if ( count( $revIds ) !== count( $arIds ) ) {
- throw new UnexpectedValueException(
- 'Tried to insert ' . count( $arIds ) . ' dummy revisions, but found '
- . count( $revIds ) . ' matching rows.'
- );
- }
- $dbw->delete( 'revision', [ 'rev_id' => $revIds ], $fname );
-
- return array_combine( $arIds, $revIds );
- } );
- } catch ( UnexpectedValueException $ex ) {
- $this->fatalError( $ex->getMessage() );
- }
+ $count += self::reassignArRevIds( $dbw, $arIds, [ 'ar_rev_id' => null ] );
+
+ $min = min( $arIds );
+ $max = max( $arIds );
+ $this->output( " ... $min-$max\n" );
+ }
+ }
- foreach ( $updates as $arId => $revId ) {
- $dbw->update(
- 'archive',
- [ 'ar_rev_id' => $revId ],
- [ 'ar_id' => $arId, 'ar_rev_id' => null ],
- __METHOD__
+ /**
+ * Assign new ar_rev_ids to a set of ar_ids.
+ * @param IDatabase $dbw
+ * @param int[] $arIds
+ * @param array $conds Extra conditions for the update
+ * @return int Number of updated rows
+ */
+ public static function reassignArRevIds( IDatabase $dbw, array $arIds, array $conds = [] ) {
+ if ( !self::$dummyRev ) {
+ self::$dummyRev = self::makeDummyRevisionRow( $dbw );
+ }
+
+ $updates = $dbw->doAtomicSection( __METHOD__, function ( $dbw, $fname ) use ( $arIds ) {
+ // Create new rev_ids by inserting dummy rows into revision and then deleting them.
+ $dbw->insert( 'revision', array_fill( 0, count( $arIds ), self::$dummyRev ), $fname );
+ $revIds = $dbw->selectFieldValues(
+ 'revision',
+ 'rev_id',
+ [ 'rev_timestamp' => self::$dummyRev['rev_timestamp'] ],
+ $fname
+ );
+ if ( !is_array( $revIds ) ) {
+ throw new UnexpectedValueException( 'Failed to insert dummy revisions' );
+ }
+ if ( count( $revIds ) !== count( $arIds ) ) {
+ throw new UnexpectedValueException(
+ 'Tried to insert ' . count( $arIds ) . ' dummy revisions, but found '
+ . count( $revIds ) . ' matching rows.'
);
- $count += $dbw->affectedRows();
}
+ $dbw->delete( 'revision', [ 'rev_id' => $revIds ], $fname );
- $min = min( array_keys( $updates ) );
- $max = max( array_keys( $updates ) );
- $this->output( " ... $min-$max\n" );
+ return array_combine( $arIds, $revIds );
+ } );
+
+ $count = 0;
+ foreach ( $updates as $arId => $revId ) {
+ $dbw->update(
+ 'archive',
+ [ 'ar_rev_id' => $revId ],
+ [ 'ar_id' => $arId ] + $conds,
+ __METHOD__
+ );
+ $count += $dbw->affectedRows();
}
+ return $count;
}
/**
*
* The row will have a wildly unlikely timestamp, and possibly a generic
* user and comment, but will otherwise be derived from a revision on the
- * wiki's main page.
+ * wiki's main page or some other revision in the database.
*
* @param IDatabase $dbw
* @return array
*/
- private function makeDummyRevisionRow( IDatabase $dbw ) {
+ private static function makeDummyRevisionRow( IDatabase $dbw ) {
$ts = $dbw->timestamp( '11111111111111' );
+ $rev = null;
+
$mainPage = Title::newMainPage();
- if ( !$mainPage ) {
- $this->fatalError( 'Main page does not exist' );
+ $pageId = $mainPage ? $mainPage->getArticleId() : null;
+ if ( $pageId ) {
+ $rev = $dbw->selectRow(
+ 'revision',
+ '*',
+ [ 'rev_page' => $pageId ],
+ __METHOD__,
+ [ 'ORDER BY' => 'rev_timestamp ASC' ]
+ );
}
- $pageId = $mainPage->getArticleId();
- if ( !$pageId ) {
- $this->fatalError( $mainPage->getPrefixedText() . ' has no ID' );
+
+ if ( !$rev ) {
+ // No main page? Let's see if there are any revisions at all
+ $rev = $dbw->selectRow(
+ 'revision',
+ '*',
+ [],
+ __METHOD__,
+ [ 'ORDER BY' => 'rev_timestamp ASC' ]
+ );
}
- $rev = $dbw->selectRow(
- 'revision',
- '*',
- [ 'rev_page' => $pageId ],
- __METHOD__,
- [ 'ORDER BY' => 'rev_timestamp ASC' ]
- );
if ( !$rev ) {
- $this->fatalError( $mainPage->getPrefixedText() . ' has no revisions' );
+ throw new UnexpectedValueException( 'No revisions are available to copy' );
}
+
unset( $rev->rev_id );
$rev = (array)$rev;
$rev['rev_timestamp'] = $ts;
__METHOD__
);
if ( $any ) {
- $this->fatalError( "... Why does your database contain a revision dated $ts?" );
+ throw new UnexpectedValueException( "... Why does your database contain a revision dated $ts?" );
}
return $rev;