Deduplicate archive.ar_rev_id
authorBrad Jorsch <bjorsch@wikimedia.org>
Fri, 27 Apr 2018 17:10:36 +0000 (13:10 -0400)
committerGergő Tisza <tgr.huwiki@gmail.com>
Wed, 16 May 2018 12:34:11 +0000 (14:34 +0200)
Old bugs and such may have left the archive table with multiple rows
using the same ar_rev_id, or rows that also exist in the revision table.
These need to be cleaned up for MCR.

The maintenance script added here will delete rows that appear to be
duplicates of the same change, and will assign new IDs to rows that do
not appear to be duplicates.

Bug: T193180
Change-Id: I39b0825c9469e074ded3df33a4f06a1ef0edb494

autoload.php
includes/installer/DatabaseUpdater.php
includes/installer/MssqlUpdater.php
includes/installer/MysqlUpdater.php
includes/installer/OracleUpdater.php
includes/installer/PostgresUpdater.php
includes/installer/SqliteUpdater.php
maintenance/deduplicateArchiveRevId.php [new file with mode: 0644]
maintenance/populateArchiveRevId.php

index e0b810b..c55b931 100644 (file)
@@ -361,6 +361,7 @@ $wgAutoloadLocalClasses = [
        'DateFormats' => __DIR__ . '/maintenance/language/date-formats.php',
        'DateFormatter' => __DIR__ . '/includes/parser/DateFormatter.php',
        'DeadendPagesPage' => __DIR__ . '/includes/specials/SpecialDeadendpages.php',
+       'DeduplicateArchiveRevId' => __DIR__ . '/maintenance/deduplicateArchiveRevId.php',
        'DeferrableCallback' => __DIR__ . '/includes/deferred/DeferrableCallback.php',
        'DeferrableUpdate' => __DIR__ . '/includes/deferred/DeferrableUpdate.php',
        'DeferredStringifier' => __DIR__ . '/includes/libs/DeferredStringifier.php',
index e2deed1..ba10278 100644 (file)
@@ -974,6 +974,31 @@ abstract class DatabaseUpdater {
                return true;
        }
 
+       /**
+        * Run a maintenance script
+        *
+        * This should only be used when the maintenance script must run before
+        * later updates. If later updates don't depend on the script, add it to
+        * DatabaseUpdater::$postDatabaseUpdateMaintenance instead.
+        *
+        * The script's execute() method must return true to indicate successful
+        * completion, and must return false (or throw an exception) to indicate
+        * unsuccessful completion.
+        *
+        * @since 1.32
+        * @param string $class Maintenance subclass
+        * @param string $script Script path and filename, usually "maintenance/fooBar.php"
+        */
+       public function runMaintenance( $class, $script ) {
+               $this->output( "Running $script...\n" );
+               $task = $this->maintenance->runChild( $class );
+               $ok = $task->execute();
+               if ( !$ok ) {
+                       throw new RuntimeException( "Execution of $script did not complete successfully." );
+               }
+               $this->output( "done.\n" );
+       }
+
        /**
         * Set any .htaccess files or equivilent for storage repos
         *
index e04ad8f..cc48e2f 100644 (file)
@@ -134,6 +134,7 @@ class MssqlUpdater extends DatabaseUpdater {
                        [ 'populateExternallinksIndex60' ],
                        [ 'modifyfield', 'externallinks', 'el_index_60',
                                'patch-externallinks-el_index_60-drop-default.sql' ],
+                       [ 'runMaintenance', DeduplicateArchiveRevId::class, 'maintenance/deduplicateArchiveRevId.php' ],
                ];
        }
 
index 1b9faf2..9365e98 100644 (file)
@@ -354,6 +354,7 @@ class MysqlUpdater extends DatabaseUpdater {
                        [ 'populateExternallinksIndex60' ],
                        [ 'modifyfield', 'externallinks', 'el_index_60',
                                'patch-externallinks-el_index_60-drop-default.sql' ],
+                       [ 'runMaintenance', DeduplicateArchiveRevId::class, 'maintenance/deduplicateArchiveRevId.php' ],
                ];
        }
 
index d55b520..34d81a7 100644 (file)
@@ -151,6 +151,7 @@ class OracleUpdater extends DatabaseUpdater {
                        // 1.32
                        [ 'addTable', 'change_tag_def', 'patch-change_tag_def.sql' ],
                        [ 'populateExternallinksIndex60' ],
+                       [ 'runMaintenance', DeduplicateArchiveRevId::class, 'maintenance/deduplicateArchiveRevId.php' ],
 
                        // KEEP THIS AT THE BOTTOM!!
                        [ 'doRebuildDuplicateFunction' ],
index 70547b4..6263edb 100644 (file)
@@ -574,6 +574,7 @@ class PostgresUpdater extends DatabaseUpdater {
                        [ 'addTable', 'change_tag_def', 'patch-change_tag_def.sql' ],
                        [ 'populateExternallinksIndex60' ],
                        [ 'dropDefault', 'externallinks', 'el_index_60' ],
+                       [ 'runMaintenance', DeduplicateArchiveRevId::class, 'maintenance/deduplicateArchiveRevId.php' ],
                ];
        }
 
index c2f2213..961cb84 100644 (file)
@@ -218,6 +218,7 @@ class SqliteUpdater extends DatabaseUpdater {
                        [ 'populateExternallinksIndex60' ],
                        [ 'modifyfield', 'externallinks', 'el_index_60',
                                'patch-externallinks-el_index_60-drop-default.sql' ],
+                       [ 'runMaintenance', DeduplicateArchiveRevId::class, 'maintenance/deduplicateArchiveRevId.php' ],
                ];
        }
 
diff --git a/maintenance/deduplicateArchiveRevId.php b/maintenance/deduplicateArchiveRevId.php
new file mode 100644 (file)
index 0000000..dad79b0
--- /dev/null
@@ -0,0 +1,209 @@
+<?php
+
+use Wikimedia\Rdbms\IDatabase;
+
+require_once __DIR__ . '/Maintenance.php';
+
+/**
+ * Maintenance script that cleans up archive rows with duplicated ar_rev_id,
+ * both within archive and between archive and revision.
+ *
+ * @ingroup Maintenance
+ * @since 1.32
+ */
+class DeduplicateArchiveRevId extends LoggedUpdateMaintenance {
+
+       /** @var array|null */
+       private $arActorQuery = null;
+
+       private $deleted = 0;
+       private $reassigned = 0;
+
+       public function __construct() {
+               parent::__construct();
+               $this->addDescription(
+                       'Clean up duplicate ar_rev_id, both within archive and between archive and revision.'
+               );
+               $this->setBatchSize( 10000 );
+       }
+
+       protected function getUpdateKey() {
+               return __CLASS__;
+       }
+
+       protected function doDBUpdates() {
+               $this->output( "Deduplicating ar_rev_id...\n" );
+
+               $dbw = $this->getDB( DB_MASTER );
+
+               $minId = $dbw->selectField( 'archive', 'MIN(ar_rev_id)', [], __METHOD__ );
+               $maxId = $dbw->selectField( 'archive', 'MAX(ar_rev_id)', [], __METHOD__ );
+               $batchSize = $this->getBatchSize();
+
+               $this->arActorQuery = ActorMigration::newMigration()->getJoin( 'ar_user' );
+               $revActorQuery = ActorMigration::newMigration()->getJoin( 'rev_user' );
+
+               for ( $id = $minId; $id <= $maxId; $id += $batchSize ) {
+                       $endId = min( $maxId, $id + $batchSize - 1 );
+
+                       $this->beginTransaction( $dbw, __METHOD__ );
+
+                       // Lock the archive and revision table rows for the IDs we're checking
+                       // to try to prevent deletions or undeletions from confusing things.
+                       $dbw->selectRowCount(
+                               'archive',
+                               1,
+                               [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
+                               __METHOD__,
+                               [ 'FOR UPDATE' ]
+                       );
+                       $dbw->selectRowCount(
+                               'revision',
+                               1,
+                               [ 'rev_id >= ' . (int)$id, 'rev_id <= ' . (int)$endId ],
+                               __METHOD__,
+                               [ 'LOCK IN SHARE MODE' ]
+                       );
+
+                       // Figure out the ar_rev_ids we actually need to look at
+                       $res = $dbw->select(
+                               [ 'archive', 'revision' ] + $revActorQuery['tables'],
+                               [ 'rev_id', 'rev_timestamp', 'rev_sha1' ] + $revActorQuery['fields'],
+                               [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
+                               __METHOD__,
+                               [ 'DISTINCT' ],
+                               [ 'revision' => [ 'JOIN', 'ar_rev_id = rev_id' ] ] + $revActorQuery['joins']
+                       );
+                       $revRows = [];
+                       foreach ( $res as $row ) {
+                               $revRows[$row->rev_id] = $row;
+                       }
+
+                       $arRevIds = $dbw->selectFieldValues(
+                               [ 'archive' ],
+                               'ar_rev_id',
+                               [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
+                               __METHOD__,
+                               [ 'GROUP BY' => 'ar_rev_id', 'HAVING' => 'COUNT(*) > 1' ]
+                       );
+                       $arRevIds = array_values( array_unique( array_merge( $arRevIds, array_keys( $revRows ) ) ) );
+
+                       if ( $arRevIds ) {
+                               $this->processArRevIds( $dbw, $arRevIds, $revRows );
+                       }
+
+                       $this->output( "... $id-$endId\n" );
+                       $this->commitTransaction( $dbw, __METHOD__ );
+               }
+
+               $this->output(
+                       "Finished deduplicating ar_rev_id. $this->deleted rows deleted, "
+                       . "$this->reassigned assigned new IDs.\n"
+               );
+               return true;
+       }
+
+       /**
+        * Process a set of ar_rev_ids
+        * @param IDatabase $dbw
+        * @param int[] $arRevIds IDs to process
+        * @param object[] $revRows Existing revision-table row data
+        */
+       private function processArRevIds( IDatabase $dbw, array $arRevIds, array $revRows ) {
+               // Select all the data we need for deduplication
+               $res = $dbw->select(
+                       [ 'archive' ] + $this->arActorQuery['tables'],
+                       [ 'ar_id', 'ar_rev_id', 'ar_namespace', 'ar_title', 'ar_timestamp', 'ar_sha1' ]
+                               + $this->arActorQuery['fields'],
+                       [ 'ar_rev_id' => $arRevIds ],
+                       __METHOD__,
+                       [],
+                       $this->arActorQuery['joins']
+               );
+
+               // Determine which rows we need to delete or reassign
+               $seen = [];
+               $toDelete = [];
+               $toReassign = [];
+               foreach ( $res as $row ) {
+                       // Revision-table row exists?
+                       if ( isset( $revRows[$row->ar_rev_id] ) ) {
+                               $revRow = $revRows[$row->ar_rev_id];
+
+                               // Record the rev_id as seen, so the code below will always delete or reassign.
+                               if ( !isset( $seen[$revRow->rev_id] ) ) {
+                                       $seen[$revRow->rev_id] = [
+                                               'first' => "revision row",
+                                       ];
+                               }
+
+                               // Delete the archive row if it seems to be the same regardless
+                               // of page, because moves can change IDs and titles.
+                               if ( $row->ar_timestamp === $revRow->rev_timestamp &&
+                                       $row->ar_sha1 === $revRow->rev_sha1 &&
+                                       $row->ar_user === $revRow->rev_user &&
+                                       $row->ar_user_text === $revRow->rev_user_text
+                               ) {
+                                       $this->output(
+                                               "Row $row->ar_id duplicates revision row for rev_id $revRow->rev_id, deleting\n"
+                                       );
+                                       $toDelete[] = $row->ar_id;
+                                       continue;
+                               }
+                       }
+
+                       $key = $this->getSeenKey( $row );
+                       if ( !isset( $seen[$row->ar_rev_id] ) ) {
+                               // This rev_id hasn't even been seen yet, nothing to do besides record it.
+                               $seen[$row->ar_rev_id] = [
+                                       'first' => "archive row $row->ar_id",
+                                       $key => $row->ar_id,
+                               ];
+                       } elseif ( !isset( $seen[$row->ar_rev_id][$key] ) ) {
+                               // The rev_id was seen, but not this particular change. Reassign it.
+                               $seen[$row->ar_rev_id][$key] = $row->ar_id;
+                               $this->output(
+                                       "Row $row->ar_id conflicts with {$seen[$row->ar_rev_id]['first']} "
+                                       . "for rev_id $row->ar_rev_id, reassigning\n"
+                               );
+                               $toReassign[] = $row->ar_id;
+                       } else {
+                               // The rev_id was seen with a row that matches this change. Delete it.
+                               $this->output(
+                                       "Row $row->ar_id duplicates archive row {$seen[$row->ar_rev_id][$key]} "
+                                       . "for rev_id $row->ar_rev_id, deleting\n"
+                               );
+                               $toDelete[] = $row->ar_id;
+                       }
+               }
+
+               // Perform the updates
+               if ( $toDelete ) {
+                       $dbw->delete( 'archive', [ 'ar_id' => $toDelete ], __METHOD__ );
+                       $this->deleted += $dbw->affectedRows();
+               }
+               if ( $toReassign ) {
+                       $this->reassigned += PopulateArchiveRevId::reassignArRevIds( $dbw, $toReassign );
+               }
+       }
+
+       /**
+        * Make a key identifying a "unique" change from a row
+        * @param object $row
+        * @return string
+        */
+       private function getSeenKey( $row ) {
+               return implode( "\n", [
+                       $row->ar_namespace,
+                       $row->ar_title,
+                       $row->ar_timestamp,
+                       $row->ar_sha1,
+                       $row->ar_user,
+                       $row->ar_user_text,
+               ] );
+       }
+
+}
+
+$maintClass = "DeduplicateArchiveRevId";
+require_once RUN_MAINTENANCE_IF_MAIN;
index b8b9e68..e493506 100644 (file)
@@ -32,6 +32,10 @@ require_once __DIR__ . '/Maintenance.php';
  * @since 1.31
  */
 class PopulateArchiveRevId extends LoggedUpdateMaintenance {
+
+       /** @var array|null Dummy revision row */
+       private static $dummyRev = null;
+
        public function __construct() {
                parent::__construct();
                $this->addDescription( 'Populate ar_rev_id in pre-1.5 rows' );
@@ -58,7 +62,6 @@ class PopulateArchiveRevId extends LoggedUpdateMaintenance {
                        return true;
                }
 
-               $rev = $this->makeDummyRevisionRow( $dbw );
                $count = 0;
                while ( true ) {
                        wfWaitForSlaves();
@@ -75,47 +78,60 @@ class PopulateArchiveRevId extends LoggedUpdateMaintenance {
                                return true;
                        }
 
-                       try {
-                               $updates = $dbw->doAtomicSection( __METHOD__, function ( $dbw, $fname ) use ( $arIds, $rev ) {
-                                       // Create new rev_ids by inserting dummy rows into revision and then deleting them.
-                                       $dbw->insert( 'revision', array_fill( 0, count( $arIds ), $rev ), $fname );
-                                       $revIds = $dbw->selectFieldValues(
-                                               'revision',
-                                               'rev_id',
-                                               [ 'rev_timestamp' => $rev['rev_timestamp'] ],
-                                               $fname
-                                       );
-                                       if ( !is_array( $revIds ) ) {
-                                               throw new UnexpectedValueException( 'Failed to insert dummy revisions' );
-                                       }
-                                       if ( count( $revIds ) !== count( $arIds ) ) {
-                                               throw new UnexpectedValueException(
-                                                       'Tried to insert ' . count( $arIds ) . ' dummy revisions, but found '
-                                                       . count( $revIds ) . ' matching rows.'
-                                               );
-                                       }
-                                       $dbw->delete( 'revision', [ 'rev_id' => $revIds ], $fname );
-
-                                       return array_combine( $arIds, $revIds );
-                               } );
-                       } catch ( UnexpectedValueException $ex ) {
-                               $this->fatalError( $ex->getMessage() );
-                       }
+                       $count += self::reassignArRevIds( $dbw, $arIds, [ 'ar_rev_id' => null ] );
+
+                       $min = min( $arIds );
+                       $max = max( $arIds );
+                       $this->output( " ... $min-$max\n" );
+               }
+       }
 
-                       foreach ( $updates as $arId => $revId ) {
-                               $dbw->update(
-                                       'archive',
-                                       [ 'ar_rev_id' => $revId ],
-                                       [ 'ar_id' => $arId, 'ar_rev_id' => null ],
-                                       __METHOD__
+       /**
+        * Assign new ar_rev_ids to a set of ar_ids.
+        * @param IDatabase $dbw
+        * @param int[] $arIds
+        * @param array $conds Extra conditions for the update
+        * @return int Number of updated rows
+        */
+       public static function reassignArRevIds( IDatabase $dbw, array $arIds, array $conds = [] ) {
+               if ( !self::$dummyRev ) {
+                       self::$dummyRev = self::makeDummyRevisionRow( $dbw );
+               }
+
+               $updates = $dbw->doAtomicSection( __METHOD__, function ( $dbw, $fname ) use ( $arIds ) {
+                       // Create new rev_ids by inserting dummy rows into revision and then deleting them.
+                       $dbw->insert( 'revision', array_fill( 0, count( $arIds ), self::$dummyRev ), $fname );
+                       $revIds = $dbw->selectFieldValues(
+                               'revision',
+                               'rev_id',
+                               [ 'rev_timestamp' => self::$dummyRev['rev_timestamp'] ],
+                               $fname
+                       );
+                       if ( !is_array( $revIds ) ) {
+                               throw new UnexpectedValueException( 'Failed to insert dummy revisions' );
+                       }
+                       if ( count( $revIds ) !== count( $arIds ) ) {
+                               throw new UnexpectedValueException(
+                                       'Tried to insert ' . count( $arIds ) . ' dummy revisions, but found '
+                                       . count( $revIds ) . ' matching rows.'
                                );
-                               $count += $dbw->affectedRows();
                        }
+                       $dbw->delete( 'revision', [ 'rev_id' => $revIds ], $fname );
 
-                       $min = min( array_keys( $updates ) );
-                       $max = max( array_keys( $updates ) );
-                       $this->output( " ... $min-$max\n" );
+                       return array_combine( $arIds, $revIds );
+               } );
+
+               $count = 0;
+               foreach ( $updates as $arId => $revId ) {
+                       $dbw->update(
+                               'archive',
+                               [ 'ar_rev_id' => $revId ],
+                               [ 'ar_id' => $arId ] + $conds,
+                               __METHOD__
+                       );
+                       $count += $dbw->affectedRows();
                }
+               return $count;
        }
 
        /**
@@ -123,31 +139,41 @@ class PopulateArchiveRevId extends LoggedUpdateMaintenance {
         *
         * The row will have a wildly unlikely timestamp, and possibly a generic
         * user and comment, but will otherwise be derived from a revision on the
-        * wiki's main page.
+        * wiki's main page or some other revision in the database.
         *
         * @param IDatabase $dbw
         * @return array
         */
-       private function makeDummyRevisionRow( IDatabase $dbw ) {
+       private static function makeDummyRevisionRow( IDatabase $dbw ) {
                $ts = $dbw->timestamp( '11111111111111' );
+               $rev = null;
+
                $mainPage = Title::newMainPage();
-               if ( !$mainPage ) {
-                       $this->fatalError( 'Main page does not exist' );
+               $pageId = $mainPage ? $mainPage->getArticleId() : null;
+               if ( $pageId ) {
+                       $rev = $dbw->selectRow(
+                               'revision',
+                               '*',
+                               [ 'rev_page' => $pageId ],
+                               __METHOD__,
+                               [ 'ORDER BY' => 'rev_timestamp ASC' ]
+                       );
                }
-               $pageId = $mainPage->getArticleId();
-               if ( !$pageId ) {
-                       $this->fatalError( $mainPage->getPrefixedText() . ' has no ID' );
+
+               if ( !$rev ) {
+                       // No main page? Let's see if there are any revisions at all
+                       $rev = $dbw->selectRow(
+                               'revision',
+                               '*',
+                               [],
+                               __METHOD__,
+                               [ 'ORDER BY' => 'rev_timestamp ASC' ]
+                       );
                }
-               $rev = $dbw->selectRow(
-                       'revision',
-                       '*',
-                       [ 'rev_page' => $pageId ],
-                       __METHOD__,
-                       [ 'ORDER BY' => 'rev_timestamp ASC' ]
-               );
                if ( !$rev ) {
-                       $this->fatalError( $mainPage->getPrefixedText() . ' has no revisions' );
+                       throw new UnexpectedValueException( 'No revisions are available to copy' );
                }
+
                unset( $rev->rev_id );
                $rev = (array)$rev;
                $rev['rev_timestamp'] = $ts;
@@ -166,7 +192,7 @@ class PopulateArchiveRevId extends LoggedUpdateMaintenance {
                        __METHOD__
                );
                if ( $any ) {
-                       $this->fatalError( "... Why does your database contain a revision dated $ts?" );
+                       throw new UnexpectedValueException( "... Why does your database contain a revision dated $ts?" );
                }
 
                return $rev;