3 use Wikimedia\Rdbms\IDatabase
;
5 require_once __DIR__
. '/Maintenance.php';
8 * Maintenance script that cleans up archive rows with duplicated ar_rev_id,
9 * both within archive and between archive and revision.
11 * @ingroup Maintenance
14 class DeduplicateArchiveRevId
extends LoggedUpdateMaintenance
{
16 /** @var array|null */
17 private $arActorQuery = null;
20 private $reassigned = 0;
22 public function __construct() {
23 parent
::__construct();
24 $this->addDescription(
25 'Clean up duplicate ar_rev_id, both within archive and between archive and revision.'
27 $this->setBatchSize( 10000 );
30 protected function getUpdateKey() {
34 protected function doDBUpdates() {
35 $this->output( "Deduplicating ar_rev_id...\n" );
37 $dbw = $this->getDB( DB_MASTER
);
38 PopulateArchiveRevId
::checkMysqlAutoIncrementBug( $dbw );
40 $minId = $dbw->selectField( 'archive', 'MIN(ar_rev_id)', [], __METHOD__
);
41 $maxId = $dbw->selectField( 'archive', 'MAX(ar_rev_id)', [], __METHOD__
);
42 $batchSize = $this->getBatchSize();
44 $this->arActorQuery
= ActorMigration
::newMigration()->getJoin( 'ar_user' );
45 $revActorQuery = ActorMigration
::newMigration()->getJoin( 'rev_user' );
47 for ( $id = $minId; $id <= $maxId; $id +
= $batchSize ) {
48 $endId = min( $maxId, $id +
$batchSize - 1 );
50 $this->beginTransaction( $dbw, __METHOD__
);
52 // Lock the archive and revision table rows for the IDs we're checking
53 // to try to prevent deletions or undeletions from confusing things.
57 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
64 [ 'rev_id >= ' . (int)$id, 'rev_id <= ' . (int)$endId ],
66 [ 'LOCK IN SHARE MODE' ]
69 // Figure out the ar_rev_ids we actually need to look at
71 [ 'archive', 'revision' ] +
$revActorQuery['tables'],
72 [ 'rev_id', 'rev_timestamp', 'rev_sha1' ] +
$revActorQuery['fields'],
73 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
76 [ 'revision' => [ 'JOIN', 'ar_rev_id = rev_id' ] ] +
$revActorQuery['joins']
79 foreach ( $res as $row ) {
80 $revRows[$row->rev_id
] = $row;
83 $arRevIds = $dbw->selectFieldValues(
86 [ 'ar_rev_id >= ' . (int)$id, 'ar_rev_id <= ' . (int)$endId ],
88 [ 'GROUP BY' => 'ar_rev_id', 'HAVING' => 'COUNT(*) > 1' ]
90 $arRevIds = array_values( array_unique( array_merge( $arRevIds, array_keys( $revRows ) ) ) );
93 $this->processArRevIds( $dbw, $arRevIds, $revRows );
96 $this->output( "... $id-$endId\n" );
97 $this->commitTransaction( $dbw, __METHOD__
);
101 "Finished deduplicating ar_rev_id. $this->deleted rows deleted, "
102 . "$this->reassigned assigned new IDs.\n"
108 * Process a set of ar_rev_ids
109 * @param IDatabase $dbw
110 * @param int[] $arRevIds IDs to process
111 * @param object[] $revRows Existing revision-table row data
113 private function processArRevIds( IDatabase
$dbw, array $arRevIds, array $revRows ) {
114 // Select all the data we need for deduplication
116 [ 'archive' ] +
$this->arActorQuery
['tables'],
117 [ 'ar_id', 'ar_rev_id', 'ar_namespace', 'ar_title', 'ar_timestamp', 'ar_sha1' ]
118 +
$this->arActorQuery
['fields'],
119 [ 'ar_rev_id' => $arRevIds ],
122 $this->arActorQuery
['joins']
125 // Determine which rows we need to delete or reassign
129 foreach ( $res as $row ) {
130 // Revision-table row exists?
131 if ( isset( $revRows[$row->ar_rev_id
] ) ) {
132 $revRow = $revRows[$row->ar_rev_id
];
134 // Record the rev_id as seen, so the code below will always delete or reassign.
135 if ( !isset( $seen[$revRow->rev_id
] ) ) {
136 $seen[$revRow->rev_id
] = [
137 'first' => "revision row",
141 // Delete the archive row if it seems to be the same regardless
142 // of page, because moves can change IDs and titles.
143 if ( $row->ar_timestamp
=== $revRow->rev_timestamp
&&
144 $row->ar_sha1
=== $revRow->rev_sha1
&&
145 $row->ar_user
=== $revRow->rev_user
&&
146 $row->ar_user_text
=== $revRow->rev_user_text
149 "Row $row->ar_id duplicates revision row for rev_id $revRow->rev_id, deleting\n"
151 $toDelete[] = $row->ar_id
;
156 $key = $this->getSeenKey( $row );
157 if ( !isset( $seen[$row->ar_rev_id
] ) ) {
158 // This rev_id hasn't even been seen yet, nothing to do besides record it.
159 $seen[$row->ar_rev_id
] = [
160 'first' => "archive row $row->ar_id",
163 } elseif ( !isset( $seen[$row->ar_rev_id
][$key] ) ) {
164 // The rev_id was seen, but not this particular change. Reassign it.
165 $seen[$row->ar_rev_id
][$key] = $row->ar_id
;
167 "Row $row->ar_id conflicts with {$seen[$row->ar_rev_id]['first']} "
168 . "for rev_id $row->ar_rev_id, reassigning\n"
170 $toReassign[] = $row->ar_id
;
172 // The rev_id was seen with a row that matches this change. Delete it.
174 "Row $row->ar_id duplicates archive row {$seen[$row->ar_rev_id][$key]} "
175 . "for rev_id $row->ar_rev_id, deleting\n"
177 $toDelete[] = $row->ar_id
;
181 // Perform the updates
183 $dbw->delete( 'archive', [ 'ar_id' => $toDelete ], __METHOD__
);
184 $this->deleted +
= $dbw->affectedRows();
187 $this->reassigned +
= PopulateArchiveRevId
::reassignArRevIds( $dbw, $toReassign );
192 * Make a key identifying a "unique" change from a row
196 private function getSeenKey( $row ) {
197 return implode( "\n", [
209 $maintClass = "DeduplicateArchiveRevId";
210 require_once RUN_MAINTENANCE_IF_MAIN
;