From 4c73c8a67647c1daaa3bdfa798a32c13ae466e6f Mon Sep 17 00:00:00 2001 From: Piotr Miazga Date: Thu, 8 Nov 2018 18:45:47 -0500 Subject: [PATCH] Provide a script to reset the page_random column Bug: T208909 Change-Id: I914ff44d1212c565e08ff17effbd682dfe7f70fb --- autoload.php | 1 + maintenance/resetPageRandom.php | 126 ++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 maintenance/resetPageRandom.php diff --git a/autoload.php b/autoload.php index 3daeee1939..8e764ae7ef 100644 --- a/autoload.php +++ b/autoload.php @@ -1207,6 +1207,7 @@ $wgAutoloadLocalClasses = [ 'RepoGroup' => __DIR__ . '/includes/filerepo/RepoGroup.php', 'RequestContext' => __DIR__ . '/includes/context/RequestContext.php', 'ResetAuthenticationThrottle' => __DIR__ . '/maintenance/resetAuthenticationThrottle.php', + 'ResetPageRandom' => __DIR__ . '/maintenance/resetPageRandom.php', 'ResetUserEmail' => __DIR__ . '/maintenance/resetUserEmail.php', 'ResetUserTokens' => __DIR__ . '/maintenance/resetUserTokens.php', 'ResourceFileCache' => __DIR__ . '/includes/cache/ResourceFileCache.php', diff --git a/maintenance/resetPageRandom.php b/maintenance/resetPageRandom.php new file mode 100644 index 0000000000..61102218e2 --- /dev/null +++ b/maintenance/resetPageRandom.php @@ -0,0 +1,126 @@ +addDescription( 'Reset the page_random for articles within given date range' ); + $this->addOption( 'from', + 'From date range selector to select articles to update, ex: 20041011000000' ); + $this->addOption( 'to', + 'To date range selector to select articles to update, ex: 20050708000000' ); + $this->addOption( 'dry', 'Do not update column' ); + $this->addOption( 'batch-start', + 'Optional: Use when you need to restart the reset process from a given page ID offset' + . ' in case a previous reset failed or was stopped' + ); + // Initialize batch size to a good default value and enable the batch size option. + $this->setBatchSize( 200 ); + } + + public function execute() { + $batchSize = $this->getBatchSize(); + $dbw = $this->getDB( DB_MASTER ); + $lbFactory = \MediaWiki\MediaWikiServices::getInstance()->getDBLoadBalancerFactory(); + $dbr = $this->getDB( DB_REPLICA ); + $from = wfTimestampOrNull( TS_MW, $this->getOption( 'from' ) ); + $to = wfTimestampOrNull( TS_MW, $this->getOption( 'to' ) ); + + if ( $from === null || $to === null ) { + $this->output( "--from and --to have to be provided" . PHP_EOL ); + return false; + } + if ( $from >= $to ) { + $this->output( "--from has to be smaller than --to" . PHP_EOL ); + return false; + } + $batchStart = (int)$this->getOption( 'batch-start', 0 ); + $changed = 0; + $dry = (bool)$this->getOption( 'dry' ); + + $message = "Resetting page_random column within date range from $from to $to"; + if ( $batchStart > 0 ) { + $message .= " starting from page ID $batchStart"; + } + $message .= $dry ? ". dry run" : '.'; + + $this->output( $message . PHP_EOL ); + do { + $this->output( " ...doing chunk of $batchSize from $batchStart " . PHP_EOL ); + + // Find the oldest page revision associated with each page_id. Iff it falls in the given + // time range AND it's greater than $batchStart, yield the page ID. If it falls outside the + // time range, it was created before or after the occurrence of T208909 and its page_random + // is considered valid. The replica is used for this read since page_id and the rev_timestamp + // will not change between queries. + $res = $dbr->select( + 'page', + 'page_id', + [ + '(' . $dbr->selectSQLText( 'revision', 'MIN(rev_timestamp)', 'rev_page=page_id' ) . ') ' . + 'BETWEEN ' . $dbr->addQuotes( $dbr->timestamp( $from ) ) . + ' AND ' . $dbr->addQuotes( $dbr->timestamp( $to ) ), + 'page_id > ' . $dbr->addQuotes( $batchStart ) + ], + __METHOD__, + [ 'LIMIT' => $batchSize, 'ORDER BY' => 'page_id' ] + ); + + foreach ( $res as $row ) { + if ( !$dry ) { + # Update the row... + $dbw->update( 'page', + [ 'page_random' => wfRandom() ], + [ 'page_id' => $row->page_id ], + __METHOD__ ); + $changed += $dbw->affectedRows(); + } else { + $changed++; + } + } + if ( $row ) { + $batchStart = $row->page_id; + } else { + // We don't need to set the $batchStart as $res is empty, + // and we don't need to do another loop + // the while() condition will evaluate to false and + // we will leave the do{}while() block. + } + + $lbFactory->waitForReplication(); + } while ( $res->numRows() === $batchSize ); + $this->output( "page_random reset complete ... changed $changed rows" . PHP_EOL ); + + return true; + } +} + +$maintClass = ResetPageRandom::class; +require_once RUN_MAINTENANCE_IF_MAIN; -- 2.20.1