From a43f751cf6b6849edddc0a6504553330aef8912e Mon Sep 17 00:00:00 2001 From: Tim Starling Date: Tue, 12 Mar 2013 11:26:12 +1100 Subject: [PATCH] Reduce disruption during updateCollation.php Have updateCollation.php order by cl_to, so that each category is updated all at once. This minimises the time during which a category will appear to be incorrectly sorted, while the maintenance script is in progress. Mark the cl_collation index as needing deletion, it was always pretty pointless. You can't do much better than a full table scan when you're changing the collation value on a wiki. Increase the batch size since the lack of a cl_to,cl_from index means that it will have to filesort each category. A larger batch size means less sorts. As noted by Liangent on bug 45970, you can't order by cl_sortkey since that will change during execution. Also fix an inappropriate use of $wgMiserMode and remove a no-op from the SET clause of the UPDATE. Very lightly tested. Change-Id: I19bc8d6701f5f78040aa9c521427ac98ef488d89 --- maintenance/tables.sql | 4 +-- maintenance/updateCollation.php | 62 +++++++++++++++++++++++---------- 2 files changed, 45 insertions(+), 21 deletions(-) diff --git a/maintenance/tables.sql b/maintenance/tables.sql index a9177838fb..4307c0c711 100644 --- a/maintenance/tables.sql +++ b/maintenance/tables.sql @@ -562,10 +562,10 @@ CREATE UNIQUE INDEX /*i*/cl_from ON /*_*/categorylinks (cl_from,cl_to); -- callers won't be using an index: fix this? CREATE INDEX /*i*/cl_sortkey ON /*_*/categorylinks (cl_to,cl_type,cl_sortkey,cl_from); --- Not really used? +-- Used by the API (and some extensions) CREATE INDEX /*i*/cl_timestamp ON /*_*/categorylinks (cl_to,cl_timestamp); --- For finding rows with outdated collation +-- FIXME: Not used, delete this CREATE INDEX /*i*/cl_collation ON /*_*/categorylinks (cl_collation); -- diff --git a/maintenance/updateCollation.php b/maintenance/updateCollation.php index 04a2d47b72..2132938011 100644 --- a/maintenance/updateCollation.php +++ b/maintenance/updateCollation.php @@ -35,7 +35,7 @@ require_once( __DIR__ . '/Maintenance.php' ); * @ingroup Maintenance */ class UpdateCollation extends Maintenance { - const BATCH_SIZE = 50; // Number of rows to process in one batch + const BATCH_SIZE = 10000; // Number of rows to process in one batch const SYNC_INTERVAL = 20; // Wait for slaves after this many batches public $sizeHistogram = array(); @@ -82,10 +82,13 @@ TEXT; $collation = Collation::singleton(); } - $options = array( 'LIMIT' => self::BATCH_SIZE, 'STRAIGHT_JOIN' ); + $options = array( + 'LIMIT' => self::BATCH_SIZE, + 'ORDER BY' => 'cl_to, cl_type, cl_from', + 'STRAIGHT_JOIN', + ); if ( $force || $dryRun ) { - $options['ORDER BY'] = 'cl_from, cl_to'; $collationConds = array(); } else { if ( $this->hasOption( 'previous-collation' ) ) { @@ -96,20 +99,20 @@ TEXT; ); } - if ( !$wgMiserMode ) { + $count = $dbw->estimateRowCount( + 'categorylinks', + '*', + $collationConds, + __METHOD__ + ); + // Improve estimate if feasible + if ( $count < 1000000 ) { $count = $dbw->selectField( 'categorylinks', 'COUNT(*)', $collationConds, __METHOD__ ); - } else { - $count = $dbw->estimateRowCount( - 'categorylinks', - '*', - $collationConds, - __METHOD__ - ); } if ( $count == 0 ) { $this->output( "Collations up-to-date.\n" ); @@ -126,7 +129,7 @@ TEXT; $res = $dbw->select( array( 'categorylinks', 'page' ), array( 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation', - 'cl_sortkey', 'page_namespace', 'page_title' + 'cl_sortkey', 'cl_type', 'page_namespace', 'page_title' ), array_merge( $collationConds, $batchConds, array( 'cl_from = page_id' ) ), __METHOD__, @@ -175,7 +178,6 @@ TEXT; 'cl_sortkey_prefix' => $prefix, 'cl_collation' => $collationName, 'cl_type' => $type, - 'cl_timestamp = cl_timestamp', ), array( 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ), __METHOD__ @@ -186,12 +188,8 @@ TEXT; $dbw->commit( __METHOD__ ); } - if ( ( $force || $dryRun ) && $row ) { - $encFrom = $dbw->addQuotes( $row->cl_from ); - $encTo = $dbw->addQuotes( $row->cl_to ); - $batchConds = array( - "(cl_from = $encFrom AND cl_to > $encTo) " . - " OR cl_from > $encFrom" ); + if ( $row ) { + $batchConds = array( $this->getBatchCondition( $row ) ); } $count += $res->numRows(); @@ -212,6 +210,32 @@ TEXT; } } + /** + * Return an SQL expression selecting rows which sort above the given row, + * assuming an ordering of cl_to, cl_type, cl_from + */ + function getBatchCondition( $row ) { + $dbw = $this->getDB( DB_MASTER ); + $fields = array( 'cl_to', 'cl_type', 'cl_from' ); + $first = true; + $cond = false; + $prefix = false; + foreach ( $fields as $field ) { + $encValue = $dbw->addQuotes( $row->$field ); + $inequality = "$field > $encValue"; + $equality = "$field = $encValue"; + if ( $first ) { + $cond = $inequality; + $prefix = $equality; + $first = false; + } else { + $cond .= " OR ($prefix AND $inequality)"; + $prefix .= " AND $equality"; + } + } + return $cond; + } + function updateSortKeySizeHistogram( $key ) { $length = strlen( $key ); if ( !isset( $this->sizeHistogram[$length] ) ) { -- 2.20.1