From 748a667f5a0918fe268ff7db3be95d65721e442c Mon Sep 17 00:00:00 2001 From: Tim Starling Date: Wed, 12 Aug 2009 05:00:30 +0000 Subject: [PATCH] In response to a report from Domas that we are seeing HTMLCacheUpdate::invalidate() queries that touch hundreds of thousands of rows and cause significant slave lag: * Check the number of rows to be updated before actually doing the query, and if it is too large, repartition the job. Due to caching and job queue lag, it is possible that the original partitioning could be pathologically inaccurate. * Respect $wgRowsPerQuery (regression due to r47317) but increase the default from 10 to 100. It was originally chosen with a low value because I imagined that it would help reduce slave lag, but this is not generally the case since the queries may be in the same transaction. * Fix lack of initialisation of $jobs in insertJobs() (sloppy but not a bug) * To avoid queueing up jobs unnecessarily and to reduce the chance of jobs being repartitioned a large number of times as links are incrementally added, make the size threshold for queueing double the job size instead of equal to the job size * Add a check of title array size to the immediate case, to avoid updating hundreds of thousands of rows when an incorrect size is stored to memcached. --- includes/DefaultSettings.php | 2 +- includes/HTMLCacheUpdate.php | 137 +++++++++++++++++++++++++++++------ 2 files changed, 116 insertions(+), 23 deletions(-) diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php index 7da728af4c..46704e9629 100644 --- a/includes/DefaultSettings.php +++ b/includes/DefaultSettings.php @@ -3635,7 +3635,7 @@ $wgUpdateRowsPerJob = 500; /** * Number of rows to update per query */ -$wgUpdateRowsPerQuery = 10; +$wgUpdateRowsPerQuery = 100; /** * Enable AJAX framework diff --git a/includes/HTMLCacheUpdate.php b/includes/HTMLCacheUpdate.php index bd63c072de..7c4731b0ad 100644 --- a/includes/HTMLCacheUpdate.php +++ b/includes/HTMLCacheUpdate.php @@ -25,38 +25,119 @@ */ class HTMLCacheUpdate { - public $mTitle, $mTable, $mPrefix; + public $mTitle, $mTable, $mPrefix, $mStart, $mEnd; public $mRowsPerJob, $mRowsPerQuery; - function __construct( $titleTo, $table ) { + function __construct( $titleTo, $table, $start = false, $end = false ) { global $wgUpdateRowsPerJob, $wgUpdateRowsPerQuery; $this->mTitle = $titleTo; $this->mTable = $table; + $this->mStart = $start; + $this->mEnd = $end; $this->mRowsPerJob = $wgUpdateRowsPerJob; $this->mRowsPerQuery = $wgUpdateRowsPerQuery; $this->mCache = $this->mTitle->getBacklinkCache(); } public function doUpdate() { - # Fetch the IDs - $numRows = $this->mCache->getNumLinks( $this->mTable ); + if ( $this->mStart || $this->mEnd ) { + $this->doPartialUpdate(); + return; + } - if ( $numRows != 0 ) { - if ( $numRows > $this->mRowsPerJob ) { - $this->insertJobs(); + # Get an estimate of the number of rows from the BacklinkCache + $numRows = $this->mCache->getNumLinks( $this->mTable ); + if ( $numRows > $this->mRowsPerJob * 2 ) { + # Do fast cached partition + $this->insertJobs(); + } else { + # Get the links from the DB + $titleArray = $this->mCache->getLinks( $this->mTable ); + # Check if the row count estimate was correct + if ( $titleArray->count() > $this->mRowsPerJob * 2 ) { + # Not correct, do accurate partition + wfDebug( __METHOD__.": row count estimate was incorrect, repartitioning\n" ); + $this->insertJobsFromTitles( $titleArray ); } else { - $this->invalidate(); + $this->invalidateTitles( $titleArray ); } } wfRunHooks( 'HTMLCacheUpdate::doUpdate', array($this->mTitle) ); } + /** + * Update some of the backlinks, defined by a page ID range + */ + protected function doPartialUpdate() { + $titleArray = $this->mCache->getLinks( $this->mTable, $this->mStart, $this->mEnd ); + if ( $titleArray->count() <= $this->mRowsPerJob * 2 ) { + # This partition is small enough, do the update + $this->invalidateTitles( $titleArray ); + } else { + # Partitioning was excessively inaccurate. Divide the job further. + # This can occur when a large number of links are added in a short + # period of time, say by updating a heavily-used template. + $this->insertJobsFromTitles( $titleArray ); + } + } + + /** + * Partition the current range given by $this->mStart and $this->mEnd, + * using a pre-calculated title array which gives the links in that range. + * Queue the resulting jobs. + */ + protected function insertJobsFromTitles( $titleArray ) { + # We make subpartitions in the sense that the start of the first job + # will be the start of the parent partition, and the end of the last + # job will be the end of the parent partition. + $jobs = array(); + $start = $this->mStart; # start of the current job + $numTitles = 0; + foreach ( $titleArray as $title ) { + $id = $title->getArticleID(); + # $numTitles is now the number of titles in the current job not + # including the current ID + if ( $numTitles >= $this->mRowsPerJob ) { + # Add a job up to but not including the current ID + $params = array( + 'table' => $this->mTable, + 'start' => $start, + 'end' => $id - 1 + ); + $jobs[] = new HTMLCacheUpdateJob( $this->mTitle, $params ); + $start = $id; + $numTitles = 0; + } + $numTitles++; + } + # Last job + $params = array( + 'table' => $this->mTable, + 'start' => $start, + 'end' => $this->mEnd + ); + $jobs[] = new HTMLCacheUpdateJob( $this->mTitle, $params ); + wfDebug( __METHOD__.": repartitioning into " . count( $jobs ) . " jobs\n" ); + + if ( count( $jobs ) < 2 ) { + # I don't think this is possible at present, but handling this case + # makes the code a bit more robust against future code updates and + # avoids a potential infinite loop of repartitioning + wfDebug( __METHOD__.": repartitioning failed!\n" ); + $this->invalidateTitles( $titleArray ); + return; + } + + Job::batchInsert( $jobs ); + } + protected function insertJobs() { $batches = $this->mCache->partition( $this->mTable, $this->mRowsPerJob ); if ( !$batches ) { return; } + $jobs = array(); foreach ( $batches as $batch ) { $params = array( 'table' => $this->mTable, @@ -68,17 +149,20 @@ class HTMLCacheUpdate Job::batchInsert( $jobs ); } - /** - * Invalidate a set of pages, right now + * Invalidate a range of pages, right now + * @deprecated */ public function invalidate( $startId = false, $endId = false ) { - global $wgUseFileCache, $wgUseSquid; - $titleArray = $this->mCache->getLinks( $this->mTable, $startId, $endId ); - if ( $titleArray->count() == 0 ) { - return; - } + $this->invalidateTitles( $titleArray ); + } + + /** + * Invalidate an array (or iterator) of Title objects, right now + */ + protected function invalidateTitles( $titleArray ) { + global $wgUseFileCache, $wgUseSquid; $dbw = wfGetDB( DB_MASTER ); $timestamp = $dbw->timestamp(); @@ -88,12 +172,20 @@ class HTMLCacheUpdate foreach ( $titleArray as $title ) { $ids[] = $title->getArticleID(); } + + if ( !$ids ) { + return; + } + # Update page_touched - $dbw->update( 'page', - array( 'page_touched' => $timestamp ), - array( 'page_id IN (' . $dbw->makeList( $ids ) . ')' ), - __METHOD__ - ); + $batches = array_chunk( $ids, $this->mRowsPerQuery ); + foreach ( $batches as $batch ) { + $dbw->update( 'page', + array( 'page_touched' => $timestamp ), + array( 'page_id IN (' . $dbw->makeList( $batch ) . ')' ), + __METHOD__ + ); + } # Update squid if ( $wgUseSquid ) { @@ -108,6 +200,7 @@ class HTMLCacheUpdate } } } + } /** @@ -133,8 +226,8 @@ class HTMLCacheUpdateJob extends Job { } public function run() { - $update = new HTMLCacheUpdate( $this->title, $this->table ); - $update->invalidate( $this->start, $this->end ); + $update = new HTMLCacheUpdate( $this->title, $this->table, $this->start, $this->end ); + $update->doUpdate(); return true; } } -- 2.20.1