From 41da159c87ce11e89fa867c1e08a639ddb0dc9c8 Mon Sep 17 00:00:00 2001 From: Aaron Schulz Date: Wed, 27 Nov 2013 16:29:36 -0800 Subject: [PATCH] Converted html cache updates to use BacklinkJobUtils * De-duplicate squid purges using page_touched * Removed excess title validity check * Also removed broken versioning in refreshLinks jobs * Added TODO comment about $wgMaxBacklinksInvalidate * Other documentation tweaks Change-Id: If91e06549e5afa2d4e27595d0e131b4d78412ad2 --- includes/job/jobs/HTMLCacheUpdateJob.php | 268 +++++++---------------- includes/job/jobs/RefreshLinksJob.php | 14 +- 2 files changed, 85 insertions(+), 197 deletions(-) diff --git a/includes/job/jobs/HTMLCacheUpdateJob.php b/includes/job/jobs/HTMLCacheUpdateJob.php index 8885e25a79..0297a35e42 100644 --- a/includes/job/jobs/HTMLCacheUpdateJob.php +++ b/includes/job/jobs/HTMLCacheUpdateJob.php @@ -22,244 +22,132 @@ */ /** - * Job wrapper for HTMLCacheUpdate. Gets run whenever a related - * job gets called from the queue. + * Job to purge the cache for all pages that link to or use another page or file * - * This class is designed to work efficiently with small numbers of links, and - * to work reasonably well with up to ~10^5 links. Above ~10^6 links, the memory - * and time requirements of loading all backlinked IDs in doUpdate() might become - * prohibitive. The requirements measured at Wikimedia are approximately: - * - * memory: 48 bytes per row - * time: 16us per row for the query plus processing - * - * The reason this query is done is to support partitioning of the job - * by backlinked ID. The memory issue could be allieviated by doing this query in - * batches, but of course LIMIT with an offset is inefficient on the DB side. - * - * The class is nevertheless a vast improvement on the previous method of using - * File::getLinksTo() and Title::touchArray(), which uses about 2KB of memory per - * link. + * This job comes in a few variants: + * - a) Recursive jobs to purge caches for backlink pages for a given title. + * These jobs have have (recursive:true,table:) set. + * - b) Jobs to purge caches for a set of titles (the job title is ignored). + * These jobs have have (pages:(:(,),...) set. * * @ingroup JobQueue */ class HTMLCacheUpdateJob extends Job { - /** @var BacklinkCache */ - protected $blCache; - - /** @var int Number of rows to update per job, see $wgUpdateRowsPerJob */ - protected $rowsPerJob; - - /** @var int Number of rows to update per query, see $wgUpdateRowsPerQuery */ - protected $rowsPerQuery; - - /** - * Construct a job - * @param Title $title The title linked to - * @param array $params job parameters (table, start and end page_ids) - * @param int $id Job id - */ - function __construct( $title, $params, $id = 0 ) { - global $wgUpdateRowsPerJob, $wgUpdateRowsPerQuery; - + function __construct( $title, $params = '', $id = 0 ) { parent::__construct( 'htmlCacheUpdate', $title, $params, $id ); - - $this->rowsPerJob = $wgUpdateRowsPerJob; - $this->rowsPerQuery = $wgUpdateRowsPerQuery; - $this->blCache = $title->getBacklinkCache(); - } - - public function run() { - if ( isset( $this->params['start'] ) && isset( $this->params['end'] ) ) { - # This is hit when a job is actually performed - return $this->doPartialUpdate(); - } else { - # This is hit when the jobs have to be inserted - return $this->doFullUpdate(); - } + // Base backlink purge jobs can be de-duplicated + $this->removeDuplicates = ( !isset( $params['range'] ) && !isset( $params['pages'] ) ); } - /** - * Update all of the backlinks - */ - protected function doFullUpdate() { - global $wgMaxBacklinksInvalidate; + function run() { + global $wgUpdateRowsPerJob, $wgUpdateRowsPerQuery, $wgMaxBacklinksInvalidate; - # Get an estimate of the number of rows from the BacklinkCache - $max = max( $this->rowsPerJob * 2, $wgMaxBacklinksInvalidate ) + 1; - $numRows = $this->blCache->getNumLinks( $this->params['table'], $max ); - if ( $wgMaxBacklinksInvalidate !== false && $numRows > $wgMaxBacklinksInvalidate ) { - wfDebug( "Skipped HTML cache invalidation of {$this->title->getPrefixedText()}." ); - - return true; - } + static $expected = array( 'recursive', 'pages' ); // new jobs have one of these - if ( $numRows > $this->rowsPerJob * 2 ) { - # Do fast cached partition - $this->insertPartitionJobs(); - } else { - # Get the links from the DB - $titleArray = $this->blCache->getLinks( $this->params['table'] ); - # Check if the row count estimate was correct - if ( $titleArray->count() > $this->rowsPerJob * 2 ) { - # Not correct, do accurate partition - wfDebug( __METHOD__ . ": row count estimate was incorrect, repartitioning\n" ); - $this->insertJobsFromTitles( $titleArray ); + $oldRangeJob = false; + if ( !array_intersect( array_keys( $this->params ), $expected ) ) { + // B/C for older job params formats that lack these fields: + // a) base jobs with just ("table") and b) range jobs with ("table","start","end") + if ( isset( $this->params['start'] ) && isset( $this->params['end'] ) ) { + $oldRangeJob = true; } else { - $this->invalidateTitles( $titleArray ); // just do the query + $this->params['recursive'] = true; // base job } } - return true; - } - - /** - * Update some of the backlinks, defined by a page ID range - */ - protected function doPartialUpdate() { - $titleArray = $this->blCache->getLinks( - $this->params['table'], $this->params['start'], $this->params['end'] ); - if ( $titleArray->count() <= $this->rowsPerJob * 2 ) { - # This partition is small enough, do the update - $this->invalidateTitles( $titleArray ); - } else { - # Partitioning was excessively inaccurate. Divide the job further. - # This can occur when a large number of links are added in a short - # period of time, say by updating a heavily-used template. - $this->insertJobsFromTitles( $titleArray ); - } - - return true; - } + // Job to purge all (or a range of) backlink pages for a page + if ( !empty( $this->params['recursive'] ) ) { + // @TODO: try to use delayed jobs if possible? + if ( !isset( $this->params['range'] ) && $wgMaxBacklinksInvalidate !== false ) { + $numRows = $this->title->getBacklinkCache()->getNumLinks( + $this->params['table'], $wgMaxBacklinksInvalidate ); + if ( $numRows > $wgMaxBacklinksInvalidate ) { + return true; + } + } + // Convert this into no more than $wgUpdateRowsPerJob HTMLCacheUpdateJob per-title + // jobs and possibly a recursive RefreshLinks job for the rest of the backlinks + $jobs = BacklinkJobUtils::partitionBacklinkJob( + $this, + $wgUpdateRowsPerJob, + $wgUpdateRowsPerQuery, // jobs-per-title + // Carry over information for de-duplication + array( 'params' => $this->getRootJobParams() ) + ); + JobQueueGroup::singleton()->push( $jobs ); + // Job to purge pages for for a set of titles + } elseif ( isset( $this->params['pages'] ) ) { + $this->invalidateTitles( $this->params['pages'] ); + // B/C for job to purge a range of backlink pages for a given page + } elseif ( $oldRangeJob ) { + $titleArray = $this->title->getBacklinkCache()->getLinks( + $this->params['table'], $this->params['start'], $this->params['end'] ); + + $pages = array(); // same format BacklinkJobUtils uses + foreach ( $titleArray as $tl ) { + $pages[$tl->getArticleId()] = array( $tl->getNamespace(), $tl->getDbKey() ); + } - /** - * Partition the current range given by $this->params['start'] and $this->params['end'], - * using a pre-calculated title array which gives the links in that range. - * Queue the resulting jobs. - * - * @param array|TitleArrayFromResult $titleArray - * @param array $rootJobParams - */ - protected function insertJobsFromTitles( $titleArray, $rootJobParams = array() ) { - // Carry over any "root job" information - $rootJobParams = $this->getRootJobParams(); - # We make subpartitions in the sense that the start of the first job - # will be the start of the parent partition, and the end of the last - # job will be the end of the parent partition. - $jobs = array(); - $start = $this->params['start']; # start of the current job - $numTitles = 0; - /** @var Title $title */ - foreach ( $titleArray as $title ) { - $id = $title->getArticleID(); - # $numTitles is now the number of titles in the current job not - # including the current ID - if ( $numTitles >= $this->rowsPerJob ) { - # Add a job up to but not including the current ID + $jobs = array(); + foreach ( array_chunk( $pages, $wgUpdateRowsPerJob ) as $pageChunk ) { $jobs[] = new HTMLCacheUpdateJob( $this->title, array( 'table' => $this->params['table'], - 'start' => $start, - 'end' => $id - 1 - ) + $rootJobParams // carry over information for de-duplication + 'pages' => $pageChunk + ) + $this->getRootJobParams() // carry over information for de-duplication ); - $start = $id; - $numTitles = 0; } - $numTitles++; - } - # Last job - $jobs[] = new HTMLCacheUpdateJob( $this->title, - array( - 'table' => $this->params['table'], - 'start' => $start, - 'end' => $this->params['end'] - ) + $rootJobParams // carry over information for de-duplication - ); - wfDebug( __METHOD__ . ": repartitioning into " . count( $jobs ) . " jobs\n" ); - - if ( count( $jobs ) < 2 ) { - # I don't think this is possible at present, but handling this case - # makes the code a bit more robust against future code updates and - # avoids a potential infinite loop of repartitioning - wfDebug( __METHOD__ . ": repartitioning failed!\n" ); - $this->invalidateTitles( $titleArray ); - } else { JobQueueGroup::singleton()->push( $jobs ); } + + return true; } /** - * @param array $rootJobParams + * @param array $pages Map of (page ID => (namespace, DB key)) entries */ - protected function insertPartitionJobs( $rootJobParams = array() ) { - // Carry over any "root job" information - $rootJobParams = $this->getRootJobParams(); + protected function invalidateTitles( array $pages ) { + global $wgUpdateRowsPerQuery, $wgUseFileCache, $wgUseSquid; - $batches = $this->blCache->partition( $this->params['table'], $this->rowsPerJob ); - if ( !count( $batches ) ) { - return; // no jobs to insert - } - - $jobs = array(); - foreach ( $batches as $batch ) { - list( $start, $end ) = $batch; - $jobs[] = new HTMLCacheUpdateJob( $this->title, - array( - 'table' => $this->params['table'], - 'start' => $start, - 'end' => $end, - ) + $rootJobParams // carry over information for de-duplication - ); + // Get all page IDs in this query into an array + $pageIds = array_keys( $pages ); + if ( !$pageIds ) { + return; } - JobQueueGroup::singleton()->push( $jobs ); - } - - /** - * Invalidate an array (or iterator) of Title objects, right now - * @param array|TitleArrayFromResult $titleArray - */ - protected function invalidateTitles( $titleArray ) { - global $wgUseFileCache, $wgUseSquid; - $dbw = wfGetDB( DB_MASTER ); $timestamp = $dbw->timestamp(); - # Get all IDs in this query into an array - $ids = array(); - /** @var Title $title */ - foreach ( $titleArray as $title ) { - $ids[] = $title->getArticleID(); - } - - if ( !$ids ) { - return; - } - - # Don't invalidated pages that were already invalidated + // Don't invalidated pages that were already invalidated $touchedCond = isset( $this->params['rootJobTimestamp'] ) ? array( "page_touched < " . $dbw->addQuotes( $dbw->timestamp( $this->params['rootJobTimestamp'] ) ) ) : array(); - # Update page_touched - $batches = array_chunk( $ids, $this->rowsPerQuery ); - foreach ( $batches as $batch ) { + // Update page_touched (skipping pages already touched since the root job). + // Check $wgUpdateRowsPerQuery for sanity; batch jobs are sized by that already. + foreach ( array_chunk( $pageIds, $wgUpdateRowsPerQuery ) as $batch ) { $dbw->update( 'page', array( 'page_touched' => $timestamp ), array( 'page_id' => $batch ) + $touchedCond, __METHOD__ ); } - - # Update squid + // Get the list of affected pages (races only mean something else did the purge) + $titleArray = TitleArray::newFromResult( $dbw->select( + 'page', + array( 'page_namespace', 'page_title' ), + array( 'page_id' => $pageIds, 'page_touched' => $timestamp ), + __METHOD__ + ) ); + + // Update squid if ( $wgUseSquid ) { $u = SquidUpdate::newFromTitles( $titleArray ); $u->doUpdate(); } - # Update file cache + // Update file cache if ( $wgUseFileCache ) { foreach ( $titleArray as $title ) { HTMLFileCache::clearFileCache( $title ); diff --git a/includes/job/jobs/RefreshLinksJob.php b/includes/job/jobs/RefreshLinksJob.php index 0372d85927..81a6f2072b 100644 --- a/includes/job/jobs/RefreshLinksJob.php +++ b/includes/job/jobs/RefreshLinksJob.php @@ -25,18 +25,18 @@ * Job to update link tables for pages * * This job comes in a few variants: - * - a) Recursive jobs to update links for backlink pages for a given title - * - b) Jobs to update links for a set of titles (the job title is ignored) - * - c) Jobs to update links for a single title (the job title) + * - a) Recursive jobs to update links for backlink pages for a given title. + * These jobs have have (recursive:true,table:<table>) set. + * - b) Jobs to update links for a set of pages (the job title is ignored). + * These jobs have have (pages:(<page ID>:(<namespace>,<title>),...) set. + * - c) Jobs to update links for a single page (the job title) + * These jobs need no extra fields set. * * @ingroup JobQueue */ class RefreshLinksJob extends Job { - const VERSION = 1; - function __construct( $title, $params = '', $id = 0 ) { parent::__construct( 'refreshLinks', $title, $params, $id ); - $this->params['version'] = self::VERSION; // Base backlink update jobs and per-title update jobs can be de-duplicated. // If template A changes twice before any jobs run, a clean queue will have: // (A base, A base) @@ -65,7 +65,7 @@ class RefreshLinksJob extends Job { } // Job to update all (or a range of) backlink pages for a page - if ( isset( $this->params['recursive'] ) ) { + if ( !empty( $this->params['recursive'] ) ) { // Carry over information for de-duplication $extraParams = $this->getRootJobParams(); // Avoid slave lag when fetching templates. -- 2.20.1