includes/jobqueue/jobs/HTMLCacheUpdateJob.php

   1 <?php
   2 /**
   3  * HTML cache invalidation of all pages linking to a given title.
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License along
  16  * with this program; if not, write to the Free Software Foundation, Inc.,
  17  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18  * http://www.gnu.org/copyleft/gpl.html
  19  *
  20  * @file
  21  * @ingroup JobQueue
  22  * @ingroup Cache
  23  */
  24
  25 use MediaWiki\MediaWikiServices;
  26
  27 /**
  28  * Job to purge the cache for all pages that link to or use another page or file
  29  *
  30  * This job comes in a few variants:
  31  *   - a) Recursive jobs to purge caches for backlink pages for a given title.
  32  *        These jobs have (recursive:true,table:<table>) set.
  33  *   - b) Jobs to purge caches for a set of titles (the job title is ignored).
  34  *        These jobs have (pages:(<page ID>:(<namespace>,<title>),...) set.
  35  *
  36  * @ingroup JobQueue
  37  */
  38 class HTMLCacheUpdateJob extends Job {
  39         function __construct( Title $title, array $params ) {
  40                 parent::__construct( 'htmlCacheUpdate', $title, $params );
  41                 // Avoid the overhead of de-duplication when it would be pointless.
  42                 // Note that these jobs always set page_touched to the current time,
  43                 // so letting the older existing job "win" is still correct.
  44                 $this->removeDuplicates = (
  45                         // Ranges rarely will line up
  46                         !isset( $params['range'] ) &&
  47                         // Multiple pages per job make matches unlikely
  48                         !( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
  49                 );
  50         }
  51
  52         /**
  53          * @param Title $title Title to purge backlink pages from
  54          * @param string $table Backlink table name
  55          * @return HTMLCacheUpdateJob
  56          */
  57         public static function newForBacklinks( Title $title, $table ) {
  58                 return new self(
  59                         $title,
  60                         [
  61                                 'table' => $table,
  62                                 'recursive' => true
  63                         ] + Job::newRootJobParams( // "overall" refresh links job info
  64                                 "htmlCacheUpdate:{$table}:{$title->getPrefixedText()}"
  65                         )
  66                 );
  67         }
  68
  69         function run() {
  70                 global $wgUpdateRowsPerJob, $wgUpdateRowsPerQuery;
  71
  72                 if ( isset( $this->params['table'] ) && !isset( $this->params['pages'] ) ) {
  73                         $this->params['recursive'] = true; // b/c; base job
  74                 }
  75
  76                 // Job to purge all (or a range of) backlink pages for a page
  77                 if ( !empty( $this->params['recursive'] ) ) {
  78                         // Convert this into no more than $wgUpdateRowsPerJob HTMLCacheUpdateJob per-title
  79                         // jobs and possibly a recursive HTMLCacheUpdateJob job for the rest of the backlinks
  80                         $jobs = BacklinkJobUtils::partitionBacklinkJob(
  81                                 $this,
  82                                 $wgUpdateRowsPerJob,
  83                                 $wgUpdateRowsPerQuery, // jobs-per-title
  84                                 // Carry over information for de-duplication
  85                                 [ 'params' => $this->getRootJobParams() ]
  86                         );
  87                         JobQueueGroup::singleton()->push( $jobs );
  88                 // Job to purge pages for a set of titles
  89                 } elseif ( isset( $this->params['pages'] ) ) {
  90                         $this->invalidateTitles( $this->params['pages'] );
  91                 // Job to update a single title
  92                 } else {
  93                         $t = $this->title;
  94                         $this->invalidateTitles( [
  95                                 $t->getArticleID() => [ $t->getNamespace(), $t->getDBkey() ]
  96                         ] );
  97                 }
  98
  99                 return true;
 100         }
 101
 102         /**
 103          * @param array $pages Map of (page ID => (namespace, DB key)) entries
 104          */
 105         protected function invalidateTitles( array $pages ) {
 106                 global $wgUpdateRowsPerQuery, $wgUseFileCache;
 107
 108                 // Get all page IDs in this query into an array
 109                 $pageIds = array_keys( $pages );
 110                 if ( !$pageIds ) {
 111                         return;
 112                 }
 113
 114                 // Bump page_touched to the current timestamp. This used to use the root job timestamp
 115                 // (e.g. template/file edit time), which was a bit more efficient when template edits are
 116                 // rare and don't effect the same pages much. However, this way allows for better
 117                 // de-duplication, which is much more useful for wikis with high edit rates. Note that
 118                 // RefreshLinksJob, which is enqueued alongside HTMLCacheUpdateJob, saves the parser output
 119                 // since it has to parse anyway. We assume that vast majority of the cache jobs finish
 120                 // before the link jobs, so using the current timestamp instead of the root timestamp is
 121                 // not expected to invalidate these cache entries too often.
 122                 $touchTimestamp = wfTimestampNow();
 123                 // If page_touched is higher than this, then something else already bumped it after enqueue
 124                 $condTimestamp = isset( $this->params['rootJobTimestamp'] )
 125                         ? $this->params['rootJobTimestamp']
 126                         : $touchTimestamp;
 127
 128                 $dbw = wfGetDB( DB_MASTER );
 129                 $factory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
 130                 $ticket = $factory->getEmptyTransactionTicket( __METHOD__ );
 131                 // Update page_touched (skipping pages already touched since the root job).
 132                 // Check $wgUpdateRowsPerQuery for sanity; batch jobs are sized by that already.
 133                 foreach ( array_chunk( $pageIds, $wgUpdateRowsPerQuery ) as $batch ) {
 134                         $factory->commitAndWaitForReplication( __METHOD__, $ticket );
 135
 136                         $dbw->update( 'page',
 137                                 [ 'page_touched' => $dbw->timestamp( $touchTimestamp ) ],
 138                                 [ 'page_id' => $batch,
 139                                         // don't invalidated pages that were already invalidated
 140                                         "page_touched < " . $dbw->addQuotes( $dbw->timestamp( $condTimestamp ) )
 141                                 ],
 142                                 __METHOD__
 143                         );
 144                 }
 145                 // Get the list of affected pages (races only mean something else did the purge)
 146                 $titleArray = TitleArray::newFromResult( $dbw->select(
 147                         'page',
 148                         [ 'page_namespace', 'page_title' ],
 149                         [ 'page_id' => $pageIds, 'page_touched' => $dbw->timestamp( $touchTimestamp ) ],
 150                         __METHOD__
 151                 ) );
 152
 153                 // Update CDN; call purge() directly so as to not bother with secondary purges
 154                 $urls = [];
 155                 foreach ( $titleArray as $title ) {
 156                         /** @var Title $title */
 157                         $urls = array_merge( $urls, $title->getCdnUrls() );
 158                 }
 159                 CdnCacheUpdate::purge( $urls );
 160
 161                 // Update file cache
 162                 if ( $wgUseFileCache ) {
 163                         foreach ( $titleArray as $title ) {
 164                                 HTMLFileCache::clearFileCache( $title );
 165                         }
 166                 }
 167         }
 168
 169         public function workItemCount() {
 170                 if ( !empty( $this->params['recursive'] ) ) {
 171                         return 0; // nothing actually purged
 172                 } elseif ( isset( $this->params['pages'] ) ) {
 173                         return count( $this->params['pages'] );
 174                 }
 175
 176                 return 1; // one title
 177         }
 178 }