RefreshLinksJob performance tweaks.
authorAaron Schulz <aschulz@wikimedia.org>
Thu, 2 Aug 2012 04:19:00 +0000 (21:19 -0700)
committerGerrit Code Review <gerrit@wikimedia.org>
Tue, 7 Aug 2012 00:35:38 +0000 (00:35 +0000)
* Made refreshLinksJob2 always spawn smaller jobs. This can reduce
  the problem of all runners doing the same refresh jobs by increasing the
  granularity of the work to single pages parses per job.
* Avoid master queries when fetching the latest revision for refresh links jobs.
  Also avoid the master for template fetching on parse. A LoadBalancer waitFor()
  call is used instead. The main reason for hitting the master to fetch templates
  was this job itself.
* Fixed bug in refreshLinksJob2 where one missing page would cause all the
  remaining updates for pages to be aborted.
* Factored out some code duplication between the two refresh links job classes.

Change-Id: Ieca51567a888f50a6f15b6c2606323da80d6584b

includes/job/RefreshLinksJob.php
includes/parser/Parser.php

index 7ccf00d..2eac326 100644 (file)
@@ -37,7 +37,6 @@ class RefreshLinksJob extends Job {
         * @return boolean success
         */
        function run() {
-               global $wgParser, $wgContLang;
                wfProfileIn( __METHOD__ );
 
                $linkCache = LinkCache::singleton();
@@ -49,26 +48,41 @@ class RefreshLinksJob extends Job {
                        return false;
                }
 
-               $revision = Revision::newFromTitle( $this->title );
+               # Wait for the DB of the current/next slave DB handle to catch up to the master.
+               # This way, we get the correct page_latest for templates or files that just changed
+               # milliseconds ago, having triggered this job to begin with.
+               if ( isset( $this->params['masterPos'] ) ) {
+                       wfGetLB()->waitFor( $this->params['masterPos'] );
+               }
+
+               $revision = Revision::newFromTitle( $this->title, 0, Revision::AVOID_MASTER );
                if ( !$revision ) {
-                       $this->error = 'refreshLinks: Article not found "' . $this->title->getPrefixedDBkey() . '"';
+                       $this->error = 'refreshLinks: Article not found "' .
+                               $this->title->getPrefixedDBkey() . '"';
                        wfProfileOut( __METHOD__ );
-                       return false;
+                       return false; // XXX: what if it was just deleted?
                }
 
-               wfProfileIn( __METHOD__.'-parse' );
-               $options = ParserOptions::newFromUserAndLang( new User, $wgContLang );
-               $parserOutput = $wgParser->parse( $revision->getText(), $this->title, $options, true, true, $revision->getId() );
-               wfProfileOut( __METHOD__.'-parse' );
-               wfProfileIn( __METHOD__.'-update' );
+               self::runForTitleInternal( $this->title, $revision, __METHOD__ );
 
-               $updates = $parserOutput->getSecondaryDataUpdates( $this->title, false );
-               DataUpdate::runUpdates( $updates );
-
-               wfProfileOut( __METHOD__.'-update' );
                wfProfileOut( __METHOD__ );
                return true;
        }
+
+       public static function runForTitleInternal( Title $title, Revision $revision, $fname ) {
+               global $wgParser, $wgContLang;
+
+               wfProfileIn( $fname . '-parse' );
+               $options = ParserOptions::newFromUserAndLang( new User, $wgContLang );
+               $parserOutput = $wgParser->parse(
+                       $revision->getText(), $title, $options, true, true, $revision->getId() );
+               wfProfileOut( $fname . '-parse' );
+
+               wfProfileIn( $fname . '-update' );
+               $updates = $parserOutput->getSecondaryDataUpdates( $title, false );
+               DataUpdate::runUpdates( $updates );
+               wfProfileOut( $fname . '-update' );
+       }
 }
 
 /**
@@ -78,6 +92,7 @@ class RefreshLinksJob extends Job {
  * @ingroup JobQueue
  */
 class RefreshLinksJob2 extends Job {
+       const MAX_TITLES_RUN = 10;
 
        function __construct( $title, $params, $id = 0 ) {
                parent::__construct( 'refreshLinks2', $title, $params, $id );
@@ -88,62 +103,100 @@ class RefreshLinksJob2 extends Job {
         * @return boolean success
         */
        function run() {
-               global $wgParser, $wgContLang;
-
                wfProfileIn( __METHOD__ );
 
                $linkCache = LinkCache::singleton();
                $linkCache->clear();
 
-               if( is_null( $this->title ) ) {
+               if ( is_null( $this->title ) ) {
                        $this->error = "refreshLinks2: Invalid title";
                        wfProfileOut( __METHOD__ );
                        return false;
-               }
-               if( !isset($this->params['start']) || !isset($this->params['end']) ) {
+               } elseif ( !isset( $this->params['start'] ) || !isset( $this->params['end'] ) ) {
                        $this->error = "refreshLinks2: Invalid params";
                        wfProfileOut( __METHOD__ );
                        return false;
                }
+
                // Back compat for pre-r94435 jobs
                $table = isset( $this->params['table'] ) ? $this->params['table'] : 'templatelinks';
-               $titles = $this->title->getBacklinkCache()->getLinks( 
-                       $table, $this->params['start'], $this->params['end']);
-               
-               # Not suitable for page load triggered job running!
-               # Gracefully switch to refreshLinks jobs if this happens.
-               if( php_sapi_name() != 'cli' ) {
+
+               // Avoid slave lag when fetching templates
+               if ( isset( $this->params['masterPos'] ) ) {
+                       $masterPos = $this->params['masterPos'];
+               } elseif ( wfGetLB()->getServerCount() > 1  ) {
+                       $masterPos = wfGetLB()->getMasterPos();
+               } else {
+                       $masterPos = false;
+               }
+
+               $titles = $this->title->getBacklinkCache()->getLinks(
+                       $table, $this->params['start'], $this->params['end'] );
+
+               if ( $titles->count() > self::MAX_TITLES_RUN ) {
+                       # We don't want to parse too many pages per job as it can starve other jobs.
+                       # If there are too many pages to parse, break this up into smaller jobs. By passing
+                       # in the master position here we can cut down on the time spent waiting for slaves to
+                       # catch up by the runners handling these jobs since time will have passed between now
+                       # and when they pop these jobs off the queue.
+                       $start = 0; // batch start
+                       $end   = 0; // batch end
+                       $bsize = 0; // batch size
+                       $first = true; // first of batch
+                       $jobs  = array();
+                       foreach ( $titles as $title ) {
+                               $start = $first ? $title->getArticleId() : $start;
+                               $end   = $title->getArticleId();
+                               $first = false;
+                               if ( ++$bsize >= self::MAX_TITLES_RUN ) {
+                                       $jobs[] = new RefreshLinksJob2( $this->title, array(
+                                               'table'     => $table,
+                                               'start'     => $start,
+                                               'end'       => $end,
+                                               'masterPos' => $masterPos
+                                       ) );
+                                       $first = true;
+                                       $start = $end = $bsize = 0;
+                               }
+                       }
+                       if ( $bsize > 0 ) { // group remaining pages into a job
+                               $jobs[] = new RefreshLinksJob2( $this->title, array(
+                                       'table'     => $table,
+                                       'start'     => $start,
+                                       'end'       => $end,
+                                       'masterPos' => $masterPos
+                               ) );
+                       }
+                       Job::batchInsert( $jobs );
+               } elseif ( php_sapi_name() != 'cli' ) {
+                       # Not suitable for page load triggered job running!
+                       # Gracefully switch to refreshLinks jobs if this happens.
                        $jobs = array();
                        foreach ( $titles as $title ) {
-                               $jobs[] = new RefreshLinksJob( $title, '' );
+                               $jobs[] = new RefreshLinksJob( $title, array( 'masterPos' => $masterPos ) );
                        }
                        Job::batchInsert( $jobs );
-
-                       wfProfileOut( __METHOD__ );
-                       return true;
-               }
-               $options = ParserOptions::newFromUserAndLang( new User, $wgContLang );
-               # Re-parse each page that transcludes this page and update their tracking links...
-               foreach ( $titles as $title ) {
-                       $revision = Revision::newFromTitle( $title );
-                       if ( !$revision ) {
-                               $this->error = 'refreshLinks: Article not found "' . $title->getPrefixedDBkey() . '"';
-                               wfProfileOut( __METHOD__ );
-                               return false;
+               } else {
+                       # Wait for the DB of the current/next slave DB handle to catch up to the master.
+                       # This way, we get the correct page_latest for templates or files that just changed
+                       # milliseconds ago, having triggered this job to begin with.
+                       if ( $masterPos ) {
+                               wfGetLB()->waitFor( $masterPos );
+                       }
+                       # Re-parse each page that transcludes this page and update their tracking links...
+                       foreach ( $titles as $title ) {
+                               $revision = Revision::newFromTitle( $title, 0, Revision::AVOID_MASTER );
+                               if ( !$revision ) {
+                                       $this->error = 'refreshLinks: Article not found "' .
+                                               $title->getPrefixedDBkey() . '"';
+                                       continue; // skip this page
+                               }
+                               RefreshLinksJob::runForTitleInternal( $title, $revision, __METHOD__ );
+                               wfWaitForSlaves();
                        }
-                       wfProfileIn( __METHOD__.'-parse' );
-                       $parserOutput = $wgParser->parse( $revision->getText(), $title, $options, true, true, $revision->getId() );
-                       wfProfileOut( __METHOD__.'-parse' );
-                       wfProfileIn( __METHOD__.'-update' );
-
-                       $updates = $parserOutput->getSecondaryDataUpdates( $title, false );
-                       DataUpdate::runUpdates( $updates );
-
-                       wfProfileOut( __METHOD__.'-update' );
-                       wfWaitForSlaves();
                }
-               wfProfileOut( __METHOD__ );
 
+               wfProfileOut( __METHOD__ );
                return true;
        }
 }
index 16f9cb2..28957df 100644 (file)
@@ -3552,7 +3552,7 @@ class Parser {
                        # Get the revision
                        $rev = $id
                                ? Revision::newFromId( $id )
-                               : Revision::newFromTitle( $title );
+                               : Revision::newFromTitle( $title, 0, Revision::AVOID_MASTER );
                        $rev_id = $rev ? $rev->getId() : 0;
                        # If there is no current revision, there is no page
                        if ( $id === false && !$rev ) {