[JobQueue] Reduced the change of deadlocks in recycleStaleJobs().
authorAaron Schulz <aschulz@wikimedia.org>
Thu, 6 Dec 2012 21:02:34 +0000 (13:02 -0800)
committerAaron Schulz <aschulz@wikimedia.org>
Thu, 6 Dec 2012 21:04:14 +0000 (13:04 -0800)
* Do the DELETE by primary key to avoid gap locks. This should cut
  down on deadlocks, though their frequency is still fairly modest.
* Also use GET_LOCK() to assure that two processes are not doing this
  update at the same time.

Change-Id: Ic3de3b7cb3acd5294a151b4a00bfed65c33d0f1e

includes/job/JobQueueDB.php

index cf6f1b9..3e057eb 100644 (file)
@@ -304,26 +304,42 @@ class JobQueueDB extends JobQueue {
                $dbw   = $this->getMasterDB();
                $count = 0; // affected rows
 
-               if ( $this->claimTTL > 0 ) { // re-try stale jobs...
+               if ( !$dbw->lock( "jobqueue-recycle-{$this->type}", __METHOD__, 1 ) ) {
+                       return $count; // already in progress
+               }
+
+               // Remove claims on jobs acquired for too long if enabled...
+               if ( $this->claimTTL > 0 ) {
                        $claimCutoff = $dbw->timestamp( $now - $this->claimTTL );
-                       // Reset job_token for these jobs so that other runners will pick them up.
-                       // Set the timestamp to the current time, as it is useful to now that the job
-                       // was already tried before.
-                       $dbw->update( 'job',
-                               array(
-                                       'job_token' => '',
-                                       'job_token_timestamp' => $dbw->timestamp( $now ) ), // time of release
+                       // Get the IDs of jobs that have be claimed but not finished after too long.
+                       // These jobs can be recycled into the queue by expiring the claim. Selecting
+                       // the IDs first means that the UPDATE can be done by primary key (less deadlocks).
+                       $res = $dbw->select( 'job', 'job_id',
                                array(
                                        'job_cmd' => $this->type,
                                        "job_token != {$dbw->addQuotes( '' )}", // was acquired
                                        "job_token_timestamp < {$dbw->addQuotes( $claimCutoff )}", // stale
-                                       "job_attempts < {$dbw->addQuotes( self::MAX_ATTEMPTS )}" ),
+                                       "job_attempts < {$dbw->addQuotes( self::MAX_ATTEMPTS )}" ), // retries left
                                __METHOD__
                        );
-                       $count += $dbw->affectedRows();
+                       $ids = array_map( function( $o ) { return $o->job_id; }, iterator_to_array( $res ) );
+                       if ( count( $ids ) ) {
+                               // Reset job_token for these jobs so that other runners will pick them up.
+                               // Set the timestamp to the current time, as it is useful to now that the job
+                               // was already tried before (the timestamp becomes the "released" time).
+                               $dbw->update( 'job',
+                                       array(
+                                               'job_token' => '',
+                                               'job_token_timestamp' => $dbw->timestamp( $now ) ), // time of release
+                                       array(
+                                               'job_id' => $ids ),
+                                       __METHOD__
+                               );
+                               $count += $dbw->affectedRows();
+                       }
                }
 
-               // Just destroy stale jobs...
+               // Just destroy any stale jobs...
                $pruneCutoff = $dbw->timestamp( $now - self::MAX_AGE_PRUNE );
                $conds = array(
                        'job_cmd' => $this->type,
@@ -333,8 +349,16 @@ class JobQueueDB extends JobQueue {
                if ( $this->claimTTL > 0 ) { // only prune jobs attempted too many times...
                        $conds[] = "job_attempts >= {$dbw->addQuotes( self::MAX_ATTEMPTS )}";
                }
-               $dbw->delete( 'job', $conds, __METHOD__ );
-               $count += $dbw->affectedRows();
+               // Get the IDs of jobs that are considered stale and should be removed. Selecting
+               // the IDs first means that the UPDATE can be done by primary key (less deadlocks).
+               $res = $dbw->select( 'job', 'job_id', $conds, __METHOD__ );
+               $ids = array_map( function( $o ) { return $o->job_id; }, iterator_to_array( $res ) );
+               if ( count( $ids ) ) {
+                       $dbw->delete( 'job', array( 'job_id' => $ids ), __METHOD__ );
+                       $count += $dbw->affectedRows();
+               }
+
+               $dbw->unlock( "jobqueue-recycle-{$this->type}", __METHOD__ );
 
                return $count;
        }