Don't throw an exception when waiting for replication times out
authorTim Starling <tstarling@wikimedia.org>
Thu, 16 Aug 2018 07:01:55 +0000 (17:01 +1000)
committerTim Starling <tstarling@wikimedia.org>
Mon, 3 Sep 2018 02:29:35 +0000 (12:29 +1000)
For maintenance scripts it is usually harmful to throw an exception.
For jobs the exception was already caught and handled appropriately,
so this can continue as before. For DeferredUpdates it was extremely
harmful to throw an exception. So in the web case, reduce the timeout to
1s and continue as normal if the 1s timeout is reached. This allows the
DeferredUpdate to be throttled without being killed.

In the updater, increase the replication wait timeout to 5 minutes.
ALTER TABLE could indeed cause replication lag, but exiting the update
script with an exception will probably ruin your day. Update actions are
not necessarily efficiently restartable.

Do not call JobQueue::waitForBackups() when jobs are popped. Maybe it
makes sense to call a queue-specific replication wait function for
bulk inserts, like copyJobQueue.php, but doing it when jobs are popped
just makes no sense. Surely the worst that could happen is that the
queue would become locally empty? Removing this waitForBackups() call
avoids waiting for replication twice when JobQueueDB is used.

Bug: T201482
Change-Id: Ia820196caccf9c95007aea12175faf809800f084

includes/GlobalFunctions.php
includes/installer/DatabaseUpdater.php
includes/installer/MysqlUpdater.php
includes/jobqueue/JobRunner.php
includes/jobqueue/jobs/RecentChangesUpdateJob.php
includes/jobqueue/jobs/RefreshLinksJob.php
includes/libs/rdbms/exception/DBReplicationWaitError.php
includes/libs/rdbms/lbfactory/ILBFactory.php
includes/libs/rdbms/lbfactory/LBFactory.php
maintenance/Maintenance.php
maintenance/updateSpecialPages.php

index 567db85..000951d 100644 (file)
@@ -30,7 +30,6 @@ use MediaWiki\Session\SessionManager;
 use MediaWiki\MediaWikiServices;
 use MediaWiki\Shell\Shell;
 use Wikimedia\ScopedCallback;
-use Wikimedia\Rdbms\DBReplicationWaitError;
 use Wikimedia\WrappedString;
 
 /**
@@ -2944,17 +2943,13 @@ function wfGetNull() {
  * @param float|null $ifWritesSince Only wait if writes were done since this UNIX timestamp
  * @param string|bool $wiki Wiki identifier accepted by wfGetLB
  * @param string|bool $cluster Cluster name accepted by LBFactory. Default: false.
- * @param int|null $timeout Max wait time. Default: 1 day (cli), ~10 seconds (web)
+ * @param int|null $timeout Max wait time. Default: 60 seconds (cli), 1 second (web)
  * @return bool Success (able to connect and no timeouts reached)
  * @deprecated since 1.27 Use LBFactory::waitForReplication
  */
 function wfWaitForSlaves(
        $ifWritesSince = null, $wiki = false, $cluster = false, $timeout = null
 ) {
-       if ( $timeout === null ) {
-               $timeout = wfIsCLI() ? 60 : 10;
-       }
-
        if ( $cluster === '*' ) {
                $cluster = false;
                $wiki = false;
@@ -2962,20 +2957,18 @@ function wfWaitForSlaves(
                $wiki = wfWikiID();
        }
 
-       try {
-               $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
-               $lbFactory->waitForReplication( [
-                       'wiki' => $wiki,
-                       'cluster' => $cluster,
-                       'timeout' => $timeout,
-                       // B/C: first argument used to be "max seconds of lag"; ignore such values
-                       'ifWritesSince' => ( $ifWritesSince > 1e9 ) ? $ifWritesSince : null
-               ] );
-       } catch ( DBReplicationWaitError $e ) {
-               return false;
+       $opts = [
+               'wiki' => $wiki,
+               'cluster' => $cluster,
+               // B/C: first argument used to be "max seconds of lag"; ignore such values
+               'ifWritesSince' => ( $ifWritesSince > 1e9 ) ? $ifWritesSince : null
+       ];
+       if ( $timeout !== null ) {
+               $opts['timeout'] = $timeout;
        }
 
-       return true;
+       $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
+       return $lbFactory->waitForReplication( $opts );
 }
 
 /**
index e49a846..9cea96b 100644 (file)
@@ -34,6 +34,8 @@ require_once __DIR__ . '/../../maintenance/Maintenance.php';
  * @since 1.17
  */
 abstract class DatabaseUpdater {
+       const REPLICATION_WAIT_TIMEOUT = 300;
+
        /**
         * Array of updates to perform on the database
         *
@@ -484,7 +486,7 @@ abstract class DatabaseUpdater {
                        flush();
                        if ( $ret !== false ) {
                                $updatesDone[] = $origParams;
-                               $lbFactory->waitForReplication();
+                               $lbFactory->waitForReplication( [ 'timeout' => self::REPLICATION_WAIT_TIMEOUT ] );
                        } else {
                                $updatesSkipped[] = [ $func, $params, $origParams ];
                        }
index ac0a520..e231042 100644 (file)
@@ -925,7 +925,8 @@ class MysqlUpdater extends DatabaseUpdater {
                                $count = ( $count + 1 ) % 100;
                                if ( $count == 0 ) {
                                        $lbFactory = $services->getDBLoadBalancerFactory();
-                                       $lbFactory->waitForReplication( [ 'wiki' => wfWikiID() ] );
+                                       $lbFactory->waitForReplication( [
+                                               'wiki' => wfWikiID(), 'timeout' => self::REPLICATION_WAIT_TIMEOUT ] );
                                }
                                $this->db->insert( 'templatelinks',
                                        [
index 7c5d0ae..dab9b14 100644 (file)
@@ -29,7 +29,6 @@ use Psr\Log\LoggerInterface;
 use Wikimedia\ScopedCallback;
 use Wikimedia\Rdbms\LBFactory;
 use Wikimedia\Rdbms\DBError;
-use Wikimedia\Rdbms\DBReplicationWaitError;
 
 /**
  * Job queue runner utility methods
@@ -225,21 +224,16 @@ class JobRunner implements LoggerAwareInterface {
                                // other wikis in the farm (on different masters) get a chance.
                                $timePassed = microtime( true ) - $lastCheckTime;
                                if ( $timePassed >= self::LAG_CHECK_PERIOD || $timePassed < 0 ) {
-                                       try {
-                                               $lbFactory->waitForReplication( [
-                                                       'ifWritesSince' => $lastCheckTime,
-                                                       'timeout' => self::MAX_ALLOWED_LAG
-                                               ] );
-                                       } catch ( DBReplicationWaitError $e ) {
+                                       $success = $lbFactory->waitForReplication( [
+                                               'ifWritesSince' => $lastCheckTime,
+                                               'timeout' => self::MAX_ALLOWED_LAG,
+                                       ] );
+                                       if ( !$success ) {
                                                $response['reached'] = 'replica-lag-limit';
                                                break;
                                        }
                                        $lastCheckTime = microtime( true );
                                }
-                               // Don't let any queue replica DBs/backups fall behind
-                               if ( $jobsPopped > 0 && ( $jobsPopped % 100 ) == 0 ) {
-                                       $group->waitForBackups();
-                               }
 
                                // Bail if near-OOM instead of in a job
                                if ( !$this->checkMemoryOK() ) {
index 8f50828..223ae32 100644 (file)
@@ -19,7 +19,6 @@
  * @ingroup JobQueue
  */
 use MediaWiki\MediaWikiServices;
-use Wikimedia\Rdbms\DBReplicationWaitError;
 
 /**
  * Job for pruning recent changes
@@ -105,11 +104,9 @@ class RecentChangesUpdateJob extends Job {
                                $dbw->delete( 'recentchanges', [ 'rc_id' => $rcIds ], __METHOD__ );
                                Hooks::run( 'RecentChangesPurgeRows', [ $rows ] );
                                // There might be more, so try waiting for replica DBs
-                               try {
-                                       $factory->commitAndWaitForReplication(
-                                               __METHOD__, $ticket, [ 'timeout' => 3 ]
-                                       );
-                               } catch ( DBReplicationWaitError $e ) {
+                               if ( !$factory->commitAndWaitForReplication(
+                                       __METHOD__, $ticket, [ 'timeout' => 3 ]
+                               ) ) {
                                        // Another job will continue anyway
                                        break;
                                }
index 8854c65..8c4d1a1 100644 (file)
@@ -21,7 +21,6 @@
  * @ingroup JobQueue
  */
 use MediaWiki\MediaWikiServices;
-use Wikimedia\Rdbms\DBReplicationWaitError;
 
 /**
  * Job to update link tables for pages
@@ -89,13 +88,11 @@ class RefreshLinksJob extends Job {
                        // From then on, we know that any template changes at the time the base job was
                        // enqueued will be reflected in backlink page parses when the leaf jobs run.
                        if ( !isset( $this->params['range'] ) ) {
-                               try {
-                                       $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
-                                       $lbFactory->waitForReplication( [
+                               $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
+                               if ( !$lbFactory->waitForReplication( [
                                                'wiki'    => wfWikiID(),
                                                'timeout' => self::LAG_WAIT_TIMEOUT
-                                       ] );
-                               } catch ( DBReplicationWaitError $e ) { // only try so hard
+                               ] ) ) { // only try so hard
                                        $stats = MediaWikiServices::getInstance()->getStatsdDataFactory();
                                        $stats->increment( 'refreshlinks.lag_wait_failed' );
                                }
index c5dd8ae..74abace 100644 (file)
@@ -23,6 +23,7 @@ namespace Wikimedia\Rdbms;
 
 /**
  * Exception class for replica DB wait timeouts
+ * @deprecated since 1.32
  * @ingroup Database
  */
 class DBReplicationWaitError extends DBExpectedError {
index 23699c7..8e35456 100644 (file)
@@ -269,9 +269,9 @@ interface ILBFactory {
         * @param array $opts Optional fields that include:
         *   - domain : wait on the load balancer DBs that handles the given domain ID
         *   - cluster : wait on the given external load balancer DBs
-        *   - timeout : Max wait time. Default: ~60 seconds
+        *   - timeout : Max wait time. Default: 60 seconds for CLI, 1 second for web.
         *   - ifWritesSince: Only wait if writes were done since this UNIX timestamp
-        * @throws DBReplicationWaitError If a timeout or error occurred waiting on a DB cluster
+        * @return bool True on success, false if a timeout or error occurred while waiting
         */
        public function waitForReplication( array $opts = [] );
 
@@ -301,7 +301,7 @@ interface ILBFactory {
         * @param string $fname Caller name (e.g. __METHOD__)
         * @param mixed $ticket Result of getEmptyTransactionTicket()
         * @param array $opts Options to waitForReplication()
-        * @throws DBReplicationWaitError
+        * @return bool True if the wait was successful, false on timeout
         */
        public function commitAndWaitForReplication( $fname, $ticket, array $opts = [] );
 
index be1a851..e736ab9 100644 (file)
@@ -375,7 +375,7 @@ abstract class LBFactory implements ILBFactory {
                $opts += [
                        'domain' => false,
                        'cluster' => false,
-                       'timeout' => $this->cliMode ? 60 : 10,
+                       'timeout' => $this->cliMode ? 60 : 1,
                        'ifWritesSince' => null
                ];
 
@@ -432,13 +432,7 @@ abstract class LBFactory implements ILBFactory {
                        }
                }
 
-               if ( $failed ) {
-                       throw new DBReplicationWaitError(
-                               null,
-                               "Could not wait for replica DBs to catch up to " .
-                               implode( ', ', $failed )
-                       );
-               }
+               return !$failed;
        }
 
        public function setWaitForReplicationListener( $name, callable $callback = null ) {
@@ -478,12 +472,13 @@ abstract class LBFactory implements ILBFactory {
                }
 
                $this->commitMasterChanges( $fnameEffective );
-               $this->waitForReplication( $opts );
+               $waitSucceeded = $this->waitForReplication( $opts );
                // If a nested caller committed on behalf of $fname, start another empty $fname
                // transaction, leaving the caller with the same empty transaction state as before.
                if ( $fnameEffective !== $fname ) {
                        $this->beginMasterChanges( $fnameEffective );
                }
+               return $waitSucceeded;
        }
 
        public function getChronologyProtectorTouched( $dbName ) {
index 6377734..b446cc1 100644 (file)
@@ -26,7 +26,6 @@ require_once __DIR__ . '/../includes/PHPVersionCheck.php';
 wfEntryPointCheck( 'cli' );
 
 use MediaWiki\Shell\Shell;
-use Wikimedia\Rdbms\DBReplicationWaitError;
 
 /**
  * @defgroup MaintenanceArchive Maintenance archives
@@ -1393,17 +1392,12 @@ abstract class Maintenance {
         */
        protected function commitTransaction( IDatabase $dbw, $fname ) {
                $dbw->commit( $fname );
-               try {
-                       $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
-                       $lbFactory->waitForReplication(
-                               [ 'timeout' => 30, 'ifWritesSince' => $this->lastReplicationWait ]
-                       );
-                       $this->lastReplicationWait = microtime( true );
-
-                       return true;
-               } catch ( DBReplicationWaitError $e ) {
-                       return false;
-               }
+               $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
+               $waitSucceeded = $lbFactory->waitForReplication(
+                       [ 'timeout' => 30, 'ifWritesSince' => $this->lastReplicationWait ]
+               );
+               $this->lastReplicationWait = microtime( true );
+               return $waitSucceeded;
        }
 
        /**
index 01aace0..6627daf 100644 (file)
@@ -25,7 +25,6 @@
 require_once __DIR__ . '/Maintenance.php';
 
 use MediaWiki\MediaWikiServices;
-use Wikimedia\Rdbms\DBReplicationWaitError;
 
 /**
  * Maintenance script to update cached special pages.
@@ -133,11 +132,7 @@ class UpdateSpecialPages extends Maintenance {
                        $this->output( "Reconnected\n\n" );
                }
                // Wait for the replica DB to catch up
-               try {
-                       $lbFactory->waitForReplication();
-               } catch ( DBReplicationWaitError $e ) {
-                       // ignore
-               }
+               $lbFactory->waitForReplication();
        }
 
        public function doSpecialPageCacheUpdates( $dbw ) {