From: Tim Starling Date: Thu, 16 Aug 2018 07:01:55 +0000 (+1000) Subject: Don't throw an exception when waiting for replication times out X-Git-Tag: 1.34.0-rc.0~4205^2 X-Git-Url: http://git.cyclocoop.org/?a=commitdiff_plain;h=e8df0fbab17b9ef40b306c437a626acc852902bf;p=lhc%2Fweb%2Fwiklou.git Don't throw an exception when waiting for replication times out For maintenance scripts it is usually harmful to throw an exception. For jobs the exception was already caught and handled appropriately, so this can continue as before. For DeferredUpdates it was extremely harmful to throw an exception. So in the web case, reduce the timeout to 1s and continue as normal if the 1s timeout is reached. This allows the DeferredUpdate to be throttled without being killed. In the updater, increase the replication wait timeout to 5 minutes. ALTER TABLE could indeed cause replication lag, but exiting the update script with an exception will probably ruin your day. Update actions are not necessarily efficiently restartable. Do not call JobQueue::waitForBackups() when jobs are popped. Maybe it makes sense to call a queue-specific replication wait function for bulk inserts, like copyJobQueue.php, but doing it when jobs are popped just makes no sense. Surely the worst that could happen is that the queue would become locally empty? Removing this waitForBackups() call avoids waiting for replication twice when JobQueueDB is used. Bug: T201482 Change-Id: Ia820196caccf9c95007aea12175faf809800f084 --- diff --git a/includes/GlobalFunctions.php b/includes/GlobalFunctions.php index 567db853ee..000951d70e 100644 --- a/includes/GlobalFunctions.php +++ b/includes/GlobalFunctions.php @@ -30,7 +30,6 @@ use MediaWiki\Session\SessionManager; use MediaWiki\MediaWikiServices; use MediaWiki\Shell\Shell; use Wikimedia\ScopedCallback; -use Wikimedia\Rdbms\DBReplicationWaitError; use Wikimedia\WrappedString; /** @@ -2944,17 +2943,13 @@ function wfGetNull() { * @param float|null $ifWritesSince Only wait if writes were done since this UNIX timestamp * @param string|bool $wiki Wiki identifier accepted by wfGetLB * @param string|bool $cluster Cluster name accepted by LBFactory. Default: false. - * @param int|null $timeout Max wait time. Default: 1 day (cli), ~10 seconds (web) + * @param int|null $timeout Max wait time. Default: 60 seconds (cli), 1 second (web) * @return bool Success (able to connect and no timeouts reached) * @deprecated since 1.27 Use LBFactory::waitForReplication */ function wfWaitForSlaves( $ifWritesSince = null, $wiki = false, $cluster = false, $timeout = null ) { - if ( $timeout === null ) { - $timeout = wfIsCLI() ? 60 : 10; - } - if ( $cluster === '*' ) { $cluster = false; $wiki = false; @@ -2962,20 +2957,18 @@ function wfWaitForSlaves( $wiki = wfWikiID(); } - try { - $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory(); - $lbFactory->waitForReplication( [ - 'wiki' => $wiki, - 'cluster' => $cluster, - 'timeout' => $timeout, - // B/C: first argument used to be "max seconds of lag"; ignore such values - 'ifWritesSince' => ( $ifWritesSince > 1e9 ) ? $ifWritesSince : null - ] ); - } catch ( DBReplicationWaitError $e ) { - return false; + $opts = [ + 'wiki' => $wiki, + 'cluster' => $cluster, + // B/C: first argument used to be "max seconds of lag"; ignore such values + 'ifWritesSince' => ( $ifWritesSince > 1e9 ) ? $ifWritesSince : null + ]; + if ( $timeout !== null ) { + $opts['timeout'] = $timeout; } - return true; + $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory(); + return $lbFactory->waitForReplication( $opts ); } /** diff --git a/includes/installer/DatabaseUpdater.php b/includes/installer/DatabaseUpdater.php index e49a846679..9cea96bd16 100644 --- a/includes/installer/DatabaseUpdater.php +++ b/includes/installer/DatabaseUpdater.php @@ -34,6 +34,8 @@ require_once __DIR__ . '/../../maintenance/Maintenance.php'; * @since 1.17 */ abstract class DatabaseUpdater { + const REPLICATION_WAIT_TIMEOUT = 300; + /** * Array of updates to perform on the database * @@ -484,7 +486,7 @@ abstract class DatabaseUpdater { flush(); if ( $ret !== false ) { $updatesDone[] = $origParams; - $lbFactory->waitForReplication(); + $lbFactory->waitForReplication( [ 'timeout' => self::REPLICATION_WAIT_TIMEOUT ] ); } else { $updatesSkipped[] = [ $func, $params, $origParams ]; } diff --git a/includes/installer/MysqlUpdater.php b/includes/installer/MysqlUpdater.php index ac0a520435..e231042f6b 100644 --- a/includes/installer/MysqlUpdater.php +++ b/includes/installer/MysqlUpdater.php @@ -925,7 +925,8 @@ class MysqlUpdater extends DatabaseUpdater { $count = ( $count + 1 ) % 100; if ( $count == 0 ) { $lbFactory = $services->getDBLoadBalancerFactory(); - $lbFactory->waitForReplication( [ 'wiki' => wfWikiID() ] ); + $lbFactory->waitForReplication( [ + 'wiki' => wfWikiID(), 'timeout' => self::REPLICATION_WAIT_TIMEOUT ] ); } $this->db->insert( 'templatelinks', [ diff --git a/includes/jobqueue/JobRunner.php b/includes/jobqueue/JobRunner.php index 7c5d0ae703..dab9b14d19 100644 --- a/includes/jobqueue/JobRunner.php +++ b/includes/jobqueue/JobRunner.php @@ -29,7 +29,6 @@ use Psr\Log\LoggerInterface; use Wikimedia\ScopedCallback; use Wikimedia\Rdbms\LBFactory; use Wikimedia\Rdbms\DBError; -use Wikimedia\Rdbms\DBReplicationWaitError; /** * Job queue runner utility methods @@ -225,21 +224,16 @@ class JobRunner implements LoggerAwareInterface { // other wikis in the farm (on different masters) get a chance. $timePassed = microtime( true ) - $lastCheckTime; if ( $timePassed >= self::LAG_CHECK_PERIOD || $timePassed < 0 ) { - try { - $lbFactory->waitForReplication( [ - 'ifWritesSince' => $lastCheckTime, - 'timeout' => self::MAX_ALLOWED_LAG - ] ); - } catch ( DBReplicationWaitError $e ) { + $success = $lbFactory->waitForReplication( [ + 'ifWritesSince' => $lastCheckTime, + 'timeout' => self::MAX_ALLOWED_LAG, + ] ); + if ( !$success ) { $response['reached'] = 'replica-lag-limit'; break; } $lastCheckTime = microtime( true ); } - // Don't let any queue replica DBs/backups fall behind - if ( $jobsPopped > 0 && ( $jobsPopped % 100 ) == 0 ) { - $group->waitForBackups(); - } // Bail if near-OOM instead of in a job if ( !$this->checkMemoryOK() ) { diff --git a/includes/jobqueue/jobs/RecentChangesUpdateJob.php b/includes/jobqueue/jobs/RecentChangesUpdateJob.php index 8f508283d7..223ae324b8 100644 --- a/includes/jobqueue/jobs/RecentChangesUpdateJob.php +++ b/includes/jobqueue/jobs/RecentChangesUpdateJob.php @@ -19,7 +19,6 @@ * @ingroup JobQueue */ use MediaWiki\MediaWikiServices; -use Wikimedia\Rdbms\DBReplicationWaitError; /** * Job for pruning recent changes @@ -105,11 +104,9 @@ class RecentChangesUpdateJob extends Job { $dbw->delete( 'recentchanges', [ 'rc_id' => $rcIds ], __METHOD__ ); Hooks::run( 'RecentChangesPurgeRows', [ $rows ] ); // There might be more, so try waiting for replica DBs - try { - $factory->commitAndWaitForReplication( - __METHOD__, $ticket, [ 'timeout' => 3 ] - ); - } catch ( DBReplicationWaitError $e ) { + if ( !$factory->commitAndWaitForReplication( + __METHOD__, $ticket, [ 'timeout' => 3 ] + ) ) { // Another job will continue anyway break; } diff --git a/includes/jobqueue/jobs/RefreshLinksJob.php b/includes/jobqueue/jobs/RefreshLinksJob.php index 8854c6560f..8c4d1a1c8e 100644 --- a/includes/jobqueue/jobs/RefreshLinksJob.php +++ b/includes/jobqueue/jobs/RefreshLinksJob.php @@ -21,7 +21,6 @@ * @ingroup JobQueue */ use MediaWiki\MediaWikiServices; -use Wikimedia\Rdbms\DBReplicationWaitError; /** * Job to update link tables for pages @@ -89,13 +88,11 @@ class RefreshLinksJob extends Job { // From then on, we know that any template changes at the time the base job was // enqueued will be reflected in backlink page parses when the leaf jobs run. if ( !isset( $this->params['range'] ) ) { - try { - $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory(); - $lbFactory->waitForReplication( [ + $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory(); + if ( !$lbFactory->waitForReplication( [ 'wiki' => wfWikiID(), 'timeout' => self::LAG_WAIT_TIMEOUT - ] ); - } catch ( DBReplicationWaitError $e ) { // only try so hard + ] ) ) { // only try so hard $stats = MediaWikiServices::getInstance()->getStatsdDataFactory(); $stats->increment( 'refreshlinks.lag_wait_failed' ); } diff --git a/includes/libs/rdbms/exception/DBReplicationWaitError.php b/includes/libs/rdbms/exception/DBReplicationWaitError.php index c5dd8ae19c..74abace87f 100644 --- a/includes/libs/rdbms/exception/DBReplicationWaitError.php +++ b/includes/libs/rdbms/exception/DBReplicationWaitError.php @@ -23,6 +23,7 @@ namespace Wikimedia\Rdbms; /** * Exception class for replica DB wait timeouts + * @deprecated since 1.32 * @ingroup Database */ class DBReplicationWaitError extends DBExpectedError { diff --git a/includes/libs/rdbms/lbfactory/ILBFactory.php b/includes/libs/rdbms/lbfactory/ILBFactory.php index 23699c7a88..8e35456904 100644 --- a/includes/libs/rdbms/lbfactory/ILBFactory.php +++ b/includes/libs/rdbms/lbfactory/ILBFactory.php @@ -269,9 +269,9 @@ interface ILBFactory { * @param array $opts Optional fields that include: * - domain : wait on the load balancer DBs that handles the given domain ID * - cluster : wait on the given external load balancer DBs - * - timeout : Max wait time. Default: ~60 seconds + * - timeout : Max wait time. Default: 60 seconds for CLI, 1 second for web. * - ifWritesSince: Only wait if writes were done since this UNIX timestamp - * @throws DBReplicationWaitError If a timeout or error occurred waiting on a DB cluster + * @return bool True on success, false if a timeout or error occurred while waiting */ public function waitForReplication( array $opts = [] ); @@ -301,7 +301,7 @@ interface ILBFactory { * @param string $fname Caller name (e.g. __METHOD__) * @param mixed $ticket Result of getEmptyTransactionTicket() * @param array $opts Options to waitForReplication() - * @throws DBReplicationWaitError + * @return bool True if the wait was successful, false on timeout */ public function commitAndWaitForReplication( $fname, $ticket, array $opts = [] ); diff --git a/includes/libs/rdbms/lbfactory/LBFactory.php b/includes/libs/rdbms/lbfactory/LBFactory.php index be1a8511dc..e736ab9cc7 100644 --- a/includes/libs/rdbms/lbfactory/LBFactory.php +++ b/includes/libs/rdbms/lbfactory/LBFactory.php @@ -375,7 +375,7 @@ abstract class LBFactory implements ILBFactory { $opts += [ 'domain' => false, 'cluster' => false, - 'timeout' => $this->cliMode ? 60 : 10, + 'timeout' => $this->cliMode ? 60 : 1, 'ifWritesSince' => null ]; @@ -432,13 +432,7 @@ abstract class LBFactory implements ILBFactory { } } - if ( $failed ) { - throw new DBReplicationWaitError( - null, - "Could not wait for replica DBs to catch up to " . - implode( ', ', $failed ) - ); - } + return !$failed; } public function setWaitForReplicationListener( $name, callable $callback = null ) { @@ -478,12 +472,13 @@ abstract class LBFactory implements ILBFactory { } $this->commitMasterChanges( $fnameEffective ); - $this->waitForReplication( $opts ); + $waitSucceeded = $this->waitForReplication( $opts ); // If a nested caller committed on behalf of $fname, start another empty $fname // transaction, leaving the caller with the same empty transaction state as before. if ( $fnameEffective !== $fname ) { $this->beginMasterChanges( $fnameEffective ); } + return $waitSucceeded; } public function getChronologyProtectorTouched( $dbName ) { diff --git a/maintenance/Maintenance.php b/maintenance/Maintenance.php index 63777343d5..b446cc12b7 100644 --- a/maintenance/Maintenance.php +++ b/maintenance/Maintenance.php @@ -26,7 +26,6 @@ require_once __DIR__ . '/../includes/PHPVersionCheck.php'; wfEntryPointCheck( 'cli' ); use MediaWiki\Shell\Shell; -use Wikimedia\Rdbms\DBReplicationWaitError; /** * @defgroup MaintenanceArchive Maintenance archives @@ -1393,17 +1392,12 @@ abstract class Maintenance { */ protected function commitTransaction( IDatabase $dbw, $fname ) { $dbw->commit( $fname ); - try { - $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory(); - $lbFactory->waitForReplication( - [ 'timeout' => 30, 'ifWritesSince' => $this->lastReplicationWait ] - ); - $this->lastReplicationWait = microtime( true ); - - return true; - } catch ( DBReplicationWaitError $e ) { - return false; - } + $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory(); + $waitSucceeded = $lbFactory->waitForReplication( + [ 'timeout' => 30, 'ifWritesSince' => $this->lastReplicationWait ] + ); + $this->lastReplicationWait = microtime( true ); + return $waitSucceeded; } /** diff --git a/maintenance/updateSpecialPages.php b/maintenance/updateSpecialPages.php index 01aace0738..6627dafe26 100644 --- a/maintenance/updateSpecialPages.php +++ b/maintenance/updateSpecialPages.php @@ -25,7 +25,6 @@ require_once __DIR__ . '/Maintenance.php'; use MediaWiki\MediaWikiServices; -use Wikimedia\Rdbms\DBReplicationWaitError; /** * Maintenance script to update cached special pages. @@ -133,11 +132,7 @@ class UpdateSpecialPages extends Maintenance { $this->output( "Reconnected\n\n" ); } // Wait for the replica DB to catch up - try { - $lbFactory->waitForReplication(); - } catch ( DBReplicationWaitError $e ) { - // ignore - } + $lbFactory->waitForReplication(); } public function doSpecialPageCacheUpdates( $dbw ) {