From fef85e5d92ba19b778dd3d3e9c8bc67c203b9e7e Mon Sep 17 00:00:00 2001 From: Aaron Schulz Date: Wed, 13 Jan 2016 14:33:38 -0800 Subject: [PATCH] Add better error logging for DB getLag() calls Bug: T32257 Change-Id: I4ea5db670fe96d20b1d593cc2d759f9c3f570790 --- includes/db/DatabaseMysqlBase.php | 14 ++++++++ includes/db/loadbalancer/LoadBalancer.php | 10 +++--- includes/db/loadbalancer/LoadMonitorMySQL.php | 34 ++++++++++++++----- 3 files changed, 45 insertions(+), 13 deletions(-) diff --git a/includes/db/DatabaseMysqlBase.php b/includes/db/DatabaseMysqlBase.php index 5bcfd66d0c..3a8f7373b6 100644 --- a/includes/db/DatabaseMysqlBase.php +++ b/includes/db/DatabaseMysqlBase.php @@ -654,6 +654,13 @@ abstract class DatabaseMysqlBase extends Database { protected function getLagFromPtHeartbeat() { $masterInfo = $this->getMasterServerInfo(); if ( !$masterInfo ) { + wfLogDBError( + "Unable to query master of {db_server} for server ID", + $this->getLogContext( array( + 'method' => __METHOD__ + ) ) + ); + return false; // could not get master server ID } @@ -666,6 +673,13 @@ abstract class DatabaseMysqlBase extends Database { return max( $nowUnix - $timeUnix, 0.0 ); } + wfLogDBError( + "Unable to find pt-heartbeat row for {db_server}", + $this->getLogContext( array( + 'method' => __METHOD__ + ) ) + ); + return false; } diff --git a/includes/db/loadbalancer/LoadBalancer.php b/includes/db/loadbalancer/LoadBalancer.php index 85a6bf08e8..3fe963890e 100644 --- a/includes/db/loadbalancer/LoadBalancer.php +++ b/includes/db/loadbalancer/LoadBalancer.php @@ -190,11 +190,13 @@ class LoadBalancer { if ( isset( $this->mServers[$i]['max lag'] ) ) { $maxServerLag = min( $maxServerLag, $this->mServers[$i]['max lag'] ); } + + $host = $this->getServerName( $i ); if ( $lag === false ) { - wfDebugLog( 'replication', "Server #$i is not replicating" ); + wfDebugLog( 'replication', "Server $host (#$i) is not replicating?" ); unset( $loads[$i] ); } elseif ( $lag > $maxServerLag ) { - wfDebugLog( 'replication', "Server #$i is excessively lagged ($lag seconds)" ); + wfDebugLog( 'replication', "Server $host (#$i) has >= $lag seconds of lag" ); unset( $loads[$i] ); } } @@ -679,9 +681,7 @@ class LoadBalancer { * * @param int $i Server index * @param string|bool $wiki Wiki ID, or false for the current wiki - * @return DatabaseBase - * - * @access private + * @return DatabaseBase|bool Returns false on errors */ public function openConnection( $i, $wiki = false ) { if ( $wiki !== false ) { diff --git a/includes/db/loadbalancer/LoadMonitorMySQL.php b/includes/db/loadbalancer/LoadMonitorMySQL.php index 31f616366e..e251501626 100644 --- a/includes/db/loadbalancer/LoadMonitorMySQL.php +++ b/includes/db/loadbalancer/LoadMonitorMySQL.php @@ -88,18 +88,33 @@ class LoadMonitorMySQL implements LoadMonitor { $lagTimes = array(); foreach ( $serverIndexes as $i ) { - if ( $i == 0 ) { # Master - $lagTimes[$i] = 0; + if ( $i == $this->parent->getWriterIndex() ) { + $lagTimes[$i] = 0; // master always has no lag continue; } + $conn = $this->parent->getAnyOpenConnection( $i ); - if ( $conn !== false ) { - $lagTimes[$i] = $conn->getLag(); + if ( $conn ) { + $close = false; // already open + } else { + $conn = $this->parent->openConnection( $i, $wiki ); + $close = true; // new connection + } + + if ( !$conn ) { + $lagTimes[$i] = false; + $host = $this->parent->getServerName( $i ); + wfDebugLog( 'replication', __METHOD__ . ": host $host (#$i) is unreachable" ); continue; } - $conn = $this->parent->openConnection( $i, $wiki ); - if ( $conn !== false ) { - $lagTimes[$i] = $conn->getLag(); + + $lagTimes[$i] = $conn->getLag(); + if ( $lagTimes[$i] === false ) { + $host = $this->parent->getServerName( $i ); + wfDebugLog( 'replication', __METHOD__ . ": host $host (#$i) is not replicating?" ); + } + + if ( $close ) { # Close the connection to avoid sleeper connections piling up. # Note that the caller will pick one of these DBs and reconnect, # which is slightly inefficient, but this only matters for the lag @@ -124,7 +139,10 @@ class LoadMonitorMySQL implements LoadMonitor { } private function getLagTimeCacheKey() { + $writerIndex = $this->parent->getWriterIndex(); // Lag is per-server, not per-DB, so key on the master DB name - return $this->srvCache->makeGlobalKey( 'lag-times', $this->parent->getServerName( 0 ) ); + return $this->srvCache->makeGlobalKey( + 'lag-times', $this->parent->getServerName( $writerIndex ) + ); } } -- 2.20.1