Add better error logging for DB getLag() calls
authorAaron Schulz <aschulz@wikimedia.org>
Wed, 13 Jan 2016 22:33:38 +0000 (14:33 -0800)
committerBryanDavis <bdavis@wikimedia.org>
Thu, 14 Jan 2016 01:34:56 +0000 (01:34 +0000)
Bug: T32257
Change-Id: I4ea5db670fe96d20b1d593cc2d759f9c3f570790

includes/db/DatabaseMysqlBase.php
includes/db/loadbalancer/LoadBalancer.php
includes/db/loadbalancer/LoadMonitorMySQL.php

index 5bcfd66..3a8f737 100644 (file)
@@ -654,6 +654,13 @@ abstract class DatabaseMysqlBase extends Database {
        protected function getLagFromPtHeartbeat() {
                $masterInfo = $this->getMasterServerInfo();
                if ( !$masterInfo ) {
+                       wfLogDBError(
+                               "Unable to query master of {db_server} for server ID",
+                               $this->getLogContext( array(
+                                       'method' => __METHOD__
+                               ) )
+                       );
+
                        return false; // could not get master server ID
                }
 
@@ -666,6 +673,13 @@ abstract class DatabaseMysqlBase extends Database {
                        return max( $nowUnix - $timeUnix, 0.0 );
                }
 
+               wfLogDBError(
+                       "Unable to find pt-heartbeat row for {db_server}",
+                       $this->getLogContext( array(
+                               'method' => __METHOD__
+                       ) )
+               );
+
                return false;
        }
 
index 85a6bf0..3fe9638 100644 (file)
@@ -190,11 +190,13 @@ class LoadBalancer {
                                if ( isset( $this->mServers[$i]['max lag'] ) ) {
                                        $maxServerLag = min( $maxServerLag, $this->mServers[$i]['max lag'] );
                                }
+
+                               $host = $this->getServerName( $i );
                                if ( $lag === false ) {
-                                       wfDebugLog( 'replication', "Server #$i is not replicating" );
+                                       wfDebugLog( 'replication', "Server $host (#$i) is not replicating?" );
                                        unset( $loads[$i] );
                                } elseif ( $lag > $maxServerLag ) {
-                                       wfDebugLog( 'replication', "Server #$i is excessively lagged ($lag seconds)" );
+                                       wfDebugLog( 'replication', "Server $host (#$i) has >= $lag seconds of lag" );
                                        unset( $loads[$i] );
                                }
                        }
@@ -679,9 +681,7 @@ class LoadBalancer {
         *
         * @param int $i Server index
         * @param string|bool $wiki Wiki ID, or false for the current wiki
-        * @return DatabaseBase
-        *
-        * @access private
+        * @return DatabaseBase|bool Returns false on errors
         */
        public function openConnection( $i, $wiki = false ) {
                if ( $wiki !== false ) {
index 31f6163..e251501 100644 (file)
@@ -88,18 +88,33 @@ class LoadMonitorMySQL implements LoadMonitor {
 
                $lagTimes = array();
                foreach ( $serverIndexes as $i ) {
-                       if ( $i == 0 ) { # Master
-                               $lagTimes[$i] = 0;
+                       if ( $i == $this->parent->getWriterIndex() ) {
+                               $lagTimes[$i] = 0; // master always has no lag
                                continue;
                        }
+
                        $conn = $this->parent->getAnyOpenConnection( $i );
-                       if ( $conn !== false ) {
-                               $lagTimes[$i] = $conn->getLag();
+                       if ( $conn ) {
+                               $close = false; // already open
+                       } else {
+                               $conn = $this->parent->openConnection( $i, $wiki );
+                               $close = true; // new connection
+                       }
+
+                       if ( !$conn ) {
+                               $lagTimes[$i] = false;
+                               $host = $this->parent->getServerName( $i );
+                               wfDebugLog( 'replication', __METHOD__ . ": host $host (#$i) is unreachable" );
                                continue;
                        }
-                       $conn = $this->parent->openConnection( $i, $wiki );
-                       if ( $conn !== false ) {
-                               $lagTimes[$i] = $conn->getLag();
+
+                       $lagTimes[$i] = $conn->getLag();
+                       if ( $lagTimes[$i] === false ) {
+                               $host = $this->parent->getServerName( $i );
+                               wfDebugLog( 'replication', __METHOD__ . ": host $host (#$i) is not replicating?" );
+                       }
+
+                       if ( $close ) {
                                # Close the connection to avoid sleeper connections piling up.
                                # Note that the caller will pick one of these DBs and reconnect,
                                # which is slightly inefficient, but this only matters for the lag
@@ -124,7 +139,10 @@ class LoadMonitorMySQL implements LoadMonitor {
        }
 
        private function getLagTimeCacheKey() {
+               $writerIndex = $this->parent->getWriterIndex();
                // Lag is per-server, not per-DB, so key on the master DB name
-               return $this->srvCache->makeGlobalKey( 'lag-times', $this->parent->getServerName( 0 ) );
+               return $this->srvCache->makeGlobalKey(
+                       'lag-times', $this->parent->getServerName( $writerIndex )
+               );
        }
 }