From a4041dd87f7b33b28de29ef0a4eebe33c7b0e6ca Mon Sep 17 00:00:00 2001
From: Charles Reiss <charles@eecs.berkeley.edu>
Date: Thu, 13 Dec 2012 16:11:08 -0800
Subject: Log duplicate slaveLost() calls in ClusterScheduler.

---
 .../src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala b/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala
index d160379b14..ab200decb1 100644
--- a/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala
+++ b/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala
@@ -254,14 +254,20 @@ private[spark] class ClusterScheduler(val sc: SparkContext)
     synchronized {
       val host = slaveIdToHost(slaveId)
       if (hostsAlive.contains(host)) {
+        logError("Lost an executor on " + host + ": " + reason)
         slaveIdsWithExecutors -= slaveId
         hostsAlive -= host
         activeTaskSetsQueue.foreach(_.hostLost(host))
         failedHost = Some(host)
+      } else {
+        // We may get multiple slaveLost() calls with different loss reasons. For example, one 
+        // may be triggered by a dropped connection from the slave while another may be a report
+        // of executor termination from Mesos. We produce log messages for both so we eventually
+        // report the termination reason.
+        logError("Lost an executor on " + host + " (already removed): " + reason)
       }
     }
     if (failedHost != None) {
-      logError("Lost an executor on " + failedHost.get + ": " + reason)
       listener.hostLost(failedHost.get)
       backend.reviveOffers()
     }
-- 
cgit v1.2.3