Merge pull request #402 from stephenh/handleslavelosttoosoon

Handle slaveLost before slaveIdToHost knows about it.
author: Matei Zaharia <matei@eecs.berkeley.edu> 2013-01-22 15:48:37 -0800
committer: Matei Zaharia <matei@eecs.berkeley.edu> 2013-01-22 15:48:37 -0800
commit: 49c78446c83fd26ab6f3b3b168dc40b9e0088a5c (patch)
tree: bbf2f06e9210d38bedc23c12aca90c01fd583cda
parent: d1de9d7d15591d13c3ae27721f9926e94f7b6cf0 (diff)
parent: 27b3f3f0a980f86bac14a14516b5d52a32aa8cbb (diff)
download: spark-49c78446c83fd26ab6f3b3b168dc40b9e0088a5c.tar.gz
spark-49c78446c83fd26ab6f3b3b168dc40b9e0088a5c.tar.bz2
spark-49c78446c83fd26ab6f3b3b168dc40b9e0088a5c.zip
1 files changed, 18 insertions, 13 deletions
diff --git a/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala b/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala
index 20f6e65020..a639b72795 100644
--- a/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala
+++ b/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala
@@ -252,19 +252,24 @@ private[spark] class ClusterScheduler(val sc: SparkContext)
   def slaveLost(slaveId: String, reason: ExecutorLossReason) {
     var failedHost: Option[String] = None
     synchronized {
-      val host = slaveIdToHost(slaveId)
-      if (hostsAlive.contains(host)) {
-        logError("Lost an executor on " + host + ": " + reason)
-        slaveIdsWithExecutors -= slaveId
-        hostsAlive -= host
-        activeTaskSetsQueue.foreach(_.hostLost(host))
-        failedHost = Some(host)
-      } else {
-        // We may get multiple slaveLost() calls with different loss reasons. For example, one 
-        // may be triggered by a dropped connection from the slave while another may be a report
-        // of executor termination from Mesos. We produce log messages for both so we eventually
-        // report the termination reason.
-        logError("Lost an executor on " + host + " (already removed): " + reason)
+      slaveIdToHost.get(slaveId) match {
+        case Some(host) =>
+          if (hostsAlive.contains(host)) {
+            logError("Lost an executor on " + host + ": " + reason)
+            slaveIdsWithExecutors -= slaveId
+            hostsAlive -= host
+            activeTaskSetsQueue.foreach(_.hostLost(host))
+            failedHost = Some(host)
+          } else {
+            // We may get multiple slaveLost() calls with different loss reasons. For example, one 
+            // may be triggered by a dropped connection from the slave while another may be a report
+            // of executor termination from Mesos. We produce log messages for both so we eventually
+            // report the termination reason.
+            logError("Lost an executor on " + host + " (already removed): " + reason)
+          }
+        case None =>
+          // We were told about a slave being lost before we could even allocate work to it
+          logError("Lost slave " + slaveId + " (no work assigned yet)")
       }
     }
     if (failedHost != None) {
author	Matei Zaharia <matei@eecs.berkeley.edu>	2013-01-22 15:48:37 -0800
committer	Matei Zaharia <matei@eecs.berkeley.edu>	2013-01-22 15:48:37 -0800
commit	49c78446c83fd26ab6f3b3b168dc40b9e0088a5c (patch)
tree	bbf2f06e9210d38bedc23c12aca90c01fd583cda
parent	d1de9d7d15591d13c3ae27721f9926e94f7b6cf0 (diff)
parent	27b3f3f0a980f86bac14a14516b5d52a32aa8cbb (diff)
download	spark-49c78446c83fd26ab6f3b3b168dc40b9e0088a5c.tar.gz spark-49c78446c83fd26ab6f3b3b168dc40b9e0088a5c.tar.bz2 spark-49c78446c83fd26ab6f3b3b168dc40b9e0088a5c.zip