aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatei Zaharia <matei@eecs.berkeley.edu>2013-01-22 15:48:37 -0800
committerMatei Zaharia <matei@eecs.berkeley.edu>2013-01-22 15:48:37 -0800
commit49c78446c83fd26ab6f3b3b168dc40b9e0088a5c (patch)
treebbf2f06e9210d38bedc23c12aca90c01fd583cda
parentd1de9d7d15591d13c3ae27721f9926e94f7b6cf0 (diff)
parent27b3f3f0a980f86bac14a14516b5d52a32aa8cbb (diff)
downloadspark-49c78446c83fd26ab6f3b3b168dc40b9e0088a5c.tar.gz
spark-49c78446c83fd26ab6f3b3b168dc40b9e0088a5c.tar.bz2
spark-49c78446c83fd26ab6f3b3b168dc40b9e0088a5c.zip
Merge pull request #402 from stephenh/handleslavelosttoosoon
Handle slaveLost before slaveIdToHost knows about it.
-rw-r--r--core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala31
1 files changed, 18 insertions, 13 deletions
diff --git a/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala b/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala
index 20f6e65020..a639b72795 100644
--- a/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala
+++ b/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala
@@ -252,19 +252,24 @@ private[spark] class ClusterScheduler(val sc: SparkContext)
def slaveLost(slaveId: String, reason: ExecutorLossReason) {
var failedHost: Option[String] = None
synchronized {
- val host = slaveIdToHost(slaveId)
- if (hostsAlive.contains(host)) {
- logError("Lost an executor on " + host + ": " + reason)
- slaveIdsWithExecutors -= slaveId
- hostsAlive -= host
- activeTaskSetsQueue.foreach(_.hostLost(host))
- failedHost = Some(host)
- } else {
- // We may get multiple slaveLost() calls with different loss reasons. For example, one
- // may be triggered by a dropped connection from the slave while another may be a report
- // of executor termination from Mesos. We produce log messages for both so we eventually
- // report the termination reason.
- logError("Lost an executor on " + host + " (already removed): " + reason)
+ slaveIdToHost.get(slaveId) match {
+ case Some(host) =>
+ if (hostsAlive.contains(host)) {
+ logError("Lost an executor on " + host + ": " + reason)
+ slaveIdsWithExecutors -= slaveId
+ hostsAlive -= host
+ activeTaskSetsQueue.foreach(_.hostLost(host))
+ failedHost = Some(host)
+ } else {
+ // We may get multiple slaveLost() calls with different loss reasons. For example, one
+ // may be triggered by a dropped connection from the slave while another may be a report
+ // of executor termination from Mesos. We produce log messages for both so we eventually
+ // report the termination reason.
+ logError("Lost an executor on " + host + " (already removed): " + reason)
+ }
+ case None =>
+ // We were told about a slave being lost before we could even allocate work to it
+ logError("Lost slave " + slaveId + " (no work assigned yet)")
}
}
if (failedHost != None) {