diff options
author | Matei Zaharia <matei@eecs.berkeley.edu> | 2013-01-22 15:48:37 -0800 |
---|---|---|
committer | Matei Zaharia <matei@eecs.berkeley.edu> | 2013-01-22 15:48:37 -0800 |
commit | 49c78446c83fd26ab6f3b3b168dc40b9e0088a5c (patch) | |
tree | bbf2f06e9210d38bedc23c12aca90c01fd583cda | |
parent | d1de9d7d15591d13c3ae27721f9926e94f7b6cf0 (diff) | |
parent | 27b3f3f0a980f86bac14a14516b5d52a32aa8cbb (diff) | |
download | spark-49c78446c83fd26ab6f3b3b168dc40b9e0088a5c.tar.gz spark-49c78446c83fd26ab6f3b3b168dc40b9e0088a5c.tar.bz2 spark-49c78446c83fd26ab6f3b3b168dc40b9e0088a5c.zip |
Merge pull request #402 from stephenh/handleslavelosttoosoon
Handle slaveLost before slaveIdToHost knows about it.
-rw-r--r-- | core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala | 31 |
1 files changed, 18 insertions, 13 deletions
diff --git a/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala b/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala index 20f6e65020..a639b72795 100644 --- a/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala +++ b/core/src/main/scala/spark/scheduler/cluster/ClusterScheduler.scala @@ -252,19 +252,24 @@ private[spark] class ClusterScheduler(val sc: SparkContext) def slaveLost(slaveId: String, reason: ExecutorLossReason) { var failedHost: Option[String] = None synchronized { - val host = slaveIdToHost(slaveId) - if (hostsAlive.contains(host)) { - logError("Lost an executor on " + host + ": " + reason) - slaveIdsWithExecutors -= slaveId - hostsAlive -= host - activeTaskSetsQueue.foreach(_.hostLost(host)) - failedHost = Some(host) - } else { - // We may get multiple slaveLost() calls with different loss reasons. For example, one - // may be triggered by a dropped connection from the slave while another may be a report - // of executor termination from Mesos. We produce log messages for both so we eventually - // report the termination reason. - logError("Lost an executor on " + host + " (already removed): " + reason) + slaveIdToHost.get(slaveId) match { + case Some(host) => + if (hostsAlive.contains(host)) { + logError("Lost an executor on " + host + ": " + reason) + slaveIdsWithExecutors -= slaveId + hostsAlive -= host + activeTaskSetsQueue.foreach(_.hostLost(host)) + failedHost = Some(host) + } else { + // We may get multiple slaveLost() calls with different loss reasons. For example, one + // may be triggered by a dropped connection from the slave while another may be a report + // of executor termination from Mesos. We produce log messages for both so we eventually + // report the termination reason. + logError("Lost an executor on " + host + " (already removed): " + reason) + } + case None => + // We were told about a slave being lost before we could even allocate work to it + logError("Lost slave " + slaveId + " (no work assigned yet)") } } if (failedHost != None) { |