aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--core/src/main/scala/org/apache/spark/deploy/master/ApplicationState.scala2
-rw-r--r--core/src/main/scala/org/apache/spark/deploy/master/Master.scala7
-rw-r--r--docs/spark-standalone.md15
3 files changed, 21 insertions, 3 deletions
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationState.scala b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationState.scala
index 37bfcdfdf4..097728c821 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationState.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationState.scala
@@ -22,6 +22,4 @@ private[master] object ApplicationState extends Enumeration {
type ApplicationState = Value
val WAITING, RUNNING, FINISHED, FAILED, KILLED, UNKNOWN = Value
-
- val MAX_NUM_RETRY = 10
}
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index fded8475a0..dfffc47703 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -58,6 +58,7 @@ private[deploy] class Master(
private val RETAINED_DRIVERS = conf.getInt("spark.deploy.retainedDrivers", 200)
private val REAPER_ITERATIONS = conf.getInt("spark.dead.worker.persistence", 15)
private val RECOVERY_MODE = conf.get("spark.deploy.recoveryMode", "NONE")
+ private val MAX_EXECUTOR_RETRIES = conf.getInt("spark.deploy.maxExecutorRetries", 10)
val workers = new HashSet[WorkerInfo]
val idToApp = new HashMap[String, ApplicationInfo]
@@ -265,7 +266,11 @@ private[deploy] class Master(
val normalExit = exitStatus == Some(0)
// Only retry certain number of times so we don't go into an infinite loop.
- if (!normalExit && appInfo.incrementRetryCount() >= ApplicationState.MAX_NUM_RETRY) {
+ // Important note: this code path is not exercised by tests, so be very careful when
+ // changing this `if` condition.
+ if (!normalExit
+ && appInfo.incrementRetryCount() >= MAX_EXECUTOR_RETRIES
+ && MAX_EXECUTOR_RETRIES >= 0) { // < 0 disables this application-killing path
val execs = appInfo.executors.values
if (!execs.exists(_.state == ExecutorState.RUNNING)) {
logError(s"Application ${appInfo.desc.name} with ID ${appInfo.id} failed " +
diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
index c864c90308..5ae63fe4e6 100644
--- a/docs/spark-standalone.md
+++ b/docs/spark-standalone.md
@@ -196,6 +196,21 @@ SPARK_MASTER_OPTS supports the following system properties:
</td>
</tr>
<tr>
+ <td><code>spark.deploy.maxExecutorRetries</code></td>
+ <td>10</td>
+ <td>
+ Limit on the maximum number of back-to-back executor failures that can occur before the
+ standalone cluster manager removes a faulty application. An application will never be removed
+ if it has any running executors. If an application experiences more than
+ <code>spark.deploy.maxExecutorRetries</code> failures in a row, no executors
+ successfully start running in between those failures, and the application has no running
+ executors then the standalone cluster manager will remove the application and mark it as failed.
+ To disable this automatic removal, set <code>spark.deploy.maxExecutorRetries</code> to
+ <code>-1</code>.
+ <br/>
+ </td>
+</tr>
+<tr>
<td><code>spark.worker.timeout</code></td>
<td>60</td>
<td>