diff options
-rw-r--r-- | core/src/main/scala/org/apache/spark/deploy/master/ApplicationState.scala | 2 | ||||
-rw-r--r-- | core/src/main/scala/org/apache/spark/deploy/master/Master.scala | 7 | ||||
-rw-r--r-- | docs/spark-standalone.md | 15 |
3 files changed, 21 insertions, 3 deletions
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationState.scala b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationState.scala index 37bfcdfdf4..097728c821 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationState.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationState.scala @@ -22,6 +22,4 @@ private[master] object ApplicationState extends Enumeration { type ApplicationState = Value val WAITING, RUNNING, FINISHED, FAILED, KILLED, UNKNOWN = Value - - val MAX_NUM_RETRY = 10 } diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala index fded8475a0..dfffc47703 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala @@ -58,6 +58,7 @@ private[deploy] class Master( private val RETAINED_DRIVERS = conf.getInt("spark.deploy.retainedDrivers", 200) private val REAPER_ITERATIONS = conf.getInt("spark.dead.worker.persistence", 15) private val RECOVERY_MODE = conf.get("spark.deploy.recoveryMode", "NONE") + private val MAX_EXECUTOR_RETRIES = conf.getInt("spark.deploy.maxExecutorRetries", 10) val workers = new HashSet[WorkerInfo] val idToApp = new HashMap[String, ApplicationInfo] @@ -265,7 +266,11 @@ private[deploy] class Master( val normalExit = exitStatus == Some(0) // Only retry certain number of times so we don't go into an infinite loop. - if (!normalExit && appInfo.incrementRetryCount() >= ApplicationState.MAX_NUM_RETRY) { + // Important note: this code path is not exercised by tests, so be very careful when + // changing this `if` condition. + if (!normalExit + && appInfo.incrementRetryCount() >= MAX_EXECUTOR_RETRIES + && MAX_EXECUTOR_RETRIES >= 0) { // < 0 disables this application-killing path val execs = appInfo.executors.values if (!execs.exists(_.state == ExecutorState.RUNNING)) { logError(s"Application ${appInfo.desc.name} with ID ${appInfo.id} failed " + diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index c864c90308..5ae63fe4e6 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -196,6 +196,21 @@ SPARK_MASTER_OPTS supports the following system properties: </td> </tr> <tr> + <td><code>spark.deploy.maxExecutorRetries</code></td> + <td>10</td> + <td> + Limit on the maximum number of back-to-back executor failures that can occur before the + standalone cluster manager removes a faulty application. An application will never be removed + if it has any running executors. If an application experiences more than + <code>spark.deploy.maxExecutorRetries</code> failures in a row, no executors + successfully start running in between those failures, and the application has no running + executors then the standalone cluster manager will remove the application and mark it as failed. + To disable this automatic removal, set <code>spark.deploy.maxExecutorRetries</code> to + <code>-1</code>. + <br/> + </td> +</tr> +<tr> <td><code>spark.worker.timeout</code></td> <td>60</td> <td> |