[SPARK-3072] YARN - Exit when reach max number failed executors

In some cases on hadoop 2.x the spark application master doesn't properly exit and hangs around for 10 minutes after its really done. We should make sure it exits properly and stops the driver. Author: Thomas Graves <tgraves@apache.org> Closes #2022 from tgravescs/SPARK-3072 and squashes the following commits: 665701d [Thomas Graves] Exit when reach max number failed executors
author: Thomas Graves <tgraves@apache.org> 2014-08-19 09:40:31 -0500
committer: Thomas Graves <tgraves@apache.org> 2014-08-19 09:40:31 -0500
commit: 7eb9cbc273d758522e787fcb2ef68ef65911475f (patch)
tree: baea2c9bfa0b5afd89978971e00ec5587c844b09 /yarn/alpha
parent: cd0720ca77894d481fb73a8b5bb517013843cb1e (diff)
download: spark-7eb9cbc273d758522e787fcb2ef68ef65911475f.tar.gz
spark-7eb9cbc273d758522e787fcb2ef68ef65911475f.tar.bz2
spark-7eb9cbc273d758522e787fcb2ef68ef65911475f.zip
2 files changed, 25 insertions, 13 deletions
diff --git a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 62b5c3bc5f..46a01f5a9a 100644
--- a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -267,12 +267,10 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
       // TODO: This is a bit ugly. Can we make it nicer?
       // TODO: Handle container failure
 
-      // Exists the loop if the user thread exits.
-      while (yarnAllocator.getNumExecutorsRunning < args.numExecutors && userThread.isAlive) {
-        if (yarnAllocator.getNumExecutorsFailed >= maxNumExecutorFailures) {
-          finishApplicationMaster(FinalApplicationStatus.FAILED,
-            "max number of executor failures reached")
-        }
+      // Exits the loop if the user thread exits.
+      while (yarnAllocator.getNumExecutorsRunning < args.numExecutors && userThread.isAlive
+          && !isFinished) {
+        checkNumExecutorsFailed()
         yarnAllocator.allocateContainers(
           math.max(args.numExecutors - yarnAllocator.getNumExecutorsRunning, 0))
         Thread.sleep(ApplicationMaster.ALLOCATE_HEARTBEAT_INTERVAL)
@@ -303,11 +301,8 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
 
     val t = new Thread {
       override def run() {
-        while (userThread.isAlive) {
-          if (yarnAllocator.getNumExecutorsFailed >= maxNumExecutorFailures) {
-            finishApplicationMaster(FinalApplicationStatus.FAILED,
-              "max number of executor failures reached")
-          }
+        while (userThread.isAlive && !isFinished) {
+          checkNumExecutorsFailed()
           val missingExecutorCount = args.numExecutors - yarnAllocator.getNumExecutorsRunning
           if (missingExecutorCount > 0) {
             logInfo("Allocating %d containers to make up for (potentially) lost containers".
@@ -327,6 +322,22 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
     t
   }
 
+  private def checkNumExecutorsFailed() {
+    if (yarnAllocator.getNumExecutorsFailed >= maxNumExecutorFailures) {
+      logInfo("max number of executor failures reached")
+      finishApplicationMaster(FinalApplicationStatus.FAILED,
+        "max number of executor failures reached")
+      // make sure to stop the user thread
+      val sparkContext = ApplicationMaster.sparkContextRef.get()
+      if (sparkContext != null) {
+        logInfo("Invoking sc stop from checkNumExecutorsFailed")
+        sparkContext.stop()
+      } else {
+        logError("sparkContext is null when should shutdown")
+      }
+    }
+  }
+
   private def sendProgress() {
     logDebug("Sending progress")
     // Simulated with an allocate request with no nodes requested ...
diff --git a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ExecutorLauncher.scala b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ExecutorLauncher.scala
index 184e2ad6c8..72c7143edc 100644
--- a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ExecutorLauncher.scala
+++ b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ExecutorLauncher.scala
@@ -249,7 +249,8 @@ class ExecutorLauncher(args: ApplicationMasterArguments, conf: Configuration, sp
     // Wait until all containers have finished
     // TODO: This is a bit ugly. Can we make it nicer?
     // TODO: Handle container failure
-    while ((yarnAllocator.getNumExecutorsRunning < args.numExecutors) && (!driverClosed)) {
+    while ((yarnAllocator.getNumExecutorsRunning < args.numExecutors) && (!driverClosed) &&
+        !isFinished) {
       yarnAllocator.allocateContainers(
         math.max(args.numExecutors - yarnAllocator.getNumExecutorsRunning, 0))
       checkNumExecutorsFailed()
@@ -271,7 +272,7 @@ class ExecutorLauncher(args: ApplicationMasterArguments, conf: Configuration, sp
 
     val t = new Thread {
       override def run() {
-        while (!driverClosed) {
+        while (!driverClosed && !isFinished) {
           checkNumExecutorsFailed()
           val missingExecutorCount = args.numExecutors - yarnAllocator.getNumExecutorsRunning
           if (missingExecutorCount > 0) {
author	Thomas Graves <tgraves@apache.org>	2014-08-19 09:40:31 -0500
committer	Thomas Graves <tgraves@apache.org>	2014-08-19 09:40:31 -0500
commit	7eb9cbc273d758522e787fcb2ef68ef65911475f (patch)
tree	baea2c9bfa0b5afd89978971e00ec5587c844b09 /yarn/alpha
parent	cd0720ca77894d481fb73a8b5bb517013843cb1e (diff)
download	spark-7eb9cbc273d758522e787fcb2ef68ef65911475f.tar.gz spark-7eb9cbc273d758522e787fcb2ef68ef65911475f.tar.bz2 spark-7eb9cbc273d758522e787fcb2ef68ef65911475f.zip