aboutsummaryrefslogtreecommitdiff
path: root/yarn
diff options
context:
space:
mode:
authorMarcelo Vanzin <vanzin@cloudera.com>2014-11-28 15:15:30 -0500
committerPatrick Wendell <pwendell@gmail.com>2014-11-28 15:16:05 -0500
commit915f8eeb3a493a0bb4b8d05d795ddd21f373d2ff (patch)
tree0b12ab2fec2ba2a558e617d189dada9a4865334c /yarn
parente464f0ac2d7210a4bf715478885fe7a8d397fe89 (diff)
downloadspark-915f8eeb3a493a0bb4b8d05d795ddd21f373d2ff.tar.gz
spark-915f8eeb3a493a0bb4b8d05d795ddd21f373d2ff.tar.bz2
spark-915f8eeb3a493a0bb4b8d05d795ddd21f373d2ff.zip
[SPARK-4584] [yarn] Remove security manager from Yarn AM.
The security manager adds a lot of overhead to the runtime of the app, and causes a severe performance regression. Even stubbing out all unneeded methods (all except checkExit()) does not help. So, instead, penalize users who do an explicit System.exit() by leaving them in "undefined behavior" territory: if they do that, the Yarn backend won't be able to report the final app status to the RM. The result is that the final status of the application might not match the user's expectations. One side-effect of the change is that users who do an explicit System.exit() will lose the AM retry functionality. Since there is no way to know if the exit was because of success or failure, the AM right now errs on the side of it being a successful exit. Author: Marcelo Vanzin <vanzin@cloudera.com> Closes #3484 from vanzin/SPARK-4584 and squashes the following commits: 21f2502 [Marcelo Vanzin] Do not retry apps that use System.exit(). 4198b3b [Marcelo Vanzin] [SPARK-4584] [yarn] Remove security manager from Yarn AM.
Diffstat (limited to 'yarn')
-rw-r--r--yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala60
1 files changed, 14 insertions, 46 deletions
diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index e90672c004..987b3373fb 100644
--- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -60,7 +60,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments,
@volatile private var exitCode = 0
@volatile private var unregistered = false
@volatile private var finished = false
- @volatile private var finalStatus = FinalApplicationStatus.UNDEFINED
+ @volatile private var finalStatus = FinalApplicationStatus.SUCCEEDED
@volatile private var finalMsg: String = ""
@volatile private var userClassThread: Thread = _
@@ -106,10 +106,14 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments,
val isLastAttempt = client.getAttemptId().getAttemptId() >= maxAppAttempts
if (!finished) {
- // this shouldn't ever happen, but if it does assume weird failure
- finish(FinalApplicationStatus.FAILED,
- ApplicationMaster.EXIT_UNCAUGHT_EXCEPTION,
- "shutdown hook called without cleanly finishing")
+ // This happens when the user application calls System.exit(). We have the choice
+ // of either failing or succeeding at this point. We report success to avoid
+ // retrying applications that have succeeded (System.exit(0)), which means that
+ // applications that explicitly exit with a non-zero status will also show up as
+ // succeeded in the RM UI.
+ finish(finalStatus,
+ ApplicationMaster.EXIT_SUCCESS,
+ "Shutdown hook called before final status was reported.")
}
if (!unregistered) {
@@ -164,17 +168,18 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments,
final def finish(status: FinalApplicationStatus, code: Int, msg: String = null) = synchronized {
if (!finished) {
+ val inShutdown = Utils.inShutdown()
logInfo(s"Final app status: ${status}, exitCode: ${code}" +
Option(msg).map(msg => s", (reason: $msg)").getOrElse(""))
exitCode = code
finalStatus = status
finalMsg = msg
finished = true
- if (Thread.currentThread() != reporterThread && reporterThread != null) {
+ if (!inShutdown && Thread.currentThread() != reporterThread && reporterThread != null) {
logDebug("shutting down reporter thread")
reporterThread.interrupt()
}
- if (Thread.currentThread() != userClassThread && userClassThread != null) {
+ if (!inShutdown && Thread.currentThread() != userClassThread && userClassThread != null) {
logDebug("shutting down user thread")
userClassThread.interrupt()
}
@@ -214,7 +219,6 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments,
private def runDriver(securityMgr: SecurityManager): Unit = {
addAmIpFilter()
- setupSystemSecurityManager()
userClassThread = startUserClass()
// This a bit hacky, but we need to wait until the spark.driver.port property has
@@ -403,45 +407,9 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments,
}
/**
- * This system security manager applies to the entire process.
- * It's main purpose is to handle the case if the user code does a System.exit.
- * This allows us to catch that and properly set the YARN application status and
- * cleanup if needed.
- */
- private def setupSystemSecurityManager(): Unit = {
- try {
- var stopped = false
- System.setSecurityManager(new java.lang.SecurityManager() {
- override def checkExit(paramInt: Int) {
- if (!stopped) {
- logInfo("In securityManager checkExit, exit code: " + paramInt)
- if (paramInt == 0) {
- finish(FinalApplicationStatus.SUCCEEDED, ApplicationMaster.EXIT_SUCCESS)
- } else {
- finish(FinalApplicationStatus.FAILED,
- paramInt,
- "User class exited with non-zero exit code")
- }
- stopped = true
- }
- }
- // required for the checkExit to work properly
- override def checkPermission(perm: java.security.Permission): Unit = {}
- })
- }
- catch {
- case e: SecurityException =>
- finish(FinalApplicationStatus.FAILED,
- ApplicationMaster.EXIT_SECURITY,
- "Error in setSecurityManager")
- logError("Error in setSecurityManager:", e)
- }
- }
-
- /**
* Start the user class, which contains the spark driver, in a separate Thread.
- * If the main routine exits cleanly or exits with System.exit(0) we
- * assume it was successful, for all other cases we assume failure.
+ * If the main routine exits cleanly or exits with System.exit(N) for any N
+ * we assume it was successful, for all other cases we assume failure.
*
* Returns the user thread that was started.
*/