From 41e0a21b22ccd2788dc079790788e505b0d4e37d Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Tue, 5 Aug 2014 15:57:32 -0500 Subject: SPARK-1680: use configs for specifying environment variables on YARN Note that this also documents spark.executorEnv.* which to me means its public. If we don't want that please speak up. Author: Thomas Graves Closes #1512 from tgravescs/SPARK-1680 and squashes the following commits: 11525df [Thomas Graves] more doc changes 553bad0 [Thomas Graves] fix documentation 152bf7c [Thomas Graves] fix docs 5382326 [Thomas Graves] try fix docs 32f86a4 [Thomas Graves] use configs for specifying environment variables on YARN --- docs/configuration.md | 8 ++++++++ docs/running-on-yarn.md | 22 +++++++++++++++++----- .../org/apache/spark/deploy/yarn/ClientBase.scala | 13 +++++++++++++ .../spark/deploy/yarn/ExecutorRunnableUtil.scala | 6 +++++- 4 files changed, 43 insertions(+), 6 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 25adea210c..5e7556c08e 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -206,6 +206,14 @@ Apart from these, the following properties are also available, and may be useful used during aggregation goes above this amount, it will spill the data into disks. + + spark.executorEnv.[EnvironmentVariableName] + (none) + + Add the environment variable specified by EnvironmentVariableName to the Executor + process. The user can specify multiple of these and to set multiple environment variables. + + #### Shuffle Behavior diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index 573930dbf4..9bc20dbf92 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -17,10 +17,6 @@ To build Spark yourself, refer to the [building with Maven guide](building-with- Most of the configs are the same for Spark on YARN as for other deployment modes. See the [configuration page](configuration.html) for more information on those. These are configs that are specific to Spark on YARN. -#### Environment Variables - -* `SPARK_YARN_USER_ENV`, to add environment variables to the Spark processes launched on YARN. This can be a comma separated list of environment variables, e.g. `SPARK_YARN_USER_ENV="JAVA_HOME=/jdk64,FOO=bar"`. - #### Spark Properties @@ -110,7 +106,23 @@ Most of the configs are the same for Spark on YARN as for other deployment modes + + + + +
spark.yarn.access.namenodes (none) - A list of secure HDFS namenodes your Spark application is going to access. For example, `spark.yarn.access.namenodes=hdfs://nn1.com:8032,hdfs://nn2.com:8032`. The Spark application must have acess to the namenodes listed and Kerberos must be properly configured to be able to access them (either in the same realm or in a trusted realm). Spark acquires security tokens for each of the namenodes so that the Spark application can access those remote HDFS clusters. + A list of secure HDFS namenodes your Spark application is going to access. For + example, `spark.yarn.access.namenodes=hdfs://nn1.com:8032,hdfs://nn2.com:8032`. + The Spark application must have acess to the namenodes listed and Kerberos must + be properly configured to be able to access them (either in the same realm or in + a trusted realm). Spark acquires security tokens for each of the namenodes so that + the Spark application can access those remote HDFS clusters. +
spark.yarn.appMasterEnv.[EnvironmentVariableName](none) + Add the environment variable specified by EnvironmentVariableName to the + Application Master process launched on YARN. The user can specify multiple of + these and to set multiple environment variables. In yarn-cluster mode this controls + the environment of the SPARK driver and in yarn-client mode it only controls + the environment of the executor launcher.
diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala index 44e025b8f6..1da0a1b675 100644 --- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala +++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala @@ -259,6 +259,14 @@ trait ClientBase extends Logging { localResources } + /** Get all application master environment variables set on this SparkConf */ + def getAppMasterEnv: Seq[(String, String)] = { + val prefix = "spark.yarn.appMasterEnv." + sparkConf.getAll.filter{case (k, v) => k.startsWith(prefix)} + .map{case (k, v) => (k.substring(prefix.length), v)} + } + + def setupLaunchEnv( localResources: HashMap[String, LocalResource], stagingDir: String): HashMap[String, String] = { @@ -276,6 +284,11 @@ trait ClientBase extends Logging { distCacheMgr.setDistFilesEnv(env) distCacheMgr.setDistArchivesEnv(env) + getAppMasterEnv.foreach { case (key, value) => + YarnSparkHadoopUtil.addToEnvironment(env, key, value, File.pathSeparator) + } + + // Keep this for backwards compatibility but users should move to the config sys.env.get("SPARK_YARN_USER_ENV").foreach { userEnvs => // Allow users to specify some environment variables. YarnSparkHadoopUtil.setEnvFromInputString(env, userEnvs, File.pathSeparator) diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnableUtil.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnableUtil.scala index 4ba7133a95..71a9e42846 100644 --- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnableUtil.scala +++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnableUtil.scala @@ -171,7 +171,11 @@ trait ExecutorRunnableUtil extends Logging { val extraCp = sparkConf.getOption("spark.executor.extraClassPath") ClientBase.populateClasspath(null, yarnConf, sparkConf, env, extraCp) - // Allow users to specify some environment variables + sparkConf.getExecutorEnv.foreach { case (key, value) => + YarnSparkHadoopUtil.addToEnvironment(env, key, value, File.pathSeparator) + } + + // Keep this for backwards compatibility but users should move to the config YarnSparkHadoopUtil.setEnvFromInputString(env, System.getenv("SPARK_YARN_USER_ENV"), File.pathSeparator) -- cgit v1.2.3