aboutsummaryrefslogtreecommitdiff
path: root/yarn
diff options
context:
space:
mode:
authorMatei Zaharia <matei.zaharia@gmail.com>2013-08-28 12:44:46 -0700
committerMatei Zaharia <matei.zaharia@gmail.com>2013-08-28 12:44:46 -0700
commitbaa84e7e4c5e0afc8bc3b177379311d309c00cd2 (patch)
tree76aeeb15613a583c9472eefc6e82d3b9b582dd5c /yarn
parentcd043cf922692aa493308cf1e6da6f7522d80b78 (diff)
parentaac1214ee48ef143b0164f740380cdb0a5a7383b (diff)
downloadspark-baa84e7e4c5e0afc8bc3b177379311d309c00cd2.tar.gz
spark-baa84e7e4c5e0afc8bc3b177379311d309c00cd2.tar.bz2
spark-baa84e7e4c5e0afc8bc3b177379311d309c00cd2.zip
Merge pull request #865 from tgravescs/fixtmpdir
Spark on Yarn should use yarn approved directories for spark.local.dir and tmp
Diffstat (limited to 'yarn')
-rw-r--r--yarn/src/main/scala/spark/deploy/yarn/ApplicationMaster.scala18
-rw-r--r--yarn/src/main/scala/spark/deploy/yarn/Client.scala4
-rw-r--r--yarn/src/main/scala/spark/deploy/yarn/WorkerRunnable.scala4
3 files changed, 26 insertions, 0 deletions
diff --git a/yarn/src/main/scala/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/spark/deploy/yarn/ApplicationMaster.scala
index 15dbd1c0fb..0f3b6bc1a6 100644
--- a/yarn/src/main/scala/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/spark/deploy/yarn/ApplicationMaster.scala
@@ -47,6 +47,9 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration) e
private var isFinished:Boolean = false
def run() {
+ // setup the directories so things go to yarn approved directories rather
+ // then user specified and /tmp
+ System.setProperty("spark.local.dir", getLocalDirs())
appAttemptId = getApplicationAttemptId()
resourceManager = registerWithResourceManager()
@@ -89,6 +92,21 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration) e
System.exit(0)
}
+
+ /** Get the Yarn approved local directories. */
+ private def getLocalDirs(): String = {
+ // Hadoop 0.23 and 2.x have different Environment variable names for the
+ // local dirs, so lets check both. We assume one of the 2 is set.
+ // LOCAL_DIRS => 2.X, YARN_LOCAL_DIRS => 0.23.X
+ val localDirs = Option(System.getenv("YARN_LOCAL_DIRS"))
+ .getOrElse(Option(System.getenv("LOCAL_DIRS"))
+ .getOrElse(""))
+
+ if (localDirs.isEmpty()) {
+ throw new Exception("Yarn Local dirs can't be empty")
+ }
+ return localDirs
+ }
private def getApplicationAttemptId(): ApplicationAttemptId = {
val envs = System.getenv()
diff --git a/yarn/src/main/scala/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/spark/deploy/yarn/Client.scala
index e84fb7c985..eb2a8cc642 100644
--- a/yarn/src/main/scala/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/spark/deploy/yarn/Client.scala
@@ -223,6 +223,10 @@ class Client(conf: Configuration, args: ClientArguments) extends YarnClientImpl
// Add Xmx for am memory
JAVA_OPTS += "-Xmx" + amMemory + "m "
+ JAVA_OPTS += " -Djava.io.tmpdir=" + new Path(Environment.PWD.$(),
+ YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR)
+
+
// Commenting it out for now - so that people can refer to the properties if required. Remove it once cpuset version is pushed out.
// The context is, default gc for server class machines end up using all cores to do gc - hence if there are multiple containers in same
// node, spark gc effects all other containers performance (which can also be other spark containers)
diff --git a/yarn/src/main/scala/spark/deploy/yarn/WorkerRunnable.scala b/yarn/src/main/scala/spark/deploy/yarn/WorkerRunnable.scala
index 3e007509e6..0e1fd9b680 100644
--- a/yarn/src/main/scala/spark/deploy/yarn/WorkerRunnable.scala
+++ b/yarn/src/main/scala/spark/deploy/yarn/WorkerRunnable.scala
@@ -75,6 +75,10 @@ class WorkerRunnable(container: Container, conf: Configuration, masterAddress: S
if (env.isDefinedAt("SPARK_JAVA_OPTS")) {
JAVA_OPTS += env("SPARK_JAVA_OPTS") + " "
}
+
+ JAVA_OPTS += " -Djava.io.tmpdir=" + new Path(Environment.PWD.$(),
+ YarnConfiguration.DEFAULT_CONTAINER_TEMP_DIR)
+
// Commenting it out for now - so that people can refer to the properties if required. Remove it once cpuset version is pushed out.
// The context is, default gc for server class machines end up using all cores to do gc - hence if there are multiple containers in same
// node, spark gc effects all other containers performance (which can also be other spark containers)