From 5081a0a9d47ca31900ea4de570de2cbb0e063105 Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Sat, 24 May 2014 18:01:49 -0700 Subject: [SPARK-1900 / 1918] PySpark on YARN is broken If I run the following on a YARN cluster ``` bin/spark-submit sheep.py --master yarn-client ``` it fails because of a mismatch in paths: `spark-submit` thinks that `sheep.py` resides on HDFS, and balks when it can't find the file there. A natural workaround is to add the `file:` prefix to the file: ``` bin/spark-submit file:/path/to/sheep.py --master yarn-client ``` However, this also fails. This time it is because python does not understand URI schemes. This PR fixes this by automatically resolving all paths passed as command line argument to `spark-submit` properly. This has the added benefit of keeping file and jar paths consistent across different cluster modes. For python, we strip the URI scheme before we actually try to run it. Much of the code is originally written by @mengxr. Tested on YARN cluster. More tests pending. Author: Andrew Or Closes #853 from andrewor14/submit-paths and squashes the following commits: 0bb097a [Andrew Or] Format path correctly before adding it to PYTHONPATH 323b45c [Andrew Or] Include --py-files on PYTHONPATH for pyspark shell 3c36587 [Andrew Or] Improve error messages (minor) 854aa6a [Andrew Or] Guard against NPE if user gives pathological paths 6638a6b [Andrew Or] Fix spark-shell jar paths after #849 went in 3bb0359 [Andrew Or] Update more comments (minor) 2a1f8a0 [Andrew Or] Update comments (minor) 6af2c77 [Andrew Or] Merge branch 'master' of github.com:apache/spark into submit-paths a68c4d1 [Andrew Or] Handle Windows python file path correctly 427a250 [Andrew Or] Resolve paths properly for Windows a591a4a [Andrew Or] Update tests for resolving URIs 6c8621c [Andrew Or] Move resolveURIs to Utils db8255e [Andrew Or] Merge branch 'master' of github.com:apache/spark into submit-paths f542dce [Andrew Or] Fix outdated tests 691c4ce [Andrew Or] Ignore special primary resource names 5342ac7 [Andrew Or] Add missing space in error message 02f77f3 [Andrew Or] Resolve command line arguments to spark-submit properly --- repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'repl') diff --git a/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala index 5f34362ccd..e1db4d5395 100644 --- a/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala +++ b/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala @@ -942,7 +942,7 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter, def createSparkContext(): SparkContext = { val execUri = System.getenv("SPARK_EXECUTOR_URI") - val jars = SparkILoop.getAddedJars.map(new java.io.File(_).getAbsolutePath) + val jars = SparkILoop.getAddedJars val conf = new SparkConf() .setMaster(getMaster()) .setAppName("Spark shell") @@ -997,7 +997,8 @@ object SparkILoop { val propJars = sys.props.get("spark.jars").flatMap { p => if (p == "") None else Some(p) } - propJars.orElse(envJars).map(_.split(",")).getOrElse(Array.empty) + val jars = propJars.orElse(envJars).getOrElse("") + Utils.resolveURIs(jars).split(",").filter(_.nonEmpty) } // Designed primarily for use by test code: take a String with a -- cgit v1.2.3