aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Wendell <pwendell@gmail.com>2014-06-12 15:43:32 -0700
committerPatrick Wendell <pwendell@gmail.com>2014-06-12 15:43:32 -0700
commit1c04652c8f18566baafb13dbae355f8ad2ad8d37 (patch)
treeea88c5147a602b2dfb8cfaa4de6c436716fac508
parentecde5b837534b11d365fcab78089820990b815cf (diff)
downloadspark-1c04652c8f18566baafb13dbae355f8ad2ad8d37.tar.gz
spark-1c04652c8f18566baafb13dbae355f8ad2ad8d37.tar.bz2
spark-1c04652c8f18566baafb13dbae355f8ad2ad8d37.zip
SPARK-1843: Replace assemble-deps with env variable.
(This change is actually small, I moved some logic into compute-classpath that was previously in spark-class). Assemble deps has existed for a while to allow developers to run local code with new changes quickly. When I'm developing I typically use a simpler approach which just prepends the Spark classes to the classpath before the assembly jar. This is well defined in the JVM and the Spark classes take precedence over those in the assembly. This approach is portable across both builds which is the main reason I'd like to switch to it. It's also a bit easier to toggle on and off quickly. The way you use this is the following: ``` $ ./bin/spark-shell # Use spark with the normal assembly $ export SPARK_PREPEND_CLASSES=true $ ./bin/spark-shell # Now it's using compiled classes $ unset SPARK_PREPEND_CLASSES $ ./bin/spark-shell # Back to normal ``` Author: Patrick Wendell <pwendell@gmail.com> Closes #877 from pwendell/assemble-deps and squashes the following commits: 8a11345 [Patrick Wendell] Merge remote-tracking branch 'apache/master' into assemble-deps faa3168 [Patrick Wendell] Adding a warning for compatibility 3f151a7 [Patrick Wendell] Small fix bbfb73c [Patrick Wendell] Review feedback 328e9f8 [Patrick Wendell] SPARK-1843: Replace assemble-deps with env variable.
-rwxr-xr-xbin/compute-classpath.sh34
-rwxr-xr-xbin/spark-class17
-rw-r--r--core/src/main/scala/org/apache/spark/SparkContext.scala3
-rw-r--r--project/SparkBuild.scala16
4 files changed, 40 insertions, 30 deletions
diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
index 7df43a555d..2cf4e381c1 100755
--- a/bin/compute-classpath.sh
+++ b/bin/compute-classpath.sh
@@ -38,8 +38,10 @@ else
JAR_CMD="jar"
fi
-# First check if we have a dependencies jar. If so, include binary classes with the deps jar
-if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
+# A developer option to prepend more recently compiled Spark classes
+if [ -n "$SPARK_PREPEND_CLASSES" ]; then
+ echo "NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark"\
+ "classes ahead of assembly." >&2
CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SCALA_VERSION/classes"
@@ -51,17 +53,31 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/yarn/stable/target/scala-$SCALA_VERSION/classes"
+fi
- ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar 2>/dev/null)
+# Use spark-assembly jar from either RELEASE or assembly directory
+if [ -f "$FWDIR/RELEASE" ]; then
+ assembly_folder="$FWDIR"/lib
else
- # Else use spark-assembly jar from either RELEASE or assembly directory
- if [ -f "$FWDIR/RELEASE" ]; then
- ASSEMBLY_JAR=$(ls "$FWDIR"/lib/spark-assembly*hadoop*.jar 2>/dev/null)
- else
- ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*.jar 2>/dev/null)
- fi
+ assembly_folder="$ASSEMBLY_DIR"
fi
+num_jars=$(ls "$assembly_folder" | grep "spark-assembly.*hadoop.*\.jar" | wc -l)
+if [ "$num_jars" -eq "0" ]; then
+ echo "Failed to find Spark assembly in $assembly_folder"
+ echo "You need to build Spark before running this program."
+ exit 1
+fi
+if [ "$num_jars" -gt "1" ]; then
+ jars_list=$(ls "$assembly_folder" | grep "spark-assembly.*hadoop.*.jar")
+ echo "Found multiple Spark assembly jars in $assembly_folder:"
+ echo "$jars_list"
+ echo "Please remove all but one jar."
+ exit 1
+fi
+
+ASSEMBLY_JAR=$(ls "$assembly_folder"/spark-assembly*hadoop*.jar 2>/dev/null)
+
# Verify that versions of java used to build the jars and run Spark are compatible
jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" nonexistent/class/path 2>&1)
if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
diff --git a/bin/spark-class b/bin/spark-class
index e884511010..cfe363a71d 100755
--- a/bin/spark-class
+++ b/bin/spark-class
@@ -108,23 +108,6 @@ fi
export JAVA_OPTS
# Attention: when changing the way the JAVA_OPTS are assembled, the change must be reflected in CommandUtils.scala!
-if [ ! -f "$FWDIR/RELEASE" ]; then
- # Exit if the user hasn't compiled Spark
- num_jars=$(ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/ | grep "spark-assembly.*hadoop.*.jar" | wc -l)
- jars_list=$(ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/ | grep "spark-assembly.*hadoop.*.jar")
- if [ "$num_jars" -eq "0" ]; then
- echo "Failed to find Spark assembly in $FWDIR/assembly/target/scala-$SCALA_VERSION/" >&2
- echo "You need to build Spark before running this program." >&2
- exit 1
- fi
- if [ "$num_jars" -gt "1" ]; then
- echo "Found multiple Spark assembly jars in $FWDIR/assembly/target/scala-$SCALA_VERSION:" >&2
- echo "$jars_list"
- echo "Please remove all but one jar."
- exit 1
- fi
-fi
-
TOOLS_DIR="$FWDIR"/tools
SPARK_TOOLS_JAR=""
if [ -e "$TOOLS_DIR"/target/scala-$SCALA_VERSION/*assembly*[0-9Tg].jar ]; then
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index df15186195..8fbda2c667 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -290,6 +290,9 @@ class SparkContext(config: SparkConf) extends Logging {
value <- Option(System.getenv(envKey)).orElse(Option(System.getProperty(propKey)))} {
executorEnvs(envKey) = value
}
+ Option(System.getenv("SPARK_PREPEND_CLASSES")).foreach { v =>
+ executorEnvs("SPARK_PREPEND_CLASSES") = v
+ }
// The Mesos scheduler backend relies on this environment variable to set executor memory.
// TODO: Set this only in the Mesos scheduler.
executorEnvs("SPARK_EXECUTOR_MEMORY") = executorMemory + "m"
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index ecd9d70680..8b4885d3bb 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -90,7 +90,16 @@ object SparkBuild extends Build {
lazy val assemblyProj = Project("assembly", file("assembly"), settings = assemblyProjSettings)
.dependsOn(core, graphx, bagel, mllib, streaming, repl, sql) dependsOn(maybeYarn: _*) dependsOn(maybeHive: _*) dependsOn(maybeGanglia: _*)
- lazy val assembleDeps = TaskKey[Unit]("assemble-deps", "Build assembly of dependencies and packages Spark projects")
+ lazy val assembleDepsTask = TaskKey[Unit]("assemble-deps")
+ lazy val assembleDeps = assembleDepsTask := {
+ println()
+ println("**** NOTE ****")
+ println("'sbt/sbt assemble-deps' is no longer supported.")
+ println("Instead create a normal assembly and:")
+ println(" export SPARK_PREPEND_CLASSES=1 (toggle on)")
+ println(" unset SPARK_PREPEND_CLASSES (toggle off)")
+ println()
+ }
// A configuration to set an alternative publishLocalConfiguration
lazy val MavenCompile = config("m2r") extend(Compile)
@@ -373,6 +382,7 @@ object SparkBuild extends Build {
"net.sf.py4j" % "py4j" % "0.8.1"
),
libraryDependencies ++= maybeAvro,
+ assembleDeps,
previousArtifact := sparkPreviousArtifact("spark-core")
)
@@ -584,9 +594,7 @@ object SparkBuild extends Build {
def assemblyProjSettings = sharedSettings ++ Seq(
name := "spark-assembly",
- assembleDeps in Compile <<= (packageProjects.map(packageBin in Compile in _) ++ Seq(packageDependency in Compile)).dependOn,
- jarName in assembly <<= version map { v => "spark-assembly-" + v + "-hadoop" + hadoopVersion + ".jar" },
- jarName in packageDependency <<= version map { v => "spark-assembly-" + v + "-hadoop" + hadoopVersion + "-deps.jar" }
+ jarName in assembly <<= version map { v => "spark-assembly-" + v + "-hadoop" + hadoopVersion + ".jar" }
) ++ assemblySettings ++ extraAssemblySettings
def extraAssemblySettings() = Seq(