aboutsummaryrefslogtreecommitdiff
path: root/bin
diff options
context:
space:
mode:
authorAaron Davidson <aaron@databricks.com>2014-04-06 17:48:41 -0700
committerPatrick Wendell <pwendell@gmail.com>2014-04-06 17:48:41 -0700
commit4106558435889261243d186f5f0b51c5f9e98d56 (patch)
tree6735046be9dbc5048867a619a951c39d884f3d1f /bin
parent7ce52c4a7a07b0db5e7c1312b1920efb1165ce6a (diff)
downloadspark-4106558435889261243d186f5f0b51c5f9e98d56.tar.gz
spark-4106558435889261243d186f5f0b51c5f9e98d56.tar.bz2
spark-4106558435889261243d186f5f0b51c5f9e98d56.zip
SPARK-1314: Use SPARK_HIVE to determine if we include Hive in packaging
Previously, we based our decision regarding including datanucleus jars based on the existence of a spark-hive-assembly jar, which was incidentally built whenever "sbt assembly" is run. This means that a typical and previously supported pathway would start using hive jars. This patch has the following features/bug fixes: - Use of SPARK_HIVE (default false) to determine if we should include Hive in the assembly jar. - Analagous feature in Maven with -Phive (previously, there was no support for adding Hive to any of our jars produced by Maven) - assemble-deps fixed since we no longer use a different ASSEMBLY_DIR - avoid adding log message in compute-classpath.sh to the classpath :) Still TODO before mergeable: - We need to download the datanucleus jars outside of sbt. Perhaps we can have spark-class download them if SPARK_HIVE is set similar to how sbt downloads itself. - Spark SQL documentation updates. Author: Aaron Davidson <aaron@databricks.com> Closes #237 from aarondav/master and squashes the following commits: 5dc4329 [Aaron Davidson] Typo fixes dd4f298 [Aaron Davidson] Doc update dd1a365 [Aaron Davidson] Eliminate need for SPARK_HIVE at runtime by d/ling datanucleus from Maven a9269b5 [Aaron Davidson] [WIP] Use SPARK_HIVE to determine if we include Hive in packaging
Diffstat (limited to 'bin')
-rwxr-xr-xbin/compute-classpath.sh35
-rwxr-xr-xbin/spark-class2
2 files changed, 19 insertions, 18 deletions
diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
index bef42df71c..be37102dc0 100755
--- a/bin/compute-classpath.sh
+++ b/bin/compute-classpath.sh
@@ -30,21 +30,7 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
# Build up classpath
CLASSPATH="$SPARK_CLASSPATH:$FWDIR/conf"
-# Support for interacting with Hive. Since hive pulls in a lot of dependencies that might break
-# existing Spark applications, it is not included in the standard spark assembly. Instead, we only
-# include it in the classpath if the user has explicitly requested it by running "sbt hive/assembly"
-# Hopefully we will find a way to avoid uber-jars entirely and deploy only the needed packages in
-# the future.
-if [ -f "$FWDIR"/sql/hive/target/scala-$SCALA_VERSION/spark-hive-assembly-*.jar ]; then
-
- # Datanucleus jars do not work if only included in the uberjar as plugin.xml metadata is lost.
- DATANUCLEUSJARS=$(JARS=("$FWDIR/lib_managed/jars"/datanucleus-*.jar); IFS=:; echo "${JARS[*]}")
- CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
-
- ASSEMBLY_DIR="$FWDIR/sql/hive/target/scala-$SCALA_VERSION/"
-else
- ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"
-fi
+ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION"
# First check if we have a dependencies jar. If so, include binary classes with the deps jar
if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
@@ -59,7 +45,7 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
- DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*-deps.jar`
+ DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar`
CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
else
# Else use spark-assembly jar from either RELEASE or assembly directory
@@ -71,6 +57,23 @@ else
CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
fi
+# When Hive support is needed, Datanucleus jars must be included on the classpath.
+# Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost.
+# Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
+# built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
+# assembly is built for Hive, before actually populating the CLASSPATH with the jars.
+# Note that this check order is faster (by up to half a second) in the case where Hive is not used.
+num_datanucleus_jars=$(ls "$FWDIR"/lib_managed/jars/ | grep "datanucleus-.*\\.jar" | wc -l)
+if [ $num_datanucleus_jars -gt 0 ]; then
+ AN_ASSEMBLY_JAR=${ASSEMBLY_JAR:-$DEPS_ASSEMBLY_JAR}
+ num_hive_files=$(jar tvf "$AN_ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null | wc -l)
+ if [ $num_hive_files -gt 0 ]; then
+ echo "Spark assembly has been built with Hive, including Datanucleus jars on classpath" 1>&2
+ DATANUCLEUSJARS=$(echo "$FWDIR/lib_managed/jars"/datanucleus-*.jar | tr " " :)
+ CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
+ fi
+fi
+
# Add test classes if we're running from SBT or Maven with SPARK_TESTING set to 1
if [[ $SPARK_TESTING == 1 ]]; then
CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/test-classes"
diff --git a/bin/spark-class b/bin/spark-class
index 0dcf0e156c..76fde3e448 100755
--- a/bin/spark-class
+++ b/bin/spark-class
@@ -154,5 +154,3 @@ if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
fi
exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
-
-