aboutsummaryrefslogtreecommitdiff
path: root/bin
diff options
context:
space:
mode:
authorAndrew Or <andrewor14@gmail.com>2014-05-05 16:28:07 -0700
committerPatrick Wendell <pwendell@gmail.com>2014-05-05 16:28:07 -0700
commitcf0a8f0204bb8acdaf441b03c924c278fef08e28 (patch)
tree319660a9003529da27e8abfcdc2ab43c119327e6 /bin
parenta975a19f21e71f448b3fdb2ed4461e28ef439900 (diff)
downloadspark-cf0a8f0204bb8acdaf441b03c924c278fef08e28.tar.gz
spark-cf0a8f0204bb8acdaf441b03c924c278fef08e28.tar.bz2
spark-cf0a8f0204bb8acdaf441b03c924c278fef08e28.zip
[SPARK-1681] Include datanucleus jars in Spark Hive distribution
This copies the datanucleus jars over from `lib_managed` into `dist/lib`, if any. The `CLASSPATH` must also be updated to reflect this change. Author: Andrew Or <andrewor14@gmail.com> Closes #610 from andrewor14/hive-distribution and squashes the following commits: a4bc96f [Andrew Or] Rename search path in jar error check fa205e1 [Andrew Or] Merge branch 'master' of github.com:apache/spark into hive-distribution 7855f58 [Andrew Or] Have jar command respect JAVA_HOME + check for jar errors both cases c16bbfd [Andrew Or] Merge branch 'master' of github.com:apache/spark into hive-distribution 32f6826 [Andrew Or] Leave the double colons 940a1bb [Andrew Or] Add back 2>/dev/null 58357cc [Andrew Or] Include datanucleus jars in Spark distribution built with Hive support
Diffstat (limited to 'bin')
-rwxr-xr-xbin/compute-classpath.sh58
1 files changed, 33 insertions, 25 deletions
diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
index 8dc547b379..7df43a555d 100755
--- a/bin/compute-classpath.sh
+++ b/bin/compute-classpath.sh
@@ -32,8 +32,8 @@ CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH:$FWDIR/conf"
ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION"
-if [ -n "${JAVA_HOME}" ]; then
- JAR_CMD="${JAVA_HOME}/bin/jar"
+if [ -n "$JAVA_HOME" ]; then
+ JAR_CMD="$JAVA_HOME/bin/jar"
else
JAR_CMD="jar"
fi
@@ -52,40 +52,48 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/yarn/stable/target/scala-$SCALA_VERSION/classes"
- DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar`
- CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
+ ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar 2>/dev/null)
else
# Else use spark-assembly jar from either RELEASE or assembly directory
if [ -f "$FWDIR/RELEASE" ]; then
- ASSEMBLY_JAR=`ls "$FWDIR"/lib/spark-assembly*hadoop*.jar`
+ ASSEMBLY_JAR=$(ls "$FWDIR"/lib/spark-assembly*hadoop*.jar 2>/dev/null)
else
- ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*.jar`
+ ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*.jar 2>/dev/null)
fi
- jar_error_check=$($JAR_CMD -tf $ASSEMBLY_JAR org/apache/spark/SparkContext 2>&1)
- if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
- echo "Loading Spark jar with '$JAR_CMD' failed. "
- echo "This is likely because Spark was compiled with Java 7 and run "
- echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark "
- echo "or build Spark with Java 6."
- exit 1
- fi
- CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
fi
+# Verify that versions of java used to build the jars and run Spark are compatible
+jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" nonexistent/class/path 2>&1)
+if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
+ echo "Loading Spark jar with '$JAR_CMD' failed. "
+ echo "This is likely because Spark was compiled with Java 7 and run "
+ echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark "
+ echo "or build Spark with Java 6."
+ exit 1
+fi
+
+CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
+
# When Hive support is needed, Datanucleus jars must be included on the classpath.
-# Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost.
+# Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost.
# Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
# built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
# assembly is built for Hive, before actually populating the CLASSPATH with the jars.
# Note that this check order is faster (by up to half a second) in the case where Hive is not used.
-num_datanucleus_jars=$(ls "$FWDIR"/lib_managed/jars/ 2>/dev/null | grep "datanucleus-.*\\.jar" | wc -l)
-if [ $num_datanucleus_jars -gt 0 ]; then
- AN_ASSEMBLY_JAR=${ASSEMBLY_JAR:-$DEPS_ASSEMBLY_JAR}
- num_hive_files=$(jar tvf "$AN_ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null | wc -l)
- if [ $num_hive_files -gt 0 ]; then
+if [ -f "$FWDIR/RELEASE" ]; then
+ datanucleus_dir="$FWDIR"/lib
+else
+ datanucleus_dir="$FWDIR"/lib_managed/jars
+fi
+
+datanucleus_jars=$(find "$datanucleus_dir" 2>/dev/null | grep "datanucleus-.*\\.jar")
+datanucleus_jars=$(echo "$datanucleus_jars" | tr "\n" : | sed s/:$//g)
+
+if [ -n "$datanucleus_jars" ]; then
+ hive_files=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null)
+ if [ -n "$hive_files" ]; then
echo "Spark assembly has been built with Hive, including Datanucleus jars on classpath" 1>&2
- DATANUCLEUSJARS=$(echo "$FWDIR/lib_managed/jars"/datanucleus-*.jar | tr " " :)
- CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
+ CLASSPATH="$CLASSPATH:$datanucleus_jars"
fi
fi
@@ -105,10 +113,10 @@ fi
# Add hadoop conf dir if given -- otherwise FileSystem.*, etc fail !
# Note, this assumes that there is either a HADOOP_CONF_DIR or YARN_CONF_DIR which hosts
# the configurtion files.
-if [ "x" != "x$HADOOP_CONF_DIR" ]; then
+if [ -n "$HADOOP_CONF_DIR" ]; then
CLASSPATH="$CLASSPATH:$HADOOP_CONF_DIR"
fi
-if [ "x" != "x$YARN_CONF_DIR" ]; then
+if [ -n "$YARN_CONF_DIR" ]; then
CLASSPATH="$CLASSPATH:$YARN_CONF_DIR"
fi