From cd4ed293262e2349794c13467d1737974385c019 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Wed, 23 Apr 2014 10:19:32 -0700 Subject: SPARK-1119 and other build improvements 1. Makes assembly and examples jar naming consistent in maven/sbt. 2. Updates make-distribution.sh to use Maven and fixes some bugs. 3. Updates the create-release script to call make-distribution script. Author: Patrick Wendell Closes #502 from pwendell/make-distribution and squashes the following commits: 1a97f0d [Patrick Wendell] SPARK-1119 and other build improvements --- assembly/pom.xml | 2 +- bin/compute-classpath.sh | 4 +-- bin/run-example | 11 +++--- dev/create-release/create-release.sh | 18 +++++----- examples/pom.xml | 2 +- make-distribution.sh | 70 ++++++++++++++++++++++++------------ project/SparkBuild.scala | 4 ++- 7 files changed, 70 insertions(+), 41 deletions(-) diff --git a/assembly/pom.xml b/assembly/pom.xml index 923bf47f70..bdb3880649 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -33,7 +33,7 @@ scala-${scala.binary.version} - ${project.artifactId}-${project.version}-hadoop${hadoop.version}.jar + spark-assembly-${project.version}-hadoop${hadoop.version}.jar ${project.build.directory}/${spark.jar.dir}/${spark.jar.basename} spark /usr/share/spark diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh index 2a2bb376fd..3a59f599fd 100755 --- a/bin/compute-classpath.sh +++ b/bin/compute-classpath.sh @@ -50,9 +50,9 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then else # Else use spark-assembly jar from either RELEASE or assembly directory if [ -f "$FWDIR/RELEASE" ]; then - ASSEMBLY_JAR=`ls "$FWDIR"/jars/spark*-assembly*.jar` + ASSEMBLY_JAR=`ls "$FWDIR"/lib/spark-assembly*hadoop*.jar` else - ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*.jar` + ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*.jar` fi CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR" fi diff --git a/bin/run-example b/bin/run-example index b2999198a8..d8a94f2e31 100755 --- a/bin/run-example +++ b/bin/run-example @@ -40,12 +40,15 @@ fi # Figure out the JAR file that our examples were packaged into. This includes a bit of a hack # to avoid the -sources and -doc packages that are built by publish-local. EXAMPLES_DIR="$FWDIR"/examples -SPARK_EXAMPLES_JAR="" -if [ -e "$EXAMPLES_DIR"/target/scala-$SCALA_VERSION/*assembly*[0-9Tg].jar ]; then - export SPARK_EXAMPLES_JAR=`ls "$EXAMPLES_DIR"/target/scala-$SCALA_VERSION/*assembly*[0-9Tg].jar` + +if [ -f "$FWDIR/RELEASE" ]; then + export SPARK_EXAMPLES_JAR=`ls "$FWDIR"/lib/spark-examples-*hadoop*.jar` +elif [ -e "$EXAMPLES_DIR"/target/scala-$SCALA_VERSION/spark-examples-*hadoop*.jar ]; then + export SPARK_EXAMPLES_JAR=`ls "$EXAMPLES_DIR"/target/scala-$SCALA_VERSION/spark-examples-*hadoop*.jar` fi + if [[ -z $SPARK_EXAMPLES_JAR ]]; then - echo "Failed to find Spark examples assembly in $FWDIR/examples/target" >&2 + echo "Failed to find Spark examples assembly in $FWDIR/lib or $FWDIR/examples/target" >&2 echo "You need to build Spark with sbt/sbt assembly before running this program" >&2 exit 1 fi diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh index fb9d9f9e07..ad38c8d53e 100755 --- a/dev/create-release/create-release.sh +++ b/dev/create-release/create-release.sh @@ -83,15 +83,15 @@ rm -rf spark-$RELEASE_VERSION make_binary_release() { NAME=$1 - MAVEN_FLAGS=$2 - + FLAGS=$2 cp -r spark spark-$RELEASE_VERSION-bin-$NAME + cd spark-$RELEASE_VERSION-bin-$NAME - export MAVEN_OPTS="-Xmx3g -XX:MaxPermSize=1g -XX:ReservedCodeCacheSize=1g" - mvn $MAVEN_FLAGS -DskipTests clean package - find . -name test-classes -type d | xargs rm -rf - find . -name classes -type d | xargs rm -rf + ./make-distribution.sh $FLAGS --name $NAME --tgz cd .. + cp spark-$RELEASE_VERSION-bin-$NAME/spark-$RELEASE_VERSION-bin-$NAME.tgz . + rm -rf spark-$RELEASE_VERSION-bin-$NAME + tar cvzf spark-$RELEASE_VERSION-bin-$NAME.tgz spark-$RELEASE_VERSION-bin-$NAME echo $GPG_PASSPHRASE | gpg --passphrase-fd 0 --armour \ --output spark-$RELEASE_VERSION-bin-$NAME.tgz.asc \ @@ -105,9 +105,9 @@ make_binary_release() { rm -rf spark-$RELEASE_VERSION-bin-$NAME } -make_binary_release "hadoop1" "-Dhadoop.version=1.0.4" -make_binary_release "cdh4" "-Dhadoop.version=2.0.0-mr1-cdh4.2.0" -make_binary_release "hadoop2" "-Pyarn -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0" +make_binary_release "hadoop1" "--hadoop 1.0.4" +make_binary_release "cdh4" "--hadoop 2.0.0-mr1-cdh4.2.0" +make_binary_release "hadoop2" "--with-yarn --hadoop 2.2.0" # Copy data echo "Copying release tarballs" diff --git a/examples/pom.xml b/examples/pom.xml index 0b6212b5d1..704d6df7c5 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -187,7 +187,7 @@ maven-shade-plugin false - ${project.build.directory}/scala-${scala.binary.version}/${project.artifactId}-assembly-${project.version}.jar + ${project.build.directory}/scala-${scala.binary.version}/spark-examples-${project.version}-hadoop${hadoop.version}.jar *:* diff --git a/make-distribution.sh b/make-distribution.sh index 5c780fcbda..83dfc74585 100755 --- a/make-distribution.sh +++ b/make-distribution.sh @@ -28,6 +28,8 @@ # --tgz: Additionally creates spark-$VERSION-bin.tar.gz # --hadoop VERSION: Builds against specified version of Hadoop. # --with-yarn: Enables support for Hadoop YARN. +# --with-hive: Enable support for reading Hive tables. +# --name: A moniker for the release target. Defaults to the Hadoop verison. # # Recommended deploy/testing procedure (standalone mode): # 1) Rsync / deploy the dist/ dir to one host @@ -41,25 +43,20 @@ FWDIR="$(cd `dirname $0`; pwd)" DISTDIR="$FWDIR/dist" -# Get version from SBT -export TERM=dumb # Prevents color codes in SBT output - -VERSIONSTRING=$($FWDIR/sbt/sbt "show version") - +VERSION=$(mvn help:evaluate -Dexpression=project.version |grep -v "INFO") if [ $? == -1 ] ;then - echo -e "You need sbt installed and available on your path." - echo -e "Download sbt from http://www.scala-sbt.org/" + echo -e "You need Maven installed to build Spark." + echo -e "Download Maven from https://maven.apache.org." exit -1; fi -VERSION=$(echo "${VERSIONSTRING}" | tail -1 | cut -f 2 | sed 's/^\([a-zA-Z0-9.-]*\).*/\1/') -echo "Version is ${VERSION}" - # Initialize defaults SPARK_HADOOP_VERSION=1.0.4 SPARK_YARN=false +SPARK_HIVE=false SPARK_TACHYON=false MAKE_TGZ=false +NAME=none # Parse arguments while (( "$#" )); do @@ -71,23 +68,37 @@ while (( "$#" )); do --with-yarn) SPARK_YARN=true ;; + --with-hive) + SPARK_HIVE=true + ;; --with-tachyon) SPARK_TACHYON=true ;; --tgz) MAKE_TGZ=true ;; + --name) + NAME="$2" + shift + ;; esac shift done +if [ "$NAME" == "none" ]; then + NAME=$SPARK_HADOOP_VERSION +fi + +echo "Spark version is $VERSION" + if [ "$MAKE_TGZ" == "true" ]; then - echo "Making spark-$VERSION-hadoop_$SPARK_HADOOP_VERSION-bin.tar.gz" + echo "Making spark-$VERSION-bin-$NAME.tgz" else - echo "Making distribution for Spark $VERSION in $DISTDIR..." + echo "Making distribution for Spark $VERSION in $DISTDIR..." fi echo "Hadoop version set to $SPARK_HADOOP_VERSION" +echo "Release name set to $NAME" if [ "$SPARK_YARN" == "true" ]; then echo "YARN enabled" else @@ -100,20 +111,32 @@ else echo "Tachyon Disabled" fi -# Build fat JAR -export SPARK_HADOOP_VERSION -export SPARK_YARN +# Build uber fat JAR cd $FWDIR -"sbt/sbt" "assembly/assembly" +export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m" + +if [ "$SPARK_HIVE" == "true" ]; then + MAYBE_HIVE="-Phive" +else + MAYBE_HIVE="" +fi + +if [ "$SPARK_YARN" == "true" ]; then + mvn clean package -DskipTests -Pyarn -Dhadoop.version=$SPARK_HADOOP_VERSION \ + -Dyarn.version=$SPARK_HADOOP_VERSION $MAYBE_HIVE +else + mvn clean package -DskipTests -Dhadoop.version=$SPARK_HADOOP_VERSION $MAYBE_HIVE +fi # Make directories rm -rf "$DISTDIR" -mkdir -p "$DISTDIR/jars" +mkdir -p "$DISTDIR/lib" echo "Spark $VERSION built for Hadoop $SPARK_HADOOP_VERSION" > "$DISTDIR/RELEASE" # Copy jars -cp $FWDIR/assembly/target/scala*/*assembly*hadoop*.jar "$DISTDIR/jars/" +cp $FWDIR/assembly/target/scala*/*assembly*hadoop*.jar "$DISTDIR/lib/" +cp $FWDIR/examples/target/scala*/spark-examples*.jar "$DISTDIR/lib/" # Copy other things mkdir "$DISTDIR"/conf @@ -135,16 +158,16 @@ if [ "$SPARK_TACHYON" == "true" ]; then wget "$TACHYON_URL" tar xf "tachyon-${TACHYON_VERSION}-bin.tar.gz" - cp "tachyon-${TACHYON_VERSION}/target/tachyon-${TACHYON_VERSION}-jar-with-dependencies.jar" "$DISTDIR/jars" + cp "tachyon-${TACHYON_VERSION}/target/tachyon-${TACHYON_VERSION}-jar-with-dependencies.jar" "$DISTDIR/lib" mkdir -p "$DISTDIR/tachyon/src/main/java/tachyon/web" cp -r "tachyon-${TACHYON_VERSION}"/{bin,conf,libexec} "$DISTDIR/tachyon" cp -r "tachyon-${TACHYON_VERSION}"/src/main/java/tachyon/web/resources "$DISTDIR/tachyon/src/main/java/tachyon/web" if [[ `uname -a` == Darwin* ]]; then # need to run sed differently on osx - nl=$'\n'; sed -i "" -e "s|export TACHYON_JAR=\$TACHYON_HOME/target/\(.*\)|# This is set for spark's make-distribution\\$nl export TACHYON_JAR=\$TACHYON_HOME/../jars/\1|" "$DISTDIR/tachyon/libexec/tachyon-config.sh" + nl=$'\n'; sed -i "" -e "s|export TACHYON_JAR=\$TACHYON_HOME/target/\(.*\)|# This is set for spark's make-distribution\\$nl export TACHYON_JAR=\$TACHYON_HOME/../lib/\1|" "$DISTDIR/tachyon/libexec/tachyon-config.sh" else - sed -i "s|export TACHYON_JAR=\$TACHYON_HOME/target/\(.*\)|# This is set for spark's make-distribution\n export TACHYON_JAR=\$TACHYON_HOME/../jars/\1|" "$DISTDIR/tachyon/libexec/tachyon-config.sh" + sed -i "s|export TACHYON_JAR=\$TACHYON_HOME/target/\(.*\)|# This is set for spark's make-distribution\n export TACHYON_JAR=\$TACHYON_HOME/../lib/\1|" "$DISTDIR/tachyon/libexec/tachyon-config.sh" fi popd > /dev/null @@ -152,8 +175,9 @@ if [ "$SPARK_TACHYON" == "true" ]; then fi if [ "$MAKE_TGZ" == "true" ]; then - TARDIR="$FWDIR/spark-$VERSION" + TARDIR_NAME=spark-$VERSION-bin-$NAME + TARDIR="$FWDIR/$TARDIR_NAME" cp -r "$DISTDIR" "$TARDIR" - tar -zcf "spark-$VERSION-hadoop_$SPARK_HADOOP_VERSION-bin.tar.gz" -C "$FWDIR" "spark-$VERSION" + tar czf "spark-$VERSION-bin-$NAME.tgz" -C "$FWDIR" "$TARDIR_NAME" rm -rf "$TARDIR" fi diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 852358501a..b8af2bbd2e 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -411,7 +411,9 @@ object SparkBuild extends Build { ) def examplesSettings = sharedSettings ++ Seq( - name := "spark-examples", + name := "spark-examples", + jarName in assembly <<= version map { + v => "spark-examples-" + v + "-hadoop" + hadoopVersion + ".jar" }, libraryDependencies ++= Seq( "com.twitter" %% "algebird-core" % "0.1.11", "org.apache.hbase" % "hbase" % HBASE_VERSION excludeAll(excludeNetty, excludeAsm, excludeOldAsm, excludeCommonsLogging), -- cgit v1.2.3