#!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # # Script to create a binary distribution for easy deploys of Spark. # The distribution directory defaults to dist/ but can be overridden below. # The distribution contains fat (assembly) jars that include the Scala library, # so it is completely self contained. # It does not contain source or *.class files. set -o pipefail set -e set -x # Figure out where the Spark framework is installed SPARK_HOME="$(cd "`dirname "$0"`/.."; pwd)" DISTDIR="$SPARK_HOME/dist" MAKE_TGZ=false MAKE_PIP=false MAKE_R=false NAME=none MVN="$SPARK_HOME/build/mvn" function exit_with_usage { echo "make-distribution.sh - tool for making binary distributions of Spark" echo "" echo "usage:" cl_options="[--name] [--tgz] [--pip] [--r] [--mvn ]" echo "make-distribution.sh $cl_options " echo "See Spark's \"Building Spark\" doc for correct Maven options." echo "" exit 1 } # Parse arguments while (( "$#" )); do case $1 in --tgz) MAKE_TGZ=true ;; --pip) MAKE_PIP=true ;; --r) MAKE_R=true ;; --mvn) MVN="$2" shift ;; --name) NAME="$2" shift ;; --help) exit_with_usage ;; *) break ;; esac shift done if [ -z "$JAVA_HOME" ]; then # Fall back on JAVA_HOME from rpm, if found if [ $(command -v rpm) ]; then RPM_JAVA_HOME="$(rpm -E %java_home 2>/dev/null)" if [ "$RPM_JAVA_HOME" != "%java_home" ]; then JAVA_HOME="$RPM_JAVA_HOME" echo "No JAVA_HOME set, proceeding with '$JAVA_HOME' learned from rpm" fi fi if [ -z "$JAVA_HOME" ]; then if [ `command -v java` ]; then # If java is in /usr/bin/java, we want /usr JAVA_HOME="$(dirname $(dirname $(which java)))" fi fi fi if [ -z "$JAVA_HOME" ]; then echo "Error: JAVA_HOME is not set, cannot proceed." exit -1 fi if [ $(command -v git) ]; then GITREV=$(git rev-parse --short HEAD 2>/dev/null || :) if [ ! -z "$GITREV" ]; then GITREVSTRING=" (git revision $GITREV)" fi unset GITREV fi if [ ! "$(command -v "$MVN")" ] ; then echo -e "Could not locate Maven command: '$MVN'." echo -e "Specify the Maven command with the --mvn flag" exit -1; fi VERSION=$("$MVN" help:evaluate -Dexpression=project.version $@ 2>/dev/null | grep -v "INFO" | tail -n 1) SCALA_VERSION=$("$MVN" help:evaluate -Dexpression=scala.binary.version $@ 2>/dev/null\ | grep -v "INFO"\ | tail -n 1) SPARK_HADOOP_VERSION=$("$MVN" help:evaluate -Dexpression=hadoop.version $@ 2>/dev/null\ | grep -v "INFO"\ | tail -n 1) SPARK_HIVE=$("$MVN" help:evaluate -Dexpression=project.activeProfiles -pl sql/hive $@ 2>/dev/null\ | grep -v "INFO"\ | fgrep --count "hive";\ # Reset exit status to 0, otherwise the script stops here if the last grep finds nothing\ # because we use "set -o pipefail" echo -n) if [ "$NAME" == "none" ]; then NAME=$SPARK_HADOOP_VERSION fi echo "Spark version is $VERSION" if [ "$MAKE_TGZ" == "true" ]; then echo "Making spark-$VERSION-bin-$NAME.tgz" else echo "Making distribution for Spark $VERSION in '$DISTDIR'..." fi # Build uber fat JAR cd "$SPARK_HOME" export MAVEN_OPTS="${MAVEN_OPTS:--Xmx2g -XX:ReservedCodeCacheSize=512m}" # Store the command as an array because $MVN variable might have spaces in it. # Normal quoting tricks don't work. # See: http://mywiki.wooledge.org/BashFAQ/050 BUILD_COMMAND=("$MVN" -T 1C clean package -DskipTests $@) # Actually build the jar echo -e "\nBuilding with..." echo -e "\$ ${BUILD_COMMAND[@]}\n" "${BUILD_COMMAND[@]}" # Make directories rm -rf "$DISTDIR" mkdir -p "$DISTDIR/jars" echo "Spark $VERSION$GITREVSTRING built for Hadoop $SPARK_HADOOP_VERSION" > "$DISTDIR/RELEASE" echo "Build flags: $@" >> "$DISTDIR/RELEASE" # Copy jars cp "$SPARK_HOME"/assembly/target/scala*/jars/* "$DISTDIR/jars/" # Only create the yarn directory if the yarn artifacts were build. if [ -f "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar ]; then mkdir "$DISTDIR/yarn" cp "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar "$DISTDIR/yarn" fi # Copy examples and dependencies mkdir -p "$DISTDIR/examples/jars" cp "$SPARK_HOME"/examples/target/scala*/jars/* "$DISTDIR/examples/jars" # Deduplicate jars that have already been packaged as part of the main Spark dependencies. for f in "$DISTDIR"/examples/jars/*; do name=$(basename "$f") if [ -f "$DISTDIR/jars/$name" ]; then rm "$DISTDIR/examples/jars/$name" fi done # Copy example sources (needed for python and SQL) mkdir -p "$DISTDIR/examples/src/main" cp -r "$SPARK_HOME/examples/src/main" "$DISTDIR/examples/src/" # Copy license and ASF files cp "$SPARK_HOME/LICENSE" "$DISTDIR" cp -r "$SPARK_HOME/licenses" "$DISTDIR" cp "$SPARK_HOME/NOTICE" "$DISTDIR" if [ -e "$SPARK_HOME/CHANGES.txt" ]; then cp "$SPARK_HOME/CHANGES.txt" "$DISTDIR" fi # Copy data files cp -r "$SPARK_HOME/data" "$DISTDIR" # Make pip package if [ "$MAKE_PIP" == "true" ]; then echo "Building python distribution package" pushd "$SPARK_HOME/python" > /dev/null # Delete the egg info file if it exists, this can cache older setup files. rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion" python setup.py sdist popd > /dev/null else echo "Skipping building python distribution package" fi # Make R package - this is used for both CRAN release and packing R layout into distribution if [ "$MAKE_R" == "true" ]; then echo "Building R source package" R_PACKAGE_VERSION=`grep Version "$SPARK_HOME/R/pkg/DESCRIPTION" | awk '{print $NF}'` pushd "$SPARK_HOME/R" > /dev/null # Build source package and run full checks # Do not source the check-cran.sh - it should be run from where it is for it to set SPARK_HOME NO_TESTS=1 "$SPARK_HOME/R/check-cran.sh" # Move R source package to match the Spark release version if the versions are not the same. # NOTE(shivaram): `mv` throws an error on Linux if source and destination are same file if [ "$R_PACKAGE_VERSION" != "$VERSION" ]; then mv "$SPARK_HOME/R/SparkR_$R_PACKAGE_VERSION.tar.gz" "$SPARK_HOME/R/SparkR_$VERSION.tar.gz" fi # Install source package to get it to generate vignettes rds files, etc. VERSION=$VERSION "$SPARK_HOME/R/install-source-package.sh" popd > /dev/null else echo "Skipping building R source package" fi # Copy other things mkdir "$DISTDIR/conf" cp "$SPARK_HOME"/conf/*.template "$DISTDIR/conf" cp "$SPARK_HOME/README.md" "$DISTDIR" cp -r "$SPARK_HOME/bin" "$DISTDIR" cp -r "$SPARK_HOME/python" "$DISTDIR" # Remove the python distribution from dist/ if we built it if [ "$MAKE_PIP" == "true" ]; then rm -f "$DISTDIR"/python/dist/pyspark-*.tar.gz fi cp -r "$SPARK_HOME/sbin" "$DISTDIR" # Copy SparkR if it exists if [ -d "$SPARK_HOME/R/lib/SparkR" ]; then mkdir -p "$DISTDIR/R/lib" cp -r "$SPARK_HOME/R/lib/SparkR" "$DISTDIR/R/lib" cp "$SPARK_HOME/R/lib/sparkr.zip" "$DISTDIR/R/lib" fi if [ "$MAKE_TGZ" == "true" ]; then TARDIR_NAME=spark-$VERSION-bin-$NAME TARDIR="$SPARK_HOME/$TARDIR_NAME" rm -rf "$TARDIR" cp -r "$DISTDIR" "$TARDIR" tar czf "spark-$VERSION-bin-$NAME.tgz" -C "$SPARK_HOME" "$TARDIR_NAME" rm -rf "$TARDIR" fi