From 50a0496a43f09d70593419efc38587c8441843bf Mon Sep 17 00:00:00 2001 From: Brennon York Date: Wed, 17 Jun 2015 12:00:34 -0700 Subject: [SPARK-7017] [BUILD] [PROJECT INFRA] Refactor dev/run-tests into Python All, this is a first attempt at refactoring `dev/run-tests` into Python. Initially I merely converted all Bash calls over to Python, then moved to a much more modular approach (more functions, moved the calls around, etc.). What is here is the initial culmination and should provide a great base to various downstream issues (e.g. SPARK-7016, modularize / parallelize testing, etc.). Would love comments / suggestions for this initial first step! /cc srowen pwendell nchammas Author: Brennon York Closes #5694 from brennonyork/SPARK-7017 and squashes the following commits: 154ed73 [Brennon York] updated finding java binary if JAVA_HOME not set 3922a85 [Brennon York] removed necessary passed in variable f9fbe54 [Brennon York] reverted doc test change 8135518 [Brennon York] removed the test check for documentation changes until jenkins can get updated 05d435b [Brennon York] added check for jekyll install 22edb78 [Brennon York] add check if jekyll isn't installed on the path 2dff136 [Brennon York] fixed pep8 whitespace errors 767a668 [Brennon York] fixed path joining issues, ensured docs actually build on doc changes c42cf9a [Brennon York] unpack set operations with splat (*) fb85a41 [Brennon York] fixed minor set bug 0379833 [Brennon York] minor doc addition to print the changed modules aa03d9e [Brennon York] added documentation builds as a top level test component, altered high level project changes to properly execute core tests only when necessary, changed variable names for simplicity ec1ae78 [Brennon York] minor name changes, bug fixes b7c72b9 [Brennon York] reverting streaming context 03fdd7b [Brennon York] fixed the tuple () wraps around example lambda 705d12e [Brennon York] changed example to comply with pep3113 supporting python3 60b3d51 [Brennon York] prepend rather than append onto PATH 7d2f5e2 [Brennon York] updated python tests to remove unused variable 2898717 [Brennon York] added a change to streaming test to check if it only runs streaming tests eb684b6 [Brennon York] fixed sbt_test_goals reference error db7ae6f [Brennon York] reverted SPARK_HOME from start of command 1ecca26 [Brennon York] fixed merge conflicts 2fcdfc0 [Brennon York] testing targte branch dump on jenkins 1f607b1 [Brennon York] finalizing revisions to modular tests 8afbe93 [Brennon York] made error codes a global 0629de8 [Brennon York] updated to refactor and remove various small bugs, removed pep8 complaints d90ab2d [Brennon York] fixed merge conflicts, ensured that for regular builds both core and sql tests always run b1248dc [Brennon York] exec python rather than running python and exiting with return code f9deba1 [Brennon York] python to python2 and removed newline 6d0a052 [Brennon York] incorporated merge conflicts with SPARK-7249 f950010 [Brennon York] removed building hive-0.12.0 per SPARK-6908 703f095 [Brennon York] fixed merge conflicts b1ca593 [Brennon York] reverted the sparkR test afeb093 [Brennon York] updated to make sparkR test fail 1dada6b [Brennon York] reverted pyspark test failure 9a592ec [Brennon York] reverted mima exclude issue, added pyspark test failure d825aa4 [Brennon York] revert build break, add mima break f041d8a [Brennon York] added space from commented import to now test build breaking 983f2a2 [Brennon York] comment out import to fail build test 2386785 [Brennon York] Merge remote-tracking branch 'upstream/master' into SPARK-7017 76335fb [Brennon York] reverted rat license issue for sparkconf e4a96cc [Brennon York] removed the import error and added license error, fixed the way run-tests and run-tests.py report their error codes 56d3cb9 [Brennon York] changed test back and commented out import to break compile b37328c [Brennon York] fixed typo and added default return is no error block was found in the environment 7613558 [Brennon York] updated to return the proper env variable for return codes a5bd445 [Brennon York] reverted license, changed test in shuffle to fail 803143a [Brennon York] removed license file for SparkContext b0b2604 [Brennon York] comment out import to see if build fails and returns properly 83e80ef [Brennon York] attempt at better python output when called from bash c095fa6 [Brennon York] removed another wait() call 26e18e8 [Brennon York] removed unnecessary wait() 07210a9 [Brennon York] minor doc string change for java version with namedtuple update ec03bf3 [Brennon York] added namedtuple for java version to add readability 2cb413b [Brennon York] upcased global variables, changes various calling methods from check_output to check_call 639f1e9 [Brennon York] updated with pep8 rules, fixed minor bugs, added run-tests file in bash to call the run-tests.py script 3c53a1a [Brennon York] uncomment the scala tests :) 6126c4f [Brennon York] refactored run-tests into python --- dev/run-tests | 219 +-------------- dev/run-tests-codes.sh | 11 +- dev/run-tests-jenkins | 2 + dev/run-tests.py | 536 +++++++++++++++++++++++++++++++++++++ examples/src/main/python/kmeans.py | 2 +- 5 files changed, 546 insertions(+), 224 deletions(-) create mode 100755 dev/run-tests.py diff --git a/dev/run-tests b/dev/run-tests index d178e2a460..a00d9f0c27 100755 --- a/dev/run-tests +++ b/dev/run-tests @@ -17,224 +17,7 @@ # limitations under the License. # -# Go to the Spark project root directory FWDIR="$(cd "`dirname $0`"/..; pwd)" cd "$FWDIR" -# Clean up work directory and caches -rm -rf ./work -rm -rf ~/.ivy2/local/org.apache.spark -rm -rf ~/.ivy2/cache/org.apache.spark - -source "$FWDIR/dev/run-tests-codes.sh" - -CURRENT_BLOCK=$BLOCK_GENERAL - -function handle_error () { - echo "[error] Got a return code of $? on line $1 of the run-tests script." - exit $CURRENT_BLOCK -} - - -# Build against the right version of Hadoop. -{ - if [ -n "$AMPLAB_JENKINS_BUILD_PROFILE" ]; then - if [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop1.0" ]; then - export SBT_MAVEN_PROFILES_ARGS="-Phadoop-1 -Dhadoop.version=1.2.1" - elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.0" ]; then - export SBT_MAVEN_PROFILES_ARGS="-Phadoop-1 -Dhadoop.version=2.0.0-mr1-cdh4.1.1" - elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.2" ]; then - export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.2" - elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.3" ]; then - export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0" - fi - fi - - if [ -z "$SBT_MAVEN_PROFILES_ARGS" ]; then - export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0" - fi -} - -export SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Pkinesis-asl" - -# Determine Java path and version. -{ - if test -x "$JAVA_HOME/bin/java"; then - declare java_cmd="$JAVA_HOME/bin/java" - else - declare java_cmd=java - fi - - # We can't use sed -r -e due to OS X / BSD compatibility; hence, all the parentheses. - JAVA_VERSION=$( - $java_cmd -version 2>&1 \ - | grep -e "^java version" --max-count=1 \ - | sed "s/java version \"\(.*\)\.\(.*\)\.\(.*\)\"/\1\2/" - ) - - if [ "$JAVA_VERSION" -lt 18 ]; then - echo "[warn] Java 8 tests will not run because JDK version is < 1.8." - fi -} - -# Only run Hive tests if there are SQL changes. -# Partial solution for SPARK-1455. -if [ -n "$AMPLAB_JENKINS" ]; then - target_branch="$ghprbTargetBranch" - git fetch origin "$target_branch":"$target_branch" - - # AMP_JENKINS_PRB indicates if the current build is a pull request build. - if [ -n "$AMP_JENKINS_PRB" ]; then - # It is a pull request build. - sql_diffs=$( - git diff --name-only "$target_branch" \ - | grep -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh" - ) - - non_sql_diffs=$( - git diff --name-only "$target_branch" \ - | grep -v -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh" - ) - - if [ -n "$sql_diffs" ]; then - echo "[info] Detected changes in SQL. Will run Hive test suite." - _RUN_SQL_TESTS=true - - if [ -z "$non_sql_diffs" ]; then - echo "[info] Detected no changes except in SQL. Will only run SQL tests." - _SQL_TESTS_ONLY=true - fi - fi - else - # It is a regular build. We should run SQL tests. - _RUN_SQL_TESTS=true - fi -fi - -set -o pipefail -trap 'handle_error $LINENO' ERR - -echo "" -echo "=========================================================================" -echo "Running Apache RAT checks" -echo "=========================================================================" - -CURRENT_BLOCK=$BLOCK_RAT - -./dev/check-license - -echo "" -echo "=========================================================================" -echo "Running Scala style checks" -echo "=========================================================================" - -CURRENT_BLOCK=$BLOCK_SCALA_STYLE - -./dev/lint-scala - -echo "" -echo "=========================================================================" -echo "Running Python style checks" -echo "=========================================================================" - -CURRENT_BLOCK=$BLOCK_PYTHON_STYLE - -./dev/lint-python - -echo "" -echo "=========================================================================" -echo "Building Spark" -echo "=========================================================================" - -CURRENT_BLOCK=$BLOCK_BUILD - -{ - HIVE_BUILD_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive -Phive-thriftserver" - echo "[info] Compile with Hive 0.13.1" - [ -d "lib_managed" ] && rm -rf lib_managed - echo "[info] Building Spark with these arguments: $HIVE_BUILD_ARGS" - - if [ "${AMPLAB_JENKINS_BUILD_TOOL}" == "maven" ]; then - build/mvn $HIVE_BUILD_ARGS clean package -DskipTests - else - echo -e "q\n" \ - | build/sbt $HIVE_BUILD_ARGS package assembly/assembly streaming-kafka-assembly/assembly \ - | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including" - fi -} - -echo "" -echo "=========================================================================" -echo "Detecting binary incompatibilities with MiMa" -echo "=========================================================================" - -CURRENT_BLOCK=$BLOCK_MIMA - -./dev/mima - -echo "" -echo "=========================================================================" -echo "Running Spark unit tests" -echo "=========================================================================" - -CURRENT_BLOCK=$BLOCK_SPARK_UNIT_TESTS - -{ - # If the Spark SQL tests are enabled, run the tests with the Hive profiles enabled. - # This must be a single argument, as it is. - if [ -n "$_RUN_SQL_TESTS" ]; then - SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive -Phive-thriftserver" - fi - - if [ -n "$_SQL_TESTS_ONLY" ]; then - # This must be an array of individual arguments. Otherwise, having one long string - # will be interpreted as a single test, which doesn't work. - SBT_MAVEN_TEST_ARGS=("catalyst/test" "sql/test" "hive/test" "hive-thriftserver/test" "mllib/test") - else - SBT_MAVEN_TEST_ARGS=("test") - fi - - echo "[info] Running Spark tests with these arguments: $SBT_MAVEN_PROFILES_ARGS ${SBT_MAVEN_TEST_ARGS[@]}" - - if [ "${AMPLAB_JENKINS_BUILD_TOOL}" == "maven" ]; then - build/mvn test $SBT_MAVEN_PROFILES_ARGS --fail-at-end - else - # NOTE: echo "q" is needed because sbt on encountering a build file with failure - # (either resolution or compilation) prompts the user for input either q, r, etc - # to quit or retry. This echo is there to make it not block. - # NOTE: Do not quote $SBT_MAVEN_PROFILES_ARGS or else it will be interpreted as a - # single argument! - # "${SBT_MAVEN_TEST_ARGS[@]}" is cool because it's an array. - # QUESTION: Why doesn't 'yes "q"' work? - # QUESTION: Why doesn't 'grep -v -e "^\[info\] Resolving"' work? - echo -e "q\n" \ - | build/sbt $SBT_MAVEN_PROFILES_ARGS "${SBT_MAVEN_TEST_ARGS[@]}" \ - | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including" - fi -} - -echo "" -echo "=========================================================================" -echo "Running PySpark tests" -echo "=========================================================================" - -CURRENT_BLOCK=$BLOCK_PYSPARK_UNIT_TESTS - -# add path for python 3 in jenkins -export PATH="${PATH}:/home/anaconda/envs/py3k/bin" -./python/run-tests - -echo "" -echo "=========================================================================" -echo "Running SparkR tests" -echo "=========================================================================" - -CURRENT_BLOCK=$BLOCK_SPARKR_UNIT_TESTS - -if [ $(command -v R) ]; then - ./R/install-dev.sh - ./R/run-tests.sh -else - echo "Ignoring SparkR tests as R was not found in PATH" -fi - +exec python -u ./dev/run-tests.py diff --git a/dev/run-tests-codes.sh b/dev/run-tests-codes.sh index 154e01255b..f4b238e1b7 100644 --- a/dev/run-tests-codes.sh +++ b/dev/run-tests-codes.sh @@ -21,8 +21,9 @@ readonly BLOCK_GENERAL=10 readonly BLOCK_RAT=11 readonly BLOCK_SCALA_STYLE=12 readonly BLOCK_PYTHON_STYLE=13 -readonly BLOCK_BUILD=14 -readonly BLOCK_MIMA=15 -readonly BLOCK_SPARK_UNIT_TESTS=16 -readonly BLOCK_PYSPARK_UNIT_TESTS=17 -readonly BLOCK_SPARKR_UNIT_TESTS=18 +readonly BLOCK_DOCUMENTATION=14 +readonly BLOCK_BUILD=15 +readonly BLOCK_MIMA=16 +readonly BLOCK_SPARK_UNIT_TESTS=17 +readonly BLOCK_PYSPARK_UNIT_TESTS=18 +readonly BLOCK_SPARKR_UNIT_TESTS=19 diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins index 641b0ff3c4..c4d39d95d5 100755 --- a/dev/run-tests-jenkins +++ b/dev/run-tests-jenkins @@ -210,6 +210,8 @@ done failing_test="Scala style tests" elif [ "$test_result" -eq "$BLOCK_PYTHON_STYLE" ]; then failing_test="Python style tests" + elif [ "$test_result" -eq "$BLOCK_DOCUMENTATION" ]; then + failing_test="to generate documentation" elif [ "$test_result" -eq "$BLOCK_BUILD" ]; then failing_test="to build" elif [ "$test_result" -eq "$BLOCK_MIMA" ]; then diff --git a/dev/run-tests.py b/dev/run-tests.py new file mode 100755 index 0000000000..04a7b45741 --- /dev/null +++ b/dev/run-tests.py @@ -0,0 +1,536 @@ +#!/usr/bin/env python2 + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import re +import sys +import shutil +import subprocess +from collections import namedtuple + +SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") +USER_HOME = os.environ.get("HOME") + + +def get_error_codes(err_code_file): + """Function to retrieve all block numbers from the `run-tests-codes.sh` + file to maintain backwards compatibility with the `run-tests-jenkins` + script""" + + with open(err_code_file, 'r') as f: + err_codes = [e.split()[1].strip().split('=') + for e in f if e.startswith("readonly")] + return dict(err_codes) + + +ERROR_CODES = get_error_codes(os.path.join(SPARK_HOME, "dev/run-tests-codes.sh")) + + +def exit_from_command_with_retcode(cmd, retcode): + print "[error] running", cmd, "; received return code", retcode + sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) + + +def rm_r(path): + """Given an arbitrary path properly remove it with the correct python + construct if it exists + - from: http://stackoverflow.com/a/9559881""" + + if os.path.isdir(path): + shutil.rmtree(path) + elif os.path.exists(path): + os.remove(path) + + +def run_cmd(cmd): + """Given a command as a list of arguments will attempt to execute the + command from the determined SPARK_HOME directory and, on failure, print + an error message""" + + if not isinstance(cmd, list): + cmd = cmd.split() + try: + subprocess.check_call(cmd) + except subprocess.CalledProcessError as e: + exit_from_command_with_retcode(e.cmd, e.returncode) + + +def is_exe(path): + """Check if a given path is an executable file + - from: http://stackoverflow.com/a/377028""" + + return os.path.isfile(path) and os.access(path, os.X_OK) + + +def which(program): + """Find and return the given program by its absolute path or 'None' + - from: http://stackoverflow.com/a/377028""" + + fpath, fname = os.path.split(program) + + if fpath: + if is_exe(program): + return program + else: + for path in os.environ.get("PATH").split(os.pathsep): + path = path.strip('"') + exe_file = os.path.join(path, program) + if is_exe(exe_file): + return exe_file + return None + + +def determine_java_executable(): + """Will return the path of the java executable that will be used by Spark's + tests or `None`""" + + # Any changes in the way that Spark's build detects java must be reflected + # here. Currently the build looks for $JAVA_HOME/bin/java then falls back to + # the `java` executable on the path + + java_home = os.environ.get("JAVA_HOME") + + # check if there is an executable at $JAVA_HOME/bin/java + java_exe = which(os.path.join(java_home, "bin", "java")) if java_home else None + # if the java_exe wasn't set, check for a `java` version on the $PATH + return java_exe if java_exe else which("java") + + +JavaVersion = namedtuple('JavaVersion', ['major', 'minor', 'patch', 'update']) + + +def determine_java_version(java_exe): + """Given a valid java executable will return its version in named tuple format + with accessors '.major', '.minor', '.patch', '.update'""" + + raw_output = subprocess.check_output([java_exe, "-version"], + stderr=subprocess.STDOUT) + raw_version_str = raw_output.split('\n')[0] # eg 'java version "1.8.0_25"' + version_str = raw_version_str.split()[-1].strip('"') # eg '1.8.0_25' + version, update = version_str.split('_') # eg ['1.8.0', '25'] + + # map over the values and convert them to integers + version_info = [int(x) for x in version.split('.') + [update]] + + return JavaVersion(major=version_info[0], + minor=version_info[1], + patch=version_info[2], + update=version_info[3]) + + +def set_title_and_block(title, err_block): + os.environ["CURRENT_BLOCK"] = ERROR_CODES[err_block] + line_str = '=' * 72 + + print + print line_str + print title + print line_str + + +def run_apache_rat_checks(): + set_title_and_block("Running Apache RAT checks", "BLOCK_RAT") + run_cmd([os.path.join(SPARK_HOME, "dev", "check-license")]) + + +def run_scala_style_checks(): + set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE") + run_cmd([os.path.join(SPARK_HOME, "dev", "lint-scala")]) + + +def run_python_style_checks(): + set_title_and_block("Running Python style checks", "BLOCK_PYTHON_STYLE") + run_cmd([os.path.join(SPARK_HOME, "dev", "lint-python")]) + + +def build_spark_documentation(): + set_title_and_block("Building Spark Documentation", "BLOCK_DOCUMENTATION") + os.environ["PRODUCTION"] = "1 jekyll build" + + os.chdir(os.path.join(SPARK_HOME, "docs")) + + jekyll_bin = which("jekyll") + + if not jekyll_bin: + print "[error] Cannot find a version of `jekyll` on the system; please", + print "install one and retry to build documentation." + sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) + else: + run_cmd([jekyll_bin, "build"]) + + os.chdir(SPARK_HOME) + + +def exec_maven(mvn_args=[]): + """Will call Maven in the current directory with the list of mvn_args passed + in and returns the subprocess for any further processing""" + + run_cmd([os.path.join(SPARK_HOME, "build", "mvn")] + mvn_args) + + +def exec_sbt(sbt_args=[]): + """Will call SBT in the current directory with the list of mvn_args passed + in and returns the subprocess for any further processing""" + + sbt_cmd = [os.path.join(SPARK_HOME, "build", "sbt")] + sbt_args + + sbt_output_filter = re.compile("^.*[info].*Resolving" + "|" + + "^.*[warn].*Merging" + "|" + + "^.*[info].*Including") + + # NOTE: echo "q" is needed because sbt on encountering a build file + # with failure (either resolution or compilation) prompts the user for + # input either q, r, etc to quit or retry. This echo is there to make it + # not block. + echo_proc = subprocess.Popen(["echo", "\"q\n\""], stdout=subprocess.PIPE) + sbt_proc = subprocess.Popen(sbt_cmd, + stdin=echo_proc.stdout, + stdout=subprocess.PIPE) + echo_proc.wait() + for line in iter(sbt_proc.stdout.readline, ''): + if not sbt_output_filter.match(line): + print line, + retcode = sbt_proc.wait() + + if retcode > 0: + exit_from_command_with_retcode(sbt_cmd, retcode) + + +def get_hadoop_profiles(hadoop_version): + """Return a list of profiles indicating which Hadoop version to use from + a Hadoop version tag.""" + + sbt_maven_hadoop_profiles = { + "hadoop1.0": ["-Phadoop-1", "-Dhadoop.version=1.0.4"], + "hadoop2.0": ["-Phadoop-1", "-Dhadoop.version=2.0.0-mr1-cdh4.1.1"], + "hadoop2.2": ["-Pyarn", "-Phadoop-2.2"], + "hadoop2.3": ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"], + } + + if hadoop_version in sbt_maven_hadoop_profiles: + return sbt_maven_hadoop_profiles[hadoop_version] + else: + print "[error] Could not find", hadoop_version, "in the list. Valid options", + print "are", sbt_maven_hadoop_profiles.keys() + sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) + + +def get_build_profiles(hadoop_version="hadoop2.3", + enable_base_profiles=True, + enable_hive_profiles=False, + enable_doc_profiles=False): + """Returns a list of hadoop profiles to be used as looked up from the passed in hadoop profile + key with the option of adding on the base and hive profiles.""" + + base_profiles = ["-Pkinesis-asl"] + hive_profiles = ["-Phive", "-Phive-thriftserver"] + doc_profiles = [] + hadoop_profiles = get_hadoop_profiles(hadoop_version) + + build_profiles = hadoop_profiles + + if enable_base_profiles: + build_profiles += base_profiles + + if enable_hive_profiles: + build_profiles += hive_profiles + + if enable_doc_profiles: + build_profiles += doc_profiles + + return build_profiles + + +def build_spark_maven(hadoop_version): + # we always build with Hive support even if we skip Hive tests in most builds + build_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=True) + mvn_goals = ["clean", "package", "-DskipTests"] + profiles_and_goals = build_profiles + mvn_goals + + print "[info] Building Spark (w/Hive 0.13.1) using Maven with these arguments:", + print " ".join(profiles_and_goals) + + exec_maven(profiles_and_goals) + + +def build_spark_sbt(hadoop_version): + build_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=True) + sbt_goals = ["package", + "assembly/assembly", + "streaming-kafka-assembly/assembly"] + profiles_and_goals = build_profiles + sbt_goals + + print "[info] Building Spark (w/Hive 0.13.1) using SBT with these arguments:", + print " ".join(profiles_and_goals) + + exec_sbt(profiles_and_goals) + + +def build_apache_spark(build_tool, hadoop_version): + """Will build Spark against Hive v0.13.1 given the passed in build tool (either `sbt` or + `maven`). Defaults to using `sbt`.""" + + set_title_and_block("Building Spark", "BLOCK_BUILD") + + rm_r("lib_managed") + + if build_tool == "maven": + build_spark_maven(hadoop_version) + else: + build_spark_sbt(hadoop_version) + + +def detect_binary_inop_with_mima(): + set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA") + run_cmd([os.path.join(SPARK_HOME, "dev", "mima")]) + + +def identify_changed_modules(test_env): + """Given the passed in environment will determine the changed modules and + return them as a set. If the environment is local, will simply run all tests. + If run under the `amplab_jenkins` environment will determine the changed files + as compared to the `ghprbTargetBranch` and execute the necessary set of tests + to provide coverage for the changed code.""" + changed_modules = set() + + if test_env == "amplab_jenkins": + target_branch = os.environ["ghprbTargetBranch"] + + run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)]) + + raw_output = subprocess.check_output(['git', 'diff', '--name-only', target_branch]) + # remove any empty strings + changed_files = [f for f in raw_output.split('\n') if f] + + sql_files = [f for f in changed_files + if any(f.startswith(p) for p in + ["sql/", + "bin/spark-sql", + "sbin/start-thriftserver.sh", + "examples/src/main/java/org/apache/spark/examples/sql/", + "examples/src/main/scala/org/apache/spark/examples/sql/"])] + mllib_files = [f for f in changed_files + if any(f.startswith(p) for p in + ["examples/src/main/java/org/apache/spark/examples/mllib/", + "examples/src/main/scala/org/apache/spark/examples/mllib", + "data/mllib/", + "mllib/"])] + streaming_files = [f for f in changed_files + if any(f.startswith(p) for p in + ["examples/scala-2.10/", + "examples/src/main/java/org/apache/spark/examples/streaming/", + "examples/src/main/scala/org/apache/spark/examples/streaming/", + "external/", + "extras/java8-tests/", + "extras/kinesis-asl/", + "streaming/"])] + graphx_files = [f for f in changed_files + if any(f.startswith(p) for p in + ["examples/src/main/scala/org/apache/spark/examples/graphx/", + "graphx/"])] + doc_files = [f for f in changed_files if f.startswith("docs/")] + + # union together all changed top level project files + top_level_project_files = set().union(*[set(f) for f in [sql_files, + mllib_files, + streaming_files, + graphx_files, + doc_files]]) + changed_core_files = set(changed_files).difference(top_level_project_files) + + if changed_core_files: + changed_modules.add("CORE") + if sql_files: + print "[info] Detected changes in SQL. Will run Hive test suite." + changed_modules.add("SQL") + if mllib_files: + print "[info] Detected changes in MLlib. Will run MLlib test suite." + changed_modules.add("MLLIB") + if streaming_files: + print "[info] Detected changes in Streaming. Will run Streaming test suite." + changed_modules.add("STREAMING") + if graphx_files: + print "[info] Detected changes in GraphX. Will run GraphX test suite." + changed_modules.add("GRAPHX") + if doc_files: + print "[info] Detected changes in documentation. Will build spark with documentation." + changed_modules.add("DOCS") + + return changed_modules + else: + # we aren't in the Amplab environment so simply run all tests + changed_modules.add("ALL") + return changed_modules + + +def run_scala_tests_maven(test_profiles): + mvn_test_goals = ["test", "--fail-at-end"] + profiles_and_goals = test_profiles + mvn_test_goals + + print "[info] Running Spark tests using Maven with these arguments:", + print " ".join(profiles_and_goals) + + exec_maven(profiles_and_goals) + + +def run_scala_tests_sbt(test_modules, test_profiles): + # declare the variable for reference + sbt_test_goals = None + + if "ALL" in test_modules: + sbt_test_goals = ["test"] + else: + # if we only have changes in SQL, MLlib, Streaming, or GraphX then build + # a custom test list + if "SQL" in test_modules and "CORE" not in test_modules: + sbt_test_goals = ["catalyst/test", + "sql/test", + "hive/test", + "hive-thriftserver/test", + "mllib/test", + "examples/test"] + if "MLLIB" in test_modules and "CORE" not in test_modules: + sbt_test_goals += ["mllib/test", "examples/test"] + if "STREAMING" in test_modules and "CORE" not in test_modules: + sbt_test_goals += ["streaming/test", + "streaming-flume/test", + "streaming-flume-sink/test", + "streaming-kafka/test", + "streaming-mqtt/test", + "streaming-twitter/test", + "streaming-zeromq/test", + "examples/test"] + if "GRAPHX" in test_modules and "CORE" not in test_modules: + sbt_test_goals += ["graphx/test", "examples/test"] + if not sbt_test_goals: + sbt_test_goals = ["test"] + + profiles_and_goals = test_profiles + sbt_test_goals + + print "[info] Running Spark tests using SBT with these arguments:", + print " ".join(profiles_and_goals) + + exec_sbt(profiles_and_goals) + + +def run_scala_tests(build_tool, hadoop_version, test_modules): + """Function to properly execute all tests passed in as a set from the + `determine_test_suites` function""" + set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS") + + test_modules = set(test_modules) + + hive_profiles = ("SQL" in test_modules) + test_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=hive_profiles) + + if build_tool == "maven": + run_scala_tests_maven(test_profiles) + else: + run_scala_tests_sbt(test_modules, test_profiles) + + +def run_python_tests(): + set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS") + + run_cmd([os.path.join(SPARK_HOME, "python", "run-tests")]) + + +def run_sparkr_tests(): + set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS") + + if which("R"): + run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")]) + run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")]) + else: + print "Ignoring SparkR tests as R was not found in PATH" + + +def main(): + # Ensure the user home directory (HOME) is valid and is an absolute directory + if not USER_HOME or not os.path.isabs(USER_HOME): + print "[error] Cannot determine your home directory as an absolute path;", + print "ensure the $HOME environment variable is set properly." + sys.exit(1) + + os.chdir(SPARK_HOME) + + rm_r(os.path.join(SPARK_HOME, "work")) + rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark")) + rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark")) + + os.environ["CURRENT_BLOCK"] = ERROR_CODES["BLOCK_GENERAL"] + + java_exe = determine_java_executable() + + if not java_exe: + print "[error] Cannot find a version of `java` on the system; please", + print "install one and retry." + sys.exit(2) + + java_version = determine_java_version(java_exe) + + if java_version.minor < 8: + print "[warn] Java 8 tests will not run because JDK version is < 1.8." + + if os.environ.get("AMPLAB_JENKINS"): + # if we're on the Amplab Jenkins build servers setup variables + # to reflect the environment settings + build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt") + hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3") + test_env = "amplab_jenkins" + # add path for Python3 in Jenkins if we're calling from a Jenkins machine + os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH") + else: + # else we're running locally and can use local settings + build_tool = "sbt" + hadoop_version = "hadoop2.3" + test_env = "local" + + print "[info] Using build tool", build_tool, "with profile", hadoop_version, + print "under environment", test_env + + # determine high level changes + changed_modules = identify_changed_modules(test_env) + print "[info] Found the following changed modules:", ", ".join(changed_modules) + + # license checks + run_apache_rat_checks() + + # style checks + run_scala_style_checks() + run_python_style_checks() + + # determine if docs were changed and if we're inside the amplab environment + # note - the below commented out until *all* Jenkins workers can get `jekyll` installed + # if "DOCS" in changed_modules and test_env == "amplab_jenkins": + # build_spark_documentation() + + # spark build + build_apache_spark(build_tool, hadoop_version) + + # backwards compatibility checks + detect_binary_inop_with_mima() + + # run the test suites + run_scala_tests(build_tool, hadoop_version, changed_modules) + run_python_tests() + run_sparkr_tests() + +if __name__ == "__main__": + main() diff --git a/examples/src/main/python/kmeans.py b/examples/src/main/python/kmeans.py index 1456c87312..0ea7cfb702 100755 --- a/examples/src/main/python/kmeans.py +++ b/examples/src/main/python/kmeans.py @@ -68,7 +68,7 @@ if __name__ == "__main__": closest = data.map( lambda p: (closestPoint(p, kPoints), (p, 1))) pointStats = closest.reduceByKey( - lambda (p1, c1), (p2, c2): (p1 + p2, c1 + c2)) + lambda p1_c1, p2_c2: (p1_c1[0] + p2_c2[0], p1_c1[1] + p2_c2[1])) newPoints = pointStats.map( lambda st: (st[0], st[1][0] / st[1][1])).collect() -- cgit v1.2.3