From 665466dfff4f89196627a0777eabd3d3894cd296 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Thu, 27 Dec 2012 22:47:37 -0800 Subject: Simplify PySpark installation. - Bundle Py4J binaries, since it's hard to install - Uses Spark's `run` script to launch the Py4J gateway, inheriting the settings in spark-env.sh With these changes, (hopefully) nothing more than running `sbt/sbt package` will be necessary to run PySpark. --- pyspark/README | 23 ++--------------------- pyspark/lib/PY4J_LICENSE.txt | 27 +++++++++++++++++++++++++++ pyspark/lib/PY4J_VERSION.txt | 1 + pyspark/lib/py4j0.7.egg | Bin 0 -> 191756 bytes pyspark/lib/py4j0.7.jar | Bin 0 -> 103286 bytes pyspark/pyspark-shell | 2 +- pyspark/pyspark/__init__.py | 3 +++ pyspark/pyspark/java_gateway.py | 35 ++++++++++++++++++++++++++--------- pyspark/pyspark/shell.py | 19 +++++++++++-------- pyspark/requirements.txt | 7 ------- pyspark/run-pyspark | 2 +- run | 4 ++++ run2.cmd | 2 ++ 13 files changed, 78 insertions(+), 47 deletions(-) create mode 100644 pyspark/lib/PY4J_LICENSE.txt create mode 100644 pyspark/lib/PY4J_VERSION.txt create mode 100644 pyspark/lib/py4j0.7.egg create mode 100644 pyspark/lib/py4j0.7.jar delete mode 100644 pyspark/requirements.txt diff --git a/pyspark/README b/pyspark/README index 461176de7d..d8d521c72c 100644 --- a/pyspark/README +++ b/pyspark/README @@ -32,30 +32,11 @@ The `pyspark/pyspark/examples` directory contains a few complete examples. ## Installing PySpark - -PySpark requires a development version of Py4J, a Python library for -interacting with Java processes. It can be installed from -https://github.com/bartdag/py4j; make sure to install a version that -contains at least the commits through b7924aabe9. - -PySpark requires the `argparse` module, which is included in Python 2.7 -and is is available for Python 2.6 through `pip` or `easy_install`. - -PySpark uses the `PYTHONPATH` environment variable to search for Python -classes; Py4J should be on this path, along with any libraries used by -PySpark programs. `PYTHONPATH` will be automatically shipped to worker -machines, but the files that it points to must be present on each -machine. - -PySpark requires the Spark assembly JAR, which can be created by running -`sbt/sbt assembly` in the Spark directory. - -Additionally, `SPARK_HOME` should be set to the location of the Spark +# +To use PySpark, `SPARK_HOME` should be set to the location of the Spark package. ## Running PySpark The easiest way to run PySpark is to use the `run-pyspark` and `pyspark-shell` scripts, which are included in the `pyspark` directory. -These scripts automatically load the `spark-conf.sh` file, set -`SPARK_HOME`, and add the `pyspark` package to the `PYTHONPATH`. diff --git a/pyspark/lib/PY4J_LICENSE.txt b/pyspark/lib/PY4J_LICENSE.txt new file mode 100644 index 0000000000..a70279ca14 --- /dev/null +++ b/pyspark/lib/PY4J_LICENSE.txt @@ -0,0 +1,27 @@ + +Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +- Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +- Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +- The name of the author may not be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/pyspark/lib/PY4J_VERSION.txt b/pyspark/lib/PY4J_VERSION.txt new file mode 100644 index 0000000000..04a0cd52a8 --- /dev/null +++ b/pyspark/lib/PY4J_VERSION.txt @@ -0,0 +1 @@ +b7924aabe9c5e63f0a4d8bbd17019534c7ec014e diff --git a/pyspark/lib/py4j0.7.egg b/pyspark/lib/py4j0.7.egg new file mode 100644 index 0000000000..f8a339d8ee Binary files /dev/null and b/pyspark/lib/py4j0.7.egg differ diff --git a/pyspark/lib/py4j0.7.jar b/pyspark/lib/py4j0.7.jar new file mode 100644 index 0000000000..73b7ddb7d1 Binary files /dev/null and b/pyspark/lib/py4j0.7.jar differ diff --git a/pyspark/pyspark-shell b/pyspark/pyspark-shell index 4ed3e6010c..e3736826e8 100755 --- a/pyspark/pyspark-shell +++ b/pyspark/pyspark-shell @@ -1,3 +1,3 @@ -#!/bin/sh +#!/usr/bin/env bash FWDIR="`dirname $0`" exec $FWDIR/run-pyspark $FWDIR/pyspark/shell.py "$@" diff --git a/pyspark/pyspark/__init__.py b/pyspark/pyspark/__init__.py index e69de29bb2..549c2d2711 100644 --- a/pyspark/pyspark/__init__.py +++ b/pyspark/pyspark/__init__.py @@ -0,0 +1,3 @@ +import sys +import os +sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], "pyspark/lib/py4j0.7.egg")) diff --git a/pyspark/pyspark/java_gateway.py b/pyspark/pyspark/java_gateway.py index 3726bcbf17..d4a4434c05 100644 --- a/pyspark/pyspark/java_gateway.py +++ b/pyspark/pyspark/java_gateway.py @@ -1,19 +1,36 @@ -import glob import os -from py4j.java_gateway import java_import, JavaGateway +from subprocess import Popen, PIPE +from threading import Thread +from py4j.java_gateway import java_import, JavaGateway, GatewayClient SPARK_HOME = os.environ["SPARK_HOME"] -assembly_jar = glob.glob(os.path.join(SPARK_HOME, "core/target") + \ - "/spark-core-assembly-*.jar")[0] - # TODO: what if multiple assembly jars are found? - - def launch_gateway(): - gateway = JavaGateway.launch_gateway(classpath=assembly_jar, - javaopts=["-Xmx256m"], die_on_exit=True) + # Launch the Py4j gateway using Spark's run command so that we pick up the + # proper classpath and SPARK_MEM settings from spark-env.sh + command = [os.path.join(SPARK_HOME, "run"), "py4j.GatewayServer", + "--die-on-broken-pipe", "0"] + proc = Popen(command, stdout=PIPE, stdin=PIPE) + # Determine which ephemeral port the server started on: + port = int(proc.stdout.readline()) + # Create a thread to echo output from the GatewayServer, which is required + # for Java log output to show up: + class EchoOutputThread(Thread): + def __init__(self, stream): + Thread.__init__(self) + self.daemon = True + self.stream = stream + + def run(self): + while True: + line = self.stream.readline() + print line, + EchoOutputThread(proc.stdout).start() + # Connect to the gateway + gateway = JavaGateway(GatewayClient(port=port)) + # Import the classes used by PySpark java_import(gateway.jvm, "spark.api.java.*") java_import(gateway.jvm, "spark.api.python.*") java_import(gateway.jvm, "scala.Tuple2") diff --git a/pyspark/pyspark/shell.py b/pyspark/pyspark/shell.py index 7012884abc..bd39b0283f 100644 --- a/pyspark/pyspark/shell.py +++ b/pyspark/pyspark/shell.py @@ -1,7 +1,7 @@ """ An interactive shell. """ -import argparse # argparse is avaiable for Python < 2.7 through easy_install. +import optparse # I prefer argparse, but it's not included with Python < 2.7 import code import sys @@ -21,10 +21,13 @@ def main(master='local', ipython=False): if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument("master", help="Spark master host (default='local')", - nargs='?', type=str, default="local") - parser.add_argument("-i", "--ipython", help="Run IPython shell", - action="store_true") - args = parser.parse_args() - main(args.master, args.ipython) + usage = "usage: %prog [options] master" + parser = optparse.OptionParser(usage=usage) + parser.add_option("-i", "--ipython", help="Run IPython shell", + action="store_true") + (options, args) = parser.parse_args() + if len(sys.argv) > 1: + master = args[0] + else: + master = 'local' + main(master, options.ipython) diff --git a/pyspark/requirements.txt b/pyspark/requirements.txt deleted file mode 100644 index 2464ca0074..0000000000 --- a/pyspark/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -# The Python API relies on some new features from the Py4J development branch. -# pip can't install Py4J from git because the setup.py file for the Python -# package is not at the root of the git repository. It may be possible to -# install Py4J from git once https://github.com/pypa/pip/pull/526 is merged. - -# git+git://github.com/bartdag/py4j.git@b7924aabe9c5e63f0a4d8bbd17019534c7ec014e -argparse diff --git a/pyspark/run-pyspark b/pyspark/run-pyspark index 9c5e027962..f8039b8038 100755 --- a/pyspark/run-pyspark +++ b/pyspark/run-pyspark @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Figure out where the Scala framework is installed FWDIR="$(cd `dirname $0`; cd ../; pwd)" diff --git a/run b/run index 15db23bbe0..8fa61b086f 100755 --- a/run +++ b/run @@ -40,6 +40,7 @@ CORE_DIR="$FWDIR/core" REPL_DIR="$FWDIR/repl" EXAMPLES_DIR="$FWDIR/examples" BAGEL_DIR="$FWDIR/bagel" +PYSPARK_DIR="$FWDIR/pyspark" # Build up classpath CLASSPATH="$SPARK_CLASSPATH" @@ -61,6 +62,9 @@ for jar in `find $REPL_DIR/lib -name '*jar'`; do CLASSPATH+=":$jar" done CLASSPATH+=":$BAGEL_DIR/target/scala-$SCALA_VERSION/classes" +for jar in `find $PYSPARK_DIR/lib -name '*jar'`; do + CLASSPATH+=":$jar" +done export CLASSPATH # Needed for spark-shell # Figure out whether to run our class with java or with the scala launcher. diff --git a/run2.cmd b/run2.cmd index 097718b526..6024740726 100644 --- a/run2.cmd +++ b/run2.cmd @@ -34,6 +34,7 @@ set CORE_DIR=%FWDIR%core set REPL_DIR=%FWDIR%repl set EXAMPLES_DIR=%FWDIR%examples set BAGEL_DIR=%FWDIR%bagel +set PYSPARK_DIR=%FWDIR%pyspark rem Build up classpath set CLASSPATH=%SPARK_CLASSPATH%;%MESOS_CLASSPATH%;%FWDIR%conf;%CORE_DIR%\target\scala-%SCALA_VERSION%\classes @@ -42,6 +43,7 @@ set CLASSPATH=%CLASSPATH%;%REPL_DIR%\target\scala-%SCALA_VERSION%\classes;%EXAMP for /R "%FWDIR%\lib_managed\jars" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j for /R "%FWDIR%\lib_managed\bundles" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j for /R "%REPL_DIR%\lib" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j +for /R "%PYSPARK_DIR%\lib" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j set CLASSPATH=%CLASSPATH%;%BAGEL_DIR%\target\scala-%SCALA_VERSION%\classes rem Figure out whether to run our class with java or with the scala launcher. -- cgit v1.2.3