aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJosh Rosen <joshrosen@eecs.berkeley.edu>2012-12-27 22:47:37 -0800
committerJosh Rosen <joshrosen@eecs.berkeley.edu>2012-12-27 22:47:37 -0800
commit665466dfff4f89196627a0777eabd3d3894cd296 (patch)
tree7fa580209756c5fdbb0a52930f30959bbbbc2ba3
parentac32447cd38beac8f6bc7a90be9fd24666bb46ad (diff)
downloadspark-665466dfff4f89196627a0777eabd3d3894cd296.tar.gz
spark-665466dfff4f89196627a0777eabd3d3894cd296.tar.bz2
spark-665466dfff4f89196627a0777eabd3d3894cd296.zip
Simplify PySpark installation.
- Bundle Py4J binaries, since it's hard to install - Uses Spark's `run` script to launch the Py4J gateway, inheriting the settings in spark-env.sh With these changes, (hopefully) nothing more than running `sbt/sbt package` will be necessary to run PySpark.
-rw-r--r--pyspark/README23
-rw-r--r--pyspark/lib/PY4J_LICENSE.txt27
-rw-r--r--pyspark/lib/PY4J_VERSION.txt1
-rw-r--r--pyspark/lib/py4j0.7.eggbin0 -> 191756 bytes
-rw-r--r--pyspark/lib/py4j0.7.jarbin0 -> 103286 bytes
-rwxr-xr-xpyspark/pyspark-shell2
-rw-r--r--pyspark/pyspark/__init__.py3
-rw-r--r--pyspark/pyspark/java_gateway.py35
-rw-r--r--pyspark/pyspark/shell.py19
-rw-r--r--pyspark/requirements.txt7
-rwxr-xr-xpyspark/run-pyspark2
-rwxr-xr-xrun4
-rw-r--r--run2.cmd2
13 files changed, 78 insertions, 47 deletions
diff --git a/pyspark/README b/pyspark/README
index 461176de7d..d8d521c72c 100644
--- a/pyspark/README
+++ b/pyspark/README
@@ -32,30 +32,11 @@ The `pyspark/pyspark/examples` directory contains a few complete
examples.
## Installing PySpark
-
-PySpark requires a development version of Py4J, a Python library for
-interacting with Java processes. It can be installed from
-https://github.com/bartdag/py4j; make sure to install a version that
-contains at least the commits through b7924aabe9.
-
-PySpark requires the `argparse` module, which is included in Python 2.7
-and is is available for Python 2.6 through `pip` or `easy_install`.
-
-PySpark uses the `PYTHONPATH` environment variable to search for Python
-classes; Py4J should be on this path, along with any libraries used by
-PySpark programs. `PYTHONPATH` will be automatically shipped to worker
-machines, but the files that it points to must be present on each
-machine.
-
-PySpark requires the Spark assembly JAR, which can be created by running
-`sbt/sbt assembly` in the Spark directory.
-
-Additionally, `SPARK_HOME` should be set to the location of the Spark
+#
+To use PySpark, `SPARK_HOME` should be set to the location of the Spark
package.
## Running PySpark
The easiest way to run PySpark is to use the `run-pyspark` and
`pyspark-shell` scripts, which are included in the `pyspark` directory.
-These scripts automatically load the `spark-conf.sh` file, set
-`SPARK_HOME`, and add the `pyspark` package to the `PYTHONPATH`.
diff --git a/pyspark/lib/PY4J_LICENSE.txt b/pyspark/lib/PY4J_LICENSE.txt
new file mode 100644
index 0000000000..a70279ca14
--- /dev/null
+++ b/pyspark/lib/PY4J_LICENSE.txt
@@ -0,0 +1,27 @@
+
+Copyright (c) 2009-2011, Barthelemy Dagenais All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+- The name of the author may not be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/pyspark/lib/PY4J_VERSION.txt b/pyspark/lib/PY4J_VERSION.txt
new file mode 100644
index 0000000000..04a0cd52a8
--- /dev/null
+++ b/pyspark/lib/PY4J_VERSION.txt
@@ -0,0 +1 @@
+b7924aabe9c5e63f0a4d8bbd17019534c7ec014e
diff --git a/pyspark/lib/py4j0.7.egg b/pyspark/lib/py4j0.7.egg
new file mode 100644
index 0000000000..f8a339d8ee
--- /dev/null
+++ b/pyspark/lib/py4j0.7.egg
Binary files differ
diff --git a/pyspark/lib/py4j0.7.jar b/pyspark/lib/py4j0.7.jar
new file mode 100644
index 0000000000..73b7ddb7d1
--- /dev/null
+++ b/pyspark/lib/py4j0.7.jar
Binary files differ
diff --git a/pyspark/pyspark-shell b/pyspark/pyspark-shell
index 4ed3e6010c..e3736826e8 100755
--- a/pyspark/pyspark-shell
+++ b/pyspark/pyspark-shell
@@ -1,3 +1,3 @@
-#!/bin/sh
+#!/usr/bin/env bash
FWDIR="`dirname $0`"
exec $FWDIR/run-pyspark $FWDIR/pyspark/shell.py "$@"
diff --git a/pyspark/pyspark/__init__.py b/pyspark/pyspark/__init__.py
index e69de29bb2..549c2d2711 100644
--- a/pyspark/pyspark/__init__.py
+++ b/pyspark/pyspark/__init__.py
@@ -0,0 +1,3 @@
+import sys
+import os
+sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], "pyspark/lib/py4j0.7.egg"))
diff --git a/pyspark/pyspark/java_gateway.py b/pyspark/pyspark/java_gateway.py
index 3726bcbf17..d4a4434c05 100644
--- a/pyspark/pyspark/java_gateway.py
+++ b/pyspark/pyspark/java_gateway.py
@@ -1,19 +1,36 @@
-import glob
import os
-from py4j.java_gateway import java_import, JavaGateway
+from subprocess import Popen, PIPE
+from threading import Thread
+from py4j.java_gateway import java_import, JavaGateway, GatewayClient
SPARK_HOME = os.environ["SPARK_HOME"]
-assembly_jar = glob.glob(os.path.join(SPARK_HOME, "core/target") + \
- "/spark-core-assembly-*.jar")[0]
- # TODO: what if multiple assembly jars are found?
-
-
def launch_gateway():
- gateway = JavaGateway.launch_gateway(classpath=assembly_jar,
- javaopts=["-Xmx256m"], die_on_exit=True)
+ # Launch the Py4j gateway using Spark's run command so that we pick up the
+ # proper classpath and SPARK_MEM settings from spark-env.sh
+ command = [os.path.join(SPARK_HOME, "run"), "py4j.GatewayServer",
+ "--die-on-broken-pipe", "0"]
+ proc = Popen(command, stdout=PIPE, stdin=PIPE)
+ # Determine which ephemeral port the server started on:
+ port = int(proc.stdout.readline())
+ # Create a thread to echo output from the GatewayServer, which is required
+ # for Java log output to show up:
+ class EchoOutputThread(Thread):
+ def __init__(self, stream):
+ Thread.__init__(self)
+ self.daemon = True
+ self.stream = stream
+
+ def run(self):
+ while True:
+ line = self.stream.readline()
+ print line,
+ EchoOutputThread(proc.stdout).start()
+ # Connect to the gateway
+ gateway = JavaGateway(GatewayClient(port=port))
+ # Import the classes used by PySpark
java_import(gateway.jvm, "spark.api.java.*")
java_import(gateway.jvm, "spark.api.python.*")
java_import(gateway.jvm, "scala.Tuple2")
diff --git a/pyspark/pyspark/shell.py b/pyspark/pyspark/shell.py
index 7012884abc..bd39b0283f 100644
--- a/pyspark/pyspark/shell.py
+++ b/pyspark/pyspark/shell.py
@@ -1,7 +1,7 @@
"""
An interactive shell.
"""
-import argparse # argparse is avaiable for Python < 2.7 through easy_install.
+import optparse # I prefer argparse, but it's not included with Python < 2.7
import code
import sys
@@ -21,10 +21,13 @@ def main(master='local', ipython=False):
if __name__ == '__main__':
- parser = argparse.ArgumentParser()
- parser.add_argument("master", help="Spark master host (default='local')",
- nargs='?', type=str, default="local")
- parser.add_argument("-i", "--ipython", help="Run IPython shell",
- action="store_true")
- args = parser.parse_args()
- main(args.master, args.ipython)
+ usage = "usage: %prog [options] master"
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("-i", "--ipython", help="Run IPython shell",
+ action="store_true")
+ (options, args) = parser.parse_args()
+ if len(sys.argv) > 1:
+ master = args[0]
+ else:
+ master = 'local'
+ main(master, options.ipython)
diff --git a/pyspark/requirements.txt b/pyspark/requirements.txt
deleted file mode 100644
index 2464ca0074..0000000000
--- a/pyspark/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-# The Python API relies on some new features from the Py4J development branch.
-# pip can't install Py4J from git because the setup.py file for the Python
-# package is not at the root of the git repository. It may be possible to
-# install Py4J from git once https://github.com/pypa/pip/pull/526 is merged.
-
-# git+git://github.com/bartdag/py4j.git@b7924aabe9c5e63f0a4d8bbd17019534c7ec014e
-argparse
diff --git a/pyspark/run-pyspark b/pyspark/run-pyspark
index 9c5e027962..f8039b8038 100755
--- a/pyspark/run-pyspark
+++ b/pyspark/run-pyspark
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
# Figure out where the Scala framework is installed
FWDIR="$(cd `dirname $0`; cd ../; pwd)"
diff --git a/run b/run
index 15db23bbe0..8fa61b086f 100755
--- a/run
+++ b/run
@@ -40,6 +40,7 @@ CORE_DIR="$FWDIR/core"
REPL_DIR="$FWDIR/repl"
EXAMPLES_DIR="$FWDIR/examples"
BAGEL_DIR="$FWDIR/bagel"
+PYSPARK_DIR="$FWDIR/pyspark"
# Build up classpath
CLASSPATH="$SPARK_CLASSPATH"
@@ -61,6 +62,9 @@ for jar in `find $REPL_DIR/lib -name '*jar'`; do
CLASSPATH+=":$jar"
done
CLASSPATH+=":$BAGEL_DIR/target/scala-$SCALA_VERSION/classes"
+for jar in `find $PYSPARK_DIR/lib -name '*jar'`; do
+ CLASSPATH+=":$jar"
+done
export CLASSPATH # Needed for spark-shell
# Figure out whether to run our class with java or with the scala launcher.
diff --git a/run2.cmd b/run2.cmd
index 097718b526..6024740726 100644
--- a/run2.cmd
+++ b/run2.cmd
@@ -34,6 +34,7 @@ set CORE_DIR=%FWDIR%core
set REPL_DIR=%FWDIR%repl
set EXAMPLES_DIR=%FWDIR%examples
set BAGEL_DIR=%FWDIR%bagel
+set PYSPARK_DIR=%FWDIR%pyspark
rem Build up classpath
set CLASSPATH=%SPARK_CLASSPATH%;%MESOS_CLASSPATH%;%FWDIR%conf;%CORE_DIR%\target\scala-%SCALA_VERSION%\classes
@@ -42,6 +43,7 @@ set CLASSPATH=%CLASSPATH%;%REPL_DIR%\target\scala-%SCALA_VERSION%\classes;%EXAMP
for /R "%FWDIR%\lib_managed\jars" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j
for /R "%FWDIR%\lib_managed\bundles" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j
for /R "%REPL_DIR%\lib" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j
+for /R "%PYSPARK_DIR%\lib" %%j in (*.jar) do set CLASSPATH=!CLASSPATH!;%%j
set CLASSPATH=%CLASSPATH%;%BAGEL_DIR%\target\scala-%SCALA_VERSION%\classes
rem Figure out whether to run our class with java or with the scala launcher.