From ce9f1bbe20eff794cd1d588dc88f109d32588cfe Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Tue, 1 Jan 2013 21:25:49 -0800 Subject: Add `pyspark` script to replace the other scripts. Expand the PySpark programming guide. --- docs/python-programming-guide.md | 49 ++++++++++++++++++++++++++++++++++++---- docs/quick-start.md | 4 ++-- pyspark | 32 ++++++++++++++++++++++++++ pyspark-shell | 3 --- python/pyspark/shell.py | 36 ++++++++--------------------- python/run-tests | 9 ++++++++ run-pyspark | 28 ----------------------- 7 files changed, 97 insertions(+), 64 deletions(-) create mode 100755 pyspark delete mode 100755 pyspark-shell create mode 100755 python/run-tests delete mode 100755 run-pyspark diff --git a/docs/python-programming-guide.md b/docs/python-programming-guide.md index d88d4eb42d..d963551296 100644 --- a/docs/python-programming-guide.md +++ b/docs/python-programming-guide.md @@ -24,6 +24,35 @@ There are a few key differences between the Python and Scala APIs: - `sample` - `sort` +In PySpark, RDDs support the same methods as their Scala counterparts but take Python functions and return Python collection types. +Short functions can be passed to RDD methods using Python's [`lambda`](http://www.diveintopython.net/power_of_introspection/lambda_functions.html) syntax: + +{% highlight python %} +logData = sc.textFile(logFile).cache() +errors = logData.filter(lambda s: 'ERROR' in s.split()) +{% endhighlight %} + +You can also pass functions that are defined using the `def` keyword; this is useful for more complicated functions that cannot be expressed using `lambda`: + +{% highlight python %} +def is_error(line): + return 'ERROR' in line.split() +errors = logData.filter(is_error) +{% endhighlight %} + +Functions can access objects in enclosing scopes, although modifications to those objects within RDD methods will not be propagated to other tasks: + +{% highlight python %} +error_keywords = ["Exception", "Error"] +def is_error(line): + words = line.split() + return any(keyword in words for keyword in error_keywords) +errors = logData.filter(is_error) +{% endhighlight %} + +PySpark will automatically ship these functions to workers, along with any objects that they reference. +Instances of classes will be serialized and shipped to workers by PySpark, but classes themselves cannot be automatically distributed to workers. +The [Standalone Use](#standalone-use) section describes how to ship code dependencies to workers. # Installing and Configuring PySpark @@ -34,13 +63,14 @@ By default, PySpark's scripts will run programs using `python`; an alternate Pyt All of PySpark's library dependencies, including [Py4J](http://py4j.sourceforge.net/), are bundled with PySpark and automatically imported. -Standalone PySpark jobs should be run using the `run-pyspark` script, which automatically configures the Java and Python environmnt using the settings in `conf/spark-env.sh`. +Standalone PySpark jobs should be run using the `pyspark` script, which automatically configures the Java and Python environment using the settings in `conf/spark-env.sh`. The script automatically adds the `pyspark` package to the `PYTHONPATH`. # Interactive Use -PySpark's `pyspark-shell` script provides a simple way to learn the API: +The `pyspark` script launches a Python interpreter that is configured to run PySpark jobs. +When run without any input files, `pyspark` launches a shell that can be used explore data interactively, which is a simple way to learn the API: {% highlight python %} >>> words = sc.textFile("/usr/share/dict/words") @@ -48,9 +78,18 @@ PySpark's `pyspark-shell` script provides a simple way to learn the API: [u'spar', u'sparable', u'sparada', u'sparadrap', u'sparagrass'] {% endhighlight %} +By default, the `pyspark` shell creates SparkContext that runs jobs locally. +To connect to a non-local cluster, set the `MASTER` environment variable. +For example, to use the `pyspark` shell with a [standalone Spark cluster](spark-standalone.html): + +{% highlight shell %} +$ MASTER=spark://IP:PORT ./pyspark +{% endhighlight %} + + # Standalone Use -PySpark can also be used from standalone Python scripts by creating a SparkContext in the script and running the script using the `run-pyspark` script in the `pyspark` directory. +PySpark can also be used from standalone Python scripts by creating a SparkContext in your script and running the script using `pyspark`. The Quick Start guide includes a [complete example](quick-start.html#a-standalone-job-in-python) of a standalone Python job. Code dependencies can be deployed by listing them in the `pyFiles` option in the SparkContext constructor: @@ -65,8 +104,8 @@ Code dependencies can be added to an existing SparkContext using its `addPyFile( # Where to Go from Here -PySpark includes several sample programs using the Python API in `pyspark/examples`. -You can run them by passing the files to the `pyspark-run` script included in PySpark -- for example `./pyspark-run examples/wordcount.py`. +PySpark includes several sample programs using the Python API in `python/examples`. +You can run them by passing the files to the `pyspark` script -- for example `./pyspark python/examples/wordcount.py`. Each example program prints usage help when run without any arguments. We currently provide [API documentation](api/pyspark/index.html) for the Python API as Epydoc. diff --git a/docs/quick-start.md b/docs/quick-start.md index 8c25df5486..2c7cfbed25 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -258,11 +258,11 @@ We can pass Python functions to Spark, which are automatically serialized along For jobs that use custom classes or third-party libraries, we can add those code dependencies to SparkContext to ensure that they will be available on remote machines; this is described in more detail in the [Python programming guide](python-programming-guide). `SimpleJob` is simple enough that we do not need to specify any code dependencies. -We can run this job using the `run-pyspark` script in `$SPARK_HOME/pyspark`: +We can run this job using the `pyspark` script: {% highlight python %} $ cd $SPARK_HOME -$ ./pyspark/run-pyspark SimpleJob.py +$ ./pyspark SimpleJob.py ... Lines with a: 8422, Lines with b: 1836 {% endhighlight python %} diff --git a/pyspark b/pyspark new file mode 100755 index 0000000000..9e89d51ba2 --- /dev/null +++ b/pyspark @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +# Figure out where the Scala framework is installed +FWDIR="$(cd `dirname $0`; pwd)" + +# Export this as SPARK_HOME +export SPARK_HOME="$FWDIR" + +# Load environment variables from conf/spark-env.sh, if it exists +if [ -e $FWDIR/conf/spark-env.sh ] ; then + . $FWDIR/conf/spark-env.sh +fi + +# Figure out which Python executable to use +if [ -z "$PYSPARK_PYTHON" ] ; then + PYSPARK_PYTHON="python" +fi +export PYSPARK_PYTHON + +# Add the PySpark classes to the Python path: +export PYTHONPATH=$SPARK_HOME/python/:$PYTHONPATH + +# Load the PySpark shell.py script when ./pyspark is used interactively: +export OLD_PYTHONSTARTUP=$PYTHONSTARTUP +export PYTHONSTARTUP=$FWDIR/python/pyspark/shell.py + +# Launch with `scala` by default: +if [[ "$SPARK_LAUNCH_WITH_SCALA" != "0" ]] ; then + export SPARK_LAUNCH_WITH_SCALA=1 +fi + +exec "$PYSPARK_PYTHON" "$@" diff --git a/pyspark-shell b/pyspark-shell deleted file mode 100755 index 27aaac3a26..0000000000 --- a/pyspark-shell +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/bin/env bash -FWDIR="`dirname $0`" -exec $FWDIR/run-pyspark $FWDIR/python/pyspark/shell.py "$@" diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py index bd39b0283f..7e6ad3aa76 100644 --- a/python/pyspark/shell.py +++ b/python/pyspark/shell.py @@ -1,33 +1,17 @@ """ An interactive shell. -""" -import optparse # I prefer argparse, but it's not included with Python < 2.7 -import code -import sys +This fle is designed to be launched as a PYTHONSTARTUP script. +""" +import os from pyspark.context import SparkContext -def main(master='local', ipython=False): - sc = SparkContext(master, 'PySparkShell') - user_ns = {'sc' : sc} - banner = "Spark context avaiable as sc." - if ipython: - import IPython - IPython.embed(user_ns=user_ns, banner2=banner) - else: - print banner - code.interact(local=user_ns) - +sc = SparkContext(os.environ.get("MASTER", "local"), "PySparkShell") +print "Spark context avaiable as sc." -if __name__ == '__main__': - usage = "usage: %prog [options] master" - parser = optparse.OptionParser(usage=usage) - parser.add_option("-i", "--ipython", help="Run IPython shell", - action="store_true") - (options, args) = parser.parse_args() - if len(sys.argv) > 1: - master = args[0] - else: - master = 'local' - main(master, options.ipython) +# The ./pyspark script stores the old PYTHONSTARTUP value in OLD_PYTHONSTARTUP, +# which allows us to execute the user's PYTHONSTARTUP file: +_pythonstartup = os.environ.get('OLD_PYTHONSTARTUP') +if _pythonstartup and os.path.isfile(_pythonstartup): + execfile(_pythonstartup) diff --git a/python/run-tests b/python/run-tests new file mode 100755 index 0000000000..da9e24cb1f --- /dev/null +++ b/python/run-tests @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +# Figure out where the Scala framework is installed +FWDIR="$(cd `dirname $0`; cd ../; pwd)" + +$FWDIR/pyspark pyspark/rdd.py +$FWDIR/pyspark -m doctest pyspark/broadcast.py + +# TODO: in the long-run, it would be nice to use a test runner like `nose`. diff --git a/run-pyspark b/run-pyspark deleted file mode 100755 index deb0d708b3..0000000000 --- a/run-pyspark +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env bash - -# Figure out where the Scala framework is installed -FWDIR="$(cd `dirname $0`; pwd)" - -# Export this as SPARK_HOME -export SPARK_HOME="$FWDIR" - -# Load environment variables from conf/spark-env.sh, if it exists -if [ -e $FWDIR/conf/spark-env.sh ] ; then - . $FWDIR/conf/spark-env.sh -fi - -# Figure out which Python executable to use -if [ -z "$PYSPARK_PYTHON" ] ; then - PYSPARK_PYTHON="python" -fi -export PYSPARK_PYTHON - -# Add the PySpark classes to the Python path: -export PYTHONPATH=$SPARK_HOME/python/:$PYTHONPATH - -# Launch with `scala` by default: -if [[ "$SPARK_LAUNCH_WITH_SCALA" != "0" ]] ; then - export SPARK_LAUNCH_WITH_SCALA=1 -fi - -exec "$PYSPARK_PYTHON" "$@" -- cgit v1.2.3