aboutsummaryrefslogtreecommitdiff
path: root/pyspark
diff options
context:
space:
mode:
authorJosh Rosen <joshrosen@eecs.berkeley.edu>2012-10-20 00:16:41 +0000
committerJosh Rosen <joshrosen@eecs.berkeley.edu>2012-10-20 00:22:27 +0000
commitc23bf1aff4b9a1faf9d32c7b64acad2213f9515c (patch)
tree85c71f77ef78714eb6a459874354c8ba11afdd2c /pyspark
parent52989c8a2c8c10d7f5610c033f6782e58fd3abc2 (diff)
downloadspark-c23bf1aff4b9a1faf9d32c7b64acad2213f9515c.tar.gz
spark-c23bf1aff4b9a1faf9d32c7b64acad2213f9515c.tar.bz2
spark-c23bf1aff4b9a1faf9d32c7b64acad2213f9515c.zip
Add PySpark README and run scripts.
Diffstat (limited to 'pyspark')
-rw-r--r--pyspark/README58
-rwxr-xr-xpyspark/pyspark-shell3
-rw-r--r--pyspark/pyspark/context.py5
-rw-r--r--pyspark/pyspark/examples/wordcount.py17
-rw-r--r--pyspark/pyspark/shell.py21
-rwxr-xr-xpyspark/run-pyspark23
6 files changed, 124 insertions, 3 deletions
diff --git a/pyspark/README b/pyspark/README
new file mode 100644
index 0000000000..63a1def141
--- /dev/null
+++ b/pyspark/README
@@ -0,0 +1,58 @@
+# PySpark
+
+PySpark is a Python API for Spark.
+
+PySpark jobs are writen in Python and executed using a standard Python
+interpreter; this supports modules that use Python C extensions. The
+API is based on the Spark Scala API and uses regular Python functions
+and lambdas to support user-defined functions. PySpark supports
+interactive use through a standard Python interpreter; it can
+automatically serialize closures and ship them to worker processes.
+
+PySpark is built on top of the Spark Java API. Data is uniformly
+represented as serialized Python objects and stored in Spark Java
+processes, which communicate with PySpark worker processes over pipes.
+
+## Features
+
+PySpark supports most of the Spark API, including broadcast variables.
+RDDs are dynamically typed and can hold any Python object.
+
+PySpark does not support:
+
+- Special functions on RDDs of doubles
+- Accumulators
+
+## Examples and Documentation
+
+The PySpark source contains docstrings and doctests that document its
+API. The public classes are in `context.py` and `rdd.py`.
+
+The `pyspark/pyspark/examples` directory contains a few complete
+examples.
+
+## Installing PySpark
+
+PySpark requires a development version of Py4J, a Python library for
+interacting with Java processes. It can be installed from
+https://github.com/bartdag/py4j; make sure to install a version that
+contains at least the commits through 3dbf380d3d.
+
+PySpark uses the `PYTHONPATH` environment variable to search for Python
+classes; Py4J should be on this path, along with any libraries used by
+PySpark programs. `PYTHONPATH` will be automatically shipped to worker
+machines, but the files that it points to must be present on each
+machine.
+
+PySpark requires the Spark assembly JAR, which can be created by running
+`sbt/sbt assembly` in the Spark directory.
+
+Additionally, `SPARK_HOME` should be set to the location of the Spark
+package.
+
+## Running PySpark
+
+The easiest way to run PySpark is to use the `run-pyspark` and
+`pyspark-shell` scripts, which are included in the `pyspark` directory.
+These scripts automatically load the `spark-conf.sh` file, set
+`SPARK_HOME`, and add the `pyspark` package to the `PYTHONPATH`.
diff --git a/pyspark/pyspark-shell b/pyspark/pyspark-shell
new file mode 100755
index 0000000000..4ed3e6010c
--- /dev/null
+++ b/pyspark/pyspark-shell
@@ -0,0 +1,3 @@
+#!/bin/sh
+FWDIR="`dirname $0`"
+exec $FWDIR/run-pyspark $FWDIR/pyspark/shell.py "$@"
diff --git a/pyspark/pyspark/context.py b/pyspark/pyspark/context.py
index 3f4db26644..50d57e5317 100644
--- a/pyspark/pyspark/context.py
+++ b/pyspark/pyspark/context.py
@@ -18,14 +18,13 @@ class SparkContext(object):
asPickle = jvm.spark.api.python.PythonRDD.asPickle
arrayAsPickle = jvm.spark.api.python.PythonRDD.arrayAsPickle
- def __init__(self, master, name, defaultParallelism=None,
- pythonExec='python'):
+ def __init__(self, master, name, defaultParallelism=None):
self.master = master
self.name = name
self._jsc = self.jvm.JavaSparkContext(master, name)
self.defaultParallelism = \
defaultParallelism or self._jsc.sc().defaultParallelism()
- self.pythonExec = pythonExec
+ self.pythonExec = os.environ.get("PYSPARK_PYTHON_EXEC", 'python')
# Broadcast's __reduce__ method stores Broadcast instances here.
# This allows other code to determine which Broadcast instances have
# been pickled, so it can determine which Java broadcast objects to
diff --git a/pyspark/pyspark/examples/wordcount.py b/pyspark/pyspark/examples/wordcount.py
new file mode 100644
index 0000000000..8365c070e8
--- /dev/null
+++ b/pyspark/pyspark/examples/wordcount.py
@@ -0,0 +1,17 @@
+import sys
+from operator import add
+from pyspark.context import SparkContext
+
+if __name__ == "__main__":
+ if len(sys.argv) < 3:
+ print >> sys.stderr, \
+ "Usage: PythonWordCount <master> <file>"
+ exit(-1)
+ sc = SparkContext(sys.argv[1], "PythonWordCount")
+ lines = sc.textFile(sys.argv[2], 1)
+ counts = lines.flatMap(lambda x: x.split(' ')) \
+ .map(lambda x: (x, 1)) \
+ .reduceByKey(add)
+ output = counts.collect()
+ for (word, count) in output:
+ print "%s : %i" % (word, count)
diff --git a/pyspark/pyspark/shell.py b/pyspark/pyspark/shell.py
new file mode 100644
index 0000000000..7ef30894cb
--- /dev/null
+++ b/pyspark/pyspark/shell.py
@@ -0,0 +1,21 @@
+"""
+An interactive shell.
+"""
+import code
+import sys
+
+from pyspark.context import SparkContext
+
+
+def main(master='local'):
+ sc = SparkContext(master, 'PySparkShell')
+ print "Spark context available as sc."
+ code.interact(local={'sc': sc})
+
+
+if __name__ == '__main__':
+ if len(sys.argv) > 1:
+ master = sys.argv[1]
+ else:
+ master = 'local'
+ main(master)
diff --git a/pyspark/run-pyspark b/pyspark/run-pyspark
new file mode 100755
index 0000000000..9c5e027962
--- /dev/null
+++ b/pyspark/run-pyspark
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Figure out where the Scala framework is installed
+FWDIR="$(cd `dirname $0`; cd ../; pwd)"
+
+# Export this as SPARK_HOME
+export SPARK_HOME="$FWDIR"
+
+# Load environment variables from conf/spark-env.sh, if it exists
+if [ -e $FWDIR/conf/spark-env.sh ] ; then
+ . $FWDIR/conf/spark-env.sh
+fi
+
+# Figure out which Python executable to use
+if [ -z "$PYSPARK_PYTHON" ] ; then
+ PYSPARK_PYTHON="python"
+fi
+export PYSPARK_PYTHON
+
+# Add the PySpark classes to the Python path:
+export PYTHONPATH=$SPARK_HOME/pyspark/:$PYTHONPATH
+
+exec "$PYSPARK_PYTHON" "$@"