Add PySpark README and run scripts.

author: Josh Rosen <joshrosen@eecs.berkeley.edu> 2012-10-20 00:16:41 +0000
committer: Josh Rosen <joshrosen@eecs.berkeley.edu> 2012-10-20 00:22:27 +0000
commit: c23bf1aff4b9a1faf9d32c7b64acad2213f9515c (patch)
tree: 85c71f77ef78714eb6a459874354c8ba11afdd2c /pyspark
parent: 52989c8a2c8c10d7f5610c033f6782e58fd3abc2 (diff)
download: spark-c23bf1aff4b9a1faf9d32c7b64acad2213f9515c.tar.gz
spark-c23bf1aff4b9a1faf9d32c7b64acad2213f9515c.tar.bz2
spark-c23bf1aff4b9a1faf9d32c7b64acad2213f9515c.zip
6 files changed, 124 insertions, 3 deletions
diff --git a/pyspark/README b/pyspark/README
new file mode 100644
index 0000000000..63a1def141
--- /dev/null
+++ b/pyspark/README
@@ -0,0 +1,58 @@
+# PySpark
+
+PySpark is a Python API for Spark.
+
+PySpark jobs are writen in Python and executed using a standard Python
+interpreter; this supports modules that use Python C extensions.  The
+API is based on the Spark Scala API and uses regular Python functions
+and lambdas to support user-defined functions.  PySpark supports
+interactive use through a standard Python interpreter; it can
+automatically serialize closures and ship them to worker processes.
+
+PySpark is built on top of the Spark Java API.  Data is uniformly
+represented as serialized Python objects and stored in Spark Java
+processes, which communicate with PySpark worker processes over pipes.
+
+## Features
+
+PySpark supports most of the Spark API, including broadcast variables.
+RDDs are dynamically typed and can hold any Python object.
+
+PySpark does not support:
+
+- Special functions on RDDs of doubles
+- Accumulators
+
+## Examples and Documentation
+
+The PySpark source contains docstrings and doctests that document its
+API.  The public classes are in `context.py` and `rdd.py`.
+
+The `pyspark/pyspark/examples` directory contains a few complete
+examples.
+
+## Installing PySpark
+
+PySpark requires a development version of Py4J, a Python library for
+interacting with Java processes.  It can be installed from
+https://github.com/bartdag/py4j; make sure to install a version that
+contains at least the commits through 3dbf380d3d.
+
+PySpark uses the `PYTHONPATH` environment variable to search for Python
+classes; Py4J should be on this path, along with any libraries used by
+PySpark programs.  `PYTHONPATH` will be automatically shipped to worker
+machines, but the files that it points to must be present on each
+machine.
+
+PySpark requires the Spark assembly JAR, which can be created by running
+`sbt/sbt assembly` in the Spark directory.
+
+Additionally, `SPARK_HOME` should be set to the location of the Spark
+package.
+
+## Running PySpark
+
+The easiest way to run PySpark is to use the `run-pyspark` and
+`pyspark-shell` scripts, which are included in the `pyspark` directory.
+These scripts automatically load the `spark-conf.sh` file, set
+`SPARK_HOME`, and add the `pyspark` package to the `PYTHONPATH`.
diff --git a/pyspark/pyspark-shell b/pyspark/pyspark-shell
new file mode 100755
index 0000000000..4ed3e6010c
--- /dev/null
+++ b/pyspark/pyspark-shell
@@ -0,0 +1,3 @@
+#!/bin/sh
+FWDIR="`dirname $0`"
+exec $FWDIR/run-pyspark $FWDIR/pyspark/shell.py "$@"
diff --git a/pyspark/pyspark/context.py b/pyspark/pyspark/context.py
index 3f4db26644..50d57e5317 100644
--- a/pyspark/pyspark/context.py
+++ b/pyspark/pyspark/context.py
@@ -18,14 +18,13 @@ class SparkContext(object):
     asPickle = jvm.spark.api.python.PythonRDD.asPickle
     arrayAsPickle = jvm.spark.api.python.PythonRDD.arrayAsPickle
 
-    def __init__(self, master, name, defaultParallelism=None,
-                 pythonExec='python'):
+    def __init__(self, master, name, defaultParallelism=None):
         self.master = master
         self.name = name
         self._jsc = self.jvm.JavaSparkContext(master, name)
         self.defaultParallelism = \
             defaultParallelism or self._jsc.sc().defaultParallelism()
-        self.pythonExec = pythonExec
+        self.pythonExec = os.environ.get("PYSPARK_PYTHON_EXEC", 'python')
         # Broadcast's __reduce__ method stores Broadcast instances here.
         # This allows other code to determine which Broadcast instances have
         # been pickled, so it can determine which Java broadcast objects to
diff --git a/pyspark/pyspark/examples/wordcount.py b/pyspark/pyspark/examples/wordcount.py
new file mode 100644
index 0000000000..8365c070e8
--- /dev/null
+++ b/pyspark/pyspark/examples/wordcount.py
@@ -0,0 +1,17 @@
+import sys
+from operator import add
+from pyspark.context import SparkContext
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print >> sys.stderr, \
+            "Usage: PythonWordCount <master> <file>"
+        exit(-1)
+    sc = SparkContext(sys.argv[1], "PythonWordCount")
+    lines = sc.textFile(sys.argv[2], 1)
+    counts = lines.flatMap(lambda x: x.split(' ')) \
+                  .map(lambda x: (x, 1)) \
+                  .reduceByKey(add)
+    output = counts.collect()
+    for (word, count) in output:
+        print "%s : %i" % (word, count)
diff --git a/pyspark/pyspark/shell.py b/pyspark/pyspark/shell.py
new file mode 100644
index 0000000000..7ef30894cb
--- /dev/null
+++ b/pyspark/pyspark/shell.py
@@ -0,0 +1,21 @@
+"""
+An interactive shell.
+"""
+import code
+import sys
+
+from pyspark.context import SparkContext
+
+
+def main(master='local'):
+    sc = SparkContext(master, 'PySparkShell')
+    print "Spark context available as sc."
+    code.interact(local={'sc': sc})
+
+
+if __name__ == '__main__':
+    if len(sys.argv) > 1:
+        master = sys.argv[1]
+    else:
+        master = 'local'
+    main(master)
diff --git a/pyspark/run-pyspark b/pyspark/run-pyspark
new file mode 100755
index 0000000000..9c5e027962
--- /dev/null
+++ b/pyspark/run-pyspark
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Figure out where the Scala framework is installed
+FWDIR="$(cd `dirname $0`; cd ../; pwd)"
+
+# Export this as SPARK_HOME
+export SPARK_HOME="$FWDIR"
+
+# Load environment variables from conf/spark-env.sh, if it exists
+if [ -e $FWDIR/conf/spark-env.sh ] ; then
+  . $FWDIR/conf/spark-env.sh
+fi
+
+# Figure out which Python executable to use
+if [ -z "$PYSPARK_PYTHON" ] ; then
+  PYSPARK_PYTHON="python"
+fi
+export PYSPARK_PYTHON
+
+# Add the PySpark classes to the Python path:
+export PYTHONPATH=$SPARK_HOME/pyspark/:$PYTHONPATH
+
+exec "$PYSPARK_PYTHON" "$@"
author	Josh Rosen <joshrosen@eecs.berkeley.edu>	2012-10-20 00:16:41 +0000
committer	Josh Rosen <joshrosen@eecs.berkeley.edu>	2012-10-20 00:22:27 +0000
commit	c23bf1aff4b9a1faf9d32c7b64acad2213f9515c (patch)
tree	85c71f77ef78714eb6a459874354c8ba11afdd2c /pyspark
parent	52989c8a2c8c10d7f5610c033f6782e58fd3abc2 (diff)
download	spark-c23bf1aff4b9a1faf9d32c7b64acad2213f9515c.tar.gz spark-c23bf1aff4b9a1faf9d32c7b64acad2213f9515c.tar.bz2 spark-c23bf1aff4b9a1faf9d32c7b64acad2213f9515c.zip