From c2b105af34f7241ac0597d9c35fbf66633a3eaf6 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@eecs.berkeley.edu>
Date: Fri, 28 Dec 2012 22:51:28 -0800
Subject: Add documentation for Python API.

---
 pyspark/README                        | 42 ------------------------------
 pyspark/examples/kmeans.py            | 49 +++++++++++++++++++++++++++++++++++
 pyspark/examples/pi.py                | 20 ++++++++++++++
 pyspark/examples/tc.py                | 49 +++++++++++++++++++++++++++++++++++
 pyspark/examples/wordcount.py         | 17 ++++++++++++
 pyspark/pyspark/__init__.py           |  6 +++++
 pyspark/pyspark/examples/__init__.py  |  0
 pyspark/pyspark/examples/kmeans.py    | 49 -----------------------------------
 pyspark/pyspark/examples/pi.py        | 20 --------------
 pyspark/pyspark/examples/tc.py        | 49 -----------------------------------
 pyspark/pyspark/examples/wordcount.py | 17 ------------
 11 files changed, 141 insertions(+), 177 deletions(-)
 delete mode 100644 pyspark/README
 create mode 100644 pyspark/examples/kmeans.py
 create mode 100644 pyspark/examples/pi.py
 create mode 100644 pyspark/examples/tc.py
 create mode 100644 pyspark/examples/wordcount.py
 delete mode 100644 pyspark/pyspark/examples/__init__.py
 delete mode 100644 pyspark/pyspark/examples/kmeans.py
 delete mode 100644 pyspark/pyspark/examples/pi.py
 delete mode 100644 pyspark/pyspark/examples/tc.py
 delete mode 100644 pyspark/pyspark/examples/wordcount.py

(limited to 'pyspark')
diff --git a/pyspark/README b/pyspark/README
deleted file mode 100644
index d8d521c72c..0000000000
--- a/pyspark/README
+++ /dev/null
@@ -1,42 +0,0 @@
-# PySpark
-
-PySpark is a Python API for Spark.
-
-PySpark jobs are writen in Python and executed using a standard Python
-interpreter; this supports modules that use Python C extensions.  The
-API is based on the Spark Scala API and uses regular Python functions
-and lambdas to support user-defined functions.  PySpark supports
-interactive use through a standard Python interpreter; it can
-automatically serialize closures and ship them to worker processes.
-
-PySpark is built on top of the Spark Java API.  Data is uniformly
-represented as serialized Python objects and stored in Spark Java
-processes, which communicate with PySpark worker processes over pipes.
-
-## Features
-
-PySpark supports most of the Spark API, including broadcast variables.
-RDDs are dynamically typed and can hold any Python object.
-
-PySpark does not support:
-
-- Special functions on RDDs of doubles
-- Accumulators
-
-## Examples and Documentation
-
-The PySpark source contains docstrings and doctests that document its
-API.  The public classes are in `context.py` and `rdd.py`.
-
-The `pyspark/pyspark/examples` directory contains a few complete
-examples.
-
-## Installing PySpark
-#
-To use PySpark, `SPARK_HOME` should be set to the location of the Spark
-package.
-
-## Running PySpark
-
-The easiest way to run PySpark is to use the `run-pyspark` and
-`pyspark-shell` scripts, which are included in the `pyspark` directory.
diff --git a/pyspark/examples/kmeans.py b/pyspark/examples/kmeans.py
new file mode 100644
index 0000000000..9cc366f03c
--- /dev/null
+++ b/pyspark/examples/kmeans.py
@@ -0,0 +1,49 @@
+import sys
+
+from pyspark.context import SparkContext
+from numpy import array, sum as np_sum
+
+
+def parseVector(line):
+    return array([float(x) for x in line.split(' ')])
+
+
+def closestPoint(p, centers):
+    bestIndex = 0
+    closest = float("+inf")
+    for i in range(len(centers)):
+        tempDist = np_sum((p - centers[i]) ** 2)
+        if tempDist < closest:
+            closest = tempDist
+            bestIndex = i
+    return bestIndex
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 5:
+        print >> sys.stderr, \
+            "Usage: PythonKMeans <master> <file> <k> <convergeDist>"
+        exit(-1)
+    sc = SparkContext(sys.argv[1], "PythonKMeans")
+    lines = sc.textFile(sys.argv[2])
+    data = lines.map(parseVector).cache()
+    K = int(sys.argv[3])
+    convergeDist = float(sys.argv[4])
+
+    kPoints = data.takeSample(False, K, 34)
+    tempDist = 1.0
+
+    while tempDist > convergeDist:
+        closest = data.map(
+            lambda p : (closestPoint(p, kPoints), (p, 1)))
+        pointStats = closest.reduceByKey(
+            lambda (x1, y1), (x2, y2): (x1 + x2, y1 + y2))
+        newPoints = pointStats.map(
+            lambda (x, (y, z)): (x, y / z)).collect()
+
+        tempDist = sum(np_sum((kPoints[x] - y) ** 2) for (x, y) in newPoints)
+
+        for (x, y) in newPoints:
+            kPoints[x] = y
+
+    print "Final centers: " + str(kPoints)
diff --git a/pyspark/examples/pi.py b/pyspark/examples/pi.py
new file mode 100644
index 0000000000..348bbc5dce
--- /dev/null
+++ b/pyspark/examples/pi.py
@@ -0,0 +1,20 @@
+import sys
+from random import random
+from operator import add
+from pyspark.context import SparkContext
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 1:
+        print >> sys.stderr, \
+            "Usage: PythonPi <host> [<slices>]"
+        exit(-1)
+    sc = SparkContext(sys.argv[1], "PythonPi")
+    slices = int(sys.argv[2]) if len(sys.argv) > 2 else 2
+    n = 100000 * slices
+    def f(_):
+        x = random() * 2 - 1
+        y = random() * 2 - 1
+        return 1 if x ** 2 + y ** 2 < 1 else 0
+    count = sc.parallelize(xrange(1, n+1), slices).map(f).reduce(add)
+    print "Pi is roughly %f" % (4.0 * count / n)
diff --git a/pyspark/examples/tc.py b/pyspark/examples/tc.py
new file mode 100644
index 0000000000..9630e72b47
--- /dev/null
+++ b/pyspark/examples/tc.py
@@ -0,0 +1,49 @@
+import sys
+from random import Random
+from pyspark.context import SparkContext
+
+numEdges = 200
+numVertices = 100
+rand = Random(42)
+
+
+def generateGraph():
+    edges = set()
+    while len(edges) < numEdges:
+        src = rand.randrange(0, numEdges)
+        dst = rand.randrange(0, numEdges)
+        if src != dst:
+            edges.add((src, dst))
+    return edges
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 1:
+        print >> sys.stderr, \
+            "Usage: PythonTC <host> [<slices>]"
+        exit(-1)
+    sc = SparkContext(sys.argv[1], "PythonTC")
+    slices = sys.argv[2] if len(sys.argv) > 2 else 2
+    tc = sc.parallelize(generateGraph(), slices).cache()
+
+    # Linear transitive closure: each round grows paths by one edge,
+    # by joining the graph's edges with the already-discovered paths.
+    # e.g. join the path (y, z) from the TC with the edge (x, y) from
+    # the graph to obtain the path (x, z).
+
+    # Because join() joins on keys, the edges are stored in reversed order.
+    edges = tc.map(lambda (x, y): (y, x))
+
+    oldCount = 0L
+    nextCount = tc.count()
+    while True:
+        oldCount = nextCount
+        # Perform the join, obtaining an RDD of (y, (z, x)) pairs,
+        # then project the result to obtain the new (x, z) paths.
+        new_edges = tc.join(edges).map(lambda (_, (a, b)): (b, a))
+        tc = tc.union(new_edges).distinct().cache()
+        nextCount = tc.count()
+        if nextCount == oldCount:
+            break
+
+    print "TC has %i edges" % tc.count()
diff --git a/pyspark/examples/wordcount.py b/pyspark/examples/wordcount.py
new file mode 100644
index 0000000000..8365c070e8
--- /dev/null
+++ b/pyspark/examples/wordcount.py
@@ -0,0 +1,17 @@
+import sys
+from operator import add
+from pyspark.context import SparkContext
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print >> sys.stderr, \
+            "Usage: PythonWordCount <master> <file>"
+        exit(-1)
+    sc = SparkContext(sys.argv[1], "PythonWordCount")
+    lines = sc.textFile(sys.argv[2], 1)
+    counts = lines.flatMap(lambda x: x.split(' ')) \
+                  .map(lambda x: (x, 1)) \
+                  .reduceByKey(add)
+    output = counts.collect()
+    for (word, count) in output:
+        print "%s : %i" % (word, count)
diff --git a/pyspark/pyspark/__init__.py b/pyspark/pyspark/__init__.py
index 549c2d2711..8f8402b62b 100644
--- a/pyspark/pyspark/__init__.py
+++ b/pyspark/pyspark/__init__.py
@@ -1,3 +1,9 @@
 import sys
 import os
 sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], "pyspark/lib/py4j0.7.egg"))
+
+
+from pyspark.context import SparkContext
+
+
+__all__ = ["SparkContext"]
diff --git a/pyspark/pyspark/examples/__init__.py b/pyspark/pyspark/examples/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/pyspark/pyspark/examples/kmeans.py b/pyspark/pyspark/examples/kmeans.py
deleted file mode 100644
index 9cc366f03c..0000000000
--- a/pyspark/pyspark/examples/kmeans.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import sys
-
-from pyspark.context import SparkContext
-from numpy import array, sum as np_sum
-
-
-def parseVector(line):
-    return array([float(x) for x in line.split(' ')])
-
-
-def closestPoint(p, centers):
-    bestIndex = 0
-    closest = float("+inf")
-    for i in range(len(centers)):
-        tempDist = np_sum((p - centers[i]) ** 2)
-        if tempDist < closest:
-            closest = tempDist
-            bestIndex = i
-    return bestIndex
-
-
-if __name__ == "__main__":
-    if len(sys.argv) < 5:
-        print >> sys.stderr, \
-            "Usage: PythonKMeans <master> <file> <k> <convergeDist>"
-        exit(-1)
-    sc = SparkContext(sys.argv[1], "PythonKMeans")
-    lines = sc.textFile(sys.argv[2])
-    data = lines.map(parseVector).cache()
-    K = int(sys.argv[3])
-    convergeDist = float(sys.argv[4])
-
-    kPoints = data.takeSample(False, K, 34)
-    tempDist = 1.0
-
-    while tempDist > convergeDist:
-        closest = data.map(
-            lambda p : (closestPoint(p, kPoints), (p, 1)))
-        pointStats = closest.reduceByKey(
-            lambda (x1, y1), (x2, y2): (x1 + x2, y1 + y2))
-        newPoints = pointStats.map(
-            lambda (x, (y, z)): (x, y / z)).collect()
-
-        tempDist = sum(np_sum((kPoints[x] - y) ** 2) for (x, y) in newPoints)
-
-        for (x, y) in newPoints:
-            kPoints[x] = y
-
-    print "Final centers: " + str(kPoints)
diff --git a/pyspark/pyspark/examples/pi.py b/pyspark/pyspark/examples/pi.py
deleted file mode 100644
index 348bbc5dce..0000000000
--- a/pyspark/pyspark/examples/pi.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import sys
-from random import random
-from operator import add
-from pyspark.context import SparkContext
-
-
-if __name__ == "__main__":
-    if len(sys.argv) == 1:
-        print >> sys.stderr, \
-            "Usage: PythonPi <host> [<slices>]"
-        exit(-1)
-    sc = SparkContext(sys.argv[1], "PythonPi")
-    slices = int(sys.argv[2]) if len(sys.argv) > 2 else 2
-    n = 100000 * slices
-    def f(_):
-        x = random() * 2 - 1
-        y = random() * 2 - 1
-        return 1 if x ** 2 + y ** 2 < 1 else 0
-    count = sc.parallelize(xrange(1, n+1), slices).map(f).reduce(add)
-    print "Pi is roughly %f" % (4.0 * count / n)
diff --git a/pyspark/pyspark/examples/tc.py b/pyspark/pyspark/examples/tc.py
deleted file mode 100644
index 9630e72b47..0000000000
--- a/pyspark/pyspark/examples/tc.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import sys
-from random import Random
-from pyspark.context import SparkContext
-
-numEdges = 200
-numVertices = 100
-rand = Random(42)
-
-
-def generateGraph():
-    edges = set()
-    while len(edges) < numEdges:
-        src = rand.randrange(0, numEdges)
-        dst = rand.randrange(0, numEdges)
-        if src != dst:
-            edges.add((src, dst))
-    return edges
-
-
-if __name__ == "__main__":
-    if len(sys.argv) == 1:
-        print >> sys.stderr, \
-            "Usage: PythonTC <host> [<slices>]"
-        exit(-1)
-    sc = SparkContext(sys.argv[1], "PythonTC")
-    slices = sys.argv[2] if len(sys.argv) > 2 else 2
-    tc = sc.parallelize(generateGraph(), slices).cache()
-
-    # Linear transitive closure: each round grows paths by one edge,
-    # by joining the graph's edges with the already-discovered paths.
-    # e.g. join the path (y, z) from the TC with the edge (x, y) from
-    # the graph to obtain the path (x, z).
-
-    # Because join() joins on keys, the edges are stored in reversed order.
-    edges = tc.map(lambda (x, y): (y, x))
-
-    oldCount = 0L
-    nextCount = tc.count()
-    while True:
-        oldCount = nextCount
-        # Perform the join, obtaining an RDD of (y, (z, x)) pairs,
-        # then project the result to obtain the new (x, z) paths.
-        new_edges = tc.join(edges).map(lambda (_, (a, b)): (b, a))
-        tc = tc.union(new_edges).distinct().cache()
-        nextCount = tc.count()
-        if nextCount == oldCount:
-            break
-
-    print "TC has %i edges" % tc.count()
diff --git a/pyspark/pyspark/examples/wordcount.py b/pyspark/pyspark/examples/wordcount.py
deleted file mode 100644
index 8365c070e8..0000000000
--- a/pyspark/pyspark/examples/wordcount.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import sys
-from operator import add
-from pyspark.context import SparkContext
-
-if __name__ == "__main__":
-    if len(sys.argv) < 3:
-        print >> sys.stderr, \
-            "Usage: PythonWordCount <master> <file>"
-        exit(-1)
-    sc = SparkContext(sys.argv[1], "PythonWordCount")
-    lines = sc.textFile(sys.argv[2], 1)
-    counts = lines.flatMap(lambda x: x.split(' ')) \
-                  .map(lambda x: (x, 1)) \
-                  .reduceByKey(add)
-    output = counts.collect()
-    for (word, count) in output:
-        print "%s : %i" % (word, count)
-- 
cgit v1.2.3