From c2b105af34f7241ac0597d9c35fbf66633a3eaf6 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Fri, 28 Dec 2012 22:51:28 -0800 Subject: Add documentation for Python API. --- pyspark/README | 42 ------------------------------ pyspark/examples/kmeans.py | 49 +++++++++++++++++++++++++++++++++++ pyspark/examples/pi.py | 20 ++++++++++++++ pyspark/examples/tc.py | 49 +++++++++++++++++++++++++++++++++++ pyspark/examples/wordcount.py | 17 ++++++++++++ pyspark/pyspark/__init__.py | 6 +++++ pyspark/pyspark/examples/__init__.py | 0 pyspark/pyspark/examples/kmeans.py | 49 ----------------------------------- pyspark/pyspark/examples/pi.py | 20 -------------- pyspark/pyspark/examples/tc.py | 49 ----------------------------------- pyspark/pyspark/examples/wordcount.py | 17 ------------ 11 files changed, 141 insertions(+), 177 deletions(-) delete mode 100644 pyspark/README create mode 100644 pyspark/examples/kmeans.py create mode 100644 pyspark/examples/pi.py create mode 100644 pyspark/examples/tc.py create mode 100644 pyspark/examples/wordcount.py delete mode 100644 pyspark/pyspark/examples/__init__.py delete mode 100644 pyspark/pyspark/examples/kmeans.py delete mode 100644 pyspark/pyspark/examples/pi.py delete mode 100644 pyspark/pyspark/examples/tc.py delete mode 100644 pyspark/pyspark/examples/wordcount.py (limited to 'pyspark') diff --git a/pyspark/README b/pyspark/README deleted file mode 100644 index d8d521c72c..0000000000 --- a/pyspark/README +++ /dev/null @@ -1,42 +0,0 @@ -# PySpark - -PySpark is a Python API for Spark. - -PySpark jobs are writen in Python and executed using a standard Python -interpreter; this supports modules that use Python C extensions. The -API is based on the Spark Scala API and uses regular Python functions -and lambdas to support user-defined functions. PySpark supports -interactive use through a standard Python interpreter; it can -automatically serialize closures and ship them to worker processes. - -PySpark is built on top of the Spark Java API. Data is uniformly -represented as serialized Python objects and stored in Spark Java -processes, which communicate with PySpark worker processes over pipes. - -## Features - -PySpark supports most of the Spark API, including broadcast variables. -RDDs are dynamically typed and can hold any Python object. - -PySpark does not support: - -- Special functions on RDDs of doubles -- Accumulators - -## Examples and Documentation - -The PySpark source contains docstrings and doctests that document its -API. The public classes are in `context.py` and `rdd.py`. - -The `pyspark/pyspark/examples` directory contains a few complete -examples. - -## Installing PySpark -# -To use PySpark, `SPARK_HOME` should be set to the location of the Spark -package. - -## Running PySpark - -The easiest way to run PySpark is to use the `run-pyspark` and -`pyspark-shell` scripts, which are included in the `pyspark` directory. diff --git a/pyspark/examples/kmeans.py b/pyspark/examples/kmeans.py new file mode 100644 index 0000000000..9cc366f03c --- /dev/null +++ b/pyspark/examples/kmeans.py @@ -0,0 +1,49 @@ +import sys + +from pyspark.context import SparkContext +from numpy import array, sum as np_sum + + +def parseVector(line): + return array([float(x) for x in line.split(' ')]) + + +def closestPoint(p, centers): + bestIndex = 0 + closest = float("+inf") + for i in range(len(centers)): + tempDist = np_sum((p - centers[i]) ** 2) + if tempDist < closest: + closest = tempDist + bestIndex = i + return bestIndex + + +if __name__ == "__main__": + if len(sys.argv) < 5: + print >> sys.stderr, \ + "Usage: PythonKMeans " + exit(-1) + sc = SparkContext(sys.argv[1], "PythonKMeans") + lines = sc.textFile(sys.argv[2]) + data = lines.map(parseVector).cache() + K = int(sys.argv[3]) + convergeDist = float(sys.argv[4]) + + kPoints = data.takeSample(False, K, 34) + tempDist = 1.0 + + while tempDist > convergeDist: + closest = data.map( + lambda p : (closestPoint(p, kPoints), (p, 1))) + pointStats = closest.reduceByKey( + lambda (x1, y1), (x2, y2): (x1 + x2, y1 + y2)) + newPoints = pointStats.map( + lambda (x, (y, z)): (x, y / z)).collect() + + tempDist = sum(np_sum((kPoints[x] - y) ** 2) for (x, y) in newPoints) + + for (x, y) in newPoints: + kPoints[x] = y + + print "Final centers: " + str(kPoints) diff --git a/pyspark/examples/pi.py b/pyspark/examples/pi.py new file mode 100644 index 0000000000..348bbc5dce --- /dev/null +++ b/pyspark/examples/pi.py @@ -0,0 +1,20 @@ +import sys +from random import random +from operator import add +from pyspark.context import SparkContext + + +if __name__ == "__main__": + if len(sys.argv) == 1: + print >> sys.stderr, \ + "Usage: PythonPi []" + exit(-1) + sc = SparkContext(sys.argv[1], "PythonPi") + slices = int(sys.argv[2]) if len(sys.argv) > 2 else 2 + n = 100000 * slices + def f(_): + x = random() * 2 - 1 + y = random() * 2 - 1 + return 1 if x ** 2 + y ** 2 < 1 else 0 + count = sc.parallelize(xrange(1, n+1), slices).map(f).reduce(add) + print "Pi is roughly %f" % (4.0 * count / n) diff --git a/pyspark/examples/tc.py b/pyspark/examples/tc.py new file mode 100644 index 0000000000..9630e72b47 --- /dev/null +++ b/pyspark/examples/tc.py @@ -0,0 +1,49 @@ +import sys +from random import Random +from pyspark.context import SparkContext + +numEdges = 200 +numVertices = 100 +rand = Random(42) + + +def generateGraph(): + edges = set() + while len(edges) < numEdges: + src = rand.randrange(0, numEdges) + dst = rand.randrange(0, numEdges) + if src != dst: + edges.add((src, dst)) + return edges + + +if __name__ == "__main__": + if len(sys.argv) == 1: + print >> sys.stderr, \ + "Usage: PythonTC []" + exit(-1) + sc = SparkContext(sys.argv[1], "PythonTC") + slices = sys.argv[2] if len(sys.argv) > 2 else 2 + tc = sc.parallelize(generateGraph(), slices).cache() + + # Linear transitive closure: each round grows paths by one edge, + # by joining the graph's edges with the already-discovered paths. + # e.g. join the path (y, z) from the TC with the edge (x, y) from + # the graph to obtain the path (x, z). + + # Because join() joins on keys, the edges are stored in reversed order. + edges = tc.map(lambda (x, y): (y, x)) + + oldCount = 0L + nextCount = tc.count() + while True: + oldCount = nextCount + # Perform the join, obtaining an RDD of (y, (z, x)) pairs, + # then project the result to obtain the new (x, z) paths. + new_edges = tc.join(edges).map(lambda (_, (a, b)): (b, a)) + tc = tc.union(new_edges).distinct().cache() + nextCount = tc.count() + if nextCount == oldCount: + break + + print "TC has %i edges" % tc.count() diff --git a/pyspark/examples/wordcount.py b/pyspark/examples/wordcount.py new file mode 100644 index 0000000000..8365c070e8 --- /dev/null +++ b/pyspark/examples/wordcount.py @@ -0,0 +1,17 @@ +import sys +from operator import add +from pyspark.context import SparkContext + +if __name__ == "__main__": + if len(sys.argv) < 3: + print >> sys.stderr, \ + "Usage: PythonWordCount " + exit(-1) + sc = SparkContext(sys.argv[1], "PythonWordCount") + lines = sc.textFile(sys.argv[2], 1) + counts = lines.flatMap(lambda x: x.split(' ')) \ + .map(lambda x: (x, 1)) \ + .reduceByKey(add) + output = counts.collect() + for (word, count) in output: + print "%s : %i" % (word, count) diff --git a/pyspark/pyspark/__init__.py b/pyspark/pyspark/__init__.py index 549c2d2711..8f8402b62b 100644 --- a/pyspark/pyspark/__init__.py +++ b/pyspark/pyspark/__init__.py @@ -1,3 +1,9 @@ import sys import os sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], "pyspark/lib/py4j0.7.egg")) + + +from pyspark.context import SparkContext + + +__all__ = ["SparkContext"] diff --git a/pyspark/pyspark/examples/__init__.py b/pyspark/pyspark/examples/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/pyspark/pyspark/examples/kmeans.py b/pyspark/pyspark/examples/kmeans.py deleted file mode 100644 index 9cc366f03c..0000000000 --- a/pyspark/pyspark/examples/kmeans.py +++ /dev/null @@ -1,49 +0,0 @@ -import sys - -from pyspark.context import SparkContext -from numpy import array, sum as np_sum - - -def parseVector(line): - return array([float(x) for x in line.split(' ')]) - - -def closestPoint(p, centers): - bestIndex = 0 - closest = float("+inf") - for i in range(len(centers)): - tempDist = np_sum((p - centers[i]) ** 2) - if tempDist < closest: - closest = tempDist - bestIndex = i - return bestIndex - - -if __name__ == "__main__": - if len(sys.argv) < 5: - print >> sys.stderr, \ - "Usage: PythonKMeans " - exit(-1) - sc = SparkContext(sys.argv[1], "PythonKMeans") - lines = sc.textFile(sys.argv[2]) - data = lines.map(parseVector).cache() - K = int(sys.argv[3]) - convergeDist = float(sys.argv[4]) - - kPoints = data.takeSample(False, K, 34) - tempDist = 1.0 - - while tempDist > convergeDist: - closest = data.map( - lambda p : (closestPoint(p, kPoints), (p, 1))) - pointStats = closest.reduceByKey( - lambda (x1, y1), (x2, y2): (x1 + x2, y1 + y2)) - newPoints = pointStats.map( - lambda (x, (y, z)): (x, y / z)).collect() - - tempDist = sum(np_sum((kPoints[x] - y) ** 2) for (x, y) in newPoints) - - for (x, y) in newPoints: - kPoints[x] = y - - print "Final centers: " + str(kPoints) diff --git a/pyspark/pyspark/examples/pi.py b/pyspark/pyspark/examples/pi.py deleted file mode 100644 index 348bbc5dce..0000000000 --- a/pyspark/pyspark/examples/pi.py +++ /dev/null @@ -1,20 +0,0 @@ -import sys -from random import random -from operator import add -from pyspark.context import SparkContext - - -if __name__ == "__main__": - if len(sys.argv) == 1: - print >> sys.stderr, \ - "Usage: PythonPi []" - exit(-1) - sc = SparkContext(sys.argv[1], "PythonPi") - slices = int(sys.argv[2]) if len(sys.argv) > 2 else 2 - n = 100000 * slices - def f(_): - x = random() * 2 - 1 - y = random() * 2 - 1 - return 1 if x ** 2 + y ** 2 < 1 else 0 - count = sc.parallelize(xrange(1, n+1), slices).map(f).reduce(add) - print "Pi is roughly %f" % (4.0 * count / n) diff --git a/pyspark/pyspark/examples/tc.py b/pyspark/pyspark/examples/tc.py deleted file mode 100644 index 9630e72b47..0000000000 --- a/pyspark/pyspark/examples/tc.py +++ /dev/null @@ -1,49 +0,0 @@ -import sys -from random import Random -from pyspark.context import SparkContext - -numEdges = 200 -numVertices = 100 -rand = Random(42) - - -def generateGraph(): - edges = set() - while len(edges) < numEdges: - src = rand.randrange(0, numEdges) - dst = rand.randrange(0, numEdges) - if src != dst: - edges.add((src, dst)) - return edges - - -if __name__ == "__main__": - if len(sys.argv) == 1: - print >> sys.stderr, \ - "Usage: PythonTC []" - exit(-1) - sc = SparkContext(sys.argv[1], "PythonTC") - slices = sys.argv[2] if len(sys.argv) > 2 else 2 - tc = sc.parallelize(generateGraph(), slices).cache() - - # Linear transitive closure: each round grows paths by one edge, - # by joining the graph's edges with the already-discovered paths. - # e.g. join the path (y, z) from the TC with the edge (x, y) from - # the graph to obtain the path (x, z). - - # Because join() joins on keys, the edges are stored in reversed order. - edges = tc.map(lambda (x, y): (y, x)) - - oldCount = 0L - nextCount = tc.count() - while True: - oldCount = nextCount - # Perform the join, obtaining an RDD of (y, (z, x)) pairs, - # then project the result to obtain the new (x, z) paths. - new_edges = tc.join(edges).map(lambda (_, (a, b)): (b, a)) - tc = tc.union(new_edges).distinct().cache() - nextCount = tc.count() - if nextCount == oldCount: - break - - print "TC has %i edges" % tc.count() diff --git a/pyspark/pyspark/examples/wordcount.py b/pyspark/pyspark/examples/wordcount.py deleted file mode 100644 index 8365c070e8..0000000000 --- a/pyspark/pyspark/examples/wordcount.py +++ /dev/null @@ -1,17 +0,0 @@ -import sys -from operator import add -from pyspark.context import SparkContext - -if __name__ == "__main__": - if len(sys.argv) < 3: - print >> sys.stderr, \ - "Usage: PythonWordCount " - exit(-1) - sc = SparkContext(sys.argv[1], "PythonWordCount") - lines = sc.textFile(sys.argv[2], 1) - counts = lines.flatMap(lambda x: x.split(' ')) \ - .map(lambda x: (x, 1)) \ - .reduceByKey(add) - output = counts.collect() - for (word, count) in output: - print "%s : %i" % (word, count) -- cgit v1.2.3