aboutsummaryrefslogtreecommitdiff
path: root/pyspark/examples/tc.py
diff options
context:
space:
mode:
authorJosh Rosen <joshrosen@eecs.berkeley.edu>2012-12-28 22:51:28 -0800
committerJosh Rosen <joshrosen@eecs.berkeley.edu>2012-12-28 22:51:28 -0800
commitc2b105af34f7241ac0597d9c35fbf66633a3eaf6 (patch)
treee96946d2b714365937019f60741bf3ae62d565c6 /pyspark/examples/tc.py
parent7ec3595de28d53839cb3a45e940ec16f81ffdf45 (diff)
downloadspark-c2b105af34f7241ac0597d9c35fbf66633a3eaf6.tar.gz
spark-c2b105af34f7241ac0597d9c35fbf66633a3eaf6.tar.bz2
spark-c2b105af34f7241ac0597d9c35fbf66633a3eaf6.zip
Add documentation for Python API.
Diffstat (limited to 'pyspark/examples/tc.py')
-rw-r--r--pyspark/examples/tc.py49
1 files changed, 49 insertions, 0 deletions
diff --git a/pyspark/examples/tc.py b/pyspark/examples/tc.py
new file mode 100644
index 0000000000..9630e72b47
--- /dev/null
+++ b/pyspark/examples/tc.py
@@ -0,0 +1,49 @@
+import sys
+from random import Random
+from pyspark.context import SparkContext
+
+numEdges = 200
+numVertices = 100
+rand = Random(42)
+
+
+def generateGraph():
+ edges = set()
+ while len(edges) < numEdges:
+ src = rand.randrange(0, numEdges)
+ dst = rand.randrange(0, numEdges)
+ if src != dst:
+ edges.add((src, dst))
+ return edges
+
+
+if __name__ == "__main__":
+ if len(sys.argv) == 1:
+ print >> sys.stderr, \
+ "Usage: PythonTC <host> [<slices>]"
+ exit(-1)
+ sc = SparkContext(sys.argv[1], "PythonTC")
+ slices = sys.argv[2] if len(sys.argv) > 2 else 2
+ tc = sc.parallelize(generateGraph(), slices).cache()
+
+ # Linear transitive closure: each round grows paths by one edge,
+ # by joining the graph's edges with the already-discovered paths.
+ # e.g. join the path (y, z) from the TC with the edge (x, y) from
+ # the graph to obtain the path (x, z).
+
+ # Because join() joins on keys, the edges are stored in reversed order.
+ edges = tc.map(lambda (x, y): (y, x))
+
+ oldCount = 0L
+ nextCount = tc.count()
+ while True:
+ oldCount = nextCount
+ # Perform the join, obtaining an RDD of (y, (z, x)) pairs,
+ # then project the result to obtain the new (x, z) paths.
+ new_edges = tc.join(edges).map(lambda (_, (a, b)): (b, a))
+ tc = tc.union(new_edges).distinct().cache()
+ nextCount = tc.count()
+ if nextCount == oldCount:
+ break
+
+ print "TC has %i edges" % tc.count()