5 files changed, 199 insertions, 0 deletions
diff --git a/python/examples/kmeans.py b/python/examples/kmeans.py
new file mode 100644
index 0000000000..ad2be21178
--- /dev/null
+++ b/python/examples/kmeans.py
@@ -0,0 +1,52 @@
+"""
+This example requires numpy (http://www.numpy.org/)
+"""
+import sys
+
+import numpy as np
+from pyspark import SparkContext
+
+
+def parseVector(line):
+    return np.array([float(x) for x in line.split(' ')])
+
+
+def closestPoint(p, centers):
+    bestIndex = 0
+    closest = float("+inf")
+    for i in range(len(centers)):
+        tempDist = np.sum((p - centers[i]) ** 2)
+        if tempDist < closest:
+            closest = tempDist
+            bestIndex = i
+    return bestIndex
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 5:
+        print >> sys.stderr, \
+            "Usage: PythonKMeans <master> <file> <k> <convergeDist>"
+        exit(-1)
+    sc = SparkContext(sys.argv[1], "PythonKMeans")
+    lines = sc.textFile(sys.argv[2])
+    data = lines.map(parseVector).cache()
+    K = int(sys.argv[3])
+    convergeDist = float(sys.argv[4])
+
+    kPoints = data.takeSample(False, K, 34)
+    tempDist = 1.0
+
+    while tempDist > convergeDist:
+        closest = data.map(
+            lambda p : (closestPoint(p, kPoints), (p, 1)))
+        pointStats = closest.reduceByKey(
+            lambda (x1, y1), (x2, y2): (x1 + x2, y1 + y2))
+        newPoints = pointStats.map(
+            lambda (x, (y, z)): (x, y / z)).collect()
+
+        tempDist = sum(np.sum((kPoints[x] - y) ** 2) for (x, y) in newPoints)
+
+        for (x, y) in newPoints:
+            kPoints[x] = y
+
+    print "Final centers: " + str(kPoints)
diff --git a/python/examples/logistic_regression.py b/python/examples/logistic_regression.py
new file mode 100755
index 0000000000..f13698a86f
--- /dev/null
+++ b/python/examples/logistic_regression.py
@@ -0,0 +1,57 @@
+"""
+This example requires numpy (http://www.numpy.org/)
+"""
+from collections import namedtuple
+from math import exp
+from os.path import realpath
+import sys
+
+import numpy as np
+from pyspark import SparkContext
+
+
+N = 100000  # Number of data points
+D = 10  # Number of dimensions
+R = 0.7   # Scaling factor
+ITERATIONS = 5
+np.random.seed(42)
+
+
+DataPoint = namedtuple("DataPoint", ['x', 'y'])
+from lr import DataPoint  # So that DataPoint is properly serialized
+
+
+def generateData():
+    def generatePoint(i):
+        y = -1 if i % 2 == 0 else 1
+        x = np.random.normal(size=D) + (y * R)
+        return DataPoint(x, y)
+    return [generatePoint(i) for i in range(N)]
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 1:
+        print >> sys.stderr, \
+            "Usage: PythonLR <master> [<slices>]"
+        exit(-1)
+    sc = SparkContext(sys.argv[1], "PythonLR", pyFiles=[realpath(__file__)])
+    slices = int(sys.argv[2]) if len(sys.argv) > 2 else 2
+    points = sc.parallelize(generateData(), slices).cache()
+
+    # Initialize w to a random value
+    w = 2 * np.random.ranf(size=D) - 1
+    print "Initial w: " + str(w)
+
+    def add(x, y):
+        x += y
+        return x
+
+    for i in range(1, ITERATIONS + 1):
+        print "On iteration %i" % i
+
+        gradient = points.map(lambda p:
+            (1.0 / (1.0 + exp(-p.y * np.dot(w, p.x)))) * p.y * p.x
+        ).reduce(add)
+        w -= gradient
+
+    print "Final w: " + str(w)
diff --git a/python/examples/pi.py b/python/examples/pi.py
new file mode 100644
index 0000000000..127cba029b
--- /dev/null
+++ b/python/examples/pi.py
@@ -0,0 +1,21 @@
+import sys
+from random import random
+from operator import add
+
+from pyspark import SparkContext
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 1:
+        print >> sys.stderr, \
+            "Usage: PythonPi <master> [<slices>]"
+        exit(-1)
+    sc = SparkContext(sys.argv[1], "PythonPi")
+    slices = int(sys.argv[2]) if len(sys.argv) > 2 else 2
+    n = 100000 * slices
+    def f(_):
+        x = random() * 2 - 1
+        y = random() * 2 - 1
+        return 1 if x ** 2 + y ** 2 < 1 else 0
+    count = sc.parallelize(xrange(1, n+1), slices).map(f).reduce(add)
+    print "Pi is roughly %f" % (4.0 * count / n)
diff --git a/python/examples/transitive_closure.py b/python/examples/transitive_closure.py
new file mode 100644
index 0000000000..73f7f8fbaf
--- /dev/null
+++ b/python/examples/transitive_closure.py
@@ -0,0 +1,50 @@
+import sys
+from random import Random
+
+from pyspark import SparkContext
+
+numEdges = 200
+numVertices = 100
+rand = Random(42)
+
+
+def generateGraph():
+    edges = set()
+    while len(edges) < numEdges:
+        src = rand.randrange(0, numEdges)
+        dst = rand.randrange(0, numEdges)
+        if src != dst:
+            edges.add((src, dst))
+    return edges
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 1:
+        print >> sys.stderr, \
+            "Usage: PythonTC <master> [<slices>]"
+        exit(-1)
+    sc = SparkContext(sys.argv[1], "PythonTC")
+    slices = sys.argv[2] if len(sys.argv) > 2 else 2
+    tc = sc.parallelize(generateGraph(), slices).cache()
+
+    # Linear transitive closure: each round grows paths by one edge,
+    # by joining the graph's edges with the already-discovered paths.
+    # e.g. join the path (y, z) from the TC with the edge (x, y) from
+    # the graph to obtain the path (x, z).
+
+    # Because join() joins on keys, the edges are stored in reversed order.
+    edges = tc.map(lambda (x, y): (y, x))
+
+    oldCount = 0L
+    nextCount = tc.count()
+    while True:
+        oldCount = nextCount
+        # Perform the join, obtaining an RDD of (y, (z, x)) pairs,
+        # then project the result to obtain the new (x, z) paths.
+        new_edges = tc.join(edges).map(lambda (_, (a, b)): (b, a))
+        tc = tc.union(new_edges).distinct().cache()
+        nextCount = tc.count()
+        if nextCount == oldCount:
+            break
+
+    print "TC has %i edges" % tc.count()
diff --git a/python/examples/wordcount.py b/python/examples/wordcount.py
new file mode 100644
index 0000000000..857160624b
--- /dev/null
+++ b/python/examples/wordcount.py
@@ -0,0 +1,19 @@
+import sys
+from operator import add
+
+from pyspark import SparkContext
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print >> sys.stderr, \
+            "Usage: PythonWordCount <master> <file>"
+        exit(-1)
+    sc = SparkContext(sys.argv[1], "PythonWordCount")
+    lines = sc.textFile(sys.argv[2], 1)
+    counts = lines.flatMap(lambda x: x.split(' ')) \
+                  .map(lambda x: (x, 1)) \
+                  .reduceByKey(add)
+    output = counts.collect()
+    for (word, count) in output:
+        print "%s : %i" % (word, count)