diff options
Diffstat (limited to 'pyspark/examples')
-rw-r--r-- | pyspark/examples/kmeans.py | 13 | ||||
-rwxr-xr-x | pyspark/examples/logistic_regression.py (renamed from pyspark/examples/lr.py) | 4 | ||||
-rw-r--r-- | pyspark/examples/pi.py | 5 | ||||
-rw-r--r-- | pyspark/examples/transitive_closure.py (renamed from pyspark/examples/tc.py) | 5 | ||||
-rw-r--r-- | pyspark/examples/wordcount.py | 4 |
5 files changed, 19 insertions, 12 deletions
diff --git a/pyspark/examples/kmeans.py b/pyspark/examples/kmeans.py index 9cc366f03c..ad2be21178 100644 --- a/pyspark/examples/kmeans.py +++ b/pyspark/examples/kmeans.py @@ -1,18 +1,21 @@ +""" +This example requires numpy (http://www.numpy.org/) +""" import sys -from pyspark.context import SparkContext -from numpy import array, sum as np_sum +import numpy as np +from pyspark import SparkContext def parseVector(line): - return array([float(x) for x in line.split(' ')]) + return np.array([float(x) for x in line.split(' ')]) def closestPoint(p, centers): bestIndex = 0 closest = float("+inf") for i in range(len(centers)): - tempDist = np_sum((p - centers[i]) ** 2) + tempDist = np.sum((p - centers[i]) ** 2) if tempDist < closest: closest = tempDist bestIndex = i @@ -41,7 +44,7 @@ if __name__ == "__main__": newPoints = pointStats.map( lambda (x, (y, z)): (x, y / z)).collect() - tempDist = sum(np_sum((kPoints[x] - y) ** 2) for (x, y) in newPoints) + tempDist = sum(np.sum((kPoints[x] - y) ** 2) for (x, y) in newPoints) for (x, y) in newPoints: kPoints[x] = y diff --git a/pyspark/examples/lr.py b/pyspark/examples/logistic_regression.py index 5fca0266b8..f13698a86f 100755 --- a/pyspark/examples/lr.py +++ b/pyspark/examples/logistic_regression.py @@ -7,7 +7,7 @@ from os.path import realpath import sys import numpy as np -from pyspark.context import SparkContext +from pyspark import SparkContext N = 100000 # Number of data points @@ -32,7 +32,7 @@ def generateData(): if __name__ == "__main__": if len(sys.argv) == 1: print >> sys.stderr, \ - "Usage: PythonLR <host> [<slices>]" + "Usage: PythonLR <master> [<slices>]" exit(-1) sc = SparkContext(sys.argv[1], "PythonLR", pyFiles=[realpath(__file__)]) slices = int(sys.argv[2]) if len(sys.argv) > 2 else 2 diff --git a/pyspark/examples/pi.py b/pyspark/examples/pi.py index 348bbc5dce..127cba029b 100644 --- a/pyspark/examples/pi.py +++ b/pyspark/examples/pi.py @@ -1,13 +1,14 @@ import sys from random import random from operator import add -from pyspark.context import SparkContext + +from pyspark import SparkContext if __name__ == "__main__": if len(sys.argv) == 1: print >> sys.stderr, \ - "Usage: PythonPi <host> [<slices>]" + "Usage: PythonPi <master> [<slices>]" exit(-1) sc = SparkContext(sys.argv[1], "PythonPi") slices = int(sys.argv[2]) if len(sys.argv) > 2 else 2 diff --git a/pyspark/examples/tc.py b/pyspark/examples/transitive_closure.py index 9630e72b47..73f7f8fbaf 100644 --- a/pyspark/examples/tc.py +++ b/pyspark/examples/transitive_closure.py @@ -1,6 +1,7 @@ import sys from random import Random -from pyspark.context import SparkContext + +from pyspark import SparkContext numEdges = 200 numVertices = 100 @@ -20,7 +21,7 @@ def generateGraph(): if __name__ == "__main__": if len(sys.argv) == 1: print >> sys.stderr, \ - "Usage: PythonTC <host> [<slices>]" + "Usage: PythonTC <master> [<slices>]" exit(-1) sc = SparkContext(sys.argv[1], "PythonTC") slices = sys.argv[2] if len(sys.argv) > 2 else 2 diff --git a/pyspark/examples/wordcount.py b/pyspark/examples/wordcount.py index 8365c070e8..857160624b 100644 --- a/pyspark/examples/wordcount.py +++ b/pyspark/examples/wordcount.py @@ -1,6 +1,8 @@ import sys from operator import add -from pyspark.context import SparkContext + +from pyspark import SparkContext + if __name__ == "__main__": if len(sys.argv) < 3: |