aboutsummaryrefslogtreecommitdiff
path: root/pyspark/examples/kmeans.py
diff options
context:
space:
mode:
authorJosh Rosen <joshrosen@eecs.berkeley.edu>2013-01-01 13:52:14 -0800
committerJosh Rosen <joshrosen@eecs.berkeley.edu>2013-01-01 13:52:14 -0800
commit170e451fbdd308ae77065bd9c0f2bd278abf0cb7 (patch)
treeda3df59e2262dac4b381227d5bc712502249d746 /pyspark/examples/kmeans.py
parent6f6a6b79c4c3f3555f8ff427c91e714d02afe8fa (diff)
downloadspark-170e451fbdd308ae77065bd9c0f2bd278abf0cb7.tar.gz
spark-170e451fbdd308ae77065bd9c0f2bd278abf0cb7.tar.bz2
spark-170e451fbdd308ae77065bd9c0f2bd278abf0cb7.zip
Minor documentation and style fixes for PySpark.
Diffstat (limited to 'pyspark/examples/kmeans.py')
-rw-r--r--pyspark/examples/kmeans.py13
1 files changed, 8 insertions, 5 deletions
diff --git a/pyspark/examples/kmeans.py b/pyspark/examples/kmeans.py
index 9cc366f03c..ad2be21178 100644
--- a/pyspark/examples/kmeans.py
+++ b/pyspark/examples/kmeans.py
@@ -1,18 +1,21 @@
+"""
+This example requires numpy (http://www.numpy.org/)
+"""
import sys
-from pyspark.context import SparkContext
-from numpy import array, sum as np_sum
+import numpy as np
+from pyspark import SparkContext
def parseVector(line):
- return array([float(x) for x in line.split(' ')])
+ return np.array([float(x) for x in line.split(' ')])
def closestPoint(p, centers):
bestIndex = 0
closest = float("+inf")
for i in range(len(centers)):
- tempDist = np_sum((p - centers[i]) ** 2)
+ tempDist = np.sum((p - centers[i]) ** 2)
if tempDist < closest:
closest = tempDist
bestIndex = i
@@ -41,7 +44,7 @@ if __name__ == "__main__":
newPoints = pointStats.map(
lambda (x, (y, z)): (x, y / z)).collect()
- tempDist = sum(np_sum((kPoints[x] - y) ** 2) for (x, y) in newPoints)
+ tempDist = sum(np.sum((kPoints[x] - y) ** 2) for (x, y) in newPoints)
for (x, y) in newPoints:
kPoints[x] = y