aboutsummaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorReynold Xin <rxin@apache.org>2014-05-25 14:48:27 -0700
committerReynold Xin <rxin@apache.org>2014-05-25 14:48:27 -0700
commitd79c2b28e17ec0b15198aaedd2e1f403d81f717e (patch)
tree1917d4285692d387de250f8ee8192f794bb2966c /examples
parent55fddf9cc0fe420d5396b0e730c8413b2f23d636 (diff)
downloadspark-d79c2b28e17ec0b15198aaedd2e1f403d81f717e.tar.gz
spark-d79c2b28e17ec0b15198aaedd2e1f403d81f717e.tar.bz2
spark-d79c2b28e17ec0b15198aaedd2e1f403d81f717e.zip
Fix PEP8 violations in examples/src/main/python.
Author: Reynold Xin <rxin@apache.org> Closes #870 from rxin/examples-python-pep8 and squashes the following commits: 2829e84 [Reynold Xin] Fix PEP8 violations in examples/src/main/python.
Diffstat (limited to 'examples')
-rwxr-xr-xexamples/src/main/python/als.py20
-rwxr-xr-xexamples/src/main/python/kmeans.py2
-rwxr-xr-xexamples/src/main/python/logistic_regression.py4
-rwxr-xr-xexamples/src/main/python/pagerank.py12
-rwxr-xr-xexamples/src/main/python/pi.py2
-rwxr-xr-xexamples/src/main/python/sort.py4
6 files changed, 25 insertions, 19 deletions
diff --git a/examples/src/main/python/als.py b/examples/src/main/python/als.py
index f0b46cd28b..1a7c4c51f4 100755
--- a/examples/src/main/python/als.py
+++ b/examples/src/main/python/als.py
@@ -29,22 +29,25 @@ from pyspark import SparkContext
LAMBDA = 0.01 # regularization
np.random.seed(42)
+
def rmse(R, ms, us):
diff = R - ms * us.T
return np.sqrt(np.sum(np.power(diff, 2)) / M * U)
+
def update(i, vec, mat, ratings):
uu = mat.shape[0]
ff = mat.shape[1]
-
+
XtX = mat.T * mat
Xty = mat.T * ratings[i, :].T
-
+
for j in range(ff):
- XtX[j,j] += LAMBDA * uu
-
+ XtX[j, j] += LAMBDA * uu
+
return np.linalg.solve(XtX, Xty)
+
if __name__ == "__main__":
"""
Usage: als [M] [U] [F] [iterations] [slices]"
@@ -57,10 +60,10 @@ if __name__ == "__main__":
slices = int(sys.argv[5]) if len(sys.argv) > 5 else 2
print "Running ALS with M=%d, U=%d, F=%d, iters=%d, slices=%d\n" % \
- (M, U, F, ITERATIONS, slices)
+ (M, U, F, ITERATIONS, slices)
R = matrix(rand(M, F)) * matrix(rand(U, F).T)
- ms = matrix(rand(M ,F))
+ ms = matrix(rand(M, F))
us = matrix(rand(U, F))
Rb = sc.broadcast(R)
@@ -71,8 +74,9 @@ if __name__ == "__main__":
ms = sc.parallelize(range(M), slices) \
.map(lambda x: update(x, msb.value[x, :], usb.value, Rb.value)) \
.collect()
- ms = matrix(np.array(ms)[:, :, 0]) # collect() returns a list, so array ends up being
- # a 3-d array, we take the first 2 dims for the matrix
+ # collect() returns a list, so array ends up being
+ # a 3-d array, we take the first 2 dims for the matrix
+ ms = matrix(np.array(ms)[:, :, 0])
msb = sc.broadcast(ms)
us = sc.parallelize(range(U), slices) \
diff --git a/examples/src/main/python/kmeans.py b/examples/src/main/python/kmeans.py
index fc16586c28..988fc45baf 100755
--- a/examples/src/main/python/kmeans.py
+++ b/examples/src/main/python/kmeans.py
@@ -59,7 +59,7 @@ if __name__ == "__main__":
while tempDist > convergeDist:
closest = data.map(
- lambda p : (closestPoint(p, kPoints), (p, 1)))
+ lambda p: (closestPoint(p, kPoints), (p, 1)))
pointStats = closest.reduceByKey(
lambda (x1, y1), (x2, y2): (x1 + x2, y1 + y2))
newPoints = pointStats.map(
diff --git a/examples/src/main/python/logistic_regression.py b/examples/src/main/python/logistic_regression.py
index 0f22d0b323..6c33deabfd 100755
--- a/examples/src/main/python/logistic_regression.py
+++ b/examples/src/main/python/logistic_regression.py
@@ -60,8 +60,8 @@ if __name__ == "__main__":
# Compute logistic regression gradient for a matrix of data points
def gradient(matrix, w):
- Y = matrix[:,0] # point labels (first column of input file)
- X = matrix[:,1:] # point coordinates
+ Y = matrix[:, 0] # point labels (first column of input file)
+ X = matrix[:, 1:] # point coordinates
# For each point (x, y), compute gradient function, then sum these up
return ((1.0 / (1.0 + np.exp(-Y * X.dot(w))) - 1.0) * Y * X.T).sum(1)
diff --git a/examples/src/main/python/pagerank.py b/examples/src/main/python/pagerank.py
index d350fa46fa..0b96343158 100755
--- a/examples/src/main/python/pagerank.py
+++ b/examples/src/main/python/pagerank.py
@@ -15,9 +15,8 @@
# limitations under the License.
#
-#!/usr/bin/env python
-
-import re, sys
+import re
+import sys
from operator import add
from pyspark import SparkContext
@@ -26,7 +25,8 @@ from pyspark import SparkContext
def computeContribs(urls, rank):
"""Calculates URL contributions to the rank of other URLs."""
num_urls = len(urls)
- for url in urls: yield (url, rank / num_urls)
+ for url in urls:
+ yield (url, rank / num_urls)
def parseNeighbors(urls):
@@ -59,8 +59,8 @@ if __name__ == "__main__":
# Calculates and updates URL ranks continuously using PageRank algorithm.
for iteration in xrange(int(sys.argv[2])):
# Calculates URL contributions to the rank of other URLs.
- contribs = links.join(ranks).flatMap(lambda (url, (urls, rank)):
- computeContribs(urls, rank))
+ contribs = links.join(ranks).flatMap(
+ lambda (url, (urls, rank)): computeContribs(urls, rank))
# Re-calculates URL ranks based on neighbor contributions.
ranks = contribs.reduceByKey(add).mapValues(lambda rank: rank * 0.85 + 0.15)
diff --git a/examples/src/main/python/pi.py b/examples/src/main/python/pi.py
index 234720b55f..21d94a2cd4 100755
--- a/examples/src/main/python/pi.py
+++ b/examples/src/main/python/pi.py
@@ -29,9 +29,11 @@ if __name__ == "__main__":
sc = SparkContext(appName="PythonPi")
slices = int(sys.argv[1]) if len(sys.argv) > 1 else 2
n = 100000 * slices
+
def f(_):
x = random() * 2 - 1
y = random() * 2 - 1
return 1 if x ** 2 + y ** 2 < 1 else 0
+
count = sc.parallelize(xrange(1, n+1), slices).map(f).reduce(add)
print "Pi is roughly %f" % (4.0 * count / n)
diff --git a/examples/src/main/python/sort.py b/examples/src/main/python/sort.py
index 4913ee926a..41d00c1b79 100755
--- a/examples/src/main/python/sort.py
+++ b/examples/src/main/python/sort.py
@@ -27,8 +27,8 @@ if __name__ == "__main__":
sc = SparkContext(appName="PythonSort")
lines = sc.textFile(sys.argv[1], 1)
sortedCount = lines.flatMap(lambda x: x.split(' ')) \
- .map(lambda x: (int(x), 1)) \
- .sortByKey(lambda x: x)
+ .map(lambda x: (int(x), 1)) \
+ .sortByKey(lambda x: x)
# This is just a demo on how to bring all the sorted data back to a single node.
# In reality, we wouldn't want to collect all the data to the driver node.
output = sortedCount.collect()