From 6ed7e2cd01955adfbb3960e2986b6d19eaee8717 Mon Sep 17 00:00:00 2001 From: Evan Sparks Date: Thu, 8 May 2014 00:24:36 -0400 Subject: Use numpy directly for matrix multiply. Using matrix multiply to compute XtX and XtY yields a 5-20x speedup depending on problem size. For example - the following takes 19s locally after this change vs. 5m21s before the change. (16x speedup). bin/pyspark examples/src/main/python/als.py local[8] 1000 1000 50 10 10 Author: Evan Sparks Closes #687 from etrain/patch-1 and squashes the following commits: e094dbc [Evan Sparks] Touching only diaganols on update. d1ab9b6 [Evan Sparks] Use numpy directly for matrix multiply. --- examples/src/main/python/als.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'examples/src') diff --git a/examples/src/main/python/als.py b/examples/src/main/python/als.py index a77dfb2577..33700ab4f8 100755 --- a/examples/src/main/python/als.py +++ b/examples/src/main/python/als.py @@ -36,14 +36,13 @@ def rmse(R, ms, us): def update(i, vec, mat, ratings): uu = mat.shape[0] ff = mat.shape[1] - XtX = matrix(np.zeros((ff, ff))) - Xty = np.zeros((ff, 1)) - - for j in range(uu): - v = mat[j, :] - XtX += v.T * v - Xty += v.T * ratings[i, j] - XtX += np.eye(ff, ff) * LAMBDA * uu + + XtX = mat.T * mat + XtY = mat.T * ratings[i, :].T + + for j in range(ff): + XtX[j,j] += LAMBDA * uu + return np.linalg.solve(XtX, Xty) if __name__ == "__main__": -- cgit v1.2.3