diff options
author | Xiangrui Meng <meng@databricks.com> | 2014-03-13 00:43:19 -0700 |
---|---|---|
committer | Reynold Xin <rxin@apache.org> | 2014-03-13 00:43:19 -0700 |
commit | e4e8d8f395aea48f0cae00d7c381a863c48a2837 (patch) | |
tree | 283f03c2f4a7be86eb7d950dd0ee51c375b0f737 /mllib/src/test | |
parent | 4ea23db0efff2f39ac5b8f0bd1d9a6ffa3eceb0d (diff) | |
download | spark-e4e8d8f395aea48f0cae00d7c381a863c48a2837.tar.gz spark-e4e8d8f395aea48f0cae00d7c381a863c48a2837.tar.bz2 spark-e4e8d8f395aea48f0cae00d7c381a863c48a2837.zip |
[SPARK-1237, 1238] Improve the computation of YtY for implicit ALS
Computing YtY can be implemented using BLAS's DSPR operations instead of generating y_i y_i^T and then combining them. The latter generates many k-by-k matrices. On the movielens data, this change improves the performance by 10-20%. The algorithm remains the same, verified by computing RMSE on the movielens data.
To compare the results, I also added an option to set a random seed in ALS.
JIRA:
1. https://spark-project.atlassian.net/browse/SPARK-1237
2. https://spark-project.atlassian.net/browse/SPARK-1238
Author: Xiangrui Meng <meng@databricks.com>
Closes #131 from mengxr/als and squashes the following commits:
ed00432 [Xiangrui Meng] minor changes
d984623 [Xiangrui Meng] minor changes
2fc1641 [Xiangrui Meng] remove commented code
4c7cde2 [Xiangrui Meng] allow specifying a random seed in ALS
200bef0 [Xiangrui Meng] optimize computeYtY and updateBlock
Diffstat (limited to 'mllib/src/test')
-rw-r--r-- | mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala | 15 |
1 files changed, 14 insertions, 1 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala index 45e7d2db00..5aab9aba8f 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala @@ -23,9 +23,10 @@ import scala.util.Random import org.scalatest.FunSuite -import org.jblas._ +import org.jblas.DoubleMatrix import org.apache.spark.mllib.util.LocalSparkContext +import org.apache.spark.SparkContext._ object ALSSuite { @@ -115,6 +116,18 @@ class ALSSuite extends FunSuite with LocalSparkContext { testALS(100, 200, 2, 15, 0.7, 0.4, true, false, true) } + test("pseudorandomness") { + val ratings = sc.parallelize(ALSSuite.generateRatings(10, 20, 5, 0.5, false, false)._1, 2) + val model11 = ALS.train(ratings, 5, 1, 1.0, 2, 1) + val model12 = ALS.train(ratings, 5, 1, 1.0, 2, 1) + val u11 = model11.userFeatures.values.flatMap(_.toList).collect().toList + val u12 = model12.userFeatures.values.flatMap(_.toList).collect().toList + val model2 = ALS.train(ratings, 5, 1, 1.0, 2, 2) + val u2 = model2.userFeatures.values.flatMap(_.toList).collect().toList + assert(u11 == u12) + assert(u11 != u2) + } + /** * Test if we can correctly factorize R = U * P where U and P are of known rank. * |