aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/test
diff options
context:
space:
mode:
authorXiangrui Meng <meng@databricks.com>2014-03-13 00:43:19 -0700
committerReynold Xin <rxin@apache.org>2014-03-13 00:43:19 -0700
commite4e8d8f395aea48f0cae00d7c381a863c48a2837 (patch)
tree283f03c2f4a7be86eb7d950dd0ee51c375b0f737 /mllib/src/test
parent4ea23db0efff2f39ac5b8f0bd1d9a6ffa3eceb0d (diff)
downloadspark-e4e8d8f395aea48f0cae00d7c381a863c48a2837.tar.gz
spark-e4e8d8f395aea48f0cae00d7c381a863c48a2837.tar.bz2
spark-e4e8d8f395aea48f0cae00d7c381a863c48a2837.zip
[SPARK-1237, 1238] Improve the computation of YtY for implicit ALS
Computing YtY can be implemented using BLAS's DSPR operations instead of generating y_i y_i^T and then combining them. The latter generates many k-by-k matrices. On the movielens data, this change improves the performance by 10-20%. The algorithm remains the same, verified by computing RMSE on the movielens data. To compare the results, I also added an option to set a random seed in ALS. JIRA: 1. https://spark-project.atlassian.net/browse/SPARK-1237 2. https://spark-project.atlassian.net/browse/SPARK-1238 Author: Xiangrui Meng <meng@databricks.com> Closes #131 from mengxr/als and squashes the following commits: ed00432 [Xiangrui Meng] minor changes d984623 [Xiangrui Meng] minor changes 2fc1641 [Xiangrui Meng] remove commented code 4c7cde2 [Xiangrui Meng] allow specifying a random seed in ALS 200bef0 [Xiangrui Meng] optimize computeYtY and updateBlock
Diffstat (limited to 'mllib/src/test')
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala15
1 files changed, 14 insertions, 1 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
index 45e7d2db00..5aab9aba8f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
@@ -23,9 +23,10 @@ import scala.util.Random
import org.scalatest.FunSuite
-import org.jblas._
+import org.jblas.DoubleMatrix
import org.apache.spark.mllib.util.LocalSparkContext
+import org.apache.spark.SparkContext._
object ALSSuite {
@@ -115,6 +116,18 @@ class ALSSuite extends FunSuite with LocalSparkContext {
testALS(100, 200, 2, 15, 0.7, 0.4, true, false, true)
}
+ test("pseudorandomness") {
+ val ratings = sc.parallelize(ALSSuite.generateRatings(10, 20, 5, 0.5, false, false)._1, 2)
+ val model11 = ALS.train(ratings, 5, 1, 1.0, 2, 1)
+ val model12 = ALS.train(ratings, 5, 1, 1.0, 2, 1)
+ val u11 = model11.userFeatures.values.flatMap(_.toList).collect().toList
+ val u12 = model12.userFeatures.values.flatMap(_.toList).collect().toList
+ val model2 = ALS.train(ratings, 5, 1, 1.0, 2, 2)
+ val u2 = model2.userFeatures.values.flatMap(_.toList).collect().toList
+ assert(u11 == u12)
+ assert(u11 != u2)
+ }
+
/**
* Test if we can correctly factorize R = U * P where U and P are of known rank.
*