mllib/src/test/scala/spark/mllib/recommendation/ALSSuite.scala


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package spark.mllib.recommendation

import scala.util.Random

import org.scalatest.BeforeAndAfterAll
import org.scalatest.FunSuite

import spark.SparkContext
import spark.SparkContext._

import org.jblas._


class ALSSuite extends FunSuite with BeforeAndAfterAll {
  val sc = new SparkContext("local", "test")

  override def afterAll() {
    sc.stop()
    System.clearProperty("spark.driver.port")
  }

  test("rank-1 matrices") {
    testALS(10, 20, 1, 15, 0.7, 0.3)
  }

  test("rank-2 matrices") {
    testALS(20, 30, 2, 15, 0.7, 0.3)
  }

  /**
   * Test if we can correctly factorize R = U * P where U and P are of known rank.
   *
   * @param users          number of users
   * @param products       number of products
   * @param features       number of features (rank of problem)
   * @param iterations     number of iterations to run
   * @param samplingRate   what fraction of the user-product pairs are known
   * @param matchThreshold max difference allowed to consider a predicted rating correct
   */
  def testALS(users: Int, products: Int, features: Int, iterations: Int,
    samplingRate: Double, matchThreshold: Double)
  {
    val rand = new Random(42)

    // Create a random matrix with uniform values from -1 to 1
    def randomMatrix(m: Int, n: Int) =
      new DoubleMatrix(m, n, Array.fill(m * n)(rand.nextDouble() * 2 - 1): _*)

    val userMatrix = randomMatrix(users, features)
    val productMatrix = randomMatrix(features, products)
    val trueRatings = userMatrix.mmul(productMatrix)

    val sampledRatings = {
      for (u <- 0 until users; p <- 0 until products if rand.nextDouble() < samplingRate)
        yield (u, p, trueRatings.get(u, p))
    }

    val model = ALS.train(sc.parallelize(sampledRatings), features, iterations)

    val predictedU = new DoubleMatrix(users, features)
    for ((u, vec) <- model.userFeatures.collect(); i <- 0 until features) {
      predictedU.put(u, i, vec(i))
    }
    val predictedP = new DoubleMatrix(products, features)
    for ((p, vec) <- model.productFeatures.collect(); i <- 0 until features) {
      predictedP.put(p, i, vec(i))
    }
    val predictedRatings = predictedU.mmul(predictedP.transpose)

    for (u <- 0 until users; p <- 0 until products) {
      val prediction = predictedRatings.get(u, p)
      val correct = trueRatings.get(u, p)
      if (math.abs(prediction - correct) > matchThreshold) {
        fail("Model failed to predict (%d, %d): %f vs %f\ncorr: %s\npred: %s\nU: %s\n P: %s".format(
          u, p, correct, prediction, trueRatings, predictedRatings, predictedU, predictedP))
      }
    }
  }
}