SPARK-1668: Add implicit preference as an option to examples/MovieLensALS

Add --implicitPrefs as an command-line option to the example app MovieLensALS under examples/ Author: Sandeep <sandeep@techaddict.me> Closes #597 from techaddict/SPARK-1668 and squashes the following commits: 8b371dc [Sandeep] Second Pass on reviews by mengxr eca9d37 [Sandeep] based on mengxr's suggestions 937e54c [Sandeep] Changes 5149d40 [Sandeep] Changes based on review 1dd7657 [Sandeep] use mean() 42444d7 [Sandeep] Based on Suggestions by mengxr e3082fa [Sandeep] SPARK-1668: Add implicit preference as an option to examples/MovieLensALS Add --implicitPrefs as an command-line option to the example app MovieLensALS under examples/
author: Sandeep <sandeep@techaddict.me> 2014-05-08 00:15:05 -0400
committer: Reynold Xin <rxin@apache.org> 2014-05-08 00:15:05 -0400
commit: 108c4c16cc82af2e161d569d2c23849bdbf4aadb (patch)
tree: 5e84b04397db7ae74a982e49f62ffa8ea1f55073 /examples/src
parent: f269b016acb17b24d106dc2b32a1be389489bb01 (diff)
download: spark-108c4c16cc82af2e161d569d2c23849bdbf4aadb.tar.gz
spark-108c4c16cc82af2e161d569d2c23849bdbf4aadb.tar.bz2
spark-108c4c16cc82af2e161d569d2c23849bdbf4aadb.zip
1 files changed, 46 insertions, 9 deletions
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala
index 703f02255b..0e4447e0de 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala
@@ -43,7 +43,8 @@ object MovieLensALS {
       kryo: Boolean = false,
       numIterations: Int = 20,
       lambda: Double = 1.0,
-      rank: Int = 10)
+      rank: Int = 10,
+      implicitPrefs: Boolean = false)
 
   def main(args: Array[String]) {
     val defaultParams = Params()
@@ -62,6 +63,9 @@ object MovieLensALS {
       opt[Unit]("kryo")
         .text(s"use Kryo serialization")
         .action((_, c) => c.copy(kryo = true))
+      opt[Unit]("implicitPrefs")
+        .text("use implicit preference")
+        .action((_, c) => c.copy(implicitPrefs = true))
       arg[String]("<input>")
         .required()
         .text("input paths to a MovieLens dataset of ratings")
@@ -88,7 +92,25 @@ object MovieLensALS {
 
     val ratings = sc.textFile(params.input).map { line =>
       val fields = line.split("::")
-      Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble)
+      if (params.implicitPrefs) {
+        /*
+         * MovieLens ratings are on a scale of 1-5:
+         * 5: Must see
+         * 4: Will enjoy
+         * 3: It's okay
+         * 2: Fairly bad
+         * 1: Awful
+         * So we should not recommend a movie if the predicted rating is less than 3.
+         * To map ratings to confidence scores, we use
+         * 5 -> 2.5, 4 -> 1.5, 3 -> 0.5, 2 -> -0.5, 1 -> -1.5. This mappings means unobserved
+         * entries are generally between It's okay and Fairly bad.
+         * The semantics of 0 in this expanded world of non-positive weights
+         * are "the same as never having interacted at all".
+         */
+        Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble - 2.5)
+      } else {
+        Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble)
+      }
     }.cache()
 
     val numRatings = ratings.count()
@@ -99,7 +121,18 @@ object MovieLensALS {
 
     val splits = ratings.randomSplit(Array(0.8, 0.2))
     val training = splits(0).cache()
-    val test = splits(1).cache()
+    val test = if (params.implicitPrefs) {
+      /*
+       * 0 means "don't know" and positive values mean "confident that the prediction should be 1".
+       * Negative values means "confident that the prediction should be 0".
+       * We have in this case used some kind of weighted RMSE. The weight is the absolute value of
+       * the confidence. The error is the difference between prediction and either 1 or 0,
+       * depending on whether r is positive or negative.
+       */
+      splits(1).map(x => Rating(x.user, x.product, if (x.rating > 0) 1.0 else 0.0))
+    } else {
+      splits(1)
+    }.cache()
 
     val numTraining = training.count()
     val numTest = test.count()
@@ -111,9 +144,10 @@ object MovieLensALS {
       .setRank(params.rank)
       .setIterations(params.numIterations)
       .setLambda(params.lambda)
+      .setImplicitPrefs(params.implicitPrefs)
       .run(training)
 
-    val rmse = computeRmse(model, test, numTest)
+    val rmse = computeRmse(model, test, params.implicitPrefs)
 
     println(s"Test RMSE = $rmse.")
 
@@ -121,11 +155,14 @@ object MovieLensALS {
   }
 
   /** Compute RMSE (Root Mean Squared Error). */
-  def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating], n: Long) = {
+  def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating], implicitPrefs: Boolean) = {
+
+    def mapPredictedRating(r: Double) = if (implicitPrefs) math.max(math.min(r, 1.0), 0.0) else r
+
     val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product)))
-    val predictionsAndRatings = predictions.map(x => ((x.user, x.product), x.rating))
-      .join(data.map(x => ((x.user, x.product), x.rating)))
-      .values
-    math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / n)
+    val predictionsAndRatings = predictions.map{ x =>
+      ((x.user, x.product), mapPredictedRating(x.rating))
+    }.join(data.map(x => ((x.user, x.product), x.rating))).values
+    math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).mean())
   }
 }
author	Sandeep <sandeep@techaddict.me>	2014-05-08 00:15:05 -0400
committer	Reynold Xin <rxin@apache.org>	2014-05-08 00:15:05 -0400
commit	108c4c16cc82af2e161d569d2c23849bdbf4aadb (patch)
tree	5e84b04397db7ae74a982e49f62ffa8ea1f55073 /examples/src
parent	f269b016acb17b24d106dc2b32a1be389489bb01 (diff)
download	spark-108c4c16cc82af2e161d569d2c23849bdbf4aadb.tar.gz spark-108c4c16cc82af2e161d569d2c23849bdbf4aadb.tar.bz2 spark-108c4c16cc82af2e161d569d2c23849bdbf4aadb.zip