From f67ce3e2297352678371865b01acf3595443b2e1 Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Mon, 27 Jan 2014 11:15:51 -0800 Subject: Merge pull request #460 from srowen/RandomInitialALSVectors Choose initial user/item vectors uniformly on the unit sphere ...rather than within the unit square to possibly avoid bias in the initial state and improve convergence. The current implementation picks the N vector elements uniformly at random from [0,1). This means they all point into one quadrant of the vector space. As N gets just a little large, the vector tend strongly to point into the "corner", towards (1,1,1...,1). The vectors are not unit vectors either. I suggest choosing the elements as Gaussian ~ N(0,1) and normalizing. This gets you uniform random choices on the unit sphere which is more what's of interest here. It has worked a little better for me in the past. This is pretty minor but wanted to warm up suggesting a few tweaks to ALS. Please excuse my Scala, pretty new to it. Author: Sean Owen == Merge branch commits == commit 492b13a7469e5a4ed7591ee8e56d8bd7570dfab6 Author: Sean Owen Date: Mon Jan 27 08:05:25 2014 +0000 Style: spaces around binary operators commit ce2b5b5a4fefa0356875701f668f01f02ba4d87e Author: Sean Owen Date: Sun Jan 19 22:50:03 2014 +0000 Generate factors with all positive components, per discussion in https://github.com/apache/incubator-spark/pull/460 commit b6f7a8a61643a8209e8bc662e8e81f2d15c710c7 Author: Sean Owen Date: Sat Jan 18 15:54:42 2014 +0000 Choose initial user/item vectors uniformly on the unit sphere rather than within the unit square to possibly avoid bias in the initial state and improve convergence --- .../main/scala/org/apache/spark/mllib/recommendation/ALS.scala | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'mllib/src/main/scala') diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala index 89ee07063d..c5f64b1350 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala @@ -18,6 +18,7 @@ package org.apache.spark.mllib.recommendation import scala.collection.mutable.{ArrayBuffer, BitSet} +import scala.math.{abs, sqrt} import scala.util.Random import scala.util.Sorting @@ -301,7 +302,14 @@ class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var l * Make a random factor vector with the given random. */ private def randomFactor(rank: Int, rand: Random): Array[Double] = { - Array.fill(rank)(rand.nextDouble) + // Choose a unit vector uniformly at random from the unit sphere, but from the + // "first quadrant" where all elements are nonnegative. This can be done by choosing + // elements distributed as Normal(0,1) and taking the absolute value, and then normalizing. + // This appears to create factorizations that have a slightly better reconstruction + // (<1%) compared picking elements uniformly at random in [0,1]. + val factor = Array.fill(rank)(abs(rand.nextGaussian())) + val norm = sqrt(factor.map(x => x * x).sum) + factor.map(x => x / norm) } /** -- cgit v1.2.3