aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichelangelo D'Agostino <mdagostino@civisanalytics.com>2014-11-07 22:53:01 -0800
committerXiangrui Meng <meng@databricks.com>2014-11-07 22:53:01 -0800
commit7e9d975676d56ace0e84c2200137e4cd4eba074a (patch)
tree59f03936200f7a6a5502a5bf70e94631c03c63c7
parent7779109796c90d789464ab0be35917f963bbe867 (diff)
downloadspark-7e9d975676d56ace0e84c2200137e4cd4eba074a.tar.gz
spark-7e9d975676d56ace0e84c2200137e4cd4eba074a.tar.bz2
spark-7e9d975676d56ace0e84c2200137e4cd4eba074a.zip
[MLLIB] [PYTHON] SPARK-4221: Expose nonnegative ALS in the python API
SPARK-1553 added alternating nonnegative least squares to MLLib, however it's not possible to access it via the python API. This pull request resolves that. Author: Michelangelo D'Agostino <mdagostino@civisanalytics.com> Closes #3095 from mdagost/python_nmf and squashes the following commits: a6743ad [Michelangelo D'Agostino] Use setters instead of static methods in PythonMLLibAPI. Remove the new static methods I added. Set seed in tests. Change ratings to ratingsRDD in both train and trainImplicit for consistency. 7cffd39 [Michelangelo D'Agostino] Swapped nonnegative and seed in a few more places. 3fdc851 [Michelangelo D'Agostino] Moved seed to the end of the python parameter list. bdcc154 [Michelangelo D'Agostino] Change seed type to java.lang.Long so that it can handle null. cedf043 [Michelangelo D'Agostino] Added in ability to set the seed from python and made that play nice with the nonnegative changes. Also made the python ALS tests more exact. a72fdc9 [Michelangelo D'Agostino] Expose nonnegative ALS in the python API.
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala39
-rw-r--r--python/pyspark/mllib/recommendation.py40
2 files changed, 58 insertions, 21 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index d832ae34b5..70d7138e30 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -275,12 +275,25 @@ class PythonMLLibAPI extends Serializable {
* the Py4J documentation.
*/
def trainALSModel(
- ratings: JavaRDD[Rating],
+ ratingsJRDD: JavaRDD[Rating],
rank: Int,
iterations: Int,
lambda: Double,
- blocks: Int): MatrixFactorizationModel = {
- new MatrixFactorizationModelWrapper(ALS.train(ratings.rdd, rank, iterations, lambda, blocks))
+ blocks: Int,
+ nonnegative: Boolean,
+ seed: java.lang.Long): MatrixFactorizationModel = {
+
+ val als = new ALS()
+ .setRank(rank)
+ .setIterations(iterations)
+ .setLambda(lambda)
+ .setBlocks(blocks)
+ .setNonnegative(nonnegative)
+
+ if (seed != null) als.setSeed(seed)
+
+ val model = als.run(ratingsJRDD.rdd)
+ new MatrixFactorizationModelWrapper(model)
}
/**
@@ -295,9 +308,23 @@ class PythonMLLibAPI extends Serializable {
iterations: Int,
lambda: Double,
blocks: Int,
- alpha: Double): MatrixFactorizationModel = {
- new MatrixFactorizationModelWrapper(
- ALS.trainImplicit(ratingsJRDD.rdd, rank, iterations, lambda, blocks, alpha))
+ alpha: Double,
+ nonnegative: Boolean,
+ seed: java.lang.Long): MatrixFactorizationModel = {
+
+ val als = new ALS()
+ .setImplicitPrefs(true)
+ .setRank(rank)
+ .setIterations(iterations)
+ .setLambda(lambda)
+ .setBlocks(blocks)
+ .setAlpha(alpha)
+ .setNonnegative(nonnegative)
+
+ if (seed != null) als.setSeed(seed)
+
+ val model = als.run(ratingsJRDD.rdd)
+ new MatrixFactorizationModelWrapper(model)
}
/**
diff --git a/python/pyspark/mllib/recommendation.py b/python/pyspark/mllib/recommendation.py
index e8b998414d..e26b152e0c 100644
--- a/python/pyspark/mllib/recommendation.py
+++ b/python/pyspark/mllib/recommendation.py
@@ -44,31 +44,39 @@ class MatrixFactorizationModel(JavaModelWrapper):
>>> r2 = (1, 2, 2.0)
>>> r3 = (2, 1, 2.0)
>>> ratings = sc.parallelize([r1, r2, r3])
- >>> model = ALS.trainImplicit(ratings, 1)
- >>> model.predict(2,2) is not None
- True
+ >>> model = ALS.trainImplicit(ratings, 1, seed=10)
+ >>> model.predict(2,2)
+ 0.4473...
>>> testset = sc.parallelize([(1, 2), (1, 1)])
- >>> model = ALS.train(ratings, 1)
- >>> model.predictAll(testset).count() == 2
- True
+ >>> model = ALS.train(ratings, 1, seed=10)
+ >>> model.predictAll(testset).collect()
+ [Rating(1, 1, 1), Rating(1, 2, 1)]
- >>> model = ALS.train(ratings, 4)
- >>> model.userFeatures().count() == 2
- True
+ >>> model = ALS.train(ratings, 4, seed=10)
+ >>> model.userFeatures().collect()
+ [(2, array('d', [...])), (1, array('d', [...]))]
>>> first_user = model.userFeatures().take(1)[0]
>>> latents = first_user[1]
>>> len(latents) == 4
True
- >>> model.productFeatures().count() == 2
- True
+ >>> model.productFeatures().collect()
+ [(2, array('d', [...])), (1, array('d', [...]))]
>>> first_product = model.productFeatures().take(1)[0]
>>> latents = first_product[1]
>>> len(latents) == 4
True
+
+ >>> model = ALS.train(ratings, 1, nonnegative=True, seed=10)
+ >>> model.predict(2,2)
+ 3.735...
+
+ >>> model = ALS.trainImplicit(ratings, 1, nonnegative=True, seed=10)
+ >>> model.predict(2,2)
+ 0.4473...
"""
def predict(self, user, product):
return self._java_model.predict(user, product)
@@ -101,15 +109,17 @@ class ALS(object):
return _to_java_object_rdd(ratings, True)
@classmethod
- def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1):
+ def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative=False,
+ seed=None):
model = callMLlibFunc("trainALSModel", cls._prepare(ratings), rank, iterations,
- lambda_, blocks)
+ lambda_, blocks, nonnegative, seed)
return MatrixFactorizationModel(model)
@classmethod
- def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01):
+ def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01,
+ nonnegative=False, seed=None):
model = callMLlibFunc("trainImplicitALSModel", cls._prepare(ratings), rank,
- iterations, lambda_, blocks, alpha)
+ iterations, lambda_, blocks, alpha, nonnegative, seed)
return MatrixFactorizationModel(model)