From 427d7911f527e00e75dec0498b4bbdbe164db7ca Mon Sep 17 00:00:00 2001
From: Michelangelo D'Agostino <mdagostino@civisanalytics.com>
Date: Fri, 7 Nov 2014 22:53:01 -0800
Subject: [MLLIB] [PYTHON] SPARK-4221: Expose nonnegative ALS in the python API

SPARK-1553 added alternating nonnegative least squares to MLLib, however it's not possible to access it via the python API.  This pull request resolves that.

Author: Michelangelo D'Agostino <mdagostino@civisanalytics.com>

Closes #3095 from mdagost/python_nmf and squashes the following commits:

a6743ad [Michelangelo D'Agostino] Use setters instead of static methods in PythonMLLibAPI.  Remove the new static methods I added.  Set seed in tests.  Change ratings to ratingsRDD in both train and trainImplicit for consistency.
7cffd39 [Michelangelo D'Agostino] Swapped nonnegative and seed in a few more places.
3fdc851 [Michelangelo D'Agostino] Moved seed to the end of the python parameter list.
bdcc154 [Michelangelo D'Agostino] Change seed type to java.lang.Long so that it can handle null.
cedf043 [Michelangelo D'Agostino] Added in ability to set the seed from python and made that play nice with the nonnegative changes.  Also made the python ALS tests more exact.
a72fdc9 [Michelangelo D'Agostino] Expose nonnegative ALS in the python API.

(cherry picked from commit 7e9d975676d56ace0e84c2200137e4cd4eba074a)
Signed-off-by: Xiangrui Meng <meng@databricks.com>
---
 python/pyspark/mllib/recommendation.py | 40 +++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 15 deletions(-)

(limited to 'python')

diff --git a/python/pyspark/mllib/recommendation.py b/python/pyspark/mllib/recommendation.py
index e8b998414d..e26b152e0c 100644
--- a/python/pyspark/mllib/recommendation.py
+++ b/python/pyspark/mllib/recommendation.py
@@ -44,31 +44,39 @@ class MatrixFactorizationModel(JavaModelWrapper):
     >>> r2 = (1, 2, 2.0)
     >>> r3 = (2, 1, 2.0)
     >>> ratings = sc.parallelize([r1, r2, r3])
-    >>> model = ALS.trainImplicit(ratings, 1)
-    >>> model.predict(2,2) is not None
-    True
+    >>> model = ALS.trainImplicit(ratings, 1, seed=10)
+    >>> model.predict(2,2)
+    0.4473...
 
     >>> testset = sc.parallelize([(1, 2), (1, 1)])
-    >>> model = ALS.train(ratings, 1)
-    >>> model.predictAll(testset).count() == 2
-    True
+    >>> model = ALS.train(ratings, 1, seed=10)
+    >>> model.predictAll(testset).collect()
+    [Rating(1, 1, 1), Rating(1, 2, 1)]
 
-    >>> model = ALS.train(ratings, 4)
-    >>> model.userFeatures().count() == 2
-    True
+    >>> model = ALS.train(ratings, 4, seed=10)
+    >>> model.userFeatures().collect()
+    [(2, array('d', [...])), (1, array('d', [...]))]
 
     >>> first_user = model.userFeatures().take(1)[0]
     >>> latents = first_user[1]
     >>> len(latents) == 4
     True
 
-    >>> model.productFeatures().count() == 2
-    True
+    >>> model.productFeatures().collect()
+    [(2, array('d', [...])), (1, array('d', [...]))]
 
     >>> first_product = model.productFeatures().take(1)[0]
     >>> latents = first_product[1]
     >>> len(latents) == 4
     True
+
+    >>> model = ALS.train(ratings, 1, nonnegative=True, seed=10)
+    >>> model.predict(2,2)
+    3.735...
+
+    >>> model = ALS.trainImplicit(ratings, 1, nonnegative=True, seed=10)
+    >>> model.predict(2,2)
+    0.4473...
     """
     def predict(self, user, product):
         return self._java_model.predict(user, product)
@@ -101,15 +109,17 @@ class ALS(object):
         return _to_java_object_rdd(ratings, True)
 
     @classmethod
-    def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1):
+    def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative=False,
+              seed=None):
         model = callMLlibFunc("trainALSModel", cls._prepare(ratings), rank, iterations,
-                              lambda_, blocks)
+                              lambda_, blocks, nonnegative, seed)
         return MatrixFactorizationModel(model)
 
     @classmethod
-    def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01):
+    def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01,
+                      nonnegative=False, seed=None):
         model = callMLlibFunc("trainImplicitALSModel", cls._prepare(ratings), rank,
-                              iterations, lambda_, blocks, alpha)
+                              iterations, lambda_, blocks, alpha, nonnegative, seed)
         return MatrixFactorizationModel(model)
 
 
-- 
cgit v1.2.3