diff options
author | Bryan Cutler <cutlerb@gmail.com> | 2016-02-22 12:48:37 +0200 |
---|---|---|
committer | Nick Pentreath <nick.pentreath@gmail.com> | 2016-02-22 12:48:37 +0200 |
commit | e298ac91e3f6177c6da83e2d8ee994d9037466da (patch) | |
tree | 8494149068bd94f2e2cfa46af761f4c9dcec6a25 /python | |
parent | 024482bf51e8158eed08a7dc0758f585baf86e1f (diff) | |
download | spark-e298ac91e3f6177c6da83e2d8ee994d9037466da.tar.gz spark-e298ac91e3f6177c6da83e2d8ee994d9037466da.tar.bz2 spark-e298ac91e3f6177c6da83e2d8ee994d9037466da.zip |
[SPARK-12632][PYSPARK][DOC] PySpark fpm and als parameter desc to consistent format
Part of task for [SPARK-11219](https://issues.apache.org/jira/browse/SPARK-11219) to make PySpark MLlib parameter description formatting consistent. This is for the fpm and recommendation modules.
Closes #10602
Closes #10897
Author: Bryan Cutler <cutlerb@gmail.com>
Author: somideshmukh <somilde@us.ibm.com>
Closes #11186 from BryanCutler/param-desc-consistent-fpmrecc-SPARK-12632.
Diffstat (limited to 'python')
-rw-r--r-- | python/pyspark/mllib/fpm.py | 47 | ||||
-rw-r--r-- | python/pyspark/mllib/recommendation.py | 89 |
2 files changed, 102 insertions, 34 deletions
diff --git a/python/pyspark/mllib/fpm.py b/python/pyspark/mllib/fpm.py index 2039decc0c..7a2d77a4da 100644 --- a/python/pyspark/mllib/fpm.py +++ b/python/pyspark/mllib/fpm.py @@ -29,7 +29,6 @@ __all__ = ['FPGrowth', 'FPGrowthModel', 'PrefixSpan', 'PrefixSpanModel'] @inherit_doc @ignore_unicode_prefix class FPGrowthModel(JavaModelWrapper): - """ .. note:: Experimental @@ -68,11 +67,15 @@ class FPGrowth(object): """ Computes an FP-Growth model that contains frequent itemsets. - :param data: The input data set, each element contains a - transaction. - :param minSupport: The minimal support level (default: `0.3`). - :param numPartitions: The number of partitions used by - parallel FP-growth (default: same as input data). + :param data: + The input data set, each element contains a transaction. + :param minSupport: + The minimal support level. + (default: 0.3) + :param numPartitions: + The number of partitions used by parallel FP-growth. A value + of -1 will use the same number as input data. + (default: -1) """ model = callMLlibFunc("trainFPGrowthModel", data, float(minSupport), int(numPartitions)) return FPGrowthModel(model) @@ -128,17 +131,27 @@ class PrefixSpan(object): @since("1.6.0") def train(cls, data, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000): """ - Finds the complete set of frequent sequential patterns in the input sequences of itemsets. - - :param data: The input data set, each element contains a sequnce of itemsets. - :param minSupport: the minimal support level of the sequential pattern, any pattern appears - more than (minSupport * size-of-the-dataset) times will be output (default: `0.1`) - :param maxPatternLength: the maximal length of the sequential pattern, any pattern appears - less than maxPatternLength will be output. (default: `10`) - :param maxLocalProjDBSize: The maximum number of items (including delimiters used in - the internal storage format) allowed in a projected database before local - processing. If a projected database exceeds this size, another - iteration of distributed prefix growth is run. (default: `32000000`) + Finds the complete set of frequent sequential patterns in the + input sequences of itemsets. + + :param data: + The input data set, each element contains a sequence of + itemsets. + :param minSupport: + The minimal support level of the sequential pattern, any + pattern that appears more than (minSupport * + size-of-the-dataset) times will be output. + (default: 0.1) + :param maxPatternLength: + The maximal length of the sequential pattern, any pattern + that appears less than maxPatternLength will be output. + (default: 10) + :param maxLocalProjDBSize: + The maximum number of items (including delimiters used in the + internal storage format) allowed in a projected database before + local processing. If a projected database exceeds this size, + another iteration of distributed prefix growth is run. + (default: 32000000) """ model = callMLlibFunc("trainPrefixSpanModel", data, minSupport, maxPatternLength, maxLocalProjDBSize) diff --git a/python/pyspark/mllib/recommendation.py b/python/pyspark/mllib/recommendation.py index 93e47a797f..7e60255d43 100644 --- a/python/pyspark/mllib/recommendation.py +++ b/python/pyspark/mllib/recommendation.py @@ -138,7 +138,8 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader): @since("0.9.0") def predictAll(self, user_product): """ - Returns a list of predicted ratings for input user and product pairs. + Returns a list of predicted ratings for input user and product + pairs. """ assert isinstance(user_product, RDD), "user_product should be RDD of (user, product)" first = user_product.first() @@ -165,28 +166,33 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader): @since("1.4.0") def recommendUsers(self, product, num): """ - Recommends the top "num" number of users for a given product and returns a list - of Rating objects sorted by the predicted rating in descending order. + Recommends the top "num" number of users for a given product and + returns a list of Rating objects sorted by the predicted rating in + descending order. """ return list(self.call("recommendUsers", product, num)) @since("1.4.0") def recommendProducts(self, user, num): """ - Recommends the top "num" number of products for a given user and returns a list - of Rating objects sorted by the predicted rating in descending order. + Recommends the top "num" number of products for a given user and + returns a list of Rating objects sorted by the predicted rating in + descending order. """ return list(self.call("recommendProducts", user, num)) def recommendProductsForUsers(self, num): """ - Recommends top "num" products for all users. The number returned may be less than this. + Recommends the top "num" number of products for all users. The + number of recommendations returned per user may be less than "num". """ return self.call("wrappedRecommendProductsForUsers", num) def recommendUsersForProducts(self, num): """ - Recommends top "num" users for all products. The number returned may be less than this. + Recommends the top "num" number of users for all products. The + number of recommendations returned per product may be less than + "num". """ return self.call("wrappedRecommendUsersForProducts", num) @@ -234,11 +240,34 @@ class ALS(object): def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative=False, seed=None): """ - Train a matrix factorization model given an RDD of ratings given by users to some products, - in the form of (userID, productID, rating) pairs. We approximate the ratings matrix as the - product of two lower-rank matrices of a given rank (number of features). To solve for these - features, we run a given number of iterations of ALS. This is done using a level of - parallelism given by `blocks`. + Train a matrix factorization model given an RDD of ratings by users + for a subset of products. The ratings matrix is approximated as the + product of two lower-rank matrices of a given rank (number of + features). To solve for these features, ALS is run iteratively with + a configurable level of parallelism. + + :param ratings: + RDD of `Rating` or (userID, productID, rating) tuple. + :param rank: + Rank of the feature matrices computed (number of features). + :param iterations: + Number of iterations of ALS. + (default: 5) + :param lambda_: + Regularization parameter. + (default: 0.01) + :param blocks: + Number of blocks used to parallelize the computation. A value + of -1 will use an auto-configured number of blocks. + (default: -1) + :param nonnegative: + A value of True will solve least-squares with nonnegativity + constraints. + (default: False) + :param seed: + Random seed for initial matrix factorization model. A value + of None will use system time as the seed. + (default: None) """ model = callMLlibFunc("trainALSModel", cls._prepare(ratings), rank, iterations, lambda_, blocks, nonnegative, seed) @@ -249,11 +278,37 @@ class ALS(object): def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01, nonnegative=False, seed=None): """ - Train a matrix factorization model given an RDD of 'implicit preferences' given by users - to some products, in the form of (userID, productID, preference) pairs. We approximate the - ratings matrix as the product of two lower-rank matrices of a given rank (number of - features). To solve for these features, we run a given number of iterations of ALS. - This is done using a level of parallelism given by `blocks`. + Train a matrix factorization model given an RDD of 'implicit + preferences' of users for a subset of products. The ratings matrix + is approximated as the product of two lower-rank matrices of a + given rank (number of features). To solve for these features, ALS + is run iteratively with a configurable level of parallelism. + + :param ratings: + RDD of `Rating` or (userID, productID, rating) tuple. + :param rank: + Rank of the feature matrices computed (number of features). + :param iterations: + Number of iterations of ALS. + (default: 5) + :param lambda_: + Regularization parameter. + (default: 0.01) + :param blocks: + Number of blocks used to parallelize the computation. A value + of -1 will use an auto-configured number of blocks. + (default: -1) + :param alpha: + A constant used in computing confidence. + (default: 0.01) + :param nonnegative: + A value of True will solve least-squares with nonnegativity + constraints. + (default: False) + :param seed: + Random seed for initial matrix factorization model. A value + of None will use system time as the seed. + (default: None) """ model = callMLlibFunc("trainImplicitALSModel", cls._prepare(ratings), rank, iterations, lambda_, blocks, alpha, nonnegative, seed) |