aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorBryan Cutler <cutlerb@gmail.com>2016-02-22 12:48:37 +0200
committerNick Pentreath <nick.pentreath@gmail.com>2016-02-22 12:48:37 +0200
commite298ac91e3f6177c6da83e2d8ee994d9037466da (patch)
tree8494149068bd94f2e2cfa46af761f4c9dcec6a25 /mllib
parent024482bf51e8158eed08a7dc0758f585baf86e1f (diff)
downloadspark-e298ac91e3f6177c6da83e2d8ee994d9037466da.tar.gz
spark-e298ac91e3f6177c6da83e2d8ee994d9037466da.tar.bz2
spark-e298ac91e3f6177c6da83e2d8ee994d9037466da.zip
[SPARK-12632][PYSPARK][DOC] PySpark fpm and als parameter desc to consistent format
Part of task for [SPARK-11219](https://issues.apache.org/jira/browse/SPARK-11219) to make PySpark MLlib parameter description formatting consistent. This is for the fpm and recommendation modules. Closes #10602 Closes #10897 Author: Bryan Cutler <cutlerb@gmail.com> Author: somideshmukh <somilde@us.ibm.com> Closes #11186 from BryanCutler/param-desc-consistent-fpmrecc-SPARK-12632.
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala2
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala114
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala4
4 files changed, 60 insertions, 66 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index 1250bc1a07..85d609386f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -152,7 +152,7 @@ object FPGrowthModel extends Loader[FPGrowthModel[_]] {
* [[http://dx.doi.org/10.1145/335191.335372 Han et al., Mining frequent patterns without candidate
* generation]].
*
- * @param minSupport the minimal support level of the frequent pattern, any pattern appears
+ * @param minSupport the minimal support level of the frequent pattern, any pattern that appears
* more than (minSupport * size-of-the-dataset) times will be output
* @param numPartitions number of partitions used by parallel FP-growth
*
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
index ed49c9492f..94a24b527b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
@@ -38,9 +38,9 @@ import org.apache.spark.storage.StorageLevel
* The PrefixSpan algorithm is described in J. Pei, et al., PrefixSpan: Mining Sequential Patterns
* Efficiently by Prefix-Projected Pattern Growth ([[http://doi.org/10.1109/ICDE.2001.914830]]).
*
- * @param minSupport the minimal support level of the sequential pattern, any pattern appears
- * more than (minSupport * size-of-the-dataset) times will be output
- * @param maxPatternLength the maximal length of the sequential pattern, any pattern appears
+ * @param minSupport the minimal support level of the sequential pattern, any pattern that appears
+ * more than (minSupport * size-of-the-dataset) times will be output
+ * @param maxPatternLength the maximal length of the sequential pattern, any pattern that appears
* less than maxPatternLength will be output
* @param maxLocalProjDBSize The maximum number of items (including delimiters used in the internal
* storage format) allowed in a projected database before local
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
index 33aaf853e5..3e619c4264 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
@@ -218,7 +218,7 @@ class ALS private (
}
/**
- * Run ALS with the configured parameters on an input RDD of (user, product, rating) triples.
+ * Run ALS with the configured parameters on an input RDD of [[Rating]] objects.
* Returns a MatrixFactorizationModel with feature vectors for each user and product.
*/
@Since("0.8.0")
@@ -279,18 +279,17 @@ class ALS private (
@Since("0.8.0")
object ALS {
/**
- * Train a matrix factorization model given an RDD of ratings given by users to some products,
- * in the form of (userID, productID, rating) pairs. We approximate the ratings matrix as the
- * product of two lower-rank matrices of a given rank (number of features). To solve for these
- * features, we run a given number of iterations of ALS. This is done using a level of
- * parallelism given by `blocks`.
+ * Train a matrix factorization model given an RDD of ratings by users for a subset of products.
+ * The ratings matrix is approximated as the product of two lower-rank matrices of a given rank
+ * (number of features). To solve for these features, ALS is run iteratively with a configurable
+ * level of parallelism.
*
- * @param ratings RDD of (userID, productID, rating) pairs
+ * @param ratings RDD of [[Rating]] objects with userID, productID, and rating
* @param rank number of features to use
- * @param iterations number of iterations of ALS (recommended: 10-20)
- * @param lambda regularization factor (recommended: 0.01)
+ * @param iterations number of iterations of ALS
+ * @param lambda regularization parameter
* @param blocks level of parallelism to split computation into
- * @param seed random seed
+ * @param seed random seed for initial matrix factorization model
*/
@Since("0.9.1")
def train(
@@ -305,16 +304,15 @@ object ALS {
}
/**
- * Train a matrix factorization model given an RDD of ratings given by users to some products,
- * in the form of (userID, productID, rating) pairs. We approximate the ratings matrix as the
- * product of two lower-rank matrices of a given rank (number of features). To solve for these
- * features, we run a given number of iterations of ALS. This is done using a level of
- * parallelism given by `blocks`.
+ * Train a matrix factorization model given an RDD of ratings by users for a subset of products.
+ * The ratings matrix is approximated as the product of two lower-rank matrices of a given rank
+ * (number of features). To solve for these features, ALS is run iteratively with a configurable
+ * level of parallelism.
*
- * @param ratings RDD of (userID, productID, rating) pairs
+ * @param ratings RDD of [[Rating]] objects with userID, productID, and rating
* @param rank number of features to use
- * @param iterations number of iterations of ALS (recommended: 10-20)
- * @param lambda regularization factor (recommended: 0.01)
+ * @param iterations number of iterations of ALS
+ * @param lambda regularization parameter
* @param blocks level of parallelism to split computation into
*/
@Since("0.8.0")
@@ -329,16 +327,15 @@ object ALS {
}
/**
- * Train a matrix factorization model given an RDD of ratings given by users to some products,
- * in the form of (userID, productID, rating) pairs. We approximate the ratings matrix as the
- * product of two lower-rank matrices of a given rank (number of features). To solve for these
- * features, we run a given number of iterations of ALS. The level of parallelism is determined
- * automatically based on the number of partitions in `ratings`.
+ * Train a matrix factorization model given an RDD of ratings by users for a subset of products.
+ * The ratings matrix is approximated as the product of two lower-rank matrices of a given rank
+ * (number of features). To solve for these features, ALS is run iteratively with a level of
+ * parallelism automatically based on the number of partitions in `ratings`.
*
- * @param ratings RDD of (userID, productID, rating) pairs
+ * @param ratings RDD of [[Rating]] objects with userID, productID, and rating
* @param rank number of features to use
- * @param iterations number of iterations of ALS (recommended: 10-20)
- * @param lambda regularization factor (recommended: 0.01)
+ * @param iterations number of iterations of ALS
+ * @param lambda regularization parameter
*/
@Since("0.8.0")
def train(ratings: RDD[Rating], rank: Int, iterations: Int, lambda: Double)
@@ -347,15 +344,14 @@ object ALS {
}
/**
- * Train a matrix factorization model given an RDD of ratings given by users to some products,
- * in the form of (userID, productID, rating) pairs. We approximate the ratings matrix as the
- * product of two lower-rank matrices of a given rank (number of features). To solve for these
- * features, we run a given number of iterations of ALS. The level of parallelism is determined
- * automatically based on the number of partitions in `ratings`.
+ * Train a matrix factorization model given an RDD of ratings by users for a subset of products.
+ * The ratings matrix is approximated as the product of two lower-rank matrices of a given rank
+ * (number of features). To solve for these features, ALS is run iteratively with a level of
+ * parallelism automatically based on the number of partitions in `ratings`.
*
- * @param ratings RDD of (userID, productID, rating) pairs
+ * @param ratings RDD of [[Rating]] objects with userID, productID, and rating
* @param rank number of features to use
- * @param iterations number of iterations of ALS (recommended: 10-20)
+ * @param iterations number of iterations of ALS
*/
@Since("0.8.0")
def train(ratings: RDD[Rating], rank: Int, iterations: Int)
@@ -372,11 +368,11 @@ object ALS {
*
* @param ratings RDD of (userID, productID, rating) pairs
* @param rank number of features to use
- * @param iterations number of iterations of ALS (recommended: 10-20)
- * @param lambda regularization factor (recommended: 0.01)
+ * @param iterations number of iterations of ALS
+ * @param lambda regularization parameter
* @param blocks level of parallelism to split computation into
* @param alpha confidence parameter
- * @param seed random seed
+ * @param seed random seed for initial matrix factorization model
*/
@Since("0.8.1")
def trainImplicit(
@@ -392,16 +388,15 @@ object ALS {
}
/**
- * Train a matrix factorization model given an RDD of 'implicit preferences' given by users
- * to some products, in the form of (userID, productID, preference) pairs. We approximate the
- * ratings matrix as the product of two lower-rank matrices of a given rank (number of features).
- * To solve for these features, we run a given number of iterations of ALS. This is done using
- * a level of parallelism given by `blocks`.
+ * Train a matrix factorization model given an RDD of 'implicit preferences' of users for a
+ * subset of products. The ratings matrix is approximated as the product of two lower-rank
+ * matrices of a given rank (number of features). To solve for these features, ALS is run
+ * iteratively with a configurable level of parallelism.
*
- * @param ratings RDD of (userID, productID, rating) pairs
+ * @param ratings RDD of [[Rating]] objects with userID, productID, and rating
* @param rank number of features to use
- * @param iterations number of iterations of ALS (recommended: 10-20)
- * @param lambda regularization factor (recommended: 0.01)
+ * @param iterations number of iterations of ALS
+ * @param lambda regularization parameter
* @param blocks level of parallelism to split computation into
* @param alpha confidence parameter
*/
@@ -418,16 +413,16 @@ object ALS {
}
/**
- * Train a matrix factorization model given an RDD of 'implicit preferences' given by users to
- * some products, in the form of (userID, productID, preference) pairs. We approximate the
- * ratings matrix as the product of two lower-rank matrices of a given rank (number of features).
- * To solve for these features, we run a given number of iterations of ALS. The level of
- * parallelism is determined automatically based on the number of partitions in `ratings`.
+ * Train a matrix factorization model given an RDD of 'implicit preferences' of users for a
+ * subset of products. The ratings matrix is approximated as the product of two lower-rank
+ * matrices of a given rank (number of features). To solve for these features, ALS is run
+ * iteratively with a level of parallelism determined automatically based on the number of
+ * partitions in `ratings`.
*
- * @param ratings RDD of (userID, productID, rating) pairs
+ * @param ratings RDD of [[Rating]] objects with userID, productID, and rating
* @param rank number of features to use
- * @param iterations number of iterations of ALS (recommended: 10-20)
- * @param lambda regularization factor (recommended: 0.01)
+ * @param iterations number of iterations of ALS
+ * @param lambda regularization parameter
* @param alpha confidence parameter
*/
@Since("0.8.1")
@@ -437,16 +432,15 @@ object ALS {
}
/**
- * Train a matrix factorization model given an RDD of 'implicit preferences' ratings given by
- * users to some products, in the form of (userID, productID, rating) pairs. We approximate the
- * ratings matrix as the product of two lower-rank matrices of a given rank (number of features).
- * To solve for these features, we run a given number of iterations of ALS. The level of
- * parallelism is determined automatically based on the number of partitions in `ratings`.
- * Model parameters `alpha` and `lambda` are set to reasonable default values
+ * Train a matrix factorization model given an RDD of 'implicit preferences' of users for a
+ * subset of products. The ratings matrix is approximated as the product of two lower-rank
+ * matrices of a given rank (number of features). To solve for these features, ALS is run
+ * iteratively with a level of parallelism determined automatically based on the number of
+ * partitions in `ratings`.
*
- * @param ratings RDD of (userID, productID, rating) pairs
+ * @param ratings RDD of [[Rating]] objects with userID, productID, and rating
* @param rank number of features to use
- * @param iterations number of iterations of ALS (recommended: 10-20)
+ * @param iterations number of iterations of ALS
*/
@Since("0.8.1")
def trainImplicit(ratings: RDD[Rating], rank: Int, iterations: Int)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
index 0dc40483dd..628cf1dd57 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/MatrixFactorizationModel.scala
@@ -206,7 +206,7 @@ class MatrixFactorizationModel @Since("0.8.0") (
}
/**
- * Recommends topK products for all users.
+ * Recommends top products for all users.
*
* @param num how many products to return for every user.
* @return [(Int, Array[Rating])] objects, where every tuple contains a userID and an array of
@@ -224,7 +224,7 @@ class MatrixFactorizationModel @Since("0.8.0") (
/**
- * Recommends topK users for all products.
+ * Recommends top users for all products.
*
* @param num how many users to return for every product.
* @return [(Int, Array[Rating])] objects, where every tuple contains a productID and an array