aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorBryan Cutler <cutlerb@gmail.com>2016-02-02 10:50:22 -0800
committerXiangrui Meng <meng@databricks.com>2016-02-02 10:50:22 -0800
commitcba1d6b659288bfcd8db83a6d778155bab2bbecf (patch)
tree7d0b90cca15aff9ae77f0f0cd858b5909d7d948c /mllib
parentb93830126cc59a26e2cfb5d7b3c17f9cfbf85988 (diff)
downloadspark-cba1d6b659288bfcd8db83a6d778155bab2bbecf.tar.gz
spark-cba1d6b659288bfcd8db83a6d778155bab2bbecf.tar.bz2
spark-cba1d6b659288bfcd8db83a6d778155bab2bbecf.zip
[SPARK-12631][PYSPARK][DOC] PySpark clustering parameter desc to consistent format
Part of task for [SPARK-11219](https://issues.apache.org/jira/browse/SPARK-11219) to make PySpark MLlib parameter description formatting consistent. This is for the clustering module. Author: Bryan Cutler <cutlerb@gmail.com> Closes #10610 from BryanCutler/param-desc-consistent-cluster-SPARK-12631.
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala12
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala31
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala13
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala6
5 files changed, 37 insertions, 29 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index 7b203e2f40..88dbfe3fcc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -45,10 +45,10 @@ import org.apache.spark.util.Utils
* This is due to high-dimensional data (a) making it difficult to cluster at all (based
* on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
*
- * @param k The number of independent Gaussians in the mixture model
- * @param convergenceTol The maximum change in log-likelihood at which convergence
- * is considered to have occurred.
- * @param maxIterations The maximum number of iterations to perform
+ * @param k Number of independent Gaussians in the mixture model.
+ * @param convergenceTol Maximum change in log-likelihood at which convergence
+ * is considered to have occurred.
+ * @param maxIterations Maximum number of iterations allowed.
*/
@Since("1.3.0")
class GaussianMixture private (
@@ -108,7 +108,7 @@ class GaussianMixture private (
def getK: Int = k
/**
- * Set the maximum number of iterations to run. Default: 100
+ * Set the maximum number of iterations allowed. Default: 100
*/
@Since("1.3.0")
def setMaxIterations(maxIterations: Int): this.type = {
@@ -117,7 +117,7 @@ class GaussianMixture private (
}
/**
- * Return the maximum number of iterations to run
+ * Return the maximum number of iterations allowed
*/
@Since("1.3.0")
def getMaxIterations: Int = maxIterations
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index ca11ede4cc..901164a391 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -70,13 +70,13 @@ class KMeans private (
}
/**
- * Maximum number of iterations to run.
+ * Maximum number of iterations allowed.
*/
@Since("1.4.0")
def getMaxIterations: Int = maxIterations
/**
- * Set maximum number of iterations to run. Default: 20.
+ * Set maximum number of iterations allowed. Default: 20.
*/
@Since("0.8.0")
def setMaxIterations(maxIterations: Int): this.type = {
@@ -482,12 +482,15 @@ object KMeans {
/**
* Trains a k-means model using the given set of parameters.
*
- * @param data training points stored as `RDD[Vector]`
- * @param k number of clusters
- * @param maxIterations max number of iterations
- * @param runs number of parallel runs, defaults to 1. The best model is returned.
- * @param initializationMode initialization model, either "random" or "k-means||" (default).
- * @param seed random seed value for cluster initialization
+ * @param data Training points as an `RDD` of `Vector` types.
+ * @param k Number of clusters to create.
+ * @param maxIterations Maximum number of iterations allowed.
+ * @param runs Number of runs to execute in parallel. The best model according to the cost
+ * function will be returned. (default: 1)
+ * @param initializationMode The initialization algorithm. This can either be "random" or
+ * "k-means||". (default: "k-means||")
+ * @param seed Random seed for cluster initialization. Default is to generate seed based
+ * on system time.
*/
@Since("1.3.0")
def train(
@@ -508,11 +511,13 @@ object KMeans {
/**
* Trains a k-means model using the given set of parameters.
*
- * @param data training points stored as `RDD[Vector]`
- * @param k number of clusters
- * @param maxIterations max number of iterations
- * @param runs number of parallel runs, defaults to 1. The best model is returned.
- * @param initializationMode initialization model, either "random" or "k-means||" (default).
+ * @param data Training points as an `RDD` of `Vector` types.
+ * @param k Number of clusters to create.
+ * @param maxIterations Maximum number of iterations allowed.
+ * @param runs Number of runs to execute in parallel. The best model according to the cost
+ * function will be returned. (default: 1)
+ * @param initializationMode The initialization algorithm. This can either be "random" or
+ * "k-means||". (default: "k-means||")
*/
@Since("0.8.0")
def train(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index eb802a365e..81566b4779 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -61,14 +61,13 @@ class LDA private (
ldaOptimizer = new EMLDAOptimizer)
/**
- * Number of topics to infer. I.e., the number of soft cluster centers.
- *
+ * Number of topics to infer, i.e., the number of soft cluster centers.
*/
@Since("1.3.0")
def getK: Int = k
/**
- * Number of topics to infer. I.e., the number of soft cluster centers.
+ * Set the number of topics to infer, i.e., the number of soft cluster centers.
* (default = 10)
*/
@Since("1.3.0")
@@ -222,13 +221,13 @@ class LDA private (
def setBeta(beta: Double): this.type = setTopicConcentration(beta)
/**
- * Maximum number of iterations for learning.
+ * Maximum number of iterations allowed.
*/
@Since("1.3.0")
def getMaxIterations: Int = maxIterations
/**
- * Maximum number of iterations for learning.
+ * Set the maximum number of iterations allowed.
* (default = 20)
*/
@Since("1.3.0")
@@ -238,13 +237,13 @@ class LDA private (
}
/**
- * Random seed
+ * Random seed for cluster initialization.
*/
@Since("1.3.0")
def getSeed: Long = seed
/**
- * Random seed
+ * Set the random seed for cluster initialization.
*/
@Since("1.3.0")
def setSeed(seed: Long): this.type = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index 2ab0920b06..1ab7cb393b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -111,7 +111,9 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
*
* @param k Number of clusters.
* @param maxIterations Maximum number of iterations of the PIC algorithm.
- * @param initMode Initialization mode.
+ * @param initMode Set the initialization mode. This can be either "random" to use a random vector
+ * as vertex properties, or "degree" to use normalized sum similarities.
+ * Default: random.
*
* @see [[http://en.wikipedia.org/wiki/Spectral_clustering Spectral clustering (Wikipedia)]]
*/
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
index 79d217e183..d99b89dc49 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
@@ -183,7 +183,7 @@ class StreamingKMeans @Since("1.2.0") (
}
/**
- * Set the decay factor directly (for forgetful algorithms).
+ * Set the forgetfulness of the previous centroids.
*/
@Since("1.2.0")
def setDecayFactor(a: Double): this.type = {
@@ -192,7 +192,9 @@ class StreamingKMeans @Since("1.2.0") (
}
/**
- * Set the half life and time unit ("batches" or "points") for forgetful algorithms.
+ * Set the half life and time unit ("batches" or "points"). If points, then the decay factor
+ * is raised to the power of number of new points and if batches, then decay factor will be
+ * used as is.
*/
@Since("1.2.0")
def setHalfLife(halfLife: Double, timeUnit: String): this.type = {