[SPARK-12631][PYSPARK][DOC] PySpark clustering parameter desc to consistent format

Part of task for [SPARK-11219](https://issues.apache.org/jira/browse/SPARK-11219) to make PySpark MLlib parameter description formatting consistent. This is for the clustering module. Author: Bryan Cutler <cutlerb@gmail.com> Closes #10610 from BryanCutler/param-desc-consistent-cluster-SPARK-12631.
author: Bryan Cutler <cutlerb@gmail.com> 2016-02-02 10:50:22 -0800
committer: Xiangrui Meng <meng@databricks.com> 2016-02-02 10:50:22 -0800
commit: cba1d6b659288bfcd8db83a6d778155bab2bbecf (patch)
tree: 7d0b90cca15aff9ae77f0f0cd858b5909d7d948c /mllib
parent: b93830126cc59a26e2cfb5d7b3c17f9cfbf85988 (diff)
download: spark-cba1d6b659288bfcd8db83a6d778155bab2bbecf.tar.gz
spark-cba1d6b659288bfcd8db83a6d778155bab2bbecf.tar.bz2
spark-cba1d6b659288bfcd8db83a6d778155bab2bbecf.zip
5 files changed, 37 insertions, 29 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index 7b203e2f40..88dbfe3fcc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -45,10 +45,10 @@ import org.apache.spark.util.Utils
  *       This is due to high-dimensional data (a) making it difficult to cluster at all (based
  *       on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
  *
- * @param k The number of independent Gaussians in the mixture model
- * @param convergenceTol The maximum change in log-likelihood at which convergence
- * is considered to have occurred.
- * @param maxIterations The maximum number of iterations to perform
+ * @param k Number of independent Gaussians in the mixture model.
+ * @param convergenceTol Maximum change in log-likelihood at which convergence
+ *                       is considered to have occurred.
+ * @param maxIterations Maximum number of iterations allowed.
  */
 @Since("1.3.0")
 class GaussianMixture private (
@@ -108,7 +108,7 @@ class GaussianMixture private (
   def getK: Int = k
 
   /**
-   * Set the maximum number of iterations to run. Default: 100
+   * Set the maximum number of iterations allowed. Default: 100
    */
   @Since("1.3.0")
   def setMaxIterations(maxIterations: Int): this.type = {
@@ -117,7 +117,7 @@ class GaussianMixture private (
   }
 
   /**
-   * Return the maximum number of iterations to run
+   * Return the maximum number of iterations allowed
    */
   @Since("1.3.0")
   def getMaxIterations: Int = maxIterations
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index ca11ede4cc..901164a391 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -70,13 +70,13 @@ class KMeans private (
   }
 
   /**
-   * Maximum number of iterations to run.
+   * Maximum number of iterations allowed.
    */
   @Since("1.4.0")
   def getMaxIterations: Int = maxIterations
 
   /**
-   * Set maximum number of iterations to run. Default: 20.
+   * Set maximum number of iterations allowed. Default: 20.
    */
   @Since("0.8.0")
   def setMaxIterations(maxIterations: Int): this.type = {
@@ -482,12 +482,15 @@ object KMeans {
   /**
    * Trains a k-means model using the given set of parameters.
    *
-   * @param data training points stored as `RDD[Vector]`
-   * @param k number of clusters
-   * @param maxIterations max number of iterations
-   * @param runs number of parallel runs, defaults to 1. The best model is returned.
-   * @param initializationMode initialization model, either "random" or "k-means||" (default).
-   * @param seed random seed value for cluster initialization
+   * @param data Training points as an `RDD` of `Vector` types.
+   * @param k Number of clusters to create.
+   * @param maxIterations Maximum number of iterations allowed.
+   * @param runs Number of runs to execute in parallel. The best model according to the cost
+   *             function will be returned. (default: 1)
+   * @param initializationMode The initialization algorithm. This can either be "random" or
+   *                           "k-means||". (default: "k-means||")
+   * @param seed Random seed for cluster initialization. Default is to generate seed based
+   *             on system time.
    */
   @Since("1.3.0")
   def train(
@@ -508,11 +511,13 @@ object KMeans {
   /**
    * Trains a k-means model using the given set of parameters.
    *
-   * @param data training points stored as `RDD[Vector]`
-   * @param k number of clusters
-   * @param maxIterations max number of iterations
-   * @param runs number of parallel runs, defaults to 1. The best model is returned.
-   * @param initializationMode initialization model, either "random" or "k-means||" (default).
+   * @param data Training points as an `RDD` of `Vector` types.
+   * @param k Number of clusters to create.
+   * @param maxIterations Maximum number of iterations allowed.
+   * @param runs Number of runs to execute in parallel. The best model according to the cost
+   *             function will be returned. (default: 1)
+   * @param initializationMode The initialization algorithm. This can either be "random" or
+   *                           "k-means||". (default: "k-means||")
    */
   @Since("0.8.0")
   def train(
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index eb802a365e..81566b4779 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -61,14 +61,13 @@ class LDA private (
     ldaOptimizer = new EMLDAOptimizer)
 
   /**
-   * Number of topics to infer.  I.e., the number of soft cluster centers.
-   *
+   * Number of topics to infer, i.e., the number of soft cluster centers.
    */
   @Since("1.3.0")
   def getK: Int = k
 
   /**
-   * Number of topics to infer.  I.e., the number of soft cluster centers.
+   * Set the number of topics to infer, i.e., the number of soft cluster centers.
    * (default = 10)
    */
   @Since("1.3.0")
@@ -222,13 +221,13 @@ class LDA private (
   def setBeta(beta: Double): this.type = setTopicConcentration(beta)
 
   /**
-   * Maximum number of iterations for learning.
+   * Maximum number of iterations allowed.
    */
   @Since("1.3.0")
   def getMaxIterations: Int = maxIterations
 
   /**
-   * Maximum number of iterations for learning.
+   * Set the maximum number of iterations allowed.
    * (default = 20)
    */
   @Since("1.3.0")
@@ -238,13 +237,13 @@ class LDA private (
   }
 
   /**
-   * Random seed
+   * Random seed for cluster initialization.
    */
   @Since("1.3.0")
   def getSeed: Long = seed
 
   /**
-   * Random seed
+   * Set the random seed for cluster initialization.
    */
   @Since("1.3.0")
   def setSeed(seed: Long): this.type = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index 2ab0920b06..1ab7cb393b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -111,7 +111,9 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
  *
  * @param k Number of clusters.
  * @param maxIterations Maximum number of iterations of the PIC algorithm.
- * @param initMode Initialization mode.
+ * @param initMode Set the initialization mode. This can be either "random" to use a random vector
+ *                 as vertex properties, or "degree" to use normalized sum similarities.
+ *                 Default: random.
  *
  * @see [[http://en.wikipedia.org/wiki/Spectral_clustering Spectral clustering (Wikipedia)]]
  */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
index 79d217e183..d99b89dc49 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
@@ -183,7 +183,7 @@ class StreamingKMeans @Since("1.2.0") (
   }
 
   /**
-   * Set the decay factor directly (for forgetful algorithms).
+   * Set the forgetfulness of the previous centroids.
    */
   @Since("1.2.0")
   def setDecayFactor(a: Double): this.type = {
@@ -192,7 +192,9 @@ class StreamingKMeans @Since("1.2.0") (
   }
 
   /**
-   * Set the half life and time unit ("batches" or "points") for forgetful algorithms.
+   * Set the half life and time unit ("batches" or "points"). If points, then the decay factor
+   * is raised to the power of number of new points and if batches, then decay factor will be
+   * used as is.
    */
   @Since("1.2.0")
   def setHalfLife(halfLife: Double, timeUnit: String): this.type = {
author	Bryan Cutler <cutlerb@gmail.com>	2016-02-02 10:50:22 -0800
committer	Xiangrui Meng <meng@databricks.com>	2016-02-02 10:50:22 -0800
commit	cba1d6b659288bfcd8db83a6d778155bab2bbecf (patch)
tree	7d0b90cca15aff9ae77f0f0cd858b5909d7d948c /mllib
parent	b93830126cc59a26e2cfb5d7b3c17f9cfbf85988 (diff)
download	spark-cba1d6b659288bfcd8db83a6d778155bab2bbecf.tar.gz spark-cba1d6b659288bfcd8db83a6d778155bab2bbecf.tar.bz2 spark-cba1d6b659288bfcd8db83a6d778155bab2bbecf.zip