[SPARK-15361][ML] ML 2.0 QA: Scala APIs audit for ml.clustering

## What changes were proposed in this pull request? Audit Scala API for ml.clustering. Fix some wrong API documentations and update outdated one. ## How was this patch tested? Existing unit tests. Author: Yanbo Liang <ybliang8@gmail.com> Closes #13148 from yanboliang/spark-15361.
author: Yanbo Liang <ybliang8@gmail.com> 2016-05-19 13:26:41 -0700
committer: Joseph K. Bradley <joseph@databricks.com> 2016-05-19 13:26:41 -0700
commit: 59e6c5560d13def686091391aabe024ecb43174b (patch)
tree: f8e856ba6dec5b85bcab68611e3f2b60882032fe /mllib/src/main/scala
parent: 5255e55c843c7b67fcb2abb4284b8b1a09bd6672 (diff)
download: spark-59e6c5560d13def686091391aabe024ecb43174b.tar.gz
spark-59e6c5560d13def686091391aabe024ecb43174b.tar.bz2
spark-59e6c5560d13def686091391aabe024ecb43174b.zip
4 files changed, 43 insertions, 21 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index 138e059f94..afb1080b9b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -41,23 +41,27 @@ private[clustering] trait BisectingKMeansParams extends Params
   with HasMaxIter with HasFeaturesCol with HasSeed with HasPredictionCol {
 
   /**
-   * Set the number of clusters to create (k). Must be > 1. Default: 2.
+   * The desired number of leaf clusters. Must be > 1. Default: 4.
+   * The actual number could be smaller if there are no divisible leaf clusters.
    * @group param
    */
   @Since("2.0.0")
-  final val k = new IntParam(this, "k", "number of clusters to create", (x: Int) => x > 1)
+  final val k = new IntParam(this, "k", "The desired number of leaf clusters. " +
+    "Must be > 1.", ParamValidators.gt(1))
 
   /** @group getParam */
   @Since("2.0.0")
   def getK: Int = $(k)
 
-  /** @group expertParam */
+  /**
+   * The minimum number of points (if >= 1.0) or the minimum proportion
+   * of points (if < 1.0) of a divisible cluster (default: 1.0).
+   * @group expertParam
+   */
   @Since("2.0.0")
-  final val minDivisibleClusterSize = new DoubleParam(
-    this,
-    "minDivisibleClusterSize",
-    "the minimum number of points (if >= 1.0) or the minimum proportion",
-    (value: Double) => value > 0)
+  final val minDivisibleClusterSize = new DoubleParam(this, "minDivisibleClusterSize",
+    "The minimum number of points (if >= 1.0) or the minimum proportion " +
+      "of points (if < 1.0) of a divisible cluster.", ParamValidators.gt(0.0))
 
   /** @group expertGetParam */
   @Since("2.0.0")
@@ -78,7 +82,7 @@ private[clustering] trait BisectingKMeansParams extends Params
  * :: Experimental ::
  * Model fitted by BisectingKMeans.
  *
- * @param parentModel a model trained by spark.mllib.clustering.BisectingKMeans.
+ * @param parentModel a model trained by [[org.apache.spark.mllib.clustering.BisectingKMeans]].
  */
 @Since("2.0.0")
 @Experimental
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 63ca812609..d81b337607 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -25,7 +25,7 @@ import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.impl.Utils.EPSILON
 import org.apache.spark.ml.linalg._
-import org.apache.spark.ml.param.{IntParam, ParamMap, Params}
+import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.stat.distribution.MultivariateGaussian
 import org.apache.spark.ml.util._
@@ -45,11 +45,12 @@ private[clustering] trait GaussianMixtureParams extends Params with HasMaxIter w
   with HasSeed with HasPredictionCol with HasProbabilityCol with HasTol {
 
   /**
-   * Set the number of clusters to create (k). Must be > 1. Default: 2.
+   * Number of independent Gaussians in the mixture model. Must be > 1. Default: 2.
    * @group param
    */
   @Since("2.0.0")
-  final val k = new IntParam(this, "k", "number of clusters to create", (x: Int) => x > 1)
+  final val k = new IntParam(this, "k", "Number of independent Gaussians in the mixture model. " +
+    "Must be > 1.", ParamValidators.gt(1))
 
   /** @group getParam */
   @Since("2.0.0")
@@ -249,6 +250,21 @@ object GaussianMixtureModel extends MLReadable[GaussianMixtureModel] {
 /**
  * :: Experimental ::
  * Gaussian Mixture clustering.
+ *
+ * This class performs expectation maximization for multivariate Gaussian
+ * Mixture Models (GMMs).  A GMM represents a composite distribution of
+ * independent Gaussian distributions with associated "mixing" weights
+ * specifying each's contribution to the composite.
+ *
+ * Given a set of sample points, this class will maximize the log-likelihood
+ * for a mixture of k Gaussians, iterating until the log-likelihood changes by
+ * less than convergenceTol, or until it has reached the max number of iterations.
+ * While this process is generally guaranteed to converge, it is not guaranteed
+ * to find a global optimum.
+ *
+ * Note: For high-dimensional data (with many features), this algorithm may perform poorly.
+ *       This is due to high-dimensional data (a) making it difficult to cluster at all (based
+ *       on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
  */
 @Since("2.0.0")
 @Experimental
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index 986f7e0fb0..0ab370e3b4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -23,7 +23,7 @@ import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.linalg.{Vector, VectorUDT}
-import org.apache.spark.ml.param.{IntParam, Param, ParamMap, Params}
+import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.clustering.{KMeans => MLlibKMeans, KMeansModel => MLlibKMeansModel}
@@ -41,11 +41,12 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
   with HasSeed with HasPredictionCol with HasTol {
 
   /**
-   * Set the number of clusters to create (k). Must be > 1. Default: 2.
+   * The number of clusters to create (k). Must be > 1. Default: 2.
    * @group param
    */
   @Since("1.5.0")
-  final val k = new IntParam(this, "k", "number of clusters to create", (x: Int) => x > 1)
+  final val k = new IntParam(this, "k", "The number of clusters to create. " +
+    "Must be > 1.", ParamValidators.gt(1))
 
   /** @group getParam */
   @Since("1.5.0")
@@ -58,7 +59,8 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
    * @group expertParam
    */
   @Since("1.5.0")
-  final val initMode = new Param[String](this, "initMode", "initialization algorithm",
+  final val initMode = new Param[String](this, "initMode", "The initialization algorithm. " +
+    "Supported options: 'random' and 'k-means||'.",
     (value: String) => MLlibKMeans.validateInitMode(value))
 
   /** @group expertGetParam */
@@ -71,8 +73,8 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
    * @group expertParam
    */
   @Since("1.5.0")
-  final val initSteps = new IntParam(this, "initSteps", "number of steps for k-means||",
-    (value: Int) => value > 0)
+  final val initSteps = new IntParam(this, "initSteps", "The number of steps for k-means|| " +
+    "initialization mode. Must be > 0.", ParamValidators.gt(0))
 
   /** @group expertGetParam */
   @Since("1.5.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
index 5a83b28700..ec60991af6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -50,8 +50,8 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
    * @group param
    */
   @Since("1.6.0")
-  final val k = new IntParam(this, "k", "number of topics (clusters) to infer",
-    ParamValidators.gt(1))
+  final val k = new IntParam(this, "k", "The number of topics (clusters) to infer. " +
+    "Must be > 1.", ParamValidators.gt(1))
 
   /** @group getParam */
   @Since("1.6.0")
@@ -165,7 +165,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
    */
   @Since("1.6.0")
   final val optimizer = new Param[String](this, "optimizer", "Optimizer or inference" +
-    " algorithm used to estimate the LDA model.  Supported: " + supportedOptimizers.mkString(", "),
+    " algorithm used to estimate the LDA model. Supported: " + supportedOptimizers.mkString(", "),
     (o: String) => ParamValidators.inArray(supportedOptimizers).apply(o.toLowerCase))
 
   /** @group getParam */
author	Yanbo Liang <ybliang8@gmail.com>	2016-05-19 13:26:41 -0700
committer	Joseph K. Bradley <joseph@databricks.com>	2016-05-19 13:26:41 -0700
commit	59e6c5560d13def686091391aabe024ecb43174b (patch)
tree	f8e856ba6dec5b85bcab68611e3f2b60882032fe /mllib/src/main/scala
parent	5255e55c843c7b67fcb2abb4284b8b1a09bd6672 (diff)
download	spark-59e6c5560d13def686091391aabe024ecb43174b.tar.gz spark-59e6c5560d13def686091391aabe024ecb43174b.tar.bz2 spark-59e6c5560d13def686091391aabe024ecb43174b.zip