aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/main/scala
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2016-05-19 13:26:41 -0700
committerJoseph K. Bradley <joseph@databricks.com>2016-05-19 13:26:41 -0700
commit59e6c5560d13def686091391aabe024ecb43174b (patch)
treef8e856ba6dec5b85bcab68611e3f2b60882032fe /mllib/src/main/scala
parent5255e55c843c7b67fcb2abb4284b8b1a09bd6672 (diff)
downloadspark-59e6c5560d13def686091391aabe024ecb43174b.tar.gz
spark-59e6c5560d13def686091391aabe024ecb43174b.tar.bz2
spark-59e6c5560d13def686091391aabe024ecb43174b.zip
[SPARK-15361][ML] ML 2.0 QA: Scala APIs audit for ml.clustering
## What changes were proposed in this pull request? Audit Scala API for ml.clustering. Fix some wrong API documentations and update outdated one. ## How was this patch tested? Existing unit tests. Author: Yanbo Liang <ybliang8@gmail.com> Closes #13148 from yanboliang/spark-15361.
Diffstat (limited to 'mllib/src/main/scala')
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala22
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala22
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala14
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala6
4 files changed, 43 insertions, 21 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index 138e059f94..afb1080b9b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -41,23 +41,27 @@ private[clustering] trait BisectingKMeansParams extends Params
with HasMaxIter with HasFeaturesCol with HasSeed with HasPredictionCol {
/**
- * Set the number of clusters to create (k). Must be > 1. Default: 2.
+ * The desired number of leaf clusters. Must be > 1. Default: 4.
+ * The actual number could be smaller if there are no divisible leaf clusters.
* @group param
*/
@Since("2.0.0")
- final val k = new IntParam(this, "k", "number of clusters to create", (x: Int) => x > 1)
+ final val k = new IntParam(this, "k", "The desired number of leaf clusters. " +
+ "Must be > 1.", ParamValidators.gt(1))
/** @group getParam */
@Since("2.0.0")
def getK: Int = $(k)
- /** @group expertParam */
+ /**
+ * The minimum number of points (if >= 1.0) or the minimum proportion
+ * of points (if < 1.0) of a divisible cluster (default: 1.0).
+ * @group expertParam
+ */
@Since("2.0.0")
- final val minDivisibleClusterSize = new DoubleParam(
- this,
- "minDivisibleClusterSize",
- "the minimum number of points (if >= 1.0) or the minimum proportion",
- (value: Double) => value > 0)
+ final val minDivisibleClusterSize = new DoubleParam(this, "minDivisibleClusterSize",
+ "The minimum number of points (if >= 1.0) or the minimum proportion " +
+ "of points (if < 1.0) of a divisible cluster.", ParamValidators.gt(0.0))
/** @group expertGetParam */
@Since("2.0.0")
@@ -78,7 +82,7 @@ private[clustering] trait BisectingKMeansParams extends Params
* :: Experimental ::
* Model fitted by BisectingKMeans.
*
- * @param parentModel a model trained by spark.mllib.clustering.BisectingKMeans.
+ * @param parentModel a model trained by [[org.apache.spark.mllib.clustering.BisectingKMeans]].
*/
@Since("2.0.0")
@Experimental
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 63ca812609..d81b337607 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -25,7 +25,7 @@ import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.ml.impl.Utils.EPSILON
import org.apache.spark.ml.linalg._
-import org.apache.spark.ml.param.{IntParam, ParamMap, Params}
+import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.stat.distribution.MultivariateGaussian
import org.apache.spark.ml.util._
@@ -45,11 +45,12 @@ private[clustering] trait GaussianMixtureParams extends Params with HasMaxIter w
with HasSeed with HasPredictionCol with HasProbabilityCol with HasTol {
/**
- * Set the number of clusters to create (k). Must be > 1. Default: 2.
+ * Number of independent Gaussians in the mixture model. Must be > 1. Default: 2.
* @group param
*/
@Since("2.0.0")
- final val k = new IntParam(this, "k", "number of clusters to create", (x: Int) => x > 1)
+ final val k = new IntParam(this, "k", "Number of independent Gaussians in the mixture model. " +
+ "Must be > 1.", ParamValidators.gt(1))
/** @group getParam */
@Since("2.0.0")
@@ -249,6 +250,21 @@ object GaussianMixtureModel extends MLReadable[GaussianMixtureModel] {
/**
* :: Experimental ::
* Gaussian Mixture clustering.
+ *
+ * This class performs expectation maximization for multivariate Gaussian
+ * Mixture Models (GMMs). A GMM represents a composite distribution of
+ * independent Gaussian distributions with associated "mixing" weights
+ * specifying each's contribution to the composite.
+ *
+ * Given a set of sample points, this class will maximize the log-likelihood
+ * for a mixture of k Gaussians, iterating until the log-likelihood changes by
+ * less than convergenceTol, or until it has reached the max number of iterations.
+ * While this process is generally guaranteed to converge, it is not guaranteed
+ * to find a global optimum.
+ *
+ * Note: For high-dimensional data (with many features), this algorithm may perform poorly.
+ * This is due to high-dimensional data (a) making it difficult to cluster at all (based
+ * on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
*/
@Since("2.0.0")
@Experimental
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index 986f7e0fb0..0ab370e3b4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -23,7 +23,7 @@ import org.apache.spark.SparkException
import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.ml.linalg.{Vector, VectorUDT}
-import org.apache.spark.ml.param.{IntParam, Param, ParamMap, Params}
+import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util._
import org.apache.spark.mllib.clustering.{KMeans => MLlibKMeans, KMeansModel => MLlibKMeansModel}
@@ -41,11 +41,12 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
with HasSeed with HasPredictionCol with HasTol {
/**
- * Set the number of clusters to create (k). Must be > 1. Default: 2.
+ * The number of clusters to create (k). Must be > 1. Default: 2.
* @group param
*/
@Since("1.5.0")
- final val k = new IntParam(this, "k", "number of clusters to create", (x: Int) => x > 1)
+ final val k = new IntParam(this, "k", "The number of clusters to create. " +
+ "Must be > 1.", ParamValidators.gt(1))
/** @group getParam */
@Since("1.5.0")
@@ -58,7 +59,8 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
* @group expertParam
*/
@Since("1.5.0")
- final val initMode = new Param[String](this, "initMode", "initialization algorithm",
+ final val initMode = new Param[String](this, "initMode", "The initialization algorithm. " +
+ "Supported options: 'random' and 'k-means||'.",
(value: String) => MLlibKMeans.validateInitMode(value))
/** @group expertGetParam */
@@ -71,8 +73,8 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
* @group expertParam
*/
@Since("1.5.0")
- final val initSteps = new IntParam(this, "initSteps", "number of steps for k-means||",
- (value: Int) => value > 0)
+ final val initSteps = new IntParam(this, "initSteps", "The number of steps for k-means|| " +
+ "initialization mode. Must be > 0.", ParamValidators.gt(0))
/** @group expertGetParam */
@Since("1.5.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
index 5a83b28700..ec60991af6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -50,8 +50,8 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
* @group param
*/
@Since("1.6.0")
- final val k = new IntParam(this, "k", "number of topics (clusters) to infer",
- ParamValidators.gt(1))
+ final val k = new IntParam(this, "k", "The number of topics (clusters) to infer. " +
+ "Must be > 1.", ParamValidators.gt(1))
/** @group getParam */
@Since("1.6.0")
@@ -165,7 +165,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
*/
@Since("1.6.0")
final val optimizer = new Param[String](this, "optimizer", "Optimizer or inference" +
- " algorithm used to estimate the LDA model. Supported: " + supportedOptimizers.mkString(", "),
+ " algorithm used to estimate the LDA model. Supported: " + supportedOptimizers.mkString(", "),
(o: String) => ParamValidators.inArray(supportedOptimizers).apply(o.toLowerCase))
/** @group getParam */