aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2016-05-31 14:56:43 -0700
committerNick Pentreath <nickp@za.ibm.com>2016-05-31 14:56:43 -0700
commit594484cd8343f870c53fbc829ed4fb889016a8cf (patch)
tree9ca6cc63d47a1fada0f6d2c8d395c7543e1ec3c9 /python
parent9a74de18a13d84805e1a448f858bb05ce30de87e (diff)
downloadspark-594484cd8343f870c53fbc829ed4fb889016a8cf.tar.gz
spark-594484cd8343f870c53fbc829ed4fb889016a8cf.tar.bz2
spark-594484cd8343f870c53fbc829ed4fb889016a8cf.zip
[MINOR][DOC][ML] ml.clustering scala & python api doc sync
## What changes were proposed in this pull request? Since we done Scala API audit for ml.clustering at #13148, we should also fix and update the corresponding Python API docs to keep them in sync. ## How was this patch tested? Docs change, no tests. Author: Yanbo Liang <ybliang8@gmail.com> Closes #13291 from yanboliang/spark-15361-followup.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/ml/clustering.py35
1 files changed, 25 insertions, 10 deletions
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index a457904e78..92df19e804 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -64,6 +64,21 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
.. note:: Experimental
GaussianMixture clustering.
+ This class performs expectation maximization for multivariate Gaussian
+ Mixture Models (GMMs). A GMM represents a composite distribution of
+ independent Gaussian distributions with associated "mixing" weights
+ specifying each's contribution to the composite.
+
+ Given a set of sample points, this class will maximize the log-likelihood
+ for a mixture of k Gaussians, iterating until the log-likelihood changes by
+ less than convergenceTol, or until it has reached the max number of iterations.
+ While this process is generally guaranteed to converge, it is not guaranteed
+ to find a global optimum.
+
+ Note: For high-dimensional data (with many features), this algorithm may perform poorly.
+ This is due to high-dimensional data (a) making it difficult to cluster at all
+ (based on statistical/theoretical arguments) and (b) numerical issues with
+ Gaussian distributions.
>>> from pyspark.ml.linalg import Vectors
@@ -118,8 +133,8 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
.. versionadded:: 2.0.0
"""
- k = Param(Params._dummy(), "k", "number of clusters to create",
- typeConverter=TypeConverters.toInt)
+ k = Param(Params._dummy(), "k", "Number of independent Gaussians in the mixture model. " +
+ "Must be > 1.", typeConverter=TypeConverters.toInt)
@keyword_only
def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
@@ -227,15 +242,15 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
.. versionadded:: 1.5.0
"""
- k = Param(Params._dummy(), "k", "number of clusters to create",
+ k = Param(Params._dummy(), "k", "The number of clusters to create. Must be > 1.",
typeConverter=TypeConverters.toInt)
initMode = Param(Params._dummy(), "initMode",
- "the initialization algorithm. This can be either \"random\" to " +
+ "The initialization algorithm. This can be either \"random\" to " +
"choose random points as initial cluster centers, or \"k-means||\" " +
"to use a parallel variant of k-means++",
typeConverter=TypeConverters.toString)
- initSteps = Param(Params._dummy(), "initSteps", "steps for k-means initialization mode",
- typeConverter=TypeConverters.toInt)
+ initSteps = Param(Params._dummy(), "initSteps", "The number of steps for k-means|| " +
+ "initialization mode. Must be > 0.", typeConverter=TypeConverters.toInt)
@keyword_only
def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
@@ -380,11 +395,11 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
.. versionadded:: 2.0.0
"""
- k = Param(Params._dummy(), "k", "number of clusters to create",
+ k = Param(Params._dummy(), "k", "The desired number of leaf clusters. Must be > 1.",
typeConverter=TypeConverters.toInt)
minDivisibleClusterSize = Param(Params._dummy(), "minDivisibleClusterSize",
- "the minimum number of points (if >= 1.0) " +
- "or the minimum proportion",
+ "The minimum number of points (if >= 1.0) or the minimum " +
+ "proportion of points (if < 1.0) of a divisible cluster.",
typeConverter=TypeConverters.toFloat)
@keyword_only
@@ -661,7 +676,7 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, HasCheckpointInter
.. versionadded:: 2.0.0
"""
- k = Param(Params._dummy(), "k", "number of topics (clusters) to infer",
+ k = Param(Params._dummy(), "k", "The number of topics (clusters) to infer. Must be > 1.",
typeConverter=TypeConverters.toInt)
optimizer = Param(Params._dummy(), "optimizer",
"Optimizer or inference algorithm used to estimate the LDA model. "