[SPARK-9766] [ML] [PySpark] check and add miss docs for PySpark ML

Check and add miss docs for PySpark ML (this issue only check miss docs for o.a.s.ml not o.a.s.mllib). Author: Yanbo Liang <ybliang8@gmail.com> Closes #8059 from yanboliang/SPARK-9766.
author: Yanbo Liang <ybliang8@gmail.com> 2015-08-12 13:24:18 -0700
committer: Joseph K. Bradley <joseph@databricks.com> 2015-08-12 13:24:18 -0700
commit: 762bacc16ac5e74c8b05a7c1e3e367d1d1633cef (patch)
tree: da72f2717842672fcdbe092947284c0b4f009cf2 /python
parent: 60103ecd3d9c92709a5878be7ebd57012813ab48 (diff)
download: spark-762bacc16ac5e74c8b05a7c1e3e367d1d1633cef.tar.gz
spark-762bacc16ac5e74c8b05a7c1e3e367d1d1633cef.tar.bz2
spark-762bacc16ac5e74c8b05a7c1e3e367d1d1633cef.zip
4 files changed, 20 insertions, 8 deletions
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 5978d8f4d3..6702dce554 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -34,6 +34,7 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
                          HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol):
     """
     Logistic regression.
+    Currently, this class only supports binary classification.
 
     >>> from pyspark.sql import Row
     >>> from pyspark.mllib.linalg import Vectors
@@ -96,8 +97,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
         #  is an L2 penalty. For alpha = 1, it is an L1 penalty.
         self.elasticNetParam = \
             Param(self, "elasticNetParam",
-                  "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty " +
-                  "is an L2 penalty. For alpha = 1, it is an L1 penalty.")
+                  "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
+                  "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
         #: param for whether to fit an intercept term.
         self.fitIntercept = Param(self, "fitIntercept", "whether to fit an intercept term.")
         #: param for threshold in binary classification prediction, in range [0, 1].
@@ -656,6 +657,13 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H
                  HasRawPredictionCol):
     """
     Naive Bayes Classifiers.
+    It supports both Multinomial and Bernoulli NB. Multinomial NB
+    (`http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html`)
+    can handle finitely supported discrete data. For example, by converting documents into
+    TF-IDF vectors, it can be used for document classification. By making every vector a
+    binary (0/1) data, it can also be used as Bernoulli NB
+    (`http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html`).
+    The input feature values must be nonnegative.
 
     >>> from pyspark.sql import Row
     >>> from pyspark.mllib.linalg import Vectors
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index b5e9b6549d..48338713a2 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -37,7 +37,9 @@ class KMeansModel(JavaModel):
 @inherit_doc
 class KMeans(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed):
     """
-    K-means Clustering
+    K-means clustering with support for multiple parallel runs and a k-means++ like initialization
+    mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent runs are requested,
+    they are executed together with joint passes over the data for efficiency.
 
     >>> from pyspark.mllib.linalg import Vectors
     >>> data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index 06e8093522..2734092575 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -23,7 +23,8 @@ from pyspark.ml.param.shared import HasLabelCol, HasPredictionCol, HasRawPredict
 from pyspark.ml.util import keyword_only
 from pyspark.mllib.common import inherit_doc
 
-__all__ = ['Evaluator', 'BinaryClassificationEvaluator', 'RegressionEvaluator']
+__all__ = ['Evaluator', 'BinaryClassificationEvaluator', 'RegressionEvaluator',
+           'MulticlassClassificationEvaluator']
 
 
 @inherit_doc
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index cb4dfa2129..535d553266 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -26,10 +26,11 @@ from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer
 from pyspark.mllib.common import inherit_doc
 from pyspark.mllib.linalg import _convert_to_vector
 
-__all__ = ['Binarizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', 'OneHotEncoder',
-           'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', 'StandardScalerModel',
-           'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer',
-           'Word2Vec', 'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel']
+__all__ = ['Binarizer', 'Bucketizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer',
+           'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler',
+           'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', 'Tokenizer',
+           'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', 'PCA',
+           'PCAModel', 'RFormula', 'RFormulaModel']
 
 
 @inherit_doc
author	Yanbo Liang <ybliang8@gmail.com>	2015-08-12 13:24:18 -0700
committer	Joseph K. Bradley <joseph@databricks.com>	2015-08-12 13:24:18 -0700
commit	762bacc16ac5e74c8b05a7c1e3e367d1d1633cef (patch)
tree	da72f2717842672fcdbe092947284c0b4f009cf2 /python
parent	60103ecd3d9c92709a5878be7ebd57012813ab48 (diff)
download	spark-762bacc16ac5e74c8b05a7c1e3e367d1d1633cef.tar.gz spark-762bacc16ac5e74c8b05a7c1e3e367d1d1633cef.tar.bz2 spark-762bacc16ac5e74c8b05a7c1e3e367d1d1633cef.zip