aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2015-08-12 13:24:18 -0700
committerJoseph K. Bradley <joseph@databricks.com>2015-08-12 13:24:18 -0700
commit762bacc16ac5e74c8b05a7c1e3e367d1d1633cef (patch)
treeda72f2717842672fcdbe092947284c0b4f009cf2 /python
parent60103ecd3d9c92709a5878be7ebd57012813ab48 (diff)
downloadspark-762bacc16ac5e74c8b05a7c1e3e367d1d1633cef.tar.gz
spark-762bacc16ac5e74c8b05a7c1e3e367d1d1633cef.tar.bz2
spark-762bacc16ac5e74c8b05a7c1e3e367d1d1633cef.zip
[SPARK-9766] [ML] [PySpark] check and add miss docs for PySpark ML
Check and add miss docs for PySpark ML (this issue only check miss docs for o.a.s.ml not o.a.s.mllib). Author: Yanbo Liang <ybliang8@gmail.com> Closes #8059 from yanboliang/SPARK-9766.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/ml/classification.py12
-rw-r--r--python/pyspark/ml/clustering.py4
-rw-r--r--python/pyspark/ml/evaluation.py3
-rw-r--r--python/pyspark/ml/feature.py9
4 files changed, 20 insertions, 8 deletions
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 5978d8f4d3..6702dce554 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -34,6 +34,7 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol):
"""
Logistic regression.
+ Currently, this class only supports binary classification.
>>> from pyspark.sql import Row
>>> from pyspark.mllib.linalg import Vectors
@@ -96,8 +97,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
# is an L2 penalty. For alpha = 1, it is an L1 penalty.
self.elasticNetParam = \
Param(self, "elasticNetParam",
- "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty " +
- "is an L2 penalty. For alpha = 1, it is an L1 penalty.")
+ "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
+ "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.")
#: param for whether to fit an intercept term.
self.fitIntercept = Param(self, "fitIntercept", "whether to fit an intercept term.")
#: param for threshold in binary classification prediction, in range [0, 1].
@@ -656,6 +657,13 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H
HasRawPredictionCol):
"""
Naive Bayes Classifiers.
+ It supports both Multinomial and Bernoulli NB. Multinomial NB
+ (`http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html`)
+ can handle finitely supported discrete data. For example, by converting documents into
+ TF-IDF vectors, it can be used for document classification. By making every vector a
+ binary (0/1) data, it can also be used as Bernoulli NB
+ (`http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html`).
+ The input feature values must be nonnegative.
>>> from pyspark.sql import Row
>>> from pyspark.mllib.linalg import Vectors
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index b5e9b6549d..48338713a2 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -37,7 +37,9 @@ class KMeansModel(JavaModel):
@inherit_doc
class KMeans(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed):
"""
- K-means Clustering
+ K-means clustering with support for multiple parallel runs and a k-means++ like initialization
+ mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent runs are requested,
+ they are executed together with joint passes over the data for efficiency.
>>> from pyspark.mllib.linalg import Vectors
>>> data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index 06e8093522..2734092575 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -23,7 +23,8 @@ from pyspark.ml.param.shared import HasLabelCol, HasPredictionCol, HasRawPredict
from pyspark.ml.util import keyword_only
from pyspark.mllib.common import inherit_doc
-__all__ = ['Evaluator', 'BinaryClassificationEvaluator', 'RegressionEvaluator']
+__all__ = ['Evaluator', 'BinaryClassificationEvaluator', 'RegressionEvaluator',
+ 'MulticlassClassificationEvaluator']
@inherit_doc
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index cb4dfa2129..535d553266 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -26,10 +26,11 @@ from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer
from pyspark.mllib.common import inherit_doc
from pyspark.mllib.linalg import _convert_to_vector
-__all__ = ['Binarizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', 'OneHotEncoder',
- 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', 'StandardScalerModel',
- 'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer',
- 'Word2Vec', 'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel']
+__all__ = ['Binarizer', 'Bucketizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer',
+ 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler',
+ 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', 'Tokenizer',
+ 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', 'PCA',
+ 'PCAModel', 'RFormula', 'RFormulaModel']
@inherit_doc