aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorvijaykiran <mail@vijaykiran.com>2016-02-12 14:24:24 -0800
committerXiangrui Meng <meng@databricks.com>2016-02-12 14:24:24 -0800
commit42d656814f756599a2bc426f0e1f32bd4cc4470f (patch)
tree076f619d2902cbd4a4f13de51e8fb9a1445f4a7e /python
parent90de6b2fae71d05415610be70300625c409f6092 (diff)
downloadspark-42d656814f756599a2bc426f0e1f32bd4cc4470f.tar.gz
spark-42d656814f756599a2bc426f0e1f32bd4cc4470f.tar.bz2
spark-42d656814f756599a2bc426f0e1f32bd4cc4470f.zip
[SPARK-12630][PYSPARK] [DOC] PySpark classification parameter desc to consistent format
Part of task for [SPARK-11219](https://issues.apache.org/jira/browse/SPARK-11219) to make PySpark MLlib parameter description formatting consistent. This is for the classification module. Author: vijaykiran <mail@vijaykiran.com> Author: Bryan Cutler <cutlerb@gmail.com> Closes #11183 from BryanCutler/pyspark-consistent-param-classification-SPARK-12630.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/mllib/classification.py261
1 files changed, 143 insertions, 118 deletions
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index 9e6f17ef6e..b24592c379 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -94,16 +94,19 @@ class LogisticRegressionModel(LinearClassificationModel):
Classification model trained using Multinomial/Binary Logistic
Regression.
- :param weights: Weights computed for every feature.
- :param intercept: Intercept computed for this model. (Only used
- in Binary Logistic Regression. In Multinomial Logistic
- Regression, the intercepts will not be a single value,
- so the intercepts will be part of the weights.)
- :param numFeatures: the dimension of the features.
- :param numClasses: the number of possible outcomes for k classes
- classification problem in Multinomial Logistic Regression.
- By default, it is binary logistic regression so numClasses
- will be set to 2.
+ :param weights:
+ Weights computed for every feature.
+ :param intercept:
+ Intercept computed for this model. (Only used in Binary Logistic
+ Regression. In Multinomial Logistic Regression, the intercepts will
+ not bea single value, so the intercepts will be part of the
+ weights.)
+ :param numFeatures:
+ The dimension of the features.
+ :param numClasses:
+ The number of possible outcomes for k classes classification problem
+ in Multinomial Logistic Regression. By default, it is binary
+ logistic regression so numClasses will be set to 2.
>>> data = [
... LabeledPoint(0.0, [0.0, 1.0]),
@@ -189,8 +192,8 @@ class LogisticRegressionModel(LinearClassificationModel):
@since('1.4.0')
def numClasses(self):
"""
- Number of possible outcomes for k classes classification problem in Multinomial
- Logistic Regression.
+ Number of possible outcomes for k classes classification problem
+ in Multinomial Logistic Regression.
"""
return self._numClasses
@@ -272,37 +275,42 @@ class LogisticRegressionWithSGD(object):
"""
Train a logistic regression model on the given data.
- :param data: The training data, an RDD of
- LabeledPoint.
- :param iterations: The number of iterations
- (default: 100).
- :param step: The step parameter used in SGD
- (default: 1.0).
- :param miniBatchFraction: Fraction of data to be used for each
- SGD iteration (default: 1.0).
- :param initialWeights: The initial weights (default: None).
- :param regParam: The regularizer parameter
- (default: 0.01).
- :param regType: The type of regularizer used for
- training our model.
-
- :Allowed values:
- - "l1" for using L1 regularization
- - "l2" for using L2 regularization
- - None for no regularization
-
- (default: "l2")
-
- :param intercept: Boolean parameter which indicates the
- use or not of the augmented representation
- for training data (i.e. whether bias
- features are activated or not,
- default: False).
- :param validateData: Boolean parameter which indicates if
- the algorithm should validate data
- before training. (default: True)
- :param convergenceTol: A condition which decides iteration termination.
- (default: 0.001)
+ :param data:
+ The training data, an RDD of LabeledPoint.
+ :param iterations:
+ The number of iterations.
+ (default: 100)
+ :param step:
+ The step parameter used in SGD.
+ (default: 1.0)
+ :param miniBatchFraction:
+ Fraction of data to be used for each SGD iteration.
+ (default: 1.0)
+ :param initialWeights:
+ The initial weights.
+ (default: None)
+ :param regParam:
+ The regularizer parameter.
+ (default: 0.01)
+ :param regType:
+ The type of regularizer used for training our model.
+ Allowed values:
+
+ - "l1" for using L1 regularization
+ - "l2" for using L2 regularization (default)
+ - None for no regularization
+ :param intercept:
+ Boolean parameter which indicates the use or not of the
+ augmented representation for training data (i.e., whether bias
+ features are activated or not).
+ (default: False)
+ :param validateData:
+ Boolean parameter which indicates if the algorithm should
+ validate data before training.
+ (default: True)
+ :param convergenceTol:
+ A condition which decides iteration termination.
+ (default: 0.001)
"""
def train(rdd, i):
return callMLlibFunc("trainLogisticRegressionModelWithSGD", rdd, int(iterations),
@@ -323,38 +331,43 @@ class LogisticRegressionWithLBFGS(object):
"""
Train a logistic regression model on the given data.
- :param data: The training data, an RDD of
- LabeledPoint.
- :param iterations: The number of iterations
- (default: 100).
- :param initialWeights: The initial weights (default: None).
- :param regParam: The regularizer parameter
- (default: 0.01).
- :param regType: The type of regularizer used for
- training our model.
-
- :Allowed values:
- - "l1" for using L1 regularization
- - "l2" for using L2 regularization
- - None for no regularization
-
- (default: "l2")
-
- :param intercept: Boolean parameter which indicates the
- use or not of the augmented representation
- for training data (i.e. whether bias
- features are activated or not,
- default: False).
- :param corrections: The number of corrections used in the
- LBFGS update (default: 10).
- :param tolerance: The convergence tolerance of iterations
- for L-BFGS (default: 1e-4).
- :param validateData: Boolean parameter which indicates if the
- algorithm should validate data before
- training. (default: True)
- :param numClasses: The number of classes (i.e., outcomes) a
- label can take in Multinomial Logistic
- Regression (default: 2).
+ :param data:
+ The training data, an RDD of LabeledPoint.
+ :param iterations:
+ The number of iterations.
+ (default: 100)
+ :param initialWeights:
+ The initial weights.
+ (default: None)
+ :param regParam:
+ The regularizer parameter.
+ (default: 0.01)
+ :param regType:
+ The type of regularizer used for training our model.
+ Allowed values:
+
+ - "l1" for using L1 regularization
+ - "l2" for using L2 regularization (default)
+ - None for no regularization
+ :param intercept:
+ Boolean parameter which indicates the use or not of the
+ augmented representation for training data (i.e., whether bias
+ features are activated or not).
+ (default: False)
+ :param corrections:
+ The number of corrections used in the LBFGS update.
+ (default: 10)
+ :param tolerance:
+ The convergence tolerance of iterations for L-BFGS.
+ (default: 1e-4)
+ :param validateData:
+ Boolean parameter which indicates if the algorithm should
+ validate data before training.
+ (default: True)
+ :param numClasses:
+ The number of classes (i.e., outcomes) a label can take in
+ Multinomial Logistic Regression.
+ (default: 2)
>>> data = [
... LabeledPoint(0.0, [0.0, 1.0]),
@@ -387,8 +400,10 @@ class SVMModel(LinearClassificationModel):
"""
Model for Support Vector Machines (SVMs).
- :param weights: Weights computed for every feature.
- :param intercept: Intercept computed for this model.
+ :param weights:
+ Weights computed for every feature.
+ :param intercept:
+ Intercept computed for this model.
>>> data = [
... LabeledPoint(0.0, [0.0]),
@@ -490,37 +505,42 @@ class SVMWithSGD(object):
"""
Train a support vector machine on the given data.
- :param data: The training data, an RDD of
- LabeledPoint.
- :param iterations: The number of iterations
- (default: 100).
- :param step: The step parameter used in SGD
- (default: 1.0).
- :param regParam: The regularizer parameter
- (default: 0.01).
- :param miniBatchFraction: Fraction of data to be used for each
- SGD iteration (default: 1.0).
- :param initialWeights: The initial weights (default: None).
- :param regType: The type of regularizer used for
- training our model.
-
- :Allowed values:
- - "l1" for using L1 regularization
- - "l2" for using L2 regularization
- - None for no regularization
-
- (default: "l2")
-
- :param intercept: Boolean parameter which indicates the
- use or not of the augmented representation
- for training data (i.e. whether bias
- features are activated or not,
- default: False).
- :param validateData: Boolean parameter which indicates if
- the algorithm should validate data
- before training. (default: True)
- :param convergenceTol: A condition which decides iteration termination.
- (default: 0.001)
+ :param data:
+ The training data, an RDD of LabeledPoint.
+ :param iterations:
+ The number of iterations.
+ (default: 100)
+ :param step:
+ The step parameter used in SGD.
+ (default: 1.0)
+ :param regParam:
+ The regularizer parameter.
+ (default: 0.01)
+ :param miniBatchFraction:
+ Fraction of data to be used for each SGD iteration.
+ (default: 1.0)
+ :param initialWeights:
+ The initial weights.
+ (default: None)
+ :param regType:
+ The type of regularizer used for training our model.
+ Allowed values:
+
+ - "l1" for using L1 regularization
+ - "l2" for using L2 regularization (default)
+ - None for no regularization
+ :param intercept:
+ Boolean parameter which indicates the use or not of the
+ augmented representation for training data (i.e. whether bias
+ features are activated or not).
+ (default: False)
+ :param validateData:
+ Boolean parameter which indicates if the algorithm should
+ validate data before training.
+ (default: True)
+ :param convergenceTol:
+ A condition which decides iteration termination.
+ (default: 0.001)
"""
def train(rdd, i):
return callMLlibFunc("trainSVMModelWithSGD", rdd, int(iterations), float(step),
@@ -536,11 +556,13 @@ class NaiveBayesModel(Saveable, Loader):
"""
Model for Naive Bayes classifiers.
- :param labels: list of labels.
- :param pi: log of class priors, whose dimension is C,
- number of labels.
- :param theta: log of class conditional probabilities, whose
- dimension is C-by-D, where D is number of features.
+ :param labels:
+ List of labels.
+ :param pi:
+ Log of class priors, whose dimension is C, number of labels.
+ :param theta:
+ Log of class conditional probabilities, whose dimension is C-by-D,
+ where D is number of features.
>>> data = [
... LabeledPoint(0.0, [0.0, 0.0]),
@@ -639,8 +661,11 @@ class NaiveBayes(object):
it can also be used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).
The input feature values must be nonnegative.
- :param data: RDD of LabeledPoint.
- :param lambda_: The smoothing parameter (default: 1.0).
+ :param data:
+ RDD of LabeledPoint.
+ :param lambda_:
+ The smoothing parameter.
+ (default: 1.0)
"""
first = data.first()
if not isinstance(first, LabeledPoint):
@@ -652,9 +677,9 @@ class NaiveBayes(object):
@inherit_doc
class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm):
"""
- Train or predict a logistic regression model on streaming data. Training uses
- Stochastic Gradient Descent to update the model based on each new batch of
- incoming data from a DStream.
+ Train or predict a logistic regression model on streaming data.
+ Training uses Stochastic Gradient Descent to update the model based on
+ each new batch of incoming data from a DStream.
Each batch of data is assumed to be an RDD of LabeledPoints.
The number of data points per batch can vary, but the number