aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/mllib
diff options
context:
space:
mode:
authorvijaykiran <mail@vijaykiran.com>2016-02-29 15:52:41 +0200
committerNick Pentreath <nick.pentreath@gmail.com>2016-02-29 15:52:41 +0200
commit236e3c8fbc887e4da4f143cbf533f016f21c10d4 (patch)
treed22553f78118133af4d2ee4e3b284ecc65e35b5b /python/pyspark/mllib
parent99fe8993f51d3c72cd95eb0825b090dd4d4cd2cd (diff)
downloadspark-236e3c8fbc887e4da4f143cbf533f016f21c10d4.tar.gz
spark-236e3c8fbc887e4da4f143cbf533f016f21c10d4.tar.bz2
spark-236e3c8fbc887e4da4f143cbf533f016f21c10d4.zip
[SPARK-12633][PYSPARK] [DOC] PySpark regression parameter desc to consistent format
Part of task for [SPARK-11219](https://issues.apache.org/jira/browse/SPARK-11219) to make PySpark MLlib parameter description formatting consistent. This is for the regression module. Also, updated 2 params in classification to read as `Supported values:` to be consistent. closes #10600 Author: vijaykiran <mail@vijaykiran.com> Author: Bryan Cutler <cutlerb@gmail.com> Closes #11404 from BryanCutler/param-desc-consistent-regression-SPARK-12633.
Diffstat (limited to 'python/pyspark/mllib')
-rw-r--r--python/pyspark/mllib/classification.py4
-rw-r--r--python/pyspark/mllib/regression.py326
2 files changed, 166 insertions, 164 deletions
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index 53a0df27ca..57106f8690 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -294,7 +294,7 @@ class LogisticRegressionWithSGD(object):
(default: 0.01)
:param regType:
The type of regularizer used for training our model.
- Allowed values:
+ Supported values:
- "l1" for using L1 regularization
- "l2" for using L2 regularization (default)
@@ -344,7 +344,7 @@ class LogisticRegressionWithLBFGS(object):
(default: 0.0)
:param regType:
The type of regularizer used for training our model.
- Allowed values:
+ Supported values:
- "l1" for using L1 regularization
- "l2" for using L2 regularization (default)
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 4dd7083d79..3b77a62000 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -37,10 +37,11 @@ class LabeledPoint(object):
"""
Class that represents the features and labels of a data point.
- :param label: Label for this data point.
- :param features: Vector of features for this point (NumPy array,
- list, pyspark.mllib.linalg.SparseVector, or scipy.sparse
- column matrix)
+ :param label:
+ Label for this data point.
+ :param features:
+ Vector of features for this point (NumPy array, list,
+ pyspark.mllib.linalg.SparseVector, or scipy.sparse column matrix).
Note: 'label' and 'features' are accessible as class attributes.
@@ -66,8 +67,10 @@ class LinearModel(object):
"""
A linear model that has a vector of coefficients and an intercept.
- :param weights: Weights computed for every feature.
- :param intercept: Intercept computed for this model.
+ :param weights:
+ Weights computed for every feature.
+ :param intercept:
+ Intercept computed for this model.
.. versionadded:: 0.9.0
"""
@@ -217,19 +220,8 @@ def _regression_train_wrapper(train_func, modelClass, data, initial_weights):
class LinearRegressionWithSGD(object):
"""
- Train a linear regression model with no regularization using Stochastic Gradient Descent.
- This solves the least squares regression formulation
-
- f(weights) = 1/n ||A weights-y||^2
-
- which is the mean squared error.
- Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
- its corresponding right hand side label y.
- See also the documentation for the precise formulation.
-
.. versionadded:: 0.9.0
"""
-
@classmethod
@since("0.9.0")
def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
@@ -237,47 +229,52 @@ class LinearRegressionWithSGD(object):
validateData=True, convergenceTol=0.001):
"""
Train a linear regression model using Stochastic Gradient
- Descent (SGD).
- This solves the least squares regression formulation
-
- f(weights) = 1/(2n) ||A weights - y||^2,
-
- which is the mean squared error.
- Here the data matrix has n rows, and the input RDD holds the
- set of rows of A, each with its corresponding right hand side
- label y. See also the documentation for the precise formulation.
-
- :param data: The training data, an RDD of
- LabeledPoint.
- :param iterations: The number of iterations
- (default: 100).
- :param step: The step parameter used in SGD
- (default: 1.0).
- :param miniBatchFraction: Fraction of data to be used for each
- SGD iteration (default: 1.0).
- :param initialWeights: The initial weights (default: None).
- :param regParam: The regularizer parameter
- (default: 0.0).
- :param regType: The type of regularizer used for
- training our model.
-
- :Allowed values:
- - "l1" for using L1 regularization (lasso),
- - "l2" for using L2 regularization (ridge),
- - None for no regularization
-
- (default: None)
-
- :param intercept: Boolean parameter which indicates the
- use or not of the augmented representation
- for training data (i.e. whether bias
- features are activated or not,
- default: False).
- :param validateData: Boolean parameter which indicates if
- the algorithm should validate data
- before training. (default: True)
- :param convergenceTol: A condition which decides iteration termination.
- (default: 0.001)
+ Descent (SGD). This solves the least squares regression
+ formulation
+
+ f(weights) = 1/(2n) ||A weights - y||^2
+
+ which is the mean squared error. Here the data matrix has n rows,
+ and the input RDD holds the set of rows of A, each with its
+ corresponding right hand side label y.
+ See also the documentation for the precise formulation.
+
+ :param data:
+ The training data, an RDD of LabeledPoint.
+ :param iterations:
+ The number of iterations.
+ (default: 100)
+ :param step:
+ The step parameter used in SGD.
+ (default: 1.0)
+ :param miniBatchFraction:
+ Fraction of data to be used for each SGD iteration.
+ (default: 1.0)
+ :param initialWeights:
+ The initial weights.
+ (default: None)
+ :param regParam:
+ The regularizer parameter.
+ (default: 0.0)
+ :param regType:
+ The type of regularizer used for training our model.
+ Supported values:
+
+ - "l1" for using L1 regularization
+ - "l2" for using L2 regularization
+ - None for no regularization (default)
+ :param intercept:
+ Boolean parameter which indicates the use or not of the
+ augmented representation for training data (i.e., whether bias
+ features are activated or not).
+ (default: False)
+ :param validateData:
+ Boolean parameter which indicates if the algorithm should
+ validate data before training.
+ (default: True)
+ :param convergenceTol:
+ A condition which decides iteration termination.
+ (default: 0.001)
"""
def train(rdd, i):
return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd, int(iterations),
@@ -368,56 +365,53 @@ class LassoModel(LinearRegressionModelBase):
class LassoWithSGD(object):
"""
- Train a regression model with L1-regularization using Stochastic Gradient Descent.
- This solves the L1-regularized least squares regression formulation
-
- f(weights) = 1/2n ||A weights-y||^2 + regParam ||weights||_1
-
- Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
- its corresponding right hand side label y.
- See also the documentation for the precise formulation.
-
.. versionadded:: 0.9.0
"""
-
@classmethod
@since("0.9.0")
def train(cls, data, iterations=100, step=1.0, regParam=0.01,
miniBatchFraction=1.0, initialWeights=None, intercept=False,
validateData=True, convergenceTol=0.001):
"""
- Train a regression model with L1-regularization using
- Stochastic Gradient Descent.
- This solves the l1-regularized least squares regression
- formulation
-
- f(weights) = 1/(2n) ||A weights - y||^2 + regParam ||weights||_1.
-
- Here the data matrix has n rows, and the input RDD holds the
- set of rows of A, each with its corresponding right hand side
- label y. See also the documentation for the precise formulation.
-
- :param data: The training data, an RDD of
- LabeledPoint.
- :param iterations: The number of iterations
- (default: 100).
- :param step: The step parameter used in SGD
- (default: 1.0).
- :param regParam: The regularizer parameter
- (default: 0.01).
- :param miniBatchFraction: Fraction of data to be used for each
- SGD iteration (default: 1.0).
- :param initialWeights: The initial weights (default: None).
- :param intercept: Boolean parameter which indicates the
- use or not of the augmented representation
- for training data (i.e. whether bias
- features are activated or not,
- default: False).
- :param validateData: Boolean parameter which indicates if
- the algorithm should validate data
- before training. (default: True)
- :param convergenceTol: A condition which decides iteration termination.
- (default: 0.001)
+ Train a regression model with L1-regularization using Stochastic
+ Gradient Descent. This solves the l1-regularized least squares
+ regression formulation
+
+ f(weights) = 1/(2n) ||A weights - y||^2 + regParam ||weights||_1
+
+ Here the data matrix has n rows, and the input RDD holds the set
+ of rows of A, each with its corresponding right hand side label y.
+ See also the documentation for the precise formulation.
+
+ :param data:
+ The training data, an RDD of LabeledPoint.
+ :param iterations:
+ The number of iterations.
+ (default: 100)
+ :param step:
+ The step parameter used in SGD.
+ (default: 1.0)
+ :param regParam:
+ The regularizer parameter.
+ (default: 0.01)
+ :param miniBatchFraction:
+ Fraction of data to be used for each SGD iteration.
+ (default: 1.0)
+ :param initialWeights:
+ The initial weights.
+ (default: None)
+ :param intercept:
+ Boolean parameter which indicates the use or not of the
+ augmented representation for training data (i.e. whether bias
+ features are activated or not).
+ (default: False)
+ :param validateData:
+ Boolean parameter which indicates if the algorithm should
+ validate data before training.
+ (default: True)
+ :param convergenceTol:
+ A condition which decides iteration termination.
+ (default: 0.001)
"""
def train(rdd, i):
return callMLlibFunc("trainLassoModelWithSGD", rdd, int(iterations), float(step),
@@ -508,56 +502,53 @@ class RidgeRegressionModel(LinearRegressionModelBase):
class RidgeRegressionWithSGD(object):
"""
- Train a regression model with L2-regularization using Stochastic Gradient Descent.
- This solves the L2-regularized least squares regression formulation
-
- f(weights) = 1/2n ||A weights-y||^2 + regParam/2 ||weights||^2
-
- Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
- its corresponding right hand side label y.
- See also the documentation for the precise formulation.
-
.. versionadded:: 0.9.0
"""
-
@classmethod
@since("0.9.0")
def train(cls, data, iterations=100, step=1.0, regParam=0.01,
miniBatchFraction=1.0, initialWeights=None, intercept=False,
validateData=True, convergenceTol=0.001):
"""
- Train a regression model with L2-regularization using
- Stochastic Gradient Descent.
- This solves the l2-regularized least squares regression
- formulation
-
- f(weights) = 1/(2n) ||A weights - y||^2 + regParam/2 ||weights||^2.
-
- Here the data matrix has n rows, and the input RDD holds the
- set of rows of A, each with its corresponding right hand side
- label y. See also the documentation for the precise formulation.
-
- :param data: The training data, an RDD of
- LabeledPoint.
- :param iterations: The number of iterations
- (default: 100).
- :param step: The step parameter used in SGD
- (default: 1.0).
- :param regParam: The regularizer parameter
- (default: 0.01).
- :param miniBatchFraction: Fraction of data to be used for each
- SGD iteration (default: 1.0).
- :param initialWeights: The initial weights (default: None).
- :param intercept: Boolean parameter which indicates the
- use or not of the augmented representation
- for training data (i.e. whether bias
- features are activated or not,
- default: False).
- :param validateData: Boolean parameter which indicates if
- the algorithm should validate data
- before training. (default: True)
- :param convergenceTol: A condition which decides iteration termination.
- (default: 0.001)
+ Train a regression model with L2-regularization using Stochastic
+ Gradient Descent. This solves the l2-regularized least squares
+ regression formulation
+
+ f(weights) = 1/(2n) ||A weights - y||^2 + regParam/2 ||weights||^2
+
+ Here the data matrix has n rows, and the input RDD holds the set
+ of rows of A, each with its corresponding right hand side label y.
+ See also the documentation for the precise formulation.
+
+ :param data:
+ The training data, an RDD of LabeledPoint.
+ :param iterations:
+ The number of iterations.
+ (default: 100)
+ :param step:
+ The step parameter used in SGD.
+ (default: 1.0)
+ :param regParam:
+ The regularizer parameter.
+ (default: 0.01)
+ :param miniBatchFraction:
+ Fraction of data to be used for each SGD iteration.
+ (default: 1.0)
+ :param initialWeights:
+ The initial weights.
+ (default: None)
+ :param intercept:
+ Boolean parameter which indicates the use or not of the
+ augmented representation for training data (i.e. whether bias
+ features are activated or not).
+ (default: False)
+ :param validateData:
+ Boolean parameter which indicates if the algorithm should
+ validate data before training.
+ (default: True)
+ :param convergenceTol:
+ A condition which decides iteration termination.
+ (default: 0.001)
"""
def train(rdd, i):
return callMLlibFunc("trainRidgeModelWithSGD", rdd, int(iterations), float(step),
@@ -572,12 +563,14 @@ class IsotonicRegressionModel(Saveable, Loader):
"""
Regression model for isotonic regression.
- :param boundaries: Array of boundaries for which predictions are
- known. Boundaries must be sorted in increasing order.
- :param predictions: Array of predictions associated to the
- boundaries at the same index. Results of isotonic
- regression and therefore monotone.
- :param isotonic: indicates whether this is isotonic or antitonic.
+ :param boundaries:
+ Array of boundaries for which predictions are known. Boundaries
+ must be sorted in increasing order.
+ :param predictions:
+ Array of predictions associated to the boundaries at the same
+ index. Results of isotonic regression and therefore monotone.
+ :param isotonic:
+ Indicates whether this is isotonic or antitonic.
>>> data = [(1, 0, 1), (2, 1, 1), (3, 2, 1), (1, 3, 1), (6, 4, 1), (17, 5, 1), (16, 6, 1)]
>>> irm = IsotonicRegression.train(sc.parallelize(data))
@@ -628,7 +621,8 @@ class IsotonicRegressionModel(Saveable, Loader):
values with the same boundary then the same rules as in 2)
are used.
- :param x: Feature or RDD of Features to be labeled.
+ :param x:
+ Feature or RDD of Features to be labeled.
"""
if isinstance(x, RDD):
return x.map(lambda v: self.predict(v))
@@ -657,8 +651,8 @@ class IsotonicRegressionModel(Saveable, Loader):
class IsotonicRegression(object):
"""
Isotonic regression.
- Currently implemented using parallelized pool adjacent violators algorithm.
- Only univariate (single feature) algorithm supported.
+ Currently implemented using parallelized pool adjacent violators
+ algorithm. Only univariate (single feature) algorithm supported.
Sequential PAV implementation based on:
@@ -684,8 +678,11 @@ class IsotonicRegression(object):
"""
Train a isotonic regression model on the given data.
- :param data: RDD of (label, feature, weight) tuples.
- :param isotonic: Whether this is isotonic or antitonic.
+ :param data:
+ RDD of (label, feature, weight) tuples.
+ :param isotonic:
+ Whether this is isotonic (which is default) or antitonic.
+ (default: True)
"""
boundaries, predictions = callMLlibFunc("trainIsotonicRegressionModel",
data.map(_convert_to_vector), bool(isotonic))
@@ -721,9 +718,11 @@ class StreamingLinearAlgorithm(object):
@since("1.5.0")
def predictOn(self, dstream):
"""
- Make predictions on a dstream.
+ Use the model to make predictions on batches of data from a
+ DStream.
- :return: Transformed dstream object.
+ :return:
+ DStream containing predictions.
"""
self._validate(dstream)
return dstream.map(lambda x: self._model.predict(x))
@@ -731,9 +730,11 @@ class StreamingLinearAlgorithm(object):
@since("1.5.0")
def predictOnValues(self, dstream):
"""
- Make predictions on a keyed dstream.
+ Use the model to make predictions on the values of a DStream and
+ carry over its keys.
- :return: Transformed dstream object.
+ :return:
+ DStream containing the input keys and the predictions as values.
"""
self._validate(dstream)
return dstream.mapValues(lambda x: self._model.predict(x))
@@ -742,14 +743,15 @@ class StreamingLinearAlgorithm(object):
@inherit_doc
class StreamingLinearRegressionWithSGD(StreamingLinearAlgorithm):
"""
- Train or predict a linear regression model on streaming data. Training uses
- Stochastic Gradient Descent to update the model based on each new batch of
- incoming data from a DStream (see `LinearRegressionWithSGD` for model equation).
+ Train or predict a linear regression model on streaming data.
+ Training uses Stochastic Gradient Descent to update the model
+ based on each new batch of incoming data from a DStream
+ (see `LinearRegressionWithSGD` for model equation).
Each batch of data is assumed to be an RDD of LabeledPoints.
The number of data points per batch can vary, but the number
- of features must be constant. An initial weight
- vector must be provided.
+ of features must be constant. An initial weight vector must
+ be provided.
:param stepSize:
Step size for each iteration of gradient descent.