aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/mllib/regression.py
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2015-06-16 14:30:30 -0700
committerJoseph K. Bradley <joseph@databricks.com>2015-06-16 14:30:30 -0700
commitca998757e8ff2bdca2c7e88055c389161521d604 (patch)
tree971436419afa30cd08f251dcec429f8396bd20c7 /python/pyspark/mllib/regression.py
parentcebf2411847706a98dc8df9c754ef53d6d12a87c (diff)
downloadspark-ca998757e8ff2bdca2c7e88055c389161521d604.tar.gz
spark-ca998757e8ff2bdca2c7e88055c389161521d604.tar.bz2
spark-ca998757e8ff2bdca2c7e88055c389161521d604.zip
[SPARK-7916] [MLLIB] MLlib Python doc parity check for classification and regression
Check then make the MLlib Python classification and regression doc to be as complete as the Scala doc. Author: Yanbo Liang <ybliang8@gmail.com> Closes #6460 from yanboliang/spark-7916 and squashes the following commits: f8deda4 [Yanbo Liang] trigger jenkins 6dc4d99 [Yanbo Liang] address comments ce2a43e [Yanbo Liang] truncate too long line and remove extra sparse 3eaf6ad [Yanbo Liang] MLlib Python doc parity check for classification and regression
Diffstat (limited to 'python/pyspark/mllib/regression.py')
-rw-r--r--python/pyspark/mllib/regression.py167
1 files changed, 134 insertions, 33 deletions
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 41bde2ce3e..0c4d7d3bbe 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -33,12 +33,12 @@ __all__ = ['LabeledPoint', 'LinearModel',
class LabeledPoint(object):
"""
- The features and labels of a data point.
+ Class that represents the features and labels of a data point.
:param label: Label for this data point.
:param features: Vector of features for this point (NumPy array,
- list, pyspark.mllib.linalg.SparseVector, or scipy.sparse
- column matrix)
+ list, pyspark.mllib.linalg.SparseVector, or scipy.sparse
+ column matrix)
Note: 'label' and 'features' are accessible as class attributes.
"""
@@ -59,7 +59,12 @@ class LabeledPoint(object):
class LinearModel(object):
- """A linear model that has a vector of coefficients and an intercept."""
+ """
+ A linear model that has a vector of coefficients and an intercept.
+
+ :param weights: Weights computed for every feature.
+ :param intercept: Intercept computed for this model.
+ """
def __init__(self, weights, intercept):
self._coeff = _convert_to_vector(weights)
@@ -193,18 +198,28 @@ class LinearRegressionWithSGD(object):
initialWeights=None, regParam=0.0, regType=None, intercept=False,
validateData=True):
"""
- Train a linear regression model on the given data.
-
- :param data: The training data.
- :param iterations: The number of iterations (default: 100).
+ Train a linear regression model using Stochastic Gradient
+ Descent (SGD).
+ This solves the least squares regression formulation
+ f(weights) = 1/n ||A weights-y||^2^
+ (which is the mean squared error).
+ Here the data matrix has n rows, and the input RDD holds the
+ set of rows of A, each with its corresponding right hand side
+ label y. See also the documentation for the precise formulation.
+
+ :param data: The training data, an RDD of
+ LabeledPoint.
+ :param iterations: The number of iterations
+ (default: 100).
:param step: The step parameter used in SGD
(default: 1.0).
- :param miniBatchFraction: Fraction of data to be used for each SGD
- iteration.
+ :param miniBatchFraction: Fraction of data to be used for each
+ SGD iteration (default: 1.0).
:param initialWeights: The initial weights (default: None).
- :param regParam: The regularizer parameter (default: 0.0).
- :param regType: The type of regularizer used for training
- our model.
+ :param regParam: The regularizer parameter
+ (default: 0.0).
+ :param regType: The type of regularizer used for
+ training our model.
:Allowed values:
- "l1" for using L1 regularization (lasso),
@@ -213,13 +228,14 @@ class LinearRegressionWithSGD(object):
(default: None)
- :param intercept: Boolean parameter which indicates the use
- or not of the augmented representation for
- training data (i.e. whether bias features
- are activated or not). (default: False)
- :param validateData: Boolean parameter which indicates if the
- algorithm should validate data before training.
- (default: True)
+ :param intercept: Boolean parameter which indicates the
+ use or not of the augmented representation
+ for training data (i.e. whether bias
+ features are activated or not,
+ default: False).
+ :param validateData: Boolean parameter which indicates if
+ the algorithm should validate data
+ before training. (default: True)
"""
def train(rdd, i):
return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd, int(iterations),
@@ -232,8 +248,8 @@ class LinearRegressionWithSGD(object):
@inherit_doc
class LassoModel(LinearRegressionModelBase):
- """A linear regression model derived from a least-squares fit with an
- l_1 penalty term.
+ """A linear regression model derived from a least-squares fit with
+ an l_1 penalty term.
>>> from pyspark.mllib.regression import LabeledPoint
>>> data = [
@@ -304,7 +320,36 @@ class LassoWithSGD(object):
def train(cls, data, iterations=100, step=1.0, regParam=0.01,
miniBatchFraction=1.0, initialWeights=None, intercept=False,
validateData=True):
- """Train a Lasso regression model on the given data."""
+ """
+ Train a regression model with L1-regularization using
+ Stochastic Gradient Descent.
+ This solves the l1-regularized least squares regression
+ formulation
+ f(weights) = 1/2n ||A weights-y||^2^ + regParam ||weights||_1
+ Here the data matrix has n rows, and the input RDD holds the
+ set of rows of A, each with its corresponding right hand side
+ label y. See also the documentation for the precise formulation.
+
+ :param data: The training data, an RDD of
+ LabeledPoint.
+ :param iterations: The number of iterations
+ (default: 100).
+ :param step: The step parameter used in SGD
+ (default: 1.0).
+ :param regParam: The regularizer parameter
+ (default: 0.01).
+ :param miniBatchFraction: Fraction of data to be used for each
+ SGD iteration (default: 1.0).
+ :param initialWeights: The initial weights (default: None).
+ :param intercept: Boolean parameter which indicates the
+ use or not of the augmented representation
+ for training data (i.e. whether bias
+ features are activated or not,
+ default: False).
+ :param validateData: Boolean parameter which indicates if
+ the algorithm should validate data
+ before training. (default: True)
+ """
def train(rdd, i):
return callMLlibFunc("trainLassoModelWithSGD", rdd, int(iterations), float(step),
float(regParam), float(miniBatchFraction), i, bool(intercept),
@@ -316,8 +361,8 @@ class LassoWithSGD(object):
@inherit_doc
class RidgeRegressionModel(LinearRegressionModelBase):
- """A linear regression model derived from a least-squares fit with an
- l_2 penalty term.
+ """A linear regression model derived from a least-squares fit with
+ an l_2 penalty term.
>>> from pyspark.mllib.regression import LabeledPoint
>>> data = [
@@ -389,7 +434,36 @@ class RidgeRegressionWithSGD(object):
def train(cls, data, iterations=100, step=1.0, regParam=0.01,
miniBatchFraction=1.0, initialWeights=None, intercept=False,
validateData=True):
- """Train a ridge regression model on the given data."""
+ """
+ Train a regression model with L2-regularization using
+ Stochastic Gradient Descent.
+ This solves the l2-regularized least squares regression
+ formulation
+ f(weights) = 1/2n ||A weights-y||^2^ + regParam/2 ||weights||^2^
+ Here the data matrix has n rows, and the input RDD holds the
+ set of rows of A, each with its corresponding right hand side
+ label y. See also the documentation for the precise formulation.
+
+ :param data: The training data, an RDD of
+ LabeledPoint.
+ :param iterations: The number of iterations
+ (default: 100).
+ :param step: The step parameter used in SGD
+ (default: 1.0).
+ :param regParam: The regularizer parameter
+ (default: 0.01).
+ :param miniBatchFraction: Fraction of data to be used for each
+ SGD iteration (default: 1.0).
+ :param initialWeights: The initial weights (default: None).
+ :param intercept: Boolean parameter which indicates the
+ use or not of the augmented representation
+ for training data (i.e. whether bias
+ features are activated or not,
+ default: False).
+ :param validateData: Boolean parameter which indicates if
+ the algorithm should validate data
+ before training. (default: True)
+ """
def train(rdd, i):
return callMLlibFunc("trainRidgeModelWithSGD", rdd, int(iterations), float(step),
float(regParam), float(miniBatchFraction), i, bool(intercept),
@@ -400,7 +474,15 @@ class RidgeRegressionWithSGD(object):
class IsotonicRegressionModel(Saveable, Loader):
- """Regression model for isotonic regression.
+ """
+ Regression model for isotonic regression.
+
+ :param boundaries: Array of boundaries for which predictions are
+ known. Boundaries must be sorted in increasing order.
+ :param predictions: Array of predictions associated to the
+ boundaries at the same index. Results of isotonic
+ regression and therefore monotone.
+ :param isotonic: indicates whether this is isotonic or antitonic.
>>> data = [(1, 0, 1), (2, 1, 1), (3, 2, 1), (1, 3, 1), (6, 4, 1), (17, 5, 1), (16, 6, 1)]
>>> irm = IsotonicRegression.train(sc.parallelize(data))
@@ -430,6 +512,25 @@ class IsotonicRegressionModel(Saveable, Loader):
self.isotonic = isotonic
def predict(self, x):
+ """
+ Predict labels for provided features.
+ Using a piecewise linear function.
+ 1) If x exactly matches a boundary then associated prediction
+ is returned. In case there are multiple predictions with the
+ same boundary then one of them is returned. Which one is
+ undefined (same as java.util.Arrays.binarySearch).
+ 2) If x is lower or higher than all boundaries then first or
+ last prediction is returned respectively. In case there are
+ multiple predictions with the same boundary then the lowest
+ or highest is returned respectively.
+ 3) If x falls between two values in boundary array then
+ prediction is treated as piecewise linear function and
+ interpolated value is returned. In case there are multiple
+ values with the same boundary then the same rules as in 2)
+ are used.
+
+ :param x: Feature or RDD of Features to be labeled.
+ """
if isinstance(x, RDD):
return x.map(lambda v: self.predict(v))
return np.interp(x, self.boundaries, self.predictions)
@@ -451,15 +552,15 @@ class IsotonicRegressionModel(Saveable, Loader):
class IsotonicRegression(object):
- """
- Run IsotonicRegression algorithm to obtain isotonic regression model.
- :param data: RDD of (label, feature, weight) tuples.
- :param isotonic: Whether this is isotonic or antitonic.
- """
@classmethod
def train(cls, data, isotonic=True):
- """Train a isotonic regression model on the given data."""
+ """
+ Train a isotonic regression model on the given data.
+
+ :param data: RDD of (label, feature, weight) tuples.
+ :param isotonic: Whether this is isotonic or antitonic.
+ """
boundaries, predictions = callMLlibFunc("trainIsotonicRegressionModel",
data.map(_convert_to_vector), bool(isotonic))
return IsotonicRegressionModel(boundaries.toArray(), predictions.toArray(), isotonic)