aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2015-03-25 13:38:33 -0700
committerJoseph K. Bradley <joseph@databricks.com>2015-03-25 13:38:33 -0700
commit435337381f093f95248c8f0204e60c0b366edc81 (patch)
tree60e820022e209f5bc5771d71d0a147bdd391c545 /python
parentc1b74df6042b33b2b061cb07c2fbd82dba9074bb (diff)
downloadspark-435337381f093f95248c8f0204e60c0b366edc81.tar.gz
spark-435337381f093f95248c8f0204e60c0b366edc81.tar.bz2
spark-435337381f093f95248c8f0204e60c0b366edc81.zip
[SPARK-6256] [MLlib] MLlib Python API parity check for regression
MLlib Python API parity check for Regression, major disparities need to be added for Python list following: ```scala LinearRegressionWithSGD setValidateData LassoWithSGD setIntercept setValidateData RidgeRegressionWithSGD setIntercept setValidateData ``` setFeatureScaling is mllib private function which is not needed to expose in pyspark. Author: Yanbo Liang <ybliang8@gmail.com> Closes #4997 from yanboliang/spark-6256 and squashes the following commits: 102f498 [Yanbo Liang] fix intercept issue & add doc test 1fb7b4f [Yanbo Liang] change 'intercept' to 'addIntercept' de5ecbc [Yanbo Liang] MLlib Python API parity check for regression
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/mllib/regression.py43
1 files changed, 36 insertions, 7 deletions
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 414a0ada80..209f1ee473 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -140,6 +140,13 @@ class LinearRegressionModel(LinearRegressionModelBase):
True
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
True
+ >>> lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=100, step=1.0,
+ ... miniBatchFraction=1.0, initialWeights=array([1.0]), regParam=0.1, regType="l2",
+ ... intercept=True, validateData=True)
+ >>> abs(lrm.predict(array([0.0])) - 0) < 0.5
+ True
+ >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
+ True
"""
def save(self, sc, path):
java_model = sc._jvm.org.apache.spark.mllib.regression.LinearRegressionModel(
@@ -173,7 +180,8 @@ class LinearRegressionWithSGD(object):
@classmethod
def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
- initialWeights=None, regParam=0.0, regType=None, intercept=False):
+ initialWeights=None, regParam=0.0, regType=None, intercept=False,
+ validateData=True):
"""
Train a linear regression model on the given data.
@@ -195,15 +203,18 @@ class LinearRegressionWithSGD(object):
(default: None)
- @param intercept: Boolean parameter which indicates the use
+ :param intercept: Boolean parameter which indicates the use
or not of the augmented representation for
training data (i.e. whether bias features
are activated or not). (default: False)
+ :param validateData: Boolean parameter which indicates if the
+ algorithm should validate data before training.
+ (default: True)
"""
def train(rdd, i):
return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd, int(iterations),
float(step), float(miniBatchFraction), i, float(regParam),
- regType, bool(intercept))
+ regType, bool(intercept), bool(validateData))
return _regression_train_wrapper(train, LinearRegressionModel, data, initialWeights)
@@ -253,6 +264,13 @@ class LassoModel(LinearRegressionModelBase):
True
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
True
+ >>> lrm = LassoWithSGD.train(sc.parallelize(data), iterations=100, step=1.0,
+ ... regParam=0.01, miniBatchFraction=1.0, initialWeights=array([1.0]), intercept=True,
+ ... validateData=True)
+ >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
+ True
+ >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
+ True
"""
def save(self, sc, path):
java_model = sc._jvm.org.apache.spark.mllib.regression.LassoModel(
@@ -273,11 +291,13 @@ class LassoWithSGD(object):
@classmethod
def train(cls, data, iterations=100, step=1.0, regParam=0.01,
- miniBatchFraction=1.0, initialWeights=None):
+ miniBatchFraction=1.0, initialWeights=None, intercept=False,
+ validateData=True):
"""Train a Lasso regression model on the given data."""
def train(rdd, i):
return callMLlibFunc("trainLassoModelWithSGD", rdd, int(iterations), float(step),
- float(regParam), float(miniBatchFraction), i)
+ float(regParam), float(miniBatchFraction), i, bool(intercept),
+ bool(validateData))
return _regression_train_wrapper(train, LassoModel, data, initialWeights)
@@ -327,6 +347,13 @@ class RidgeRegressionModel(LinearRegressionModelBase):
True
>>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
True
+ >>> lrm = RidgeRegressionWithSGD.train(sc.parallelize(data), iterations=100, step=1.0,
+ ... regParam=0.01, miniBatchFraction=1.0, initialWeights=array([1.0]), intercept=True,
+ ... validateData=True)
+ >>> abs(lrm.predict(np.array([0.0])) - 0) < 0.5
+ True
+ >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
+ True
"""
def save(self, sc, path):
java_model = sc._jvm.org.apache.spark.mllib.regression.RidgeRegressionModel(
@@ -347,11 +374,13 @@ class RidgeRegressionWithSGD(object):
@classmethod
def train(cls, data, iterations=100, step=1.0, regParam=0.01,
- miniBatchFraction=1.0, initialWeights=None):
+ miniBatchFraction=1.0, initialWeights=None, intercept=False,
+ validateData=True):
"""Train a ridge regression model on the given data."""
def train(rdd, i):
return callMLlibFunc("trainRidgeModelWithSGD", rdd, int(iterations), float(step),
- float(regParam), float(miniBatchFraction), i)
+ float(regParam), float(miniBatchFraction), i, bool(intercept),
+ bool(validateData))
return _regression_train_wrapper(train, RidgeRegressionModel, data, initialWeights)