diff options
author | Xiangrui Meng <meng@databricks.com> | 2014-11-13 13:54:16 -0800 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2014-11-13 13:54:16 -0800 |
commit | 32218307edc6de2b08d5f7a0db6d566081d27197 (patch) | |
tree | 08be5a64c506f6d8f403dfae3f49ab50e8ac6e01 /python | |
parent | 4b0c1edfdf457cde0e39083c47961184059efded (diff) | |
download | spark-32218307edc6de2b08d5f7a0db6d566081d27197.tar.gz spark-32218307edc6de2b08d5f7a0db6d566081d27197.tar.bz2 spark-32218307edc6de2b08d5f7a0db6d566081d27197.zip |
[SPARK-4372][MLLIB] Make LR and SVM's default parameters consistent in Scala and Python
The current default regParam is 1.0 and regType is claimed to be none in Python (but actually it is l2), while regParam = 0.0 and regType is L2 in Scala. We should make the default values consistent. This PR sets the default regType to L2 and regParam to 0.01. Note that the default regParam value in LIBLINEAR (and hence scikit-learn) is 1.0. However, we use average loss instead of total loss in our formulation. Hence regParam=1.0 is definitely too heavy.
In LinearRegression, we set regParam=0.0 and regType=None, because we have separate classes for Lasso and Ridge, both of which use regParam=0.01 as the default.
davies atalwalkar
Author: Xiangrui Meng <meng@databricks.com>
Closes #3232 from mengxr/SPARK-4372 and squashes the following commits:
9979837 [Xiangrui Meng] update Ridge/Lasso to use default regParam 0.01 cast input arguments
d3ba096 [Xiangrui Meng] change 'none' back to None
1909a6e [Xiangrui Meng] change default regParam to 0.01 and regType to L2 in LR and SVM
Diffstat (limited to 'python')
-rw-r--r-- | python/pyspark/mllib/classification.py | 36 | ||||
-rw-r--r-- | python/pyspark/mllib/regression.py | 36 |
2 files changed, 37 insertions, 35 deletions
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py index 5d90dddb5d..b654813fb4 100644 --- a/python/pyspark/mllib/classification.py +++ b/python/pyspark/mllib/classification.py @@ -76,7 +76,7 @@ class LogisticRegressionWithSGD(object): @classmethod def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, - initialWeights=None, regParam=1.0, regType="none", intercept=False): + initialWeights=None, regParam=0.01, regType="l2", intercept=False): """ Train a logistic regression model on the given data. @@ -87,16 +87,16 @@ class LogisticRegressionWithSGD(object): :param miniBatchFraction: Fraction of data to be used for each SGD iteration. :param initialWeights: The initial weights (default: None). - :param regParam: The regularizer parameter (default: 1.0). + :param regParam: The regularizer parameter (default: 0.01). :param regType: The type of regularizer used for training our model. :Allowed values: - - "l1" for using L1Updater - - "l2" for using SquaredL2Updater - - "none" for no regularizer + - "l1" for using L1 regularization + - "l2" for using L2 regularization + - None for no regularization - (default: "none") + (default: "l2") @param intercept: Boolean parameter which indicates the use or not of the augmented representation for @@ -104,8 +104,9 @@ class LogisticRegressionWithSGD(object): are activated or not). """ def train(rdd, i): - return callMLlibFunc("trainLogisticRegressionModelWithSGD", rdd, iterations, step, - miniBatchFraction, i, regParam, regType, intercept) + return callMLlibFunc("trainLogisticRegressionModelWithSGD", rdd, int(iterations), + float(step), float(miniBatchFraction), i, float(regParam), regType, + bool(intercept)) return _regression_train_wrapper(train, LogisticRegressionModel, data, initialWeights) @@ -145,8 +146,8 @@ class SVMModel(LinearModel): class SVMWithSGD(object): @classmethod - def train(cls, data, iterations=100, step=1.0, regParam=1.0, - miniBatchFraction=1.0, initialWeights=None, regType="none", intercept=False): + def train(cls, data, iterations=100, step=1.0, regParam=0.01, + miniBatchFraction=1.0, initialWeights=None, regType="l2", intercept=False): """ Train a support vector machine on the given data. @@ -154,7 +155,7 @@ class SVMWithSGD(object): :param iterations: The number of iterations (default: 100). :param step: The step parameter used in SGD (default: 1.0). - :param regParam: The regularizer parameter (default: 1.0). + :param regParam: The regularizer parameter (default: 0.01). :param miniBatchFraction: Fraction of data to be used for each SGD iteration. :param initialWeights: The initial weights (default: None). @@ -162,11 +163,11 @@ class SVMWithSGD(object): our model. :Allowed values: - - "l1" for using L1Updater - - "l2" for using SquaredL2Updater, - - "none" for no regularizer. + - "l1" for using L1 regularization + - "l2" for using L2 regularization + - None for no regularization - (default: "none") + (default: "l2") @param intercept: Boolean parameter which indicates the use or not of the augmented representation for @@ -174,8 +175,9 @@ class SVMWithSGD(object): are activated or not). """ def train(rdd, i): - return callMLlibFunc("trainSVMModelWithSGD", rdd, iterations, step, regParam, - miniBatchFraction, i, regType, intercept) + return callMLlibFunc("trainSVMModelWithSGD", rdd, int(iterations), float(step), + float(regParam), float(miniBatchFraction), i, regType, + bool(intercept)) return _regression_train_wrapper(train, SVMModel, data, initialWeights) diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index 66e25a48df..f4f5e615fa 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -138,7 +138,7 @@ class LinearRegressionWithSGD(object): @classmethod def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, - initialWeights=None, regParam=1.0, regType="none", intercept=False): + initialWeights=None, regParam=0.0, regType=None, intercept=False): """ Train a linear regression model on the given data. @@ -149,16 +149,16 @@ class LinearRegressionWithSGD(object): :param miniBatchFraction: Fraction of data to be used for each SGD iteration. :param initialWeights: The initial weights (default: None). - :param regParam: The regularizer parameter (default: 1.0). + :param regParam: The regularizer parameter (default: 0.0). :param regType: The type of regularizer used for training our model. :Allowed values: - - "l1" for using L1Updater, - - "l2" for using SquaredL2Updater, - - "none" for no regularizer. + - "l1" for using L1 regularization (lasso), + - "l2" for using L2 regularization (ridge), + - None for no regularization - (default: "none") + (default: None) @param intercept: Boolean parameter which indicates the use or not of the augmented representation for @@ -166,11 +166,11 @@ class LinearRegressionWithSGD(object): are activated or not). """ def train(rdd, i): - return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd, iterations, step, - miniBatchFraction, i, regParam, regType, intercept) + return callMLlibFunc("trainLinearRegressionModelWithSGD", rdd, int(iterations), + float(step), float(miniBatchFraction), i, float(regParam), + regType, bool(intercept)) - return _regression_train_wrapper(train, LinearRegressionModel, - data, initialWeights) + return _regression_train_wrapper(train, LinearRegressionModel, data, initialWeights) class LassoModel(LinearRegressionModelBase): @@ -209,12 +209,13 @@ class LassoModel(LinearRegressionModelBase): class LassoWithSGD(object): @classmethod - def train(cls, data, iterations=100, step=1.0, regParam=1.0, + def train(cls, data, iterations=100, step=1.0, regParam=0.01, miniBatchFraction=1.0, initialWeights=None): """Train a Lasso regression model on the given data.""" def train(rdd, i): - return callMLlibFunc("trainLassoModelWithSGD", rdd, iterations, step, regParam, - miniBatchFraction, i) + return callMLlibFunc("trainLassoModelWithSGD", rdd, int(iterations), float(step), + float(regParam), float(miniBatchFraction), i) + return _regression_train_wrapper(train, LassoModel, data, initialWeights) @@ -254,15 +255,14 @@ class RidgeRegressionModel(LinearRegressionModelBase): class RidgeRegressionWithSGD(object): @classmethod - def train(cls, data, iterations=100, step=1.0, regParam=1.0, + def train(cls, data, iterations=100, step=1.0, regParam=0.01, miniBatchFraction=1.0, initialWeights=None): """Train a ridge regression model on the given data.""" def train(rdd, i): - return callMLlibFunc("trainRidgeModelWithSGD", rdd, iterations, step, regParam, - miniBatchFraction, i) + return callMLlibFunc("trainRidgeModelWithSGD", rdd, int(iterations), float(step), + float(regParam), float(miniBatchFraction), i) - return _regression_train_wrapper(train, RidgeRegressionModel, - data, initialWeights) + return _regression_train_wrapper(train, RidgeRegressionModel, data, initialWeights) def _test(): |