[SPARK-2550][MLLIB][APACHE SPARK] Support regularization and intercept in pyspark's linear methods

Related to Jira Issue: [SPARK-2550](https://issues.apache.org/jira/browse/SPARK-2550?jql=project%20%3D%20SPARK%20AND%20resolution%20%3D%20Unresolved%20AND%20priority%20%3D%20Major%20ORDER%20BY%20key%20DESC) Author: Michael Giannakopoulos <miccagiann@gmail.com> Closes #1775 from miccagiann/linearMethodsReg and squashes the following commits: cb774c3 [Michael Giannakopoulos] MiniBatchFraction added in related PythonMLLibAPI java stubs. 81fcbc6 [Michael Giannakopoulos] Fixing a typo-error. 8ad263e [Michael Giannakopoulos] Adding regularizer type and intercept parameters to LogisticRegressionWithSGD and SVMWithSGD.
author: Michael Giannakopoulos <miccagiann@gmail.com> 2014-08-05 16:30:32 -0700
committer: Xiangrui Meng <meng@databricks.com> 2014-08-05 16:30:32 -0700
commit: 1aad9114c93c5763030c14a2328f6426d9e5bcb6 (patch)
tree: 8a8085d64428993c23961042c8b430baaa61b204 /python/pyspark/mllib/classification.py
parent: acff9a7f13b98f10a08aea1d11cfa685c3419367 (diff)
download: spark-1aad9114c93c5763030c14a2328f6426d9e5bcb6.tar.gz
spark-1aad9114c93c5763030c14a2328f6426d9e5bcb6.tar.bz2
spark-1aad9114c93c5763030c14a2328f6426d9e5bcb6.zip
1 files changed, 55 insertions, 6 deletions
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index 2bbb9c3fca..5ec1a8084d 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -73,11 +73,36 @@ class LogisticRegressionModel(LinearModel):
 
 class LogisticRegressionWithSGD(object):
     @classmethod
-    def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0, initialWeights=None):
-        """Train a logistic regression model on the given data."""
+    def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
+              initialWeights=None, regParam=1.0, regType=None, intercept=False):
+        """
+        Train a logistic regression model on the given data.
+
+        @param data:              The training data.
+        @param iterations:        The number of iterations (default: 100).
+        @param step:              The step parameter used in SGD
+                                  (default: 1.0).
+        @param miniBatchFraction: Fraction of data to be used for each SGD
+                                  iteration.
+        @param initialWeights:    The initial weights (default: None).
+        @param regParam:          The regularizer parameter (default: 1.0).
+        @param regType:           The type of regularizer used for training
+                                  our model.
+                                  Allowed values: "l1" for using L1Updater,
+                                                  "l2" for using
+                                                       SquaredL2Updater,
+                                                  "none" for no regularizer.
+                                  (default: "none")
+        @param intercept:         Boolean parameter which indicates the use
+                                  or not of the augmented representation for
+                                  training data (i.e. whether bias features
+                                  are activated or not).
+        """
         sc = data.context
+        if regType is None:
+            regType = "none"
         train_func = lambda d, i: sc._jvm.PythonMLLibAPI().trainLogisticRegressionModelWithSGD(
-            d._jrdd, iterations, step, miniBatchFraction, i)
+            d._jrdd, iterations, step, miniBatchFraction, i, regParam, regType, intercept)
         return _regression_train_wrapper(sc, train_func, LogisticRegressionModel, data,
                                          initialWeights)
 
@@ -115,11 +140,35 @@ class SVMModel(LinearModel):
 class SVMWithSGD(object):
     @classmethod
     def train(cls, data, iterations=100, step=1.0, regParam=1.0,
-              miniBatchFraction=1.0, initialWeights=None):
-        """Train a support vector machine on the given data."""
+              miniBatchFraction=1.0, initialWeights=None, regType=None, intercept=False):
+        """
+        Train a support vector machine on the given data.
+
+        @param data:              The training data.
+        @param iterations:        The number of iterations (default: 100).
+        @param step:              The step parameter used in SGD
+                                  (default: 1.0).
+        @param regParam:          The regularizer parameter (default: 1.0).
+        @param miniBatchFraction: Fraction of data to be used for each SGD
+                                  iteration.
+        @param initialWeights:    The initial weights (default: None).
+        @param regType:           The type of regularizer used for training
+                                  our model.
+                                  Allowed values: "l1" for using L1Updater,
+                                                  "l2" for using
+                                                       SquaredL2Updater,
+                                                  "none" for no regularizer.
+                                  (default: "none")
+        @param intercept:         Boolean parameter which indicates the use
+                                  or not of the augmented representation for
+                                  training data (i.e. whether bias features
+                                  are activated or not).
+        """
         sc = data.context
+        if regType is None:
+            regType = "none"
         train_func = lambda d, i: sc._jvm.PythonMLLibAPI().trainSVMModelWithSGD(
-            d._jrdd, iterations, step, regParam, miniBatchFraction, i)
+            d._jrdd, iterations, step, regParam, miniBatchFraction, i, regType, intercept)
         return _regression_train_wrapper(sc, train_func, SVMModel, data, initialWeights)
author	Michael Giannakopoulos <miccagiann@gmail.com>	2014-08-05 16:30:32 -0700
committer	Xiangrui Meng <meng@databricks.com>	2014-08-05 16:30:32 -0700
commit	1aad9114c93c5763030c14a2328f6426d9e5bcb6 (patch)
tree	8a8085d64428993c23961042c8b430baaa61b204 /python/pyspark/mllib/classification.py
parent	acff9a7f13b98f10a08aea1d11cfa685c3419367 (diff)
download	spark-1aad9114c93c5763030c14a2328f6426d9e5bcb6.tar.gz spark-1aad9114c93c5763030c14a2328f6426d9e5bcb6.tar.bz2 spark-1aad9114c93c5763030c14a2328f6426d9e5bcb6.zip