From 8a4ed78869e99c7de7062c3baa0ddb9d28c8e9b1 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Sun, 21 Feb 2016 20:20:41 -0800
Subject: [SPARK-13379][MLLIB] Fix MLlib LogisticRegressionWithLBFGS set
 regularization incorrectly

## What changes were proposed in this pull request?
Fix MLlib LogisticRegressionWithLBFGS regularization map as:
```SquaredL2Updater``` -> ```elasticNetParam = 0.0```
```L1Updater``` -> ```elasticNetParam = 1.0```
cc dbtsai
## How was the this patch tested?
unit tests

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #11258 from yanboliang/spark-13379.
---
 .../classification/LogisticRegressionSuite.scala   | 348 +++++++++++++++++++++
 1 file changed, 348 insertions(+)

(limited to 'mllib/src/test/scala')

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
index 8fef1316cd..d140545e37 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
@@ -29,6 +29,7 @@ import org.apache.spark.mllib.optimization._
 import org.apache.spark.mllib.regression._
 import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext}
 import org.apache.spark.mllib.util.TestingUtils._
+import org.apache.spark.rdd.RDD
 import org.apache.spark.util.Utils
 
 
@@ -171,6 +172,37 @@ object LogisticRegressionSuite {
 
 
 class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext with Matchers {
+
+  @transient var binaryDataset: RDD[LabeledPoint] = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    /*
+       Here is the instruction describing how to export the test data into CSV format
+       so we can validate the training accuracy compared with R's glmnet package.
+
+       val nPoints = 10000
+       val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
+       val xMean = Array(5.843, 3.057, 3.758, 1.199)
+       val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
+       val data = sc.parallelize(LogisticRegressionSuite.generateMultinomialLogisticInput(
+         coefficients, xMean, xVariance, true, nPoints, 42), 1)
+       data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1) + ", "
+         + x.features(2) + ", " + x.features(3)).saveAsTextFile("path")
+     */
+    binaryDataset = {
+      val nPoints = 10000
+      val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
+      val xMean = Array(5.843, 3.057, 3.758, 1.199)
+      val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
+
+      val testData = LogisticRegressionSuite.generateMultinomialLogisticInput(
+        coefficients, xMean, xVariance, true, nPoints, 42)
+
+      sc.parallelize(testData, 2)
+    }
+  }
+
   def validatePrediction(
       predictions: Seq[Double],
       input: Seq[LabeledPoint],
@@ -555,6 +587,322 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext w
     }
   }
 
+  /**
+   * From Spark 2.0, MLlib LogisticRegressionWithLBFGS will call the LogisticRegression
+   * implementation in ML to train model. We copies test cases from ML to guarantee
+   * they produce the same result.
+   */
+  test("binary logistic regression with intercept without regularization") {
+    val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(true)
+    val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(false)
+
+    val model1 = trainer1.run(binaryDataset)
+    val model2 = trainer2.run(binaryDataset)
+
+    /*
+       Using the following R code to load the data and train the model using glmnet package.
+
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0))
+       coefficients
+
+       5 x 1 sparse Matrix of class "dgCMatrix"
+                           s0
+       (Intercept)  2.8366423
+       data.V2     -0.5895848
+       data.V3      0.8931147
+       data.V4     -0.3925051
+       data.V5     -0.7996864
+     */
+    val interceptR = 2.8366423
+    val coefficientsR = Vectors.dense(-0.5895848, 0.8931147, -0.3925051, -0.7996864)
+
+    assert(model1.intercept ~== interceptR relTol 1E-3)
+    assert(model1.weights ~= coefficientsR relTol 1E-3)
+
+    // Without regularization, with or without feature scaling will converge to the same solution.
+    assert(model2.intercept ~== interceptR relTol 1E-3)
+    assert(model2.weights ~= coefficientsR relTol 1E-3)
+  }
+
+  test("binary logistic regression without intercept without regularization") {
+    val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(true)
+    val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(false)
+
+    val model1 = trainer1.run(binaryDataset)
+    val model2 = trainer2.run(binaryDataset)
+
+    /*
+       Using the following R code to load the data and train the model using glmnet package.
+
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       coefficients =
+           coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0, intercept=FALSE))
+       coefficients
+
+       5 x 1 sparse Matrix of class "dgCMatrix"
+                           s0
+       (Intercept)   .
+       data.V2     -0.3534996
+       data.V3      1.2964482
+       data.V4     -0.3571741
+       data.V5     -0.7407946
+     */
+    val interceptR = 0.0
+    val coefficientsR = Vectors.dense(-0.3534996, 1.2964482, -0.3571741, -0.7407946)
+
+    assert(model1.intercept ~== interceptR relTol 1E-3)
+    assert(model1.weights ~= coefficientsR relTol 1E-2)
+
+    // Without regularization, with or without feature scaling should converge to the same solution.
+    assert(model2.intercept ~== interceptR relTol 1E-3)
+    assert(model2.weights ~= coefficientsR relTol 1E-2)
+  }
+
+  test("binary logistic regression with intercept with L1 regularization") {
+    val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(true)
+    trainer1.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6)
+    val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(false)
+    trainer2.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6)
+
+    val model1 = trainer1.run(binaryDataset)
+    val model2 = trainer2.run(binaryDataset)
+
+    /*
+       Using the following R code to load the data and train the model using glmnet package.
+
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12))
+       coefficients
+
+       5 x 1 sparse Matrix of class "dgCMatrix"
+                            s0
+       (Intercept) -0.05627428
+       data.V2       .
+       data.V3       .
+       data.V4     -0.04325749
+       data.V5     -0.02481551
+     */
+    val interceptR1 = -0.05627428
+    val coefficientsR1 = Vectors.dense(0.0, 0.0, -0.04325749, -0.02481551)
+
+    assert(model1.intercept ~== interceptR1 relTol 1E-2)
+    assert(model1.weights ~= coefficientsR1 absTol 2E-2)
+
+    /*
+       Using the following R code to load the data and train the model using glmnet package.
+
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
+           standardize=FALSE))
+       coefficients
+
+       5 x 1 sparse Matrix of class "dgCMatrix"
+                           s0
+       (Intercept)  0.3722152
+       data.V2       .
+       data.V3       .
+       data.V4     -0.1665453
+       data.V5       .
+     */
+    val interceptR2 = 0.3722152
+    val coefficientsR2 = Vectors.dense(0.0, 0.0, -0.1665453, 0.0)
+
+    assert(model2.intercept ~== interceptR2 relTol 1E-2)
+    assert(model2.weights ~= coefficientsR2 absTol 1E-3)
+  }
+
+  test("binary logistic regression without intercept with L1 regularization") {
+    val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(true)
+    trainer1.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6)
+    val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(false)
+    trainer2.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6)
+
+    val model1 = trainer1.run(binaryDataset)
+    val model2 = trainer2.run(binaryDataset)
+
+    /*
+       Using the following R code to load the data and train the model using glmnet package.
+
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
+           intercept=FALSE))
+       coefficients
+
+       5 x 1 sparse Matrix of class "dgCMatrix"
+                            s0
+       (Intercept)   .
+       data.V2       .
+       data.V3       .
+       data.V4     -0.05189203
+       data.V5     -0.03891782
+     */
+    val interceptR1 = 0.0
+    val coefficientsR1 = Vectors.dense(0.0, 0.0, -0.05189203, -0.03891782)
+
+    assert(model1.intercept ~== interceptR1 relTol 1E-3)
+    assert(model1.weights ~= coefficientsR1 absTol 1E-3)
+
+    /*
+       Using the following R code to load the data and train the model using glmnet package.
+
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
+           intercept=FALSE, standardize=FALSE))
+       coefficients
+
+       5 x 1 sparse Matrix of class "dgCMatrix"
+                            s0
+       (Intercept)   .
+       data.V2       .
+       data.V3       .
+       data.V4     -0.08420782
+       data.V5       .
+     */
+    val interceptR2 = 0.0
+    val coefficientsR2 = Vectors.dense(0.0, 0.0, -0.08420782, 0.0)
+
+    assert(model2.intercept ~== interceptR2 absTol 1E-3)
+    assert(model2.weights ~= coefficientsR2 absTol 1E-3)
+  }
+
+  test("binary logistic regression with intercept with L2 regularization") {
+    val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(true)
+    trainer1.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6)
+    val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(false)
+    trainer2.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6)
+
+    val model1 = trainer1.run(binaryDataset)
+    val model2 = trainer2.run(binaryDataset)
+
+    /*
+       Using the following R code to load the data and train the model using glmnet package.
+
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37))
+       coefficients
+
+       5 x 1 sparse Matrix of class "dgCMatrix"
+                            s0
+       (Intercept)  0.15021751
+       data.V2     -0.07251837
+       data.V3      0.10724191
+       data.V4     -0.04865309
+       data.V5     -0.10062872
+     */
+    val interceptR1 = 0.15021751
+    val coefficientsR1 = Vectors.dense(-0.07251837, 0.10724191, -0.04865309, -0.10062872)
+
+    assert(model1.intercept ~== interceptR1 relTol 1E-3)
+    assert(model1.weights ~= coefficientsR1 relTol 1E-3)
+
+    /*
+       Using the following R code to load the data and train the model using glmnet package.
+
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
+           standardize=FALSE))
+       coefficients
+
+       5 x 1 sparse Matrix of class "dgCMatrix"
+                            s0
+       (Intercept)  0.48657516
+       data.V2     -0.05155371
+       data.V3      0.02301057
+       data.V4     -0.11482896
+       data.V5     -0.06266838
+     */
+    val interceptR2 = 0.48657516
+    val coefficientsR2 = Vectors.dense(-0.05155371, 0.02301057, -0.11482896, -0.06266838)
+
+    assert(model2.intercept ~== interceptR2 relTol 1E-3)
+    assert(model2.weights ~= coefficientsR2 relTol 1E-3)
+  }
+
+  test("binary logistic regression without intercept with L2 regularization") {
+    val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(true)
+    trainer1.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6)
+    val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(false)
+    trainer2.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6)
+
+    val model1 = trainer1.run(binaryDataset)
+    val model2 = trainer2.run(binaryDataset)
+
+    /*
+       Using the following R code to load the data and train the model using glmnet package.
+
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
+           intercept=FALSE))
+       coefficients
+
+       5 x 1 sparse Matrix of class "dgCMatrix"
+                            s0
+       (Intercept)   .
+       data.V2     -0.06099165
+       data.V3      0.12857058
+       data.V4     -0.04708770
+       data.V5     -0.09799775
+     */
+    val interceptR1 = 0.0
+    val coefficientsR1 = Vectors.dense(-0.06099165, 0.12857058, -0.04708770, -0.09799775)
+
+    assert(model1.intercept ~== interceptR1 absTol 1E-3)
+    assert(model1.weights ~= coefficientsR1 relTol 1E-2)
+
+    /*
+       Using the following R code to load the data and train the model using glmnet package.
+
+       library("glmnet")
+       data <- read.csv("path", header=FALSE)
+       label = factor(data$V1)
+       features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+       coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
+           intercept=FALSE, standardize=FALSE))
+       coefficients
+
+       5 x 1 sparse Matrix of class "dgCMatrix"
+                             s0
+       (Intercept)   .
+       data.V2     -0.005679651
+       data.V3      0.048967094
+       data.V4     -0.093714016
+       data.V5     -0.053314311
+     */
+    val interceptR2 = 0.0
+    val coefficientsR2 = Vectors.dense(-0.005679651, 0.048967094, -0.093714016, -0.053314311)
+
+    assert(model2.intercept ~== interceptR2 absTol 1E-3)
+    assert(model2.weights ~= coefficientsR2 relTol 1E-2)
+  }
+
 }
 
 class LogisticRegressionClusterSuite extends SparkFunSuite with LocalClusterSparkContext {
-- 
cgit v1.2.3