From 8a4ed78869e99c7de7062c3baa0ddb9d28c8e9b1 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Sun, 21 Feb 2016 20:20:41 -0800 Subject: [SPARK-13379][MLLIB] Fix MLlib LogisticRegressionWithLBFGS set regularization incorrectly ## What changes were proposed in this pull request? Fix MLlib LogisticRegressionWithLBFGS regularization map as: ```SquaredL2Updater``` -> ```elasticNetParam = 0.0``` ```L1Updater``` -> ```elasticNetParam = 1.0``` cc dbtsai ## How was the this patch tested? unit tests Author: Yanbo Liang Closes #11258 from yanboliang/spark-13379. --- .../classification/LogisticRegressionSuite.scala | 348 +++++++++++++++++++++ 1 file changed, 348 insertions(+) (limited to 'mllib/src/test/scala') diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala index 8fef1316cd..d140545e37 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala @@ -29,6 +29,7 @@ import org.apache.spark.mllib.optimization._ import org.apache.spark.mllib.regression._ import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext} import org.apache.spark.mllib.util.TestingUtils._ +import org.apache.spark.rdd.RDD import org.apache.spark.util.Utils @@ -171,6 +172,37 @@ object LogisticRegressionSuite { class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext with Matchers { + + @transient var binaryDataset: RDD[LabeledPoint] = _ + + override def beforeAll(): Unit = { + super.beforeAll() + /* + Here is the instruction describing how to export the test data into CSV format + so we can validate the training accuracy compared with R's glmnet package. + + val nPoints = 10000 + val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191) + val xMean = Array(5.843, 3.057, 3.758, 1.199) + val xVariance = Array(0.6856, 0.1899, 3.116, 0.581) + val data = sc.parallelize(LogisticRegressionSuite.generateMultinomialLogisticInput( + coefficients, xMean, xVariance, true, nPoints, 42), 1) + data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1) + ", " + + x.features(2) + ", " + x.features(3)).saveAsTextFile("path") + */ + binaryDataset = { + val nPoints = 10000 + val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191) + val xMean = Array(5.843, 3.057, 3.758, 1.199) + val xVariance = Array(0.6856, 0.1899, 3.116, 0.581) + + val testData = LogisticRegressionSuite.generateMultinomialLogisticInput( + coefficients, xMean, xVariance, true, nPoints, 42) + + sc.parallelize(testData, 2) + } + } + def validatePrediction( predictions: Seq[Double], input: Seq[LabeledPoint], @@ -555,6 +587,322 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext w } } + /** + * From Spark 2.0, MLlib LogisticRegressionWithLBFGS will call the LogisticRegression + * implementation in ML to train model. We copies test cases from ML to guarantee + * they produce the same result. + */ + test("binary logistic regression with intercept without regularization") { + val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(true) + val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(false) + + val model1 = trainer1.run(binaryDataset) + val model2 = trainer2.run(binaryDataset) + + /* + Using the following R code to load the data and train the model using glmnet package. + + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0)) + coefficients + + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) 2.8366423 + data.V2 -0.5895848 + data.V3 0.8931147 + data.V4 -0.3925051 + data.V5 -0.7996864 + */ + val interceptR = 2.8366423 + val coefficientsR = Vectors.dense(-0.5895848, 0.8931147, -0.3925051, -0.7996864) + + assert(model1.intercept ~== interceptR relTol 1E-3) + assert(model1.weights ~= coefficientsR relTol 1E-3) + + // Without regularization, with or without feature scaling will converge to the same solution. + assert(model2.intercept ~== interceptR relTol 1E-3) + assert(model2.weights ~= coefficientsR relTol 1E-3) + } + + test("binary logistic regression without intercept without regularization") { + val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(true) + val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(false) + + val model1 = trainer1.run(binaryDataset) + val model2 = trainer2.run(binaryDataset) + + /* + Using the following R code to load the data and train the model using glmnet package. + + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficients = + coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0, intercept=FALSE)) + coefficients + + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) . + data.V2 -0.3534996 + data.V3 1.2964482 + data.V4 -0.3571741 + data.V5 -0.7407946 + */ + val interceptR = 0.0 + val coefficientsR = Vectors.dense(-0.3534996, 1.2964482, -0.3571741, -0.7407946) + + assert(model1.intercept ~== interceptR relTol 1E-3) + assert(model1.weights ~= coefficientsR relTol 1E-2) + + // Without regularization, with or without feature scaling should converge to the same solution. + assert(model2.intercept ~== interceptR relTol 1E-3) + assert(model2.weights ~= coefficientsR relTol 1E-2) + } + + test("binary logistic regression with intercept with L1 regularization") { + val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(true) + trainer1.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6) + val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(false) + trainer2.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6) + + val model1 = trainer1.run(binaryDataset) + val model2 = trainer2.run(binaryDataset) + + /* + Using the following R code to load the data and train the model using glmnet package. + + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12)) + coefficients + + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) -0.05627428 + data.V2 . + data.V3 . + data.V4 -0.04325749 + data.V5 -0.02481551 + */ + val interceptR1 = -0.05627428 + val coefficientsR1 = Vectors.dense(0.0, 0.0, -0.04325749, -0.02481551) + + assert(model1.intercept ~== interceptR1 relTol 1E-2) + assert(model1.weights ~= coefficientsR1 absTol 2E-2) + + /* + Using the following R code to load the data and train the model using glmnet package. + + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12, + standardize=FALSE)) + coefficients + + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) 0.3722152 + data.V2 . + data.V3 . + data.V4 -0.1665453 + data.V5 . + */ + val interceptR2 = 0.3722152 + val coefficientsR2 = Vectors.dense(0.0, 0.0, -0.1665453, 0.0) + + assert(model2.intercept ~== interceptR2 relTol 1E-2) + assert(model2.weights ~= coefficientsR2 absTol 1E-3) + } + + test("binary logistic regression without intercept with L1 regularization") { + val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(true) + trainer1.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6) + val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(false) + trainer2.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6) + + val model1 = trainer1.run(binaryDataset) + val model2 = trainer2.run(binaryDataset) + + /* + Using the following R code to load the data and train the model using glmnet package. + + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12, + intercept=FALSE)) + coefficients + + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) . + data.V2 . + data.V3 . + data.V4 -0.05189203 + data.V5 -0.03891782 + */ + val interceptR1 = 0.0 + val coefficientsR1 = Vectors.dense(0.0, 0.0, -0.05189203, -0.03891782) + + assert(model1.intercept ~== interceptR1 relTol 1E-3) + assert(model1.weights ~= coefficientsR1 absTol 1E-3) + + /* + Using the following R code to load the data and train the model using glmnet package. + + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12, + intercept=FALSE, standardize=FALSE)) + coefficients + + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) . + data.V2 . + data.V3 . + data.V4 -0.08420782 + data.V5 . + */ + val interceptR2 = 0.0 + val coefficientsR2 = Vectors.dense(0.0, 0.0, -0.08420782, 0.0) + + assert(model2.intercept ~== interceptR2 absTol 1E-3) + assert(model2.weights ~= coefficientsR2 absTol 1E-3) + } + + test("binary logistic regression with intercept with L2 regularization") { + val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(true) + trainer1.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6) + val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(false) + trainer2.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6) + + val model1 = trainer1.run(binaryDataset) + val model2 = trainer2.run(binaryDataset) + + /* + Using the following R code to load the data and train the model using glmnet package. + + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37)) + coefficients + + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) 0.15021751 + data.V2 -0.07251837 + data.V3 0.10724191 + data.V4 -0.04865309 + data.V5 -0.10062872 + */ + val interceptR1 = 0.15021751 + val coefficientsR1 = Vectors.dense(-0.07251837, 0.10724191, -0.04865309, -0.10062872) + + assert(model1.intercept ~== interceptR1 relTol 1E-3) + assert(model1.weights ~= coefficientsR1 relTol 1E-3) + + /* + Using the following R code to load the data and train the model using glmnet package. + + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37, + standardize=FALSE)) + coefficients + + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) 0.48657516 + data.V2 -0.05155371 + data.V3 0.02301057 + data.V4 -0.11482896 + data.V5 -0.06266838 + */ + val interceptR2 = 0.48657516 + val coefficientsR2 = Vectors.dense(-0.05155371, 0.02301057, -0.11482896, -0.06266838) + + assert(model2.intercept ~== interceptR2 relTol 1E-3) + assert(model2.weights ~= coefficientsR2 relTol 1E-3) + } + + test("binary logistic regression without intercept with L2 regularization") { + val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(true) + trainer1.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6) + val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(false) + trainer2.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6) + + val model1 = trainer1.run(binaryDataset) + val model2 = trainer2.run(binaryDataset) + + /* + Using the following R code to load the data and train the model using glmnet package. + + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37, + intercept=FALSE)) + coefficients + + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) . + data.V2 -0.06099165 + data.V3 0.12857058 + data.V4 -0.04708770 + data.V5 -0.09799775 + */ + val interceptR1 = 0.0 + val coefficientsR1 = Vectors.dense(-0.06099165, 0.12857058, -0.04708770, -0.09799775) + + assert(model1.intercept ~== interceptR1 absTol 1E-3) + assert(model1.weights ~= coefficientsR1 relTol 1E-2) + + /* + Using the following R code to load the data and train the model using glmnet package. + + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37, + intercept=FALSE, standardize=FALSE)) + coefficients + + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) . + data.V2 -0.005679651 + data.V3 0.048967094 + data.V4 -0.093714016 + data.V5 -0.053314311 + */ + val interceptR2 = 0.0 + val coefficientsR2 = Vectors.dense(-0.005679651, 0.048967094, -0.093714016, -0.053314311) + + assert(model2.intercept ~== interceptR2 absTol 1E-3) + assert(model2.weights ~= coefficientsR2 relTol 1E-2) + } + } class LogisticRegressionClusterSuite extends SparkFunSuite with LocalClusterSparkContext { -- cgit v1.2.3