aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/test/scala/org/apache
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2016-02-21 20:20:41 -0800
committerDB Tsai <dbt@netflix.com>2016-02-21 20:20:41 -0800
commit8a4ed78869e99c7de7062c3baa0ddb9d28c8e9b1 (patch)
tree366cc15fbffd65feb502bc3c9de946f443df2165 /mllib/src/test/scala/org/apache
parent9bf6a926a1071fa59f76ab12b23df8de618f1cae (diff)
downloadspark-8a4ed78869e99c7de7062c3baa0ddb9d28c8e9b1.tar.gz
spark-8a4ed78869e99c7de7062c3baa0ddb9d28c8e9b1.tar.bz2
spark-8a4ed78869e99c7de7062c3baa0ddb9d28c8e9b1.zip
[SPARK-13379][MLLIB] Fix MLlib LogisticRegressionWithLBFGS set regularization incorrectly
## What changes were proposed in this pull request? Fix MLlib LogisticRegressionWithLBFGS regularization map as: ```SquaredL2Updater``` -> ```elasticNetParam = 0.0``` ```L1Updater``` -> ```elasticNetParam = 1.0``` cc dbtsai ## How was the this patch tested? unit tests Author: Yanbo Liang <ybliang8@gmail.com> Closes #11258 from yanboliang/spark-13379.
Diffstat (limited to 'mllib/src/test/scala/org/apache')
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala348
1 files changed, 348 insertions, 0 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
index 8fef1316cd..d140545e37 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
@@ -29,6 +29,7 @@ import org.apache.spark.mllib.optimization._
import org.apache.spark.mllib.regression._
import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext}
import org.apache.spark.mllib.util.TestingUtils._
+import org.apache.spark.rdd.RDD
import org.apache.spark.util.Utils
@@ -171,6 +172,37 @@ object LogisticRegressionSuite {
class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext with Matchers {
+
+ @transient var binaryDataset: RDD[LabeledPoint] = _
+
+ override def beforeAll(): Unit = {
+ super.beforeAll()
+ /*
+ Here is the instruction describing how to export the test data into CSV format
+ so we can validate the training accuracy compared with R's glmnet package.
+
+ val nPoints = 10000
+ val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
+ val xMean = Array(5.843, 3.057, 3.758, 1.199)
+ val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
+ val data = sc.parallelize(LogisticRegressionSuite.generateMultinomialLogisticInput(
+ coefficients, xMean, xVariance, true, nPoints, 42), 1)
+ data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1) + ", "
+ + x.features(2) + ", " + x.features(3)).saveAsTextFile("path")
+ */
+ binaryDataset = {
+ val nPoints = 10000
+ val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
+ val xMean = Array(5.843, 3.057, 3.758, 1.199)
+ val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
+
+ val testData = LogisticRegressionSuite.generateMultinomialLogisticInput(
+ coefficients, xMean, xVariance, true, nPoints, 42)
+
+ sc.parallelize(testData, 2)
+ }
+ }
+
def validatePrediction(
predictions: Seq[Double],
input: Seq[LabeledPoint],
@@ -555,6 +587,322 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext w
}
}
+ /**
+ * From Spark 2.0, MLlib LogisticRegressionWithLBFGS will call the LogisticRegression
+ * implementation in ML to train model. We copies test cases from ML to guarantee
+ * they produce the same result.
+ */
+ test("binary logistic regression with intercept without regularization") {
+ val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(true)
+ val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(false)
+
+ val model1 = trainer1.run(binaryDataset)
+ val model2 = trainer2.run(binaryDataset)
+
+ /*
+ Using the following R code to load the data and train the model using glmnet package.
+
+ library("glmnet")
+ data <- read.csv("path", header=FALSE)
+ label = factor(data$V1)
+ features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+ coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0))
+ coefficients
+
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ (Intercept) 2.8366423
+ data.V2 -0.5895848
+ data.V3 0.8931147
+ data.V4 -0.3925051
+ data.V5 -0.7996864
+ */
+ val interceptR = 2.8366423
+ val coefficientsR = Vectors.dense(-0.5895848, 0.8931147, -0.3925051, -0.7996864)
+
+ assert(model1.intercept ~== interceptR relTol 1E-3)
+ assert(model1.weights ~= coefficientsR relTol 1E-3)
+
+ // Without regularization, with or without feature scaling will converge to the same solution.
+ assert(model2.intercept ~== interceptR relTol 1E-3)
+ assert(model2.weights ~= coefficientsR relTol 1E-3)
+ }
+
+ test("binary logistic regression without intercept without regularization") {
+ val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(true)
+ val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(false)
+
+ val model1 = trainer1.run(binaryDataset)
+ val model2 = trainer2.run(binaryDataset)
+
+ /*
+ Using the following R code to load the data and train the model using glmnet package.
+
+ library("glmnet")
+ data <- read.csv("path", header=FALSE)
+ label = factor(data$V1)
+ features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+ coefficients =
+ coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0, intercept=FALSE))
+ coefficients
+
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ (Intercept) .
+ data.V2 -0.3534996
+ data.V3 1.2964482
+ data.V4 -0.3571741
+ data.V5 -0.7407946
+ */
+ val interceptR = 0.0
+ val coefficientsR = Vectors.dense(-0.3534996, 1.2964482, -0.3571741, -0.7407946)
+
+ assert(model1.intercept ~== interceptR relTol 1E-3)
+ assert(model1.weights ~= coefficientsR relTol 1E-2)
+
+ // Without regularization, with or without feature scaling should converge to the same solution.
+ assert(model2.intercept ~== interceptR relTol 1E-3)
+ assert(model2.weights ~= coefficientsR relTol 1E-2)
+ }
+
+ test("binary logistic regression with intercept with L1 regularization") {
+ val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(true)
+ trainer1.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6)
+ val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(false)
+ trainer2.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6)
+
+ val model1 = trainer1.run(binaryDataset)
+ val model2 = trainer2.run(binaryDataset)
+
+ /*
+ Using the following R code to load the data and train the model using glmnet package.
+
+ library("glmnet")
+ data <- read.csv("path", header=FALSE)
+ label = factor(data$V1)
+ features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+ coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12))
+ coefficients
+
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ (Intercept) -0.05627428
+ data.V2 .
+ data.V3 .
+ data.V4 -0.04325749
+ data.V5 -0.02481551
+ */
+ val interceptR1 = -0.05627428
+ val coefficientsR1 = Vectors.dense(0.0, 0.0, -0.04325749, -0.02481551)
+
+ assert(model1.intercept ~== interceptR1 relTol 1E-2)
+ assert(model1.weights ~= coefficientsR1 absTol 2E-2)
+
+ /*
+ Using the following R code to load the data and train the model using glmnet package.
+
+ library("glmnet")
+ data <- read.csv("path", header=FALSE)
+ label = factor(data$V1)
+ features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+ coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
+ standardize=FALSE))
+ coefficients
+
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ (Intercept) 0.3722152
+ data.V2 .
+ data.V3 .
+ data.V4 -0.1665453
+ data.V5 .
+ */
+ val interceptR2 = 0.3722152
+ val coefficientsR2 = Vectors.dense(0.0, 0.0, -0.1665453, 0.0)
+
+ assert(model2.intercept ~== interceptR2 relTol 1E-2)
+ assert(model2.weights ~= coefficientsR2 absTol 1E-3)
+ }
+
+ test("binary logistic regression without intercept with L1 regularization") {
+ val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(true)
+ trainer1.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6)
+ val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(false)
+ trainer2.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6)
+
+ val model1 = trainer1.run(binaryDataset)
+ val model2 = trainer2.run(binaryDataset)
+
+ /*
+ Using the following R code to load the data and train the model using glmnet package.
+
+ library("glmnet")
+ data <- read.csv("path", header=FALSE)
+ label = factor(data$V1)
+ features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+ coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
+ intercept=FALSE))
+ coefficients
+
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ (Intercept) .
+ data.V2 .
+ data.V3 .
+ data.V4 -0.05189203
+ data.V5 -0.03891782
+ */
+ val interceptR1 = 0.0
+ val coefficientsR1 = Vectors.dense(0.0, 0.0, -0.05189203, -0.03891782)
+
+ assert(model1.intercept ~== interceptR1 relTol 1E-3)
+ assert(model1.weights ~= coefficientsR1 absTol 1E-3)
+
+ /*
+ Using the following R code to load the data and train the model using glmnet package.
+
+ library("glmnet")
+ data <- read.csv("path", header=FALSE)
+ label = factor(data$V1)
+ features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+ coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
+ intercept=FALSE, standardize=FALSE))
+ coefficients
+
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ (Intercept) .
+ data.V2 .
+ data.V3 .
+ data.V4 -0.08420782
+ data.V5 .
+ */
+ val interceptR2 = 0.0
+ val coefficientsR2 = Vectors.dense(0.0, 0.0, -0.08420782, 0.0)
+
+ assert(model2.intercept ~== interceptR2 absTol 1E-3)
+ assert(model2.weights ~= coefficientsR2 absTol 1E-3)
+ }
+
+ test("binary logistic regression with intercept with L2 regularization") {
+ val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(true)
+ trainer1.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6)
+ val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(false)
+ trainer2.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6)
+
+ val model1 = trainer1.run(binaryDataset)
+ val model2 = trainer2.run(binaryDataset)
+
+ /*
+ Using the following R code to load the data and train the model using glmnet package.
+
+ library("glmnet")
+ data <- read.csv("path", header=FALSE)
+ label = factor(data$V1)
+ features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+ coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37))
+ coefficients
+
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ (Intercept) 0.15021751
+ data.V2 -0.07251837
+ data.V3 0.10724191
+ data.V4 -0.04865309
+ data.V5 -0.10062872
+ */
+ val interceptR1 = 0.15021751
+ val coefficientsR1 = Vectors.dense(-0.07251837, 0.10724191, -0.04865309, -0.10062872)
+
+ assert(model1.intercept ~== interceptR1 relTol 1E-3)
+ assert(model1.weights ~= coefficientsR1 relTol 1E-3)
+
+ /*
+ Using the following R code to load the data and train the model using glmnet package.
+
+ library("glmnet")
+ data <- read.csv("path", header=FALSE)
+ label = factor(data$V1)
+ features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+ coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
+ standardize=FALSE))
+ coefficients
+
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ (Intercept) 0.48657516
+ data.V2 -0.05155371
+ data.V3 0.02301057
+ data.V4 -0.11482896
+ data.V5 -0.06266838
+ */
+ val interceptR2 = 0.48657516
+ val coefficientsR2 = Vectors.dense(-0.05155371, 0.02301057, -0.11482896, -0.06266838)
+
+ assert(model2.intercept ~== interceptR2 relTol 1E-3)
+ assert(model2.weights ~= coefficientsR2 relTol 1E-3)
+ }
+
+ test("binary logistic regression without intercept with L2 regularization") {
+ val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(true)
+ trainer1.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6)
+ val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(false)
+ trainer2.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6)
+
+ val model1 = trainer1.run(binaryDataset)
+ val model2 = trainer2.run(binaryDataset)
+
+ /*
+ Using the following R code to load the data and train the model using glmnet package.
+
+ library("glmnet")
+ data <- read.csv("path", header=FALSE)
+ label = factor(data$V1)
+ features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+ coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
+ intercept=FALSE))
+ coefficients
+
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ (Intercept) .
+ data.V2 -0.06099165
+ data.V3 0.12857058
+ data.V4 -0.04708770
+ data.V5 -0.09799775
+ */
+ val interceptR1 = 0.0
+ val coefficientsR1 = Vectors.dense(-0.06099165, 0.12857058, -0.04708770, -0.09799775)
+
+ assert(model1.intercept ~== interceptR1 absTol 1E-3)
+ assert(model1.weights ~= coefficientsR1 relTol 1E-2)
+
+ /*
+ Using the following R code to load the data and train the model using glmnet package.
+
+ library("glmnet")
+ data <- read.csv("path", header=FALSE)
+ label = factor(data$V1)
+ features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+ coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
+ intercept=FALSE, standardize=FALSE))
+ coefficients
+
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ (Intercept) .
+ data.V2 -0.005679651
+ data.V3 0.048967094
+ data.V4 -0.093714016
+ data.V5 -0.053314311
+ */
+ val interceptR2 = 0.0
+ val coefficientsR2 = Vectors.dense(-0.005679651, 0.048967094, -0.093714016, -0.053314311)
+
+ assert(model2.intercept ~== interceptR2 absTol 1E-3)
+ assert(model2.weights ~= coefficientsR2 relTol 1E-2)
+ }
+
}
class LogisticRegressionClusterSuite extends SparkFunSuite with LocalClusterSparkContext {