diff options
5 files changed, 384 insertions, 117 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 3967151f76..8fc9199fb4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.classification import scala.collection.mutable -import breeze.linalg.{DenseVector => BDV, norm => brzNorm} +import breeze.linalg.{DenseVector => BDV} import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN} import org.apache.spark.{Logging, SparkException} @@ -41,7 +41,7 @@ import org.apache.spark.storage.StorageLevel */ private[classification] trait LogisticRegressionParams extends ProbabilisticClassifierParams with HasRegParam with HasElasticNetParam with HasMaxIter with HasFitIntercept with HasTol - with HasThreshold + with HasThreshold with HasStandardization /** * :: Experimental :: @@ -98,6 +98,18 @@ class LogisticRegression(override val uid: String) def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) setDefault(fitIntercept -> true) + /** + * Whether to standardize the training features before fitting the model. + * The coefficients of models will be always returned on the original scale, + * so it will be transparent for users. Note that when no regularization, + * with or without standardization, the models should be always converged to + * the same solution. + * Default is true. + * @group setParam + * */ + def setStandardization(value: Boolean): this.type = set(standardization, value) + setDefault(standardization -> true) + /** @group setParam */ def setThreshold(value: Double): this.type = set(threshold, value) setDefault(threshold -> 0.5) @@ -149,15 +161,28 @@ class LogisticRegression(override val uid: String) val regParamL1 = $(elasticNetParam) * $(regParam) val regParamL2 = (1.0 - $(elasticNetParam)) * $(regParam) - val costFun = new LogisticCostFun(instances, numClasses, $(fitIntercept), + val costFun = new LogisticCostFun(instances, numClasses, $(fitIntercept), $(standardization), featuresStd, featuresMean, regParamL2) val optimizer = if ($(elasticNetParam) == 0.0 || $(regParam) == 0.0) { new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol)) } else { - // Remove the L1 penalization on the intercept def regParamL1Fun = (index: Int) => { - if (index == numFeatures) 0.0 else regParamL1 + // Remove the L1 penalization on the intercept + if (index == numFeatures) { + 0.0 + } else { + if ($(standardization)) { + regParamL1 + } else { + // If `standardization` is false, we still standardize the data + // to improve the rate of convergence; as a result, we have to + // perform this reverse standardization by penalizing each component + // differently to get effectively the same objective function when + // the training dataset is not standardized. + if (featuresStd(index) != 0.0) regParamL1 / featuresStd(index) else 0.0 + } + } } new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol)) } @@ -523,11 +548,13 @@ private class LogisticCostFun( data: RDD[(Double, Vector)], numClasses: Int, fitIntercept: Boolean, + standardization: Boolean, featuresStd: Array[Double], featuresMean: Array[Double], regParamL2: Double) extends DiffFunction[BDV[Double]] { override def calculate(weights: BDV[Double]): (Double, BDV[Double]) = { + val numFeatures = featuresStd.length val w = Vectors.fromBreeze(weights) val logisticAggregator = data.treeAggregate(new LogisticAggregator(w, numClasses, fitIntercept, @@ -539,27 +566,43 @@ private class LogisticCostFun( case (aggregator1, aggregator2) => aggregator1.merge(aggregator2) }) - // regVal is the sum of weight squares for L2 regularization - val norm = if (regParamL2 == 0.0) { - 0.0 - } else if (fitIntercept) { - brzNorm(Vectors.dense(weights.toArray.slice(0, weights.size -1)).toBreeze, 2.0) - } else { - brzNorm(weights, 2.0) - } - val regVal = 0.5 * regParamL2 * norm * norm + val totalGradientArray = logisticAggregator.gradient.toArray - val loss = logisticAggregator.loss + regVal - val gradient = logisticAggregator.gradient - - if (fitIntercept) { - val wArray = w.toArray.clone() - wArray(wArray.length - 1) = 0.0 - axpy(regParamL2, Vectors.dense(wArray), gradient) + // regVal is the sum of weight squares excluding intercept for L2 regularization. + val regVal = if (regParamL2 == 0.0) { + 0.0 } else { - axpy(regParamL2, w, gradient) + var sum = 0.0 + w.foreachActive { (index, value) => + // If `fitIntercept` is true, the last term which is intercept doesn't + // contribute to the regularization. + if (index != numFeatures) { + // The following code will compute the loss of the regularization; also + // the gradient of the regularization, and add back to totalGradientArray. + sum += { + if (standardization) { + totalGradientArray(index) += regParamL2 * value + value * value + } else { + if (featuresStd(index) != 0.0) { + // If `standardization` is false, we still standardize the data + // to improve the rate of convergence; as a result, we have to + // perform this reverse standardization by penalizing each component + // differently to get effectively the same objective function when + // the training dataset is not standardized. + val temp = value / (featuresStd(index) * featuresStd(index)) + totalGradientArray(index) += regParamL2 * temp + value * temp + } else { + 0.0 + } + } + } + } + } + 0.5 * regParamL2 * sum } - (loss, gradient.toBreeze.asInstanceOf[BDV[Double]]) + (logisticAggregator.loss + regVal, new BDV(totalGradientArray)) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala index b0a6af171c..66b751a1b0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala @@ -54,8 +54,7 @@ private[shared] object SharedParamsCodeGen { isValid = "ParamValidators.gtEq(1)"), ParamDesc[Boolean]("fitIntercept", "whether to fit an intercept term", Some("true")), ParamDesc[Boolean]("standardization", "whether to standardize the training features" + - " prior to fitting the model sequence. Note that the coefficients of models are" + - " always returned on the original scale.", Some("true")), + " before fitting the model.", Some("true")), ParamDesc[Long]("seed", "random seed", Some("this.getClass.getName.hashCode.toLong")), ParamDesc[Double]("elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]." + " For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala index bbe08939b6..f81bd76c22 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala @@ -239,10 +239,10 @@ private[ml] trait HasFitIntercept extends Params { private[ml] trait HasStandardization extends Params { /** - * Param for whether to standardize the training features prior to fitting the model sequence. Note that the coefficients of models are always returned on the original scale.. + * Param for whether to standardize the training features before fitting the model.. * @group param */ - final val standardization: BooleanParam = new BooleanParam(this, "standardization", "whether to standardize the training features prior to fitting the model sequence. Note that the coefficients of models are always returned on the original scale.") + final val standardization: BooleanParam = new BooleanParam(this, "standardization", "whether to standardize the training features before fitting the model.") setDefault(standardization, true) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index ba8fbee841..27253c1db2 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -77,6 +77,7 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(lr.getRawPredictionCol === "rawPrediction") assert(lr.getProbabilityCol === "probability") assert(lr.getFitIntercept) + assert(lr.getStandardization) val model = lr.fit(dataset) model.transform(dataset) .select("label", "probability", "prediction", "rawPrediction") @@ -208,8 +209,11 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { } test("binary logistic regression with intercept without regularization") { - val trainer = (new LogisticRegression).setFitIntercept(true) - val model = trainer.fit(binaryDataset) + val trainer1 = (new LogisticRegression).setFitIntercept(true).setStandardization(true) + val trainer2 = (new LogisticRegression).setFitIntercept(true).setStandardization(false) + + val model1 = trainer1.fit(binaryDataset) + val model2 = trainer2.fit(binaryDataset) /* Using the following R code to load the data and train the model using glmnet package. @@ -232,16 +236,26 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val interceptR = 2.8366423 val weightsR = Array(-0.5895848, 0.8931147, -0.3925051, -0.7996864) - assert(model.intercept ~== interceptR relTol 1E-3) - assert(model.weights(0) ~== weightsR(0) relTol 1E-3) - assert(model.weights(1) ~== weightsR(1) relTol 1E-3) - assert(model.weights(2) ~== weightsR(2) relTol 1E-3) - assert(model.weights(3) ~== weightsR(3) relTol 1E-3) + assert(model1.intercept ~== interceptR relTol 1E-3) + assert(model1.weights(0) ~== weightsR(0) relTol 1E-3) + assert(model1.weights(1) ~== weightsR(1) relTol 1E-3) + assert(model1.weights(2) ~== weightsR(2) relTol 1E-3) + assert(model1.weights(3) ~== weightsR(3) relTol 1E-3) + + // Without regularization, with or without standardization will converge to the same solution. + assert(model2.intercept ~== interceptR relTol 1E-3) + assert(model2.weights(0) ~== weightsR(0) relTol 1E-3) + assert(model2.weights(1) ~== weightsR(1) relTol 1E-3) + assert(model2.weights(2) ~== weightsR(2) relTol 1E-3) + assert(model2.weights(3) ~== weightsR(3) relTol 1E-3) } test("binary logistic regression without intercept without regularization") { - val trainer = (new LogisticRegression).setFitIntercept(false) - val model = trainer.fit(binaryDataset) + val trainer1 = (new LogisticRegression).setFitIntercept(false).setStandardization(true) + val trainer2 = (new LogisticRegression).setFitIntercept(false).setStandardization(false) + + val model1 = trainer1.fit(binaryDataset) + val model2 = trainer2.fit(binaryDataset) /* Using the following R code to load the data and train the model using glmnet package. @@ -265,17 +279,28 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val interceptR = 0.0 val weightsR = Array(-0.3534996, 1.2964482, -0.3571741, -0.7407946) - assert(model.intercept ~== interceptR relTol 1E-3) - assert(model.weights(0) ~== weightsR(0) relTol 1E-2) - assert(model.weights(1) ~== weightsR(1) relTol 1E-2) - assert(model.weights(2) ~== weightsR(2) relTol 1E-3) - assert(model.weights(3) ~== weightsR(3) relTol 1E-3) + assert(model1.intercept ~== interceptR relTol 1E-3) + assert(model1.weights(0) ~== weightsR(0) relTol 1E-2) + assert(model1.weights(1) ~== weightsR(1) relTol 1E-2) + assert(model1.weights(2) ~== weightsR(2) relTol 1E-3) + assert(model1.weights(3) ~== weightsR(3) relTol 1E-3) + + // Without regularization, with or without standardization should converge to the same solution. + assert(model2.intercept ~== interceptR relTol 1E-3) + assert(model2.weights(0) ~== weightsR(0) relTol 1E-2) + assert(model2.weights(1) ~== weightsR(1) relTol 1E-2) + assert(model2.weights(2) ~== weightsR(2) relTol 1E-3) + assert(model2.weights(3) ~== weightsR(3) relTol 1E-3) } test("binary logistic regression with intercept with L1 regularization") { - val trainer = (new LogisticRegression).setFitIntercept(true) - .setElasticNetParam(1.0).setRegParam(0.12) - val model = trainer.fit(binaryDataset) + val trainer1 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(true) + val trainer2 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(false) + + val model1 = trainer1.fit(binaryDataset) + val model2 = trainer2.fit(binaryDataset) /* Using the following R code to load the data and train the model using glmnet package. @@ -295,20 +320,52 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { data.V4 -0.04325749 data.V5 -0.02481551 */ - val interceptR = -0.05627428 - val weightsR = Array(0.0, 0.0, -0.04325749, -0.02481551) - - assert(model.intercept ~== interceptR relTol 1E-2) - assert(model.weights(0) ~== weightsR(0) relTol 1E-3) - assert(model.weights(1) ~== weightsR(1) relTol 1E-3) - assert(model.weights(2) ~== weightsR(2) relTol 1E-2) - assert(model.weights(3) ~== weightsR(3) relTol 2E-2) + val interceptR1 = -0.05627428 + val weightsR1 = Array(0.0, 0.0, -0.04325749, -0.02481551) + + assert(model1.intercept ~== interceptR1 relTol 1E-2) + assert(model1.weights(0) ~== weightsR1(0) absTol 1E-3) + assert(model1.weights(1) ~== weightsR1(1) absTol 1E-3) + assert(model1.weights(2) ~== weightsR1(2) relTol 1E-2) + assert(model1.weights(3) ~== weightsR1(3) relTol 2E-2) + + /* + Using the following R code to load the data and train the model using glmnet package. + + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12, + standardize=FALSE)) + weights + + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) 0.3722152 + data.V2 . + data.V3 . + data.V4 -0.1665453 + data.V5 . + */ + val interceptR2 = 0.3722152 + val weightsR2 = Array(0.0, 0.0, -0.1665453, 0.0) + + assert(model2.intercept ~== interceptR2 relTol 1E-2) + assert(model2.weights(0) ~== weightsR2(0) absTol 1E-3) + assert(model2.weights(1) ~== weightsR2(1) absTol 1E-3) + assert(model2.weights(2) ~== weightsR2(2) relTol 1E-2) + assert(model2.weights(3) ~== weightsR2(3) absTol 1E-3) } test("binary logistic regression without intercept with L1 regularization") { - val trainer = (new LogisticRegression).setFitIntercept(false) - .setElasticNetParam(1.0).setRegParam(0.12) - val model = trainer.fit(binaryDataset) + val trainer1 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(true) + val trainer2 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(false) + + val model1 = trainer1.fit(binaryDataset) + val model2 = trainer2.fit(binaryDataset) /* Using the following R code to load the data and train the model using glmnet package. @@ -329,20 +386,52 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { data.V4 -0.05189203 data.V5 -0.03891782 */ - val interceptR = 0.0 - val weightsR = Array(0.0, 0.0, -0.05189203, -0.03891782) + val interceptR1 = 0.0 + val weightsR1 = Array(0.0, 0.0, -0.05189203, -0.03891782) + + assert(model1.intercept ~== interceptR1 relTol 1E-3) + assert(model1.weights(0) ~== weightsR1(0) absTol 1E-3) + assert(model1.weights(1) ~== weightsR1(1) absTol 1E-3) + assert(model1.weights(2) ~== weightsR1(2) relTol 1E-2) + assert(model1.weights(3) ~== weightsR1(3) relTol 1E-2) + + /* + Using the following R code to load the data and train the model using glmnet package. + + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12, + intercept=FALSE, standardize=FALSE)) + weights - assert(model.intercept ~== interceptR relTol 1E-3) - assert(model.weights(0) ~== weightsR(0) relTol 1E-3) - assert(model.weights(1) ~== weightsR(1) relTol 1E-3) - assert(model.weights(2) ~== weightsR(2) relTol 1E-2) - assert(model.weights(3) ~== weightsR(3) relTol 1E-2) + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) . + data.V2 . + data.V3 . + data.V4 -0.08420782 + data.V5 . + */ + val interceptR2 = 0.0 + val weightsR2 = Array(0.0, 0.0, -0.08420782, 0.0) + + assert(model2.intercept ~== interceptR2 relTol 1E-3) + assert(model2.weights(0) ~== weightsR2(0) absTol 1E-3) + assert(model2.weights(1) ~== weightsR2(1) absTol 1E-3) + assert(model2.weights(2) ~== weightsR2(2) relTol 1E-2) + assert(model2.weights(3) ~== weightsR2(3) absTol 1E-3) } test("binary logistic regression with intercept with L2 regularization") { - val trainer = (new LogisticRegression).setFitIntercept(true) - .setElasticNetParam(0.0).setRegParam(1.37) - val model = trainer.fit(binaryDataset) + val trainer1 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(true) + val trainer2 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(false) + + val model1 = trainer1.fit(binaryDataset) + val model2 = trainer2.fit(binaryDataset) /* Using the following R code to load the data and train the model using glmnet package. @@ -362,20 +451,52 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { data.V4 -0.04865309 data.V5 -0.10062872 */ - val interceptR = 0.15021751 - val weightsR = Array(-0.07251837, 0.10724191, -0.04865309, -0.10062872) - - assert(model.intercept ~== interceptR relTol 1E-3) - assert(model.weights(0) ~== weightsR(0) relTol 1E-3) - assert(model.weights(1) ~== weightsR(1) relTol 1E-3) - assert(model.weights(2) ~== weightsR(2) relTol 1E-3) - assert(model.weights(3) ~== weightsR(3) relTol 1E-3) + val interceptR1 = 0.15021751 + val weightsR1 = Array(-0.07251837, 0.10724191, -0.04865309, -0.10062872) + + assert(model1.intercept ~== interceptR1 relTol 1E-3) + assert(model1.weights(0) ~== weightsR1(0) relTol 1E-3) + assert(model1.weights(1) ~== weightsR1(1) relTol 1E-3) + assert(model1.weights(2) ~== weightsR1(2) relTol 1E-3) + assert(model1.weights(3) ~== weightsR1(3) relTol 1E-3) + + /* + Using the following R code to load the data and train the model using glmnet package. + + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37, + standardize=FALSE)) + weights + + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) 0.48657516 + data.V2 -0.05155371 + data.V3 0.02301057 + data.V4 -0.11482896 + data.V5 -0.06266838 + */ + val interceptR2 = 0.48657516 + val weightsR2 = Array(-0.05155371, 0.02301057, -0.11482896, -0.06266838) + + assert(model2.intercept ~== interceptR2 relTol 1E-3) + assert(model2.weights(0) ~== weightsR2(0) relTol 1E-3) + assert(model2.weights(1) ~== weightsR2(1) relTol 1E-3) + assert(model2.weights(2) ~== weightsR2(2) relTol 1E-3) + assert(model2.weights(3) ~== weightsR2(3) relTol 1E-3) } test("binary logistic regression without intercept with L2 regularization") { - val trainer = (new LogisticRegression).setFitIntercept(false) - .setElasticNetParam(0.0).setRegParam(1.37) - val model = trainer.fit(binaryDataset) + val trainer1 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(true) + val trainer2 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.0).setRegParam(1.37).setStandardization(false) + + val model1 = trainer1.fit(binaryDataset) + val model2 = trainer2.fit(binaryDataset) /* Using the following R code to load the data and train the model using glmnet package. @@ -396,20 +517,52 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { data.V4 -0.04708770 data.V5 -0.09799775 */ - val interceptR = 0.0 - val weightsR = Array(-0.06099165, 0.12857058, -0.04708770, -0.09799775) + val interceptR1 = 0.0 + val weightsR1 = Array(-0.06099165, 0.12857058, -0.04708770, -0.09799775) + + assert(model1.intercept ~== interceptR1 relTol 1E-3) + assert(model1.weights(0) ~== weightsR1(0) relTol 1E-2) + assert(model1.weights(1) ~== weightsR1(1) relTol 1E-2) + assert(model1.weights(2) ~== weightsR1(2) relTol 1E-3) + assert(model1.weights(3) ~== weightsR1(3) relTol 1E-3) + + /* + Using the following R code to load the data and train the model using glmnet package. + + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + weights = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37, + intercept=FALSE, standardize=FALSE)) + weights - assert(model.intercept ~== interceptR relTol 1E-3) - assert(model.weights(0) ~== weightsR(0) relTol 1E-2) - assert(model.weights(1) ~== weightsR(1) relTol 1E-2) - assert(model.weights(2) ~== weightsR(2) relTol 1E-3) - assert(model.weights(3) ~== weightsR(3) relTol 1E-3) + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) . + data.V2 -0.005679651 + data.V3 0.048967094 + data.V4 -0.093714016 + data.V5 -0.053314311 + */ + val interceptR2 = 0.0 + val weightsR2 = Array(-0.005679651, 0.048967094, -0.093714016, -0.053314311) + + assert(model2.intercept ~== interceptR2 relTol 1E-3) + assert(model2.weights(0) ~== weightsR2(0) relTol 1E-2) + assert(model2.weights(1) ~== weightsR2(1) relTol 1E-2) + assert(model2.weights(2) ~== weightsR2(2) relTol 1E-3) + assert(model2.weights(3) ~== weightsR2(3) relTol 1E-3) } test("binary logistic regression with intercept with ElasticNet regularization") { - val trainer = (new LogisticRegression).setFitIntercept(true) - .setElasticNetParam(0.38).setRegParam(0.21) - val model = trainer.fit(binaryDataset) + val trainer1 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true) + val trainer2 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false) + + val model1 = trainer1.fit(binaryDataset) + val model2 = trainer2.fit(binaryDataset) /* Using the following R code to load the data and train the model using glmnet package. @@ -429,20 +582,52 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { data.V4 -0.08849250 data.V5 -0.15458796 */ - val interceptR = 0.57734851 - val weightsR = Array(-0.05310287, 0.0, -0.08849250, -0.15458796) - - assert(model.intercept ~== interceptR relTol 6E-3) - assert(model.weights(0) ~== weightsR(0) relTol 5E-3) - assert(model.weights(1) ~== weightsR(1) relTol 1E-3) - assert(model.weights(2) ~== weightsR(2) relTol 5E-3) - assert(model.weights(3) ~== weightsR(3) relTol 1E-3) + val interceptR1 = 0.57734851 + val weightsR1 = Array(-0.05310287, 0.0, -0.08849250, -0.15458796) + + assert(model1.intercept ~== interceptR1 relTol 6E-3) + assert(model1.weights(0) ~== weightsR1(0) relTol 5E-3) + assert(model1.weights(1) ~== weightsR1(1) absTol 1E-3) + assert(model1.weights(2) ~== weightsR1(2) relTol 5E-3) + assert(model1.weights(3) ~== weightsR1(3) relTol 1E-3) + + /* + Using the following R code to load the data and train the model using glmnet package. + + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21, + standardize=FALSE)) + weights + + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) 0.51555993 + data.V2 . + data.V3 . + data.V4 -0.18807395 + data.V5 -0.05350074 + */ + val interceptR2 = 0.51555993 + val weightsR2 = Array(0.0, 0.0, -0.18807395, -0.05350074) + + assert(model2.intercept ~== interceptR2 relTol 6E-3) + assert(model2.weights(0) ~== weightsR2(0) absTol 1E-3) + assert(model2.weights(1) ~== weightsR2(1) absTol 1E-3) + assert(model2.weights(2) ~== weightsR2(2) relTol 5E-3) + assert(model2.weights(3) ~== weightsR2(3) relTol 1E-2) } test("binary logistic regression without intercept with ElasticNet regularization") { - val trainer = (new LogisticRegression).setFitIntercept(false) - .setElasticNetParam(0.38).setRegParam(0.21) - val model = trainer.fit(binaryDataset) + val trainer1 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true) + val trainer2 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false) + + val model1 = trainer1.fit(binaryDataset) + val model2 = trainer2.fit(binaryDataset) /* Using the following R code to load the data and train the model using glmnet package. @@ -463,20 +648,52 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { data.V4 -0.081203769 data.V5 -0.142534158 */ - val interceptR = 0.0 - val weightsR = Array(-0.001005743, 0.072577857, -0.081203769, -0.142534158) + val interceptR1 = 0.0 + val weightsR1 = Array(-0.001005743, 0.072577857, -0.081203769, -0.142534158) + + assert(model1.intercept ~== interceptR1 relTol 1E-3) + assert(model1.weights(0) ~== weightsR1(0) absTol 1E-2) + assert(model1.weights(1) ~== weightsR1(1) absTol 1E-2) + assert(model1.weights(2) ~== weightsR1(2) relTol 1E-3) + assert(model1.weights(3) ~== weightsR1(3) relTol 1E-2) + + /* + Using the following R code to load the data and train the model using glmnet package. + + library("glmnet") + data <- read.csv("path", header=FALSE) + label = factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + weights = coef(glmnet(features,label, family="binomial", alpha = 0.38, lambda = 0.21, + intercept=FALSE, standardize=FALSE)) + weights - assert(model.intercept ~== interceptR relTol 1E-3) - assert(model.weights(0) ~== weightsR(0) absTol 1E-3) - assert(model.weights(1) ~== weightsR(1) absTol 1E-2) - assert(model.weights(2) ~== weightsR(2) relTol 1E-3) - assert(model.weights(3) ~== weightsR(3) relTol 1E-2) + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + (Intercept) . + data.V2 . + data.V3 0.03345223 + data.V4 -0.11304532 + data.V5 . + */ + val interceptR2 = 0.0 + val weightsR2 = Array(0.0, 0.03345223, -0.11304532, 0.0) + + assert(model2.intercept ~== interceptR2 relTol 1E-3) + assert(model2.weights(0) ~== weightsR2(0) absTol 1E-3) + assert(model2.weights(1) ~== weightsR2(1) relTol 1E-2) + assert(model2.weights(2) ~== weightsR2(2) relTol 1E-2) + assert(model2.weights(3) ~== weightsR2(3) absTol 1E-3) } test("binary logistic regression with intercept with strong L1 regularization") { - val trainer = (new LogisticRegression).setFitIntercept(true) - .setElasticNetParam(1.0).setRegParam(6.0) - val model = trainer.fit(binaryDataset) + val trainer1 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(1.0).setRegParam(6.0).setStandardization(true) + val trainer2 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(1.0).setRegParam(6.0).setStandardization(false) + + val model1 = trainer1.fit(binaryDataset) + val model2 = trainer2.fit(binaryDataset) val histogram = binaryDataset.map { case Row(label: Double, features: Vector) => label } .treeAggregate(new MultiClassSummarizer)( @@ -502,11 +719,17 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val interceptTheory = math.log(histogram(1).toDouble / histogram(0).toDouble) val weightsTheory = Array(0.0, 0.0, 0.0, 0.0) - assert(model.intercept ~== interceptTheory relTol 1E-5) - assert(model.weights(0) ~== weightsTheory(0) absTol 1E-6) - assert(model.weights(1) ~== weightsTheory(1) absTol 1E-6) - assert(model.weights(2) ~== weightsTheory(2) absTol 1E-6) - assert(model.weights(3) ~== weightsTheory(3) absTol 1E-6) + assert(model1.intercept ~== interceptTheory relTol 1E-5) + assert(model1.weights(0) ~== weightsTheory(0) absTol 1E-6) + assert(model1.weights(1) ~== weightsTheory(1) absTol 1E-6) + assert(model1.weights(2) ~== weightsTheory(2) absTol 1E-6) + assert(model1.weights(3) ~== weightsTheory(3) absTol 1E-6) + + assert(model2.intercept ~== interceptTheory relTol 1E-5) + assert(model2.weights(0) ~== weightsTheory(0) absTol 1E-6) + assert(model2.weights(1) ~== weightsTheory(1) absTol 1E-6) + assert(model2.weights(2) ~== weightsTheory(2) absTol 1E-6) + assert(model2.weights(3) ~== weightsTheory(3) absTol 1E-6) /* Using the following R code to load the data and train the model using glmnet package. @@ -529,10 +752,10 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val interceptR = -0.248065 val weightsR = Array(0.0, 0.0, 0.0, 0.0) - assert(model.intercept ~== interceptR relTol 1E-5) - assert(model.weights(0) ~== weightsR(0) absTol 1E-6) - assert(model.weights(1) ~== weightsR(1) absTol 1E-6) - assert(model.weights(2) ~== weightsR(2) absTol 1E-6) - assert(model.weights(3) ~== weightsR(3) absTol 1E-6) + assert(model1.intercept ~== interceptR relTol 1E-5) + assert(model1.weights(0) ~== weightsR(0) absTol 1E-6) + assert(model1.weights(1) ~== weightsR(1) absTol 1E-6) + assert(model1.weights(2) ~== weightsR(2) absTol 1E-6) + assert(model1.weights(3) ~== weightsR(3) absTol 1E-6) } } diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 680b699e9e..41e19fd9cc 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -58,6 +58,8 @@ object MimaExcludes { "org.apache.spark.ml.regression.LeastSquaresAggregator.this"), ProblemFilters.exclude[MissingMethodProblem]( "org.apache.spark.ml.regression.LeastSquaresCostFun.this"), + ProblemFilters.exclude[MissingMethodProblem]( + "org.apache.spark.ml.classification.LogisticCostFun.this"), // SQL execution is considered private. excludePackage("org.apache.spark.sql.execution"), // NanoTime and CatalystTimestampConverter is only used inside catalyst, |