/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.spark.ml.regression import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext} import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{DataFrame, Row} class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { private val seed: Int = 42 @transient var datasetWithDenseFeature: DataFrame = _ @transient var datasetWithDenseFeatureWithoutIntercept: DataFrame = _ @transient var datasetWithSparseFeature: DataFrame = _ @transient var datasetWithWeight: DataFrame = _ @transient var datasetWithWeightConstantLabel: DataFrame = _ @transient var datasetWithWeightZeroLabel: DataFrame = _ /* In `LinearRegressionSuite`, we will make sure that the model trained by SparkML is the same as the one trained by R's glmnet package. The following instruction describes how to reproduce the data in R. In a spark-shell, use the following code: import org.apache.spark.mllib.util.LinearDataGenerator val data = sc.parallelize(LinearDataGenerator.generateLinearInput(6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2) data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1)).coalesce(1) .saveAsTextFile("path") */ override def beforeAll(): Unit = { super.beforeAll() datasetWithDenseFeature = sqlContext.createDataFrame( sc.parallelize(LinearDataGenerator.generateLinearInput( intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3), xVariance = Array(0.7, 1.2), nPoints = 10000, seed, eps = 0.1), 2)) /* datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating training model without intercept */ datasetWithDenseFeatureWithoutIntercept = sqlContext.createDataFrame( sc.parallelize(LinearDataGenerator.generateLinearInput( intercept = 0.0, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3), xVariance = Array(0.7, 1.2), nPoints = 10000, seed, eps = 0.1), 2)) val r = new Random(seed) // When feature size is larger than 4096, normal optimizer is choosed // as the solver of linear regression in the case of "auto" mode. val featureSize = 4100 datasetWithSparseFeature = sqlContext.createDataFrame( sc.parallelize(LinearDataGenerator.generateLinearInput( intercept = 0.0, weights = Seq.fill(featureSize)(r.nextDouble).toArray, xMean = Seq.fill(featureSize)(r.nextDouble).toArray, xVariance = Seq.fill(featureSize)(r.nextDouble).toArray, nPoints = 200, seed, eps = 0.1, sparsity = 0.7), 2)) /* R code: A <- matrix(c(0, 1, 2, 3, 5, 7, 11, 13), 4, 2) b <- c(17, 19, 23, 29) w <- c(1, 2, 3, 4) df <- as.data.frame(cbind(A, b)) */ datasetWithWeight = sqlContext.createDataFrame( sc.parallelize(Seq( Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse), Instance(19.0, 2.0, Vectors.dense(1.0, 7.0)), Instance(23.0, 3.0, Vectors.dense(2.0, 11.0)), Instance(29.0, 4.0, Vectors.dense(3.0, 13.0)) ), 2)) /* R code: A <- matrix(c(0, 1, 2, 3, 5, 7, 11, 13), 4, 2) b.const <- c(17, 17, 17, 17) w <- c(1, 2, 3, 4) df.const.label <- as.data.frame(cbind(A, b.const)) */ datasetWithWeightConstantLabel = sqlContext.createDataFrame( sc.parallelize(Seq( Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse), Instance(17.0, 2.0, Vectors.dense(1.0, 7.0)), Instance(17.0, 3.0, Vectors.dense(2.0, 11.0)), Instance(17.0, 4.0, Vectors.dense(3.0, 13.0)) ), 2)) datasetWithWeightZeroLabel = sqlContext.createDataFrame( sc.parallelize(Seq( Instance(0.0, 1.0, Vectors.dense(0.0, 5.0).toSparse), Instance(0.0, 2.0, Vectors.dense(1.0, 7.0)), Instance(0.0, 3.0, Vectors.dense(2.0, 11.0)), Instance(0.0, 4.0, Vectors.dense(3.0, 13.0)) ), 2)) } test("params") { ParamsSuite.checkParams(new LinearRegression) val model = new LinearRegressionModel("linearReg", Vectors.dense(0.0), 0.0) ParamsSuite.checkParams(model) } test("linear regression: default params") { val lir = new LinearRegression assert(lir.getLabelCol === "label") assert(lir.getFeaturesCol === "features") assert(lir.getPredictionCol === "prediction") assert(lir.getRegParam === 0.0) assert(lir.getElasticNetParam === 0.0) assert(lir.getFitIntercept) assert(lir.getStandardization) assert(lir.getSolver == "auto") val model = lir.fit(datasetWithDenseFeature) // copied model must have the same parent. MLTestingUtils.checkCopy(model) model.transform(datasetWithDenseFeature) .select("label", "prediction") .collect() assert(model.getFeaturesCol === "features") assert(model.getPredictionCol === "prediction") assert(model.intercept !== 0.0) assert(model.hasParent) val numFeatures = datasetWithDenseFeature.select("features").first().getAs[Vector](0).size assert(model.numFeatures === numFeatures) } test("linear regression with intercept without regularization") { Seq("auto", "l-bfgs", "normal").foreach { solver => val trainer1 = new LinearRegression().setSolver(solver) // The result should be the same regardless of standardization without regularization val trainer2 = (new LinearRegression).setStandardization(false).setSolver(solver) val model1 = trainer1.fit(datasetWithDenseFeature) val model2 = trainer2.fit(datasetWithDenseFeature) /* Using the following R code to load the data and train the model using glmnet package. library("glmnet") data <- read.csv("path", header=FALSE, stringsAsFactors=FALSE) features <- as.matrix(data.frame(as.numeric(data$V2), as.numeric(data$V3))) label <- as.numeric(data$V1) coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0)) > coefficients 3 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) 6.298698 as.numeric.data.V2. 4.700706 as.numeric.data.V3. 7.199082 */ val interceptR = 6.298698 val coefficientsR = Vectors.dense(4.700706, 7.199082) assert(model1.intercept ~== interceptR relTol 1E-3) assert(model1.coefficients ~= coefficientsR relTol 1E-3) assert(model2.intercept ~== interceptR relTol 1E-3) assert(model2.coefficients ~= coefficientsR relTol 1E-3) model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) + model1.intercept assert(prediction1 ~== prediction2 relTol 1E-5) } } } test("linear regression without intercept without regularization") { Seq("auto", "l-bfgs", "normal").foreach { solver => val trainer1 = (new LinearRegression).setFitIntercept(false).setSolver(solver) // Without regularization the results should be the same val trainer2 = (new LinearRegression).setFitIntercept(false).setStandardization(false) .setSolver(solver) val model1 = trainer1.fit(datasetWithDenseFeature) val modelWithoutIntercept1 = trainer1.fit(datasetWithDenseFeatureWithoutIntercept) val model2 = trainer2.fit(datasetWithDenseFeature) val modelWithoutIntercept2 = trainer2.fit(datasetWithDenseFeatureWithoutIntercept) /* coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0, intercept = FALSE)) > coefficients 3 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . as.numeric.data.V2. 6.973403 as.numeric.data.V3. 5.284370 */ val coefficientsR = Vectors.dense(6.973403, 5.284370) assert(model1.intercept ~== 0 absTol 1E-2) assert(model1.coefficients ~= coefficientsR relTol 1E-2) assert(model2.intercept ~== 0 absTol 1E-2) assert(model2.coefficients ~= coefficientsR relTol 1E-2) /* Then again with the data with no intercept: > coefficientsWithourIntercept 3 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . as.numeric.data3.V2. 4.70011 as.numeric.data3.V3. 7.19943 */ val coefficientsWithourInterceptR = Vectors.dense(4.70011, 7.19943) assert(modelWithoutIntercept1.intercept ~== 0 absTol 1E-3) assert(modelWithoutIntercept1.coefficients ~= coefficientsWithourInterceptR relTol 1E-3) assert(modelWithoutIntercept2.intercept ~== 0 absTol 1E-3) assert(modelWithoutIntercept2.coefficients ~= coefficientsWithourInterceptR relTol 1E-3) } } test("linear regression with intercept with L1 regularization") { Seq("auto", "l-bfgs", "normal").foreach { solver => val trainer1 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57) .setSolver(solver) val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57) .setSolver(solver).setStandardization(false) // Normal optimizer is not supported with only L1 regularization case. if (solver == "normal") { intercept[IllegalArgumentException] { trainer1.fit(datasetWithDenseFeature) trainer2.fit(datasetWithDenseFeature) } } else { val model1 = trainer1.fit(datasetWithDenseFeature) val model2 = trainer2.fit(datasetWithDenseFeature) /* coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57 )) > coefficients 3 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) 6.242284 as.numeric.d1.V2. 4.019605 as.numeric.d1.V3. 6.679538 */ val interceptR1 = 6.242284 val coefficientsR1 = Vectors.dense(4.019605, 6.679538) assert(model1.intercept ~== interceptR1 relTol 1E-2) assert(model1.coefficients ~= coefficientsR1 relTol 1E-2) /* coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57, standardize=FALSE )) > coefficients 3 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) 6.416948 as.numeric.data.V2. 3.893869 as.numeric.data.V3. 6.724286 */ val interceptR2 = 6.416948 val coefficientsR2 = Vectors.dense(3.893869, 6.724286) assert(model2.intercept ~== interceptR2 relTol 1E-3) assert(model2.coefficients ~= coefficientsR2 relTol 1E-3) model1.transform(datasetWithDenseFeature).select("features", "prediction") .collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) + model1.intercept assert(prediction1 ~== prediction2 relTol 1E-5) } } } } test("linear regression without intercept with L1 regularization") { Seq("auto", "l-bfgs", "normal").foreach { solver => val trainer1 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57) .setFitIntercept(false).setSolver(solver) val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57) .setFitIntercept(false).setStandardization(false).setSolver(solver) // Normal optimizer is not supported with only L1 regularization case. if (solver == "normal") { intercept[IllegalArgumentException] { trainer1.fit(datasetWithDenseFeature) trainer2.fit(datasetWithDenseFeature) } } else { val model1 = trainer1.fit(datasetWithDenseFeature) val model2 = trainer2.fit(datasetWithDenseFeature) /* coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57, intercept=FALSE )) > coefficients 3 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . as.numeric.data.V2. 6.272927 as.numeric.data.V3. 4.782604 */ val interceptR1 = 0.0 val coefficientsR1 = Vectors.dense(6.272927, 4.782604) assert(model1.intercept ~== interceptR1 absTol 1E-2) assert(model1.coefficients ~= coefficientsR1 relTol 1E-2) /* coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57, intercept=FALSE, standardize=FALSE )) > coefficients 3 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . as.numeric.data.V2. 6.207817 as.numeric.data.V3. 4.775780 */ val interceptR2 = 0.0 val coefficientsR2 = Vectors.dense(6.207817, 4.775780) assert(model2.intercept ~== interceptR2 absTol 1E-2) assert(model2.coefficients ~= coefficientsR2 relTol 1E-2) model1.transform(datasetWithDenseFeature).select("features", "prediction") .collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) + model1.intercept assert(prediction1 ~== prediction2 relTol 1E-5) } } } } test("linear regression with intercept with L2 regularization") { Seq("auto", "l-bfgs", "normal").foreach { solver => val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) .setSolver(solver) val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) .setStandardization(false).setSolver(solver) val model1 = trainer1.fit(datasetWithDenseFeature) val model2 = trainer2.fit(datasetWithDenseFeature) /* coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3)) > coefficients 3 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) 5.260103 as.numeric.d1.V2. 3.725522 as.numeric.d1.V3. 5.711203 */ val interceptR1 = 5.260103 val coefficientsR1 = Vectors.dense(3.725522, 5.711203) assert(model1.intercept ~== interceptR1 relTol 1E-2) assert(model1.coefficients ~= coefficientsR1 relTol 1E-2) /* coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3, standardize=FALSE)) > coefficients 3 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) 5.790885 as.numeric.d1.V2. 3.432373 as.numeric.d1.V3. 5.919196 */ val interceptR2 = 5.790885 val coefficientsR2 = Vectors.dense(3.432373, 5.919196) assert(model2.intercept ~== interceptR2 relTol 1E-2) assert(model2.coefficients ~= coefficientsR2 relTol 1E-2) model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) + model1.intercept assert(prediction1 ~== prediction2 relTol 1E-5) } } } test("linear regression without intercept with L2 regularization") { Seq("auto", "l-bfgs", "normal").foreach { solver => val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) .setFitIntercept(false).setSolver(solver) val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) .setFitIntercept(false).setStandardization(false).setSolver(solver) val model1 = trainer1.fit(datasetWithDenseFeature) val model2 = trainer2.fit(datasetWithDenseFeature) /* coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3, intercept = FALSE)) > coefficients 3 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . as.numeric.d1.V2. 5.493430 as.numeric.d1.V3. 4.223082 */ val interceptR1 = 0.0 val coefficientsR1 = Vectors.dense(5.493430, 4.223082) assert(model1.intercept ~== interceptR1 absTol 1E-2) assert(model1.coefficients ~= coefficientsR1 relTol 1E-2) /* coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3, intercept = FALSE, standardize=FALSE)) > coefficients 3 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . as.numeric.d1.V2. 5.244324 as.numeric.d1.V3. 4.203106 */ val interceptR2 = 0.0 val coefficientsR2 = Vectors.dense(5.244324, 4.203106) assert(model2.intercept ~== interceptR2 absTol 1E-2) assert(model2.coefficients ~= coefficientsR2 relTol 1E-2) model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) + model1.intercept assert(prediction1 ~== prediction2 relTol 1E-5) } } } test("linear regression with intercept with ElasticNet regularization") { Seq("auto", "l-bfgs", "normal").foreach { solver => val trainer1 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6) .setSolver(solver) val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6) .setStandardization(false).setSolver(solver) // Normal optimizer is not supported with non-zero elasticnet parameter. if (solver == "normal") { intercept[IllegalArgumentException] { trainer1.fit(datasetWithDenseFeature) trainer2.fit(datasetWithDenseFeature) } } else { val model1 = trainer1.fit(datasetWithDenseFeature) val model2 = trainer2.fit(datasetWithDenseFeature) /* coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6 )) > coefficients 3 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) 5.689855 as.numeric.d1.V2. 3.661181 as.numeric.d1.V3. 6.000274 */ val interceptR1 = 5.689855 val coefficientsR1 = Vectors.dense(3.661181, 6.000274) assert(model1.intercept ~== interceptR1 relTol 1E-2) assert(model1.coefficients ~= coefficientsR1 relTol 1E-2) /* coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6 standardize=FALSE)) > coefficients 3 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) 6.113890 as.numeric.d1.V2. 3.407021 as.numeric.d1.V3. 6.152512 */ val interceptR2 = 6.113890 val coefficientsR2 = Vectors.dense(3.407021, 6.152512) assert(model2.intercept ~== interceptR2 relTol 1E-2) assert(model2.coefficients ~= coefficientsR2 relTol 1E-2) model1.transform(datasetWithDenseFeature).select("features", "prediction") .collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) + model1.intercept assert(prediction1 ~== prediction2 relTol 1E-5) } } } } test("linear regression without intercept with ElasticNet regularization") { Seq("auto", "l-bfgs", "normal").foreach { solver => val trainer1 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6) .setFitIntercept(false).setSolver(solver) val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6) .setFitIntercept(false).setStandardization(false).setSolver(solver) // Normal optimizer is not supported with non-zero elasticnet parameter. if (solver == "normal") { intercept[IllegalArgumentException] { trainer1.fit(datasetWithDenseFeature) trainer2.fit(datasetWithDenseFeature) } } else { val model1 = trainer1.fit(datasetWithDenseFeature) val model2 = trainer2.fit(datasetWithDenseFeature) /* coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6, intercept=FALSE )) > coefficients 3 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . as.numeric.d1.V2. 5.643748 as.numeric.d1.V3. 4.331519 */ val interceptR1 = 0.0 val coefficientsR1 = Vectors.dense(5.643748, 4.331519) assert(model1.intercept ~== interceptR1 absTol 1E-2) assert(model1.coefficients ~= coefficientsR1 relTol 1E-2) /* coefficients <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6, intercept=FALSE, standardize=FALSE )) > coefficients 3 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . as.numeric.d1.V2. 5.455902 as.numeric.d1.V3. 4.312266 */ val interceptR2 = 0.0 val coefficientsR2 = Vectors.dense(5.455902, 4.312266) assert(model2.intercept ~== interceptR2 absTol 1E-2) assert(model2.coefficients ~= coefficientsR2 relTol 1E-2) model1.transform(datasetWithDenseFeature).select("features", "prediction") .collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) + model1.intercept assert(prediction1 ~== prediction2 relTol 1E-5) } } } } test("linear regression model with constant label") { /* R code: for (formula in c(b.const ~ . -1, b.const ~ .)) { model <- lm(formula, data=df.const.label, weights=w) print(as.vector(coef(model))) } [1] -9.221298 3.394343 [1] 17 0 0 */ val expected = Seq( Vectors.dense(0.0, -9.221298, 3.394343), Vectors.dense(17.0, 0.0, 0.0)) Seq("auto", "l-bfgs", "normal").foreach { solver => var idx = 0 for (fitIntercept <- Seq(false, true)) { val model1 = new LinearRegression() .setFitIntercept(fitIntercept) .setWeightCol("weight") .setSolver(solver) .fit(datasetWithWeightConstantLabel) val actual1 = Vectors.dense(model1.intercept, model1.coefficients(0), model1.coefficients(1)) assert(actual1 ~== expected(idx) absTol 1e-4) val model2 = new LinearRegression() .setFitIntercept(fitIntercept) .setWeightCol("weight") .setSolver(solver) .fit(datasetWithWeightZeroLabel) val actual2 = Vectors.dense(model2.intercept, model2.coefficients(0), model2.coefficients(1)) assert(actual2 ~== Vectors.dense(0.0, 0.0, 0.0) absTol 1e-4) idx += 1 } } } test("regularized linear regression through origin with constant label") { // The problem is ill-defined if fitIntercept=false, regParam is non-zero. // An exception is thrown in this case. Seq("auto", "l-bfgs", "normal").foreach { solver => for (standardization <- Seq(false, true)) { val model = new LinearRegression().setFitIntercept(false) .setRegParam(0.1).setStandardization(standardization).setSolver(solver) intercept[IllegalArgumentException] { model.fit(datasetWithWeightConstantLabel) } } } } test("linear regression with l-bfgs when training is not needed") { // When label is constant, l-bfgs solver returns results without training. // There are two possibilities: If the label is non-zero but constant, // and fitIntercept is true, then the model return yMean as intercept without training. // If label is all zeros, then all coefficients are zero regardless of fitIntercept, so // no training is needed. for (fitIntercept <- Seq(false, true)) { for (standardization <- Seq(false, true)) { val model1 = new LinearRegression() .setFitIntercept(fitIntercept) .setStandardization(standardization) .setWeightCol("weight") .setSolver("l-bfgs") .fit(datasetWithWeightConstantLabel) if (fitIntercept) { assert(model1.summary.objectiveHistory(0) ~== 0.0 absTol 1e-4) } val model2 = new LinearRegression() .setFitIntercept(fitIntercept) .setWeightCol("weight") .setSolver("l-bfgs") .fit(datasetWithWeightZeroLabel) assert(model2.summary.objectiveHistory(0) ~== 0.0 absTol 1e-4) } } } test("linear regression model training summary") { Seq("auto", "l-bfgs", "normal").foreach { solver => val trainer = new LinearRegression().setSolver(solver) val model = trainer.fit(datasetWithDenseFeature) val trainerNoPredictionCol = trainer.setPredictionCol("") val modelNoPredictionCol = trainerNoPredictionCol.fit(datasetWithDenseFeature) // Training results for the model should be available assert(model.hasSummary) assert(modelNoPredictionCol.hasSummary) // Schema should be a superset of the input dataset assert((datasetWithDenseFeature.schema.fieldNames.toSet + "prediction").subsetOf( model.summary.predictions.schema.fieldNames.toSet)) // Validate that we re-insert a prediction column for evaluation val modelNoPredictionColFieldNames = modelNoPredictionCol.summary.predictions.schema.fieldNames assert((datasetWithDenseFeature.schema.fieldNames.toSet).subsetOf( modelNoPredictionColFieldNames.toSet)) assert(modelNoPredictionColFieldNames.exists(s => s.startsWith("prediction_"))) // Residuals in [[LinearRegressionResults]] should equal those manually computed val expectedResiduals = datasetWithDenseFeature.select("features", "label") .map { case Row(features: DenseVector, label: Double) => val prediction = features(0) * model.coefficients(0) + features(1) * model.coefficients(1) + model.intercept label - prediction } .zip(model.summary.residuals.map(_.getDouble(0))) .collect() .foreach { case (manualResidual: Double, resultResidual: Double) => assert(manualResidual ~== resultResidual relTol 1E-5) } /* # Use the following R code to generate model training results. # path/part-00000 is the file generated by running LinearDataGenerator.generateLinearInput # as described before the beforeAll() method. d1 <- read.csv("path/part-00000", header=FALSE, stringsAsFactors=FALSE) fit <- glm(V1 ~ V2 + V3, data = d1, family = "gaussian") names(f1)[1] = c("V2") names(f1)[2] = c("V3") f1 <- data.frame(as.numeric(d1$V2), as.numeric(d1$V3)) predictions <- predict(fit, newdata=f1) l1 <- as.numeric(d1$V1) residuals <- l1 - predictions > mean(residuals^2) # MSE [1] 0.00985449 > mean(abs(residuals)) # MAD [1] 0.07961668 > cor(predictions, l1)^2 # r^2 [1] 0.9998737 > summary(fit) Call: glm(formula = V1 ~ V2 + V3, family = "gaussian", data = d1) Deviance Residuals: Min 1Q Median 3Q Max -0.47082 -0.06797 0.00002 0.06725 0.34635 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 6.3022157 0.0018600 3388 <2e-16 *** V2 4.6982442 0.0011805 3980 <2e-16 *** V3 7.1994344 0.0009044 7961 <2e-16 *** --- .... */ assert(model.summary.meanSquaredError ~== 0.00985449 relTol 1E-4) assert(model.summary.meanAbsoluteError ~== 0.07961668 relTol 1E-4) assert(model.summary.r2 ~== 0.9998737 relTol 1E-4) // Normal solver uses "WeightedLeastSquares". This algorithm does not generate // objective history because it does not run through iterations. if (solver == "l-bfgs") { // Objective function should be monotonically decreasing for linear regression assert( model.summary .objectiveHistory .sliding(2) .forall(x => x(0) >= x(1))) } else { // To clalify that the normal solver is used here. assert(model.summary.objectiveHistory.length == 1) assert(model.summary.objectiveHistory(0) == 0.0) val devianceResidualsR = Array(-0.47082, 0.34635) val seCoefR = Array(0.0011805, 0.0009044, 0.0018600) val tValsR = Array(3980, 7961, 3388) val pValsR = Array(0, 0, 0) model.summary.devianceResiduals.zip(devianceResidualsR).foreach { x => assert(x._1 ~== x._2 absTol 1E-4) } model.summary.coefficientStandardErrors.zip(seCoefR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-4) } model.summary.tValues.map(_.round).zip(tValsR).foreach{ x => assert(x._1 === x._2) } model.summary.pValues.map(_.round).zip(pValsR).foreach{ x => assert(x._1 === x._2) } } } } test("linear regression model testset evaluation summary") { Seq("auto", "l-bfgs", "normal").foreach { solver => val trainer = new LinearRegression().setSolver(solver) val model = trainer.fit(datasetWithDenseFeature) // Evaluating on training dataset should yield results summary equal to training summary val testSummary = model.evaluate(datasetWithDenseFeature) assert(model.summary.meanSquaredError ~== testSummary.meanSquaredError relTol 1E-5) assert(model.summary.r2 ~== testSummary.r2 relTol 1E-5) model.summary.residuals.select("residuals").collect() .zip(testSummary.residuals.select("residuals").collect()) .forall { case (Row(r1: Double), Row(r2: Double)) => r1 ~== r2 relTol 1E-5 } } } test("linear regression with weighted samples") { Seq("auto", "l-bfgs", "normal").foreach { solver => val (data, weightedData) = { val activeData = LinearDataGenerator.generateLinearInput( 6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 500, 1, 0.1) val rnd = new Random(8392) val signedData = activeData.map { case p: LabeledPoint => (rnd.nextGaussian() > 0.0, p) } val data1 = signedData.flatMap { case (true, p) => Iterator(p, p) case (false, p) => Iterator(p) } val weightedSignedData = signedData.flatMap { case (true, LabeledPoint(label, features)) => Iterator( Instance(label, weight = 1.2, features), Instance(label, weight = 0.8, features) ) case (false, LabeledPoint(label, features)) => Iterator( Instance(label, weight = 0.3, features), Instance(label, weight = 0.1, features), Instance(label, weight = 0.6, features) ) } val noiseData = LinearDataGenerator.generateLinearInput( 2, Array(1, 3), Array(0.9, -1.3), Array(0.7, 1.2), 500, 1, 0.1) val weightedNoiseData = noiseData.map { case LabeledPoint(label, features) => Instance(label, weight = 0, features) } val data2 = weightedSignedData ++ weightedNoiseData (sqlContext.createDataFrame(sc.parallelize(data1, 4)), sqlContext.createDataFrame(sc.parallelize(data2, 4))) } val trainer1a = (new LinearRegression).setFitIntercept(true) .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(true).setSolver(solver) val trainer1b = (new LinearRegression).setFitIntercept(true).setWeightCol("weight") .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(true).setSolver(solver) // Normal optimizer is not supported with non-zero elasticnet parameter. val model1a0 = trainer1a.fit(data) val model1a1 = trainer1a.fit(weightedData) val model1b = trainer1b.fit(weightedData) assert(model1a0.coefficients !~= model1a1.coefficients absTol 1E-3) assert(model1a0.intercept !~= model1a1.intercept absTol 1E-3) assert(model1a0.coefficients ~== model1b.coefficients absTol 1E-3) assert(model1a0.intercept ~== model1b.intercept absTol 1E-3) val trainer2a = (new LinearRegression).setFitIntercept(true) .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(false).setSolver(solver) val trainer2b = (new LinearRegression).setFitIntercept(true).setWeightCol("weight") .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(false).setSolver(solver) val model2a0 = trainer2a.fit(data) val model2a1 = trainer2a.fit(weightedData) val model2b = trainer2b.fit(weightedData) assert(model2a0.coefficients !~= model2a1.coefficients absTol 1E-3) assert(model2a0.intercept !~= model2a1.intercept absTol 1E-3) assert(model2a0.coefficients ~== model2b.coefficients absTol 1E-3) assert(model2a0.intercept ~== model2b.intercept absTol 1E-3) val trainer3a = (new LinearRegression).setFitIntercept(false) .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(true).setSolver(solver) val trainer3b = (new LinearRegression).setFitIntercept(false).setWeightCol("weight") .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(true).setSolver(solver) val model3a0 = trainer3a.fit(data) val model3a1 = trainer3a.fit(weightedData) val model3b = trainer3b.fit(weightedData) assert(model3a0.coefficients !~= model3a1.coefficients absTol 1E-3) assert(model3a0.coefficients ~== model3b.coefficients absTol 1E-3) val trainer4a = (new LinearRegression).setFitIntercept(false) .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(false).setSolver(solver) val trainer4b = (new LinearRegression).setFitIntercept(false).setWeightCol("weight") .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(false).setSolver(solver) val model4a0 = trainer4a.fit(data) val model4a1 = trainer4a.fit(weightedData) val model4b = trainer4b.fit(weightedData) assert(model4a0.coefficients !~= model4a1.coefficients absTol 1E-3) assert(model4a0.coefficients ~== model4b.coefficients absTol 1E-3) } } test("linear regression model with l-bfgs with big feature datasets") { val trainer = new LinearRegression().setSolver("auto") val model = trainer.fit(datasetWithSparseFeature) // Training results for the model should be available assert(model.hasSummary) // When LBFGS is used as optimizer, objective history can be restored. assert( model.summary .objectiveHistory .sliding(2) .forall(x => x(0) >= x(1))) } test("linear regression summary with weighted samples and intercept by normal solver") { /* R code: model <- glm(formula = "b ~ .", data = df, weights = w) summary(model) Call: glm(formula = "b ~ .", data = df, weights = w) Deviance Residuals: 1 2 3 4 1.920 -1.358 -1.109 0.960 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 18.080 9.608 1.882 0.311 V1 6.080 5.556 1.094 0.471 V2 -0.600 1.960 -0.306 0.811 (Dispersion parameter for gaussian family taken to be 7.68) Null deviance: 202.00 on 3 degrees of freedom Residual deviance: 7.68 on 1 degrees of freedom AIC: 18.783 Number of Fisher Scoring iterations: 2 */ val model = new LinearRegression() .setWeightCol("weight") .setSolver("normal") .fit(datasetWithWeight) val coefficientsR = Vectors.dense(Array(6.080, -0.600)) val interceptR = 18.080 val devianceResidualsR = Array(-1.358, 1.920) val seCoefR = Array(5.556, 1.960, 9.608) val tValsR = Array(1.094, -0.306, 1.882) val pValsR = Array(0.471, 0.811, 0.311) assert(model.coefficients ~== coefficientsR absTol 1E-3) assert(model.intercept ~== interceptR absTol 1E-3) model.summary.devianceResiduals.zip(devianceResidualsR).foreach { x => assert(x._1 ~== x._2 absTol 1E-3) } model.summary.coefficientStandardErrors.zip(seCoefR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) } model.summary.tValues.zip(tValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) } model.summary.pValues.zip(pValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) } } test("linear regression summary with weighted samples and w/o intercept by normal solver") { /* R code: model <- glm(formula = "b ~ . -1", data = df, weights = w) summary(model) Call: glm(formula = "b ~ . -1", data = df, weights = w) Deviance Residuals: 1 2 3 4 1.950 2.344 -4.600 2.103 Coefficients: Estimate Std. Error t value Pr(>|t|) V1 -3.7271 2.9032 -1.284 0.3279 V2 3.0100 0.6022 4.998 0.0378 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for gaussian family taken to be 17.4376) Null deviance: 5962.000 on 4 degrees of freedom Residual deviance: 34.875 on 2 degrees of freedom AIC: 22.835 Number of Fisher Scoring iterations: 2 */ val model = new LinearRegression() .setWeightCol("weight") .setSolver("normal") .setFitIntercept(false) .fit(datasetWithWeight) val coefficientsR = Vectors.dense(Array(-3.7271, 3.0100)) val interceptR = 0.0 val devianceResidualsR = Array(-4.600, 2.344) val seCoefR = Array(2.9032, 0.6022) val tValsR = Array(-1.284, 4.998) val pValsR = Array(0.3279, 0.0378) assert(model.coefficients ~== coefficientsR absTol 1E-3) assert(model.intercept === interceptR) model.summary.devianceResiduals.zip(devianceResidualsR).foreach { x => assert(x._1 ~== x._2 absTol 1E-3) } model.summary.coefficientStandardErrors.zip(seCoefR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) } model.summary.tValues.zip(tValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) } model.summary.pValues.zip(pValsR).foreach{ x => assert(x._1 ~== x._2 absTol 1E-3) } } test("read/write") { def checkModelData(model: LinearRegressionModel, model2: LinearRegressionModel): Unit = { assert(model.intercept === model2.intercept) assert(model.coefficients === model2.coefficients) } val lr = new LinearRegression() testEstimatorAndModelReadWrite(lr, datasetWithWeight, LinearRegressionSuite.allParamSettings, checkModelData) } } object LinearRegressionSuite { /** * Mapping from all Params to valid settings which differ from the defaults. * This is useful for tests which need to exercise all Params, such as save/load. * This excludes input columns to simplify some tests. */ val allParamSettings: Map[String, Any] = Map( "predictionCol" -> "myPrediction", "regParam" -> 0.01, "elasticNetParam" -> 0.1, "maxIter" -> 2, // intentionally small "fitIntercept" -> true, "tol" -> 0.8, "standardization" -> false, "solver" -> "l-bfgs" ) }