diff options
author | Lewuathe <lewuathe@me.com> | 2015-10-30 02:59:05 -0700 |
---|---|---|
committer | DB Tsai <dbt@netflix.com> | 2015-10-30 02:59:05 -0700 |
commit | 86d65265fcab7edab88a7bdb10acba47da95bcb3 (patch) | |
tree | bd634fc275041e52bb056a5c58b77117c7ccc7b8 /mllib | |
parent | eb59b94c450fe6391d24d44ff7ea9bd4c6893af8 (diff) | |
download | spark-86d65265fcab7edab88a7bdb10acba47da95bcb3.tar.gz spark-86d65265fcab7edab88a7bdb10acba47da95bcb3.tar.bz2 spark-86d65265fcab7edab88a7bdb10acba47da95bcb3.zip |
[SPARK-11207] [ML] Add test cases for solver selection of LinearRegres…
…sion as followup. This is the follow up work of SPARK-10668.
* Fix miner style issues.
* Add test case for checking whether solver is selected properly.
Author: Lewuathe <lewuathe@me.com>
Author: lewuathe <lewuathe@me.com>
Closes #9180 from Lewuathe/SPARK-11207.
Diffstat (limited to 'mllib')
-rw-r--r-- | mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala | 54 | ||||
-rw-r--r-- | mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala | 172 |
2 files changed, 144 insertions, 82 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala index d0ba454f37..6ff07eed6c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala @@ -77,13 +77,11 @@ object LinearDataGenerator { nPoints: Int, seed: Int, eps: Double = 0.1): Seq[LabeledPoint] = { - generateLinearInput(intercept, weights, - Array.fill[Double](weights.length)(0.0), - Array.fill[Double](weights.length)(1.0 / 3.0), - nPoints, seed, eps)} + generateLinearInput(intercept, weights, Array.fill[Double](weights.length)(0.0), + Array.fill[Double](weights.length)(1.0 / 3.0), nPoints, seed, eps) + } /** - * * @param intercept Data intercept * @param weights Weights to be applied. * @param xMean the mean of the generated features. Lots of time, if the features are not properly @@ -104,16 +102,49 @@ object LinearDataGenerator { nPoints: Int, seed: Int, eps: Double): Seq[LabeledPoint] = { + generateLinearInput(intercept, weights, xMean, xVariance, nPoints, seed, eps, 0.0) + } + + /** + * @param intercept Data intercept + * @param weights Weights to be applied. + * @param xMean the mean of the generated features. Lots of time, if the features are not properly + * standardized, the algorithm with poor implementation will have difficulty + * to converge. + * @param xVariance the variance of the generated features. + * @param nPoints Number of points in sample. + * @param seed Random seed + * @param eps Epsilon scaling factor. + * @param sparsity The ratio of zero elements. If it is 0.0, LabeledPoints with + * DenseVector is returned. + * @return Seq of input. + */ + @Since("1.6.0") + def generateLinearInput( + intercept: Double, + weights: Array[Double], + xMean: Array[Double], + xVariance: Array[Double], + nPoints: Int, + seed: Int, + eps: Double, + sparsity: Double): Seq[LabeledPoint] = { + require(0.0 <= sparsity && sparsity <= 1.0) val rnd = new Random(seed) val x = Array.fill[Array[Double]](nPoints)( Array.fill[Double](weights.length)(rnd.nextDouble())) + val sparseRnd = new Random(seed) x.foreach { v => var i = 0 val len = v.length while (i < len) { - v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i) + if (sparseRnd.nextDouble() < sparsity) { + v(i) = 0.0 + } else { + v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i) + } i += 1 } } @@ -121,7 +152,16 @@ object LinearDataGenerator { val y = x.map { xi => blas.ddot(weights.length, xi, 1, weights, 1) + intercept + eps * rnd.nextGaussian() } - y.zip(x).map(p => LabeledPoint(p._1, Vectors.dense(p._2))) + + y.zip(x).map { p => + if (sparsity == 0.0) { + // Return LabeledPoints with DenseVector + LabeledPoint(p._1, Vectors.dense(p._2)) + } else { + // Return LabeledPoints with SparseVector + LabeledPoint(p._1, Vectors.dense(p._2).toSparse) + } + } } /** diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala index a6e0c72ba9..a2a5c0bbdc 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala @@ -32,8 +32,9 @@ import org.apache.spark.sql.{DataFrame, Row} class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { private val seed: Int = 42 - @transient var dataset: DataFrame = _ - @transient var datasetWithoutIntercept: DataFrame = _ + @transient var datasetWithDenseFeature: DataFrame = _ + @transient var datasetWithDenseFeatureWithoutIntercept: DataFrame = _ + @transient var datasetWithSparseFeature: DataFrame = _ /* In `LinearRegressionSuite`, we will make sure that the model trained by SparkML @@ -49,16 +50,29 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { */ override def beforeAll(): Unit = { super.beforeAll() - dataset = sqlContext.createDataFrame( + datasetWithDenseFeature = sqlContext.createDataFrame( sc.parallelize(LinearDataGenerator.generateLinearInput( - 6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2)) + intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3), + xVariance = Array(0.7, 1.2), nPoints = 10000, seed, eps = 0.1), 2)) /* datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating training model without intercept */ - datasetWithoutIntercept = sqlContext.createDataFrame( + datasetWithDenseFeatureWithoutIntercept = sqlContext.createDataFrame( sc.parallelize(LinearDataGenerator.generateLinearInput( - 0.0, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2)) + intercept = 0.0, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3), + xVariance = Array(0.7, 1.2), nPoints = 10000, seed, eps = 0.1), 2)) + + val r = new Random(seed) + // When feature size is larger than 4096, normal optimizer is choosed + // as the solver of linear regression in the case of "auto" mode. + val featureSize = 4100 + datasetWithSparseFeature = sqlContext.createDataFrame( + sc.parallelize(LinearDataGenerator.generateLinearInput( + intercept = 0.0, weights = Seq.fill(featureSize)(r.nextDouble).toArray, + xMean = Seq.fill(featureSize)(r.nextDouble).toArray, + xVariance = Seq.fill(featureSize)(r.nextDouble).toArray, nPoints = 200, + seed, eps = 0.1, sparsity = 0.7), 2)) } test("params") { @@ -77,19 +91,19 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(lir.getFitIntercept) assert(lir.getStandardization) assert(lir.getSolver == "auto") - val model = lir.fit(dataset) + val model = lir.fit(datasetWithDenseFeature) // copied model must have the same parent. MLTestingUtils.checkCopy(model) - model.transform(dataset) + model.transform(datasetWithDenseFeature) .select("label", "prediction") .collect() assert(model.getFeaturesCol === "features") assert(model.getPredictionCol === "prediction") assert(model.intercept !== 0.0) assert(model.hasParent) - val numFeatures = dataset.select("features").first().getAs[Vector](0).size + val numFeatures = datasetWithDenseFeature.select("features").first().getAs[Vector](0).size assert(model.numFeatures === numFeatures) } @@ -98,8 +112,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val trainer1 = new LinearRegression().setSolver(solver) // The result should be the same regardless of standardization without regularization val trainer2 = (new LinearRegression).setStandardization(false).setSolver(solver) - val model1 = trainer1.fit(dataset) - val model2 = trainer2.fit(dataset) + val model1 = trainer1.fit(datasetWithDenseFeature) + val model2 = trainer2.fit(datasetWithDenseFeature) /* Using the following R code to load the data and train the model using glmnet package. @@ -124,7 +138,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model2.intercept ~== interceptR relTol 1E-3) assert(model2.weights ~= weightsR relTol 1E-3) - model1.transform(dataset).select("features", "prediction").collect().foreach { + model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept @@ -139,10 +153,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { // Without regularization the results should be the same val trainer2 = (new LinearRegression).setFitIntercept(false).setStandardization(false) .setSolver(solver) - val model1 = trainer1.fit(dataset) - val modelWithoutIntercept1 = trainer1.fit(datasetWithoutIntercept) - val model2 = trainer2.fit(dataset) - val modelWithoutIntercept2 = trainer2.fit(datasetWithoutIntercept) + val model1 = trainer1.fit(datasetWithDenseFeature) + val modelWithoutIntercept1 = trainer1.fit(datasetWithDenseFeatureWithoutIntercept) + val model2 = trainer2.fit(datasetWithDenseFeature) + val modelWithoutIntercept2 = trainer2.fit(datasetWithDenseFeatureWithoutIntercept) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0, @@ -186,19 +200,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57) .setSolver(solver).setStandardization(false) - var model1: LinearRegressionModel = null - var model2: LinearRegressionModel = null - // Normal optimizer is not supported with only L1 regularization case. if (solver == "normal") { intercept[IllegalArgumentException] { - trainer1.fit(dataset) - trainer2.fit(dataset) + trainer1.fit(datasetWithDenseFeature) + trainer2.fit(datasetWithDenseFeature) } } else { - model1 = trainer1.fit(dataset) - model2 = trainer2.fit(dataset) - + val model1 = trainer1.fit(datasetWithDenseFeature) + val model2 = trainer2.fit(datasetWithDenseFeature) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57)) @@ -230,11 +240,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model2.intercept ~== interceptR2 relTol 1E-3) assert(model2.weights ~= weightsR2 relTol 1E-3) - model1.transform(dataset).select("features", "prediction").collect().foreach { - case Row(features: DenseVector, prediction1: Double) => - val prediction2 = - features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept - assert(prediction1 ~== prediction2 relTol 1E-5) + model1.transform(datasetWithDenseFeature).select("features", "prediction") + .collect().foreach { + case Row(features: DenseVector, prediction1: Double) => + val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) + + model1.intercept + assert(prediction1 ~== prediction2 relTol 1E-5) } } } @@ -247,18 +258,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57) .setFitIntercept(false).setStandardization(false).setSolver(solver) - var model1: LinearRegressionModel = null - var model2: LinearRegressionModel = null - // Normal optimizer is not supported with only L1 regularization case. if (solver == "normal") { intercept[IllegalArgumentException] { - trainer1.fit(dataset) - trainer2.fit(dataset) + trainer1.fit(datasetWithDenseFeature) + trainer2.fit(datasetWithDenseFeature) } } else { - model1 = trainer1.fit(dataset) - model2 = trainer2.fit(dataset) + val model1 = trainer1.fit(datasetWithDenseFeature) + val model2 = trainer2.fit(datasetWithDenseFeature) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57, @@ -292,11 +300,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model2.intercept ~== interceptR2 absTol 1E-3) assert(model2.weights ~= weightsR2 relTol 1E-3) - model1.transform(dataset).select("features", "prediction").collect().foreach { - case Row(features: DenseVector, prediction1: Double) => - val prediction2 = - features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept - assert(prediction1 ~== prediction2 relTol 1E-5) + model1.transform(datasetWithDenseFeature).select("features", "prediction") + .collect().foreach { + case Row(features: DenseVector, prediction1: Double) => + val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) + + model1.intercept + assert(prediction1 ~== prediction2 relTol 1E-5) } } } @@ -308,8 +317,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { .setSolver(solver) val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) .setStandardization(false).setSolver(solver) - val model1 = trainer1.fit(dataset) - val model2 = trainer2.fit(dataset) + val model1 = trainer1.fit(datasetWithDenseFeature) + val model2 = trainer2.fit(datasetWithDenseFeature) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3)) @@ -342,7 +351,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model2.intercept ~== interceptR2 relTol 1E-3) assert(model2.weights ~= weightsR2 relTol 1E-3) - model1.transform(dataset).select("features", "prediction").collect().foreach { + model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept @@ -357,8 +366,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { .setFitIntercept(false).setSolver(solver) val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) .setFitIntercept(false).setStandardization(false).setSolver(solver) - val model1 = trainer1.fit(dataset) - val model2 = trainer2.fit(dataset) + val model1 = trainer1.fit(datasetWithDenseFeature) + val model2 = trainer2.fit(datasetWithDenseFeature) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3, @@ -392,7 +401,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model2.intercept ~== interceptR2 absTol 1E-3) assert(model2.weights ~= weightsR2 relTol 1E-3) - model1.transform(dataset).select("features", "prediction").collect().foreach { + model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept @@ -408,18 +417,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6) .setStandardization(false).setSolver(solver) - var model1: LinearRegressionModel = null - var model2: LinearRegressionModel = null - // Normal optimizer is not supported with non-zero elasticnet parameter. if (solver == "normal") { intercept[IllegalArgumentException] { - trainer1.fit(dataset) - trainer2.fit(dataset) + trainer1.fit(datasetWithDenseFeature) + trainer2.fit(datasetWithDenseFeature) } } else { - model1 = trainer1.fit(dataset) - model2 = trainer2.fit(dataset) + val model1 = trainer1.fit(datasetWithDenseFeature) + val model2 = trainer2.fit(datasetWithDenseFeature) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6)) @@ -452,10 +458,11 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model2.intercept ~== interceptR2 relTol 1E-3) assert(model2.weights ~= weightsR2 relTol 1E-3) - model1.transform(dataset).select("features", "prediction").collect().foreach { + model1.transform(datasetWithDenseFeature).select("features", "prediction") + .collect().foreach { case Row(features: DenseVector, prediction1: Double) => - val prediction2 = - features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept + val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) + + model1.intercept assert(prediction1 ~== prediction2 relTol 1E-5) } } @@ -469,18 +476,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6) .setFitIntercept(false).setStandardization(false).setSolver(solver) - var model1: LinearRegressionModel = null - var model2: LinearRegressionModel = null - // Normal optimizer is not supported with non-zero elasticnet parameter. if (solver == "normal") { intercept[IllegalArgumentException] { - trainer1.fit(dataset) - trainer2.fit(dataset) + trainer1.fit(datasetWithDenseFeature) + trainer2.fit(datasetWithDenseFeature) } } else { - model1 = trainer1.fit(dataset) - model2 = trainer2.fit(dataset) + val model1 = trainer1.fit(datasetWithDenseFeature) + val model2 = trainer2.fit(datasetWithDenseFeature) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6, @@ -514,10 +518,11 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model2.intercept ~== interceptR2 absTol 1E-3) assert(model2.weights ~= weightsR2 relTol 1E-3) - model1.transform(dataset).select("features", "prediction").collect().foreach { + model1.transform(datasetWithDenseFeature).select("features", "prediction") + .collect().foreach { case Row(features: DenseVector, prediction1: Double) => - val prediction2 = - features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept + val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) + + model1.intercept assert(prediction1 ~== prediction2 relTol 1E-5) } } @@ -527,27 +532,26 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { test("linear regression model training summary") { Seq("auto", "l-bfgs", "normal").foreach { solver => val trainer = new LinearRegression().setSolver(solver) - val model = trainer.fit(dataset) + val model = trainer.fit(datasetWithDenseFeature) val trainerNoPredictionCol = trainer.setPredictionCol("") - val modelNoPredictionCol = trainerNoPredictionCol.fit(dataset) - + val modelNoPredictionCol = trainerNoPredictionCol.fit(datasetWithDenseFeature) // Training results for the model should be available assert(model.hasSummary) assert(modelNoPredictionCol.hasSummary) // Schema should be a superset of the input dataset - assert((dataset.schema.fieldNames.toSet + "prediction").subsetOf( + assert((datasetWithDenseFeature.schema.fieldNames.toSet + "prediction").subsetOf( model.summary.predictions.schema.fieldNames.toSet)) // Validate that we re-insert a prediction column for evaluation val modelNoPredictionColFieldNames = modelNoPredictionCol.summary.predictions.schema.fieldNames - assert((dataset.schema.fieldNames.toSet).subsetOf( + assert((datasetWithDenseFeature.schema.fieldNames.toSet).subsetOf( modelNoPredictionColFieldNames.toSet)) assert(modelNoPredictionColFieldNames.exists(s => s.startsWith("prediction_"))) // Residuals in [[LinearRegressionResults]] should equal those manually computed - val expectedResiduals = dataset.select("features", "label") + val expectedResiduals = datasetWithDenseFeature.select("features", "label") .map { case Row(features: DenseVector, label: Double) => val prediction = features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept @@ -585,6 +589,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { .objectiveHistory .sliding(2) .forall(x => x(0) >= x(1))) + } else { + // To clalify that the normal solver is used here. + assert(model.summary.objectiveHistory.length == 1) + assert(model.summary.objectiveHistory(0) == 0.0) } } } @@ -592,10 +600,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { test("linear regression model testset evaluation summary") { Seq("auto", "l-bfgs", "normal").foreach { solver => val trainer = new LinearRegression().setSolver(solver) - val model = trainer.fit(dataset) + val model = trainer.fit(datasetWithDenseFeature) // Evaluating on training dataset should yield results summary equal to training summary - val testSummary = model.evaluate(dataset) + val testSummary = model.evaluate(datasetWithDenseFeature) assert(model.summary.meanSquaredError ~== testSummary.meanSquaredError relTol 1E-5) assert(model.summary.r2 ~== testSummary.r2 relTol 1E-5) model.summary.residuals.select("residuals").collect() @@ -693,4 +701,18 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model4a0.weights ~== model4b.weights absTol 1E-3) } } + + test("linear regression model with l-bfgs with big feature datasets") { + val trainer = new LinearRegression().setSolver("auto") + val model = trainer.fit(datasetWithSparseFeature) + + // Training results for the model should be available + assert(model.hasSummary) + // When LBFGS is used as optimizer, objective history can be restored. + assert( + model.summary + .objectiveHistory + .sliding(2) + .forall(x => x(0) >= x(1))) + } } |