From 3f06eb72ca0c3e5779a702c7c677229e0c480751 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Wed, 16 Mar 2016 14:14:15 -0700 Subject: [SPARK-13613][ML] Provide ignored tests to export test dataset into CSV format ## What changes were proposed in this pull request? Provide ignored test cases to export the test dataset into CSV format in ```LinearRegressionSuite```, ```LogisticRegressionSuite```, ```AFTSurvivalRegressionSuite``` and ```GeneralizedLinearRegressionSuite```, so users can validate the training accuracy compared with R's glm, glmnet and survival package. cc mengxr ## How was this patch tested? The test suite is ignored, but I have enabled all these cases offline and it works as expected. Author: Yanbo Liang Closes #11463 from yanboliang/spark-13613. --- .../classification/LogisticRegressionSuite.scala | 29 ++++++------- .../ml/regression/AFTSurvivalRegressionSuite.scala | 13 ++++++ .../GeneralizedLinearRegressionSuite.scala | 49 +++++++++++++++++++++- .../ml/regression/LinearRegressionSuite.scala | 39 ++++++++++------- 4 files changed, 97 insertions(+), 33 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index cfb9bbfd41..afeeaf7fb5 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -23,7 +23,7 @@ import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.param.ParamsSuite -import org.apache.spark.ml.util.{DefaultReadWriteTest, Identifiable, MLTestingUtils} +import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.classification.LogisticRegressionSuite._ import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint @@ -44,20 +44,6 @@ class LogisticRegressionSuite dataset = sqlContext.createDataFrame(generateLogisticInput(1.0, 1.0, nPoints = 100, seed = 42)) - /* - Here is the instruction describing how to export the test data into CSV format - so we can validate the training accuracy compared with R's glmnet package. - - import org.apache.spark.mllib.classification.LogisticRegressionSuite - val nPoints = 10000 - val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191) - val xMean = Array(5.843, 3.057, 3.758, 1.199) - val xVariance = Array(0.6856, 0.1899, 3.116, 0.581) - val data = sc.parallelize(LogisticRegressionSuite.generateMultinomialLogisticInput( - coefficients, xMean, xVariance, true, nPoints, 42), 1) - data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1) + ", " - + x.features(2) + ", " + x.features(3)).saveAsTextFile("path") - */ binaryDataset = { val nPoints = 10000 val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191) @@ -65,12 +51,23 @@ class LogisticRegressionSuite val xVariance = Array(0.6856, 0.1899, 3.116, 0.581) val testData = - generateMultinomialLogisticInput(coefficients, xMean, xVariance, true, nPoints, 42) + generateMultinomialLogisticInput(coefficients, xMean, xVariance, + addIntercept = true, nPoints, 42) sqlContext.createDataFrame(sc.parallelize(testData, 4)) } } + /** + * Enable the ignored test to export the dataset into CSV format, + * so we can validate the training accuracy compared with R's glmnet package. + */ + ignore("export test data into CSV format") { + binaryDataset.rdd.map { case Row(label: Double, features: Vector) => + label + "," + features.toArray.mkString(",") + }.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/binaryDataset") + } + test("params") { ParamsSuite.checkParams(new LogisticRegression) val model = new LogisticRegressionModel("logReg", Vectors.dense(0.0), 0.0) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala index d718ef63b5..dbd752d2aa 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala @@ -44,6 +44,19 @@ class AFTSurvivalRegressionSuite 2, Array(0.9, -1.3), Array(0.7, 1.2), 1000, 42, 1.5, 2.5, 2.0))) } + /** + * Enable the ignored test to export the dataset into CSV format, + * so we can validate the training accuracy compared with R's survival package. + */ + ignore("export test data into CSV format") { + datasetUnivariate.rdd.map { case Row(features: Vector, label: Double, censor: Double) => + features.toArray.mkString(",") + "," + censor + "," + label + }.repartition(1).saveAsTextFile("target/tmp/AFTSurvivalRegressionSuite/datasetUnivariate") + datasetMultivariate.rdd.map { case Row(features: Vector, label: Double, censor: Double) => + features.toArray.mkString(",") + "," + censor + "," + label + }.repartition(1).saveAsTextFile("target/tmp/AFTSurvivalRegressionSuite/datasetMultivariate") + } + test("params") { ParamsSuite.checkParams(new AFTSurvivalRegression) val model = new AFTSurvivalRegressionModel("aftSurvReg", Vectors.dense(0.0), 0.0, 0.0) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index 6d570f7bde..4ebdbf2213 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -24,7 +24,7 @@ import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.classification.LogisticRegressionSuite._ -import org.apache.spark.mllib.linalg.{BLAS, DenseVector, Vectors} +import org.apache.spark.mllib.linalg.{BLAS, DenseVector, Vector, Vectors} import org.apache.spark.mllib.random._ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLlibTestSparkContext @@ -120,6 +120,53 @@ class GeneralizedLinearRegressionSuite family = "gamma", link = "log"), 2)) } + /** + * Enable the ignored test to export the dataset into CSV format, + * so we can validate the training accuracy compared with R's glm and glmnet package. + */ + ignore("export test data into CSV format") { + datasetGaussianIdentity.rdd.map { case Row(label: Double, features: Vector) => + label + "," + features.toArray.mkString(",") + }.repartition(1).saveAsTextFile( + "target/tmp/GeneralizedLinearRegressionSuite/datasetGaussianIdentity") + datasetGaussianLog.rdd.map { case Row(label: Double, features: Vector) => + label + "," + features.toArray.mkString(",") + }.repartition(1).saveAsTextFile( + "target/tmp/GeneralizedLinearRegressionSuite/datasetGaussianLog") + datasetGaussianInverse.rdd.map { case Row(label: Double, features: Vector) => + label + "," + features.toArray.mkString(",") + }.repartition(1).saveAsTextFile( + "target/tmp/GeneralizedLinearRegressionSuite/datasetGaussianInverse") + datasetBinomial.rdd.map { case Row(label: Double, features: Vector) => + label + "," + features.toArray.mkString(",") + }.repartition(1).saveAsTextFile( + "target/tmp/GeneralizedLinearRegressionSuite/datasetBinomial") + datasetPoissonLog.rdd.map { case Row(label: Double, features: Vector) => + label + "," + features.toArray.mkString(",") + }.repartition(1).saveAsTextFile( + "target/tmp/GeneralizedLinearRegressionSuite/datasetPoissonLog") + datasetPoissonIdentity.rdd.map { case Row(label: Double, features: Vector) => + label + "," + features.toArray.mkString(",") + }.repartition(1).saveAsTextFile( + "target/tmp/GeneralizedLinearRegressionSuite/datasetPoissonIdentity") + datasetPoissonSqrt.rdd.map { case Row(label: Double, features: Vector) => + label + "," + features.toArray.mkString(",") + }.repartition(1).saveAsTextFile( + "target/tmp/GeneralizedLinearRegressionSuite/datasetPoissonSqrt") + datasetGammaInverse.rdd.map { case Row(label: Double, features: Vector) => + label + "," + features.toArray.mkString(",") + }.repartition(1).saveAsTextFile( + "target/tmp/GeneralizedLinearRegressionSuite/datasetGammaInverse") + datasetGammaIdentity.rdd.map { case Row(label: Double, features: Vector) => + label + "," + features.toArray.mkString(",") + }.repartition(1).saveAsTextFile( + "target/tmp/GeneralizedLinearRegressionSuite/datasetGammaIdentity") + datasetGammaLog.rdd.map { case Row(label: Double, features: Vector) => + label + "," + features.toArray.mkString(",") + }.repartition(1).saveAsTextFile( + "target/tmp/GeneralizedLinearRegressionSuite/datasetGammaLog") + } + test("params") { ParamsSuite.checkParams(new GeneralizedLinearRegression) val model = new GeneralizedLinearRegressionModel("genLinReg", Vectors.dense(0.0), 0.0) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala index 9dee04c877..bd45d21e8d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala @@ -40,19 +40,6 @@ class LinearRegressionSuite @transient var datasetWithWeightConstantLabel: DataFrame = _ @transient var datasetWithWeightZeroLabel: DataFrame = _ - /* - In `LinearRegressionSuite`, we will make sure that the model trained by SparkML - is the same as the one trained by R's glmnet package. The following instruction - describes how to reproduce the data in R. - In a spark-shell, use the following code: - - import org.apache.spark.mllib.util.LinearDataGenerator - val data = - sc.parallelize(LinearDataGenerator.generateLinearInput(6.3, Array(4.7, 7.2), - Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2) - data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1)).coalesce(1) - .saveAsTextFile("path") - */ override def beforeAll(): Unit = { super.beforeAll() datasetWithDenseFeature = sqlContext.createDataFrame( @@ -60,8 +47,8 @@ class LinearRegressionSuite intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3), xVariance = Array(0.7, 1.2), nPoints = 10000, seed, eps = 0.1), 2)) /* - datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating - training model without intercept + datasetWithDenseFeatureWithoutIntercept is not needed for correctness testing + but is useful for illustrating training model without intercept */ datasetWithDenseFeatureWithoutIntercept = sqlContext.createDataFrame( sc.parallelize(LinearDataGenerator.generateLinearInput( @@ -119,6 +106,26 @@ class LinearRegressionSuite ), 2)) } + /** + * Enable the ignored test to export the dataset into CSV format, + * so we can validate the training accuracy compared with R's glmnet package. + */ + ignore("export test data into CSV format") { + datasetWithDenseFeature.rdd.map { case Row(label: Double, features: Vector) => + label + "," + features.toArray.mkString(",") + }.repartition(1).saveAsTextFile("target/tmp/LinearRegressionSuite/datasetWithDenseFeature") + + datasetWithDenseFeatureWithoutIntercept.rdd.map { + case Row(label: Double, features: Vector) => + label + "," + features.toArray.mkString(",") + }.repartition(1).saveAsTextFile( + "target/tmp/LinearRegressionSuite/datasetWithDenseFeatureWithoutIntercept") + + datasetWithSparseFeature.rdd.map { case Row(label: Double, features: Vector) => + label + "," + features.toArray.mkString(",") + }.repartition(1).saveAsTextFile("target/tmp/LinearRegressionSuite/datasetWithSparseFeature") + } + test("params") { ParamsSuite.checkParams(new LinearRegression) val model = new LinearRegressionModel("linearReg", Vectors.dense(0.0), 0.0) @@ -222,7 +229,7 @@ class LinearRegressionSuite /* Then again with the data with no intercept: - > coefficientsWithourIntercept + > coefficientsWithoutIntercept 3 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . -- cgit v1.2.3