[SPARK-13613][ML] Provide ignored tests to export test dataset into CSV format

## What changes were proposed in this pull request? Provide ignored test cases to export the test dataset into CSV format in ```LinearRegressionSuite```, ```LogisticRegressionSuite```, ```AFTSurvivalRegressionSuite``` and ```GeneralizedLinearRegressionSuite```, so users can validate the training accuracy compared with R's glm, glmnet and survival package. cc mengxr ## How was this patch tested? The test suite is ignored, but I have enabled all these cases offline and it works as expected. Author: Yanbo Liang <ybliang8@gmail.com> Closes #11463 from yanboliang/spark-13613.
author: Yanbo Liang <ybliang8@gmail.com> 2016-03-16 14:14:15 -0700
committer: Xiangrui Meng <meng@databricks.com> 2016-03-16 14:14:15 -0700
commit: 3f06eb72ca0c3e5779a702c7c677229e0c480751 (patch)
tree: b67196b2211feb9f7f882a78c7d51c63d5bf163a /mllib/src/test/scala/org
parent: ae6c677c8a03174787be99af6238a5e1fbe4e389 (diff)
download: spark-3f06eb72ca0c3e5779a702c7c677229e0c480751.tar.gz
spark-3f06eb72ca0c3e5779a702c7c677229e0c480751.tar.bz2
spark-3f06eb72ca0c3e5779a702c7c677229e0c480751.zip
4 files changed, 97 insertions, 33 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index cfb9bbfd41..afeeaf7fb5 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -23,7 +23,7 @@ import scala.util.Random
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, Identifiable, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.mllib.classification.LogisticRegressionSuite._
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -44,20 +44,6 @@ class LogisticRegressionSuite
 
     dataset = sqlContext.createDataFrame(generateLogisticInput(1.0, 1.0, nPoints = 100, seed = 42))
 
-    /*
-       Here is the instruction describing how to export the test data into CSV format
-       so we can validate the training accuracy compared with R's glmnet package.
-
-       import org.apache.spark.mllib.classification.LogisticRegressionSuite
-       val nPoints = 10000
-       val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
-       val xMean = Array(5.843, 3.057, 3.758, 1.199)
-       val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
-       val data = sc.parallelize(LogisticRegressionSuite.generateMultinomialLogisticInput(
-         coefficients, xMean, xVariance, true, nPoints, 42), 1)
-       data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1) + ", "
-         + x.features(2) + ", " + x.features(3)).saveAsTextFile("path")
-     */
     binaryDataset = {
       val nPoints = 10000
       val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
@@ -65,12 +51,23 @@ class LogisticRegressionSuite
       val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
 
       val testData =
-        generateMultinomialLogisticInput(coefficients, xMean, xVariance, true, nPoints, 42)
+        generateMultinomialLogisticInput(coefficients, xMean, xVariance,
+          addIntercept = true, nPoints, 42)
 
       sqlContext.createDataFrame(sc.parallelize(testData, 4))
     }
   }
 
+  /**
+   * Enable the ignored test to export the dataset into CSV format,
+   * so we can validate the training accuracy compared with R's glmnet package.
+   */
+  ignore("export test data into CSV format") {
+    binaryDataset.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/binaryDataset")
+  }
+
   test("params") {
     ParamsSuite.checkParams(new LogisticRegression)
     val model = new LogisticRegressionModel("logReg", Vectors.dense(0.0), 0.0)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
index d718ef63b5..dbd752d2aa 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
@@ -44,6 +44,19 @@ class AFTSurvivalRegressionSuite
         2, Array(0.9, -1.3), Array(0.7, 1.2), 1000, 42, 1.5, 2.5, 2.0)))
   }
 
+  /**
+   * Enable the ignored test to export the dataset into CSV format,
+   * so we can validate the training accuracy compared with R's survival package.
+   */
+  ignore("export test data into CSV format") {
+    datasetUnivariate.rdd.map { case Row(features: Vector, label: Double, censor: Double) =>
+      features.toArray.mkString(",") + "," + censor + "," + label
+    }.repartition(1).saveAsTextFile("target/tmp/AFTSurvivalRegressionSuite/datasetUnivariate")
+    datasetMultivariate.rdd.map { case Row(features: Vector, label: Double, censor: Double) =>
+      features.toArray.mkString(",") + "," + censor + "," + label
+    }.repartition(1).saveAsTextFile("target/tmp/AFTSurvivalRegressionSuite/datasetMultivariate")
+  }
+
   test("params") {
     ParamsSuite.checkParams(new AFTSurvivalRegression)
     val model = new AFTSurvivalRegressionModel("aftSurvReg", Vectors.dense(0.0), 0.0, 0.0)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 6d570f7bde..4ebdbf2213 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.mllib.classification.LogisticRegressionSuite._
-import org.apache.spark.mllib.linalg.{BLAS, DenseVector, Vectors}
+import org.apache.spark.mllib.linalg.{BLAS, DenseVector, Vector, Vectors}
 import org.apache.spark.mllib.random._
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLlibTestSparkContext
@@ -120,6 +120,53 @@ class GeneralizedLinearRegressionSuite
         family = "gamma", link = "log"), 2))
   }
 
+  /**
+   * Enable the ignored test to export the dataset into CSV format,
+   * so we can validate the training accuracy compared with R's glm and glmnet package.
+   */
+  ignore("export test data into CSV format") {
+    datasetGaussianIdentity.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetGaussianIdentity")
+    datasetGaussianLog.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetGaussianLog")
+    datasetGaussianInverse.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetGaussianInverse")
+    datasetBinomial.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetBinomial")
+    datasetPoissonLog.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetPoissonLog")
+    datasetPoissonIdentity.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetPoissonIdentity")
+    datasetPoissonSqrt.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetPoissonSqrt")
+    datasetGammaInverse.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetGammaInverse")
+    datasetGammaIdentity.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetGammaIdentity")
+    datasetGammaLog.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetGammaLog")
+  }
+
   test("params") {
     ParamsSuite.checkParams(new GeneralizedLinearRegression)
     val model = new GeneralizedLinearRegressionModel("genLinReg", Vectors.dense(0.0), 0.0)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 9dee04c877..bd45d21e8d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -40,19 +40,6 @@ class LinearRegressionSuite
   @transient var datasetWithWeightConstantLabel: DataFrame = _
   @transient var datasetWithWeightZeroLabel: DataFrame = _
 
-  /*
-     In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
-     is the same as the one trained by R's glmnet package. The following instruction
-     describes how to reproduce the data in R.
-     In a spark-shell, use the following code:
-
-     import org.apache.spark.mllib.util.LinearDataGenerator
-     val data =
-       sc.parallelize(LinearDataGenerator.generateLinearInput(6.3, Array(4.7, 7.2),
-         Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2)
-     data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1)).coalesce(1)
-       .saveAsTextFile("path")
-   */
   override def beforeAll(): Unit = {
     super.beforeAll()
     datasetWithDenseFeature = sqlContext.createDataFrame(
@@ -60,8 +47,8 @@ class LinearRegressionSuite
         intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
         xVariance = Array(0.7, 1.2), nPoints = 10000, seed, eps = 0.1), 2))
     /*
-       datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating
-       training model without intercept
+       datasetWithDenseFeatureWithoutIntercept is not needed for correctness testing
+       but is useful for illustrating training model without intercept
      */
     datasetWithDenseFeatureWithoutIntercept = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
@@ -119,6 +106,26 @@ class LinearRegressionSuite
       ), 2))
   }
 
+  /**
+   * Enable the ignored test to export the dataset into CSV format,
+   * so we can validate the training accuracy compared with R's glmnet package.
+   */
+  ignore("export test data into CSV format") {
+    datasetWithDenseFeature.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile("target/tmp/LinearRegressionSuite/datasetWithDenseFeature")
+
+    datasetWithDenseFeatureWithoutIntercept.rdd.map {
+      case Row(label: Double, features: Vector) =>
+        label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/LinearRegressionSuite/datasetWithDenseFeatureWithoutIntercept")
+
+    datasetWithSparseFeature.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile("target/tmp/LinearRegressionSuite/datasetWithSparseFeature")
+  }
+
   test("params") {
     ParamsSuite.checkParams(new LinearRegression)
     val model = new LinearRegressionModel("linearReg", Vectors.dense(0.0), 0.0)
@@ -222,7 +229,7 @@ class LinearRegressionSuite
 
       /*
          Then again with the data with no intercept:
-         > coefficientsWithourIntercept
+         > coefficientsWithoutIntercept
           3 x 1 sparse Matrix of class "dgCMatrix"
                                    s0
          (Intercept)           .
author	Yanbo Liang <ybliang8@gmail.com>	2016-03-16 14:14:15 -0700
committer	Xiangrui Meng <meng@databricks.com>	2016-03-16 14:14:15 -0700
commit	3f06eb72ca0c3e5779a702c7c677229e0c480751 (patch)
tree	b67196b2211feb9f7f882a78c7d51c63d5bf163a /mllib/src/test/scala/org
parent	ae6c677c8a03174787be99af6238a5e1fbe4e389 (diff)
download	spark-3f06eb72ca0c3e5779a702c7c677229e0c480751.tar.gz spark-3f06eb72ca0c3e5779a702c7c677229e0c480751.tar.bz2 spark-3f06eb72ca0c3e5779a702c7c677229e0c480751.zip