aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/test/scala/org
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2016-03-16 14:14:15 -0700
committerXiangrui Meng <meng@databricks.com>2016-03-16 14:14:15 -0700
commit3f06eb72ca0c3e5779a702c7c677229e0c480751 (patch)
treeb67196b2211feb9f7f882a78c7d51c63d5bf163a /mllib/src/test/scala/org
parentae6c677c8a03174787be99af6238a5e1fbe4e389 (diff)
downloadspark-3f06eb72ca0c3e5779a702c7c677229e0c480751.tar.gz
spark-3f06eb72ca0c3e5779a702c7c677229e0c480751.tar.bz2
spark-3f06eb72ca0c3e5779a702c7c677229e0c480751.zip
[SPARK-13613][ML] Provide ignored tests to export test dataset into CSV format
## What changes were proposed in this pull request? Provide ignored test cases to export the test dataset into CSV format in ```LinearRegressionSuite```, ```LogisticRegressionSuite```, ```AFTSurvivalRegressionSuite``` and ```GeneralizedLinearRegressionSuite```, so users can validate the training accuracy compared with R's glm, glmnet and survival package. cc mengxr ## How was this patch tested? The test suite is ignored, but I have enabled all these cases offline and it works as expected. Author: Yanbo Liang <ybliang8@gmail.com> Closes #11463 from yanboliang/spark-13613.
Diffstat (limited to 'mllib/src/test/scala/org')
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala29
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala13
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala49
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala39
4 files changed, 97 insertions, 33 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index cfb9bbfd41..afeeaf7fb5 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -23,7 +23,7 @@ import scala.util.Random
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.feature.Instance
import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, Identifiable, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.classification.LogisticRegressionSuite._
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
@@ -44,20 +44,6 @@ class LogisticRegressionSuite
dataset = sqlContext.createDataFrame(generateLogisticInput(1.0, 1.0, nPoints = 100, seed = 42))
- /*
- Here is the instruction describing how to export the test data into CSV format
- so we can validate the training accuracy compared with R's glmnet package.
-
- import org.apache.spark.mllib.classification.LogisticRegressionSuite
- val nPoints = 10000
- val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
- val xMean = Array(5.843, 3.057, 3.758, 1.199)
- val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
- val data = sc.parallelize(LogisticRegressionSuite.generateMultinomialLogisticInput(
- coefficients, xMean, xVariance, true, nPoints, 42), 1)
- data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1) + ", "
- + x.features(2) + ", " + x.features(3)).saveAsTextFile("path")
- */
binaryDataset = {
val nPoints = 10000
val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
@@ -65,12 +51,23 @@ class LogisticRegressionSuite
val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
val testData =
- generateMultinomialLogisticInput(coefficients, xMean, xVariance, true, nPoints, 42)
+ generateMultinomialLogisticInput(coefficients, xMean, xVariance,
+ addIntercept = true, nPoints, 42)
sqlContext.createDataFrame(sc.parallelize(testData, 4))
}
}
+ /**
+ * Enable the ignored test to export the dataset into CSV format,
+ * so we can validate the training accuracy compared with R's glmnet package.
+ */
+ ignore("export test data into CSV format") {
+ binaryDataset.rdd.map { case Row(label: Double, features: Vector) =>
+ label + "," + features.toArray.mkString(",")
+ }.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/binaryDataset")
+ }
+
test("params") {
ParamsSuite.checkParams(new LogisticRegression)
val model = new LogisticRegressionModel("logReg", Vectors.dense(0.0), 0.0)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
index d718ef63b5..dbd752d2aa 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
@@ -44,6 +44,19 @@ class AFTSurvivalRegressionSuite
2, Array(0.9, -1.3), Array(0.7, 1.2), 1000, 42, 1.5, 2.5, 2.0)))
}
+ /**
+ * Enable the ignored test to export the dataset into CSV format,
+ * so we can validate the training accuracy compared with R's survival package.
+ */
+ ignore("export test data into CSV format") {
+ datasetUnivariate.rdd.map { case Row(features: Vector, label: Double, censor: Double) =>
+ features.toArray.mkString(",") + "," + censor + "," + label
+ }.repartition(1).saveAsTextFile("target/tmp/AFTSurvivalRegressionSuite/datasetUnivariate")
+ datasetMultivariate.rdd.map { case Row(features: Vector, label: Double, censor: Double) =>
+ features.toArray.mkString(",") + "," + censor + "," + label
+ }.repartition(1).saveAsTextFile("target/tmp/AFTSurvivalRegressionSuite/datasetMultivariate")
+ }
+
test("params") {
ParamsSuite.checkParams(new AFTSurvivalRegression)
val model = new AFTSurvivalRegressionModel("aftSurvReg", Vectors.dense(0.0), 0.0, 0.0)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 6d570f7bde..4ebdbf2213 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.ml.feature.Instance
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.classification.LogisticRegressionSuite._
-import org.apache.spark.mllib.linalg.{BLAS, DenseVector, Vectors}
+import org.apache.spark.mllib.linalg.{BLAS, DenseVector, Vector, Vectors}
import org.apache.spark.mllib.random._
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLlibTestSparkContext
@@ -120,6 +120,53 @@ class GeneralizedLinearRegressionSuite
family = "gamma", link = "log"), 2))
}
+ /**
+ * Enable the ignored test to export the dataset into CSV format,
+ * so we can validate the training accuracy compared with R's glm and glmnet package.
+ */
+ ignore("export test data into CSV format") {
+ datasetGaussianIdentity.rdd.map { case Row(label: Double, features: Vector) =>
+ label + "," + features.toArray.mkString(",")
+ }.repartition(1).saveAsTextFile(
+ "target/tmp/GeneralizedLinearRegressionSuite/datasetGaussianIdentity")
+ datasetGaussianLog.rdd.map { case Row(label: Double, features: Vector) =>
+ label + "," + features.toArray.mkString(",")
+ }.repartition(1).saveAsTextFile(
+ "target/tmp/GeneralizedLinearRegressionSuite/datasetGaussianLog")
+ datasetGaussianInverse.rdd.map { case Row(label: Double, features: Vector) =>
+ label + "," + features.toArray.mkString(",")
+ }.repartition(1).saveAsTextFile(
+ "target/tmp/GeneralizedLinearRegressionSuite/datasetGaussianInverse")
+ datasetBinomial.rdd.map { case Row(label: Double, features: Vector) =>
+ label + "," + features.toArray.mkString(",")
+ }.repartition(1).saveAsTextFile(
+ "target/tmp/GeneralizedLinearRegressionSuite/datasetBinomial")
+ datasetPoissonLog.rdd.map { case Row(label: Double, features: Vector) =>
+ label + "," + features.toArray.mkString(",")
+ }.repartition(1).saveAsTextFile(
+ "target/tmp/GeneralizedLinearRegressionSuite/datasetPoissonLog")
+ datasetPoissonIdentity.rdd.map { case Row(label: Double, features: Vector) =>
+ label + "," + features.toArray.mkString(",")
+ }.repartition(1).saveAsTextFile(
+ "target/tmp/GeneralizedLinearRegressionSuite/datasetPoissonIdentity")
+ datasetPoissonSqrt.rdd.map { case Row(label: Double, features: Vector) =>
+ label + "," + features.toArray.mkString(",")
+ }.repartition(1).saveAsTextFile(
+ "target/tmp/GeneralizedLinearRegressionSuite/datasetPoissonSqrt")
+ datasetGammaInverse.rdd.map { case Row(label: Double, features: Vector) =>
+ label + "," + features.toArray.mkString(",")
+ }.repartition(1).saveAsTextFile(
+ "target/tmp/GeneralizedLinearRegressionSuite/datasetGammaInverse")
+ datasetGammaIdentity.rdd.map { case Row(label: Double, features: Vector) =>
+ label + "," + features.toArray.mkString(",")
+ }.repartition(1).saveAsTextFile(
+ "target/tmp/GeneralizedLinearRegressionSuite/datasetGammaIdentity")
+ datasetGammaLog.rdd.map { case Row(label: Double, features: Vector) =>
+ label + "," + features.toArray.mkString(",")
+ }.repartition(1).saveAsTextFile(
+ "target/tmp/GeneralizedLinearRegressionSuite/datasetGammaLog")
+ }
+
test("params") {
ParamsSuite.checkParams(new GeneralizedLinearRegression)
val model = new GeneralizedLinearRegressionModel("genLinReg", Vectors.dense(0.0), 0.0)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 9dee04c877..bd45d21e8d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -40,19 +40,6 @@ class LinearRegressionSuite
@transient var datasetWithWeightConstantLabel: DataFrame = _
@transient var datasetWithWeightZeroLabel: DataFrame = _
- /*
- In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
- is the same as the one trained by R's glmnet package. The following instruction
- describes how to reproduce the data in R.
- In a spark-shell, use the following code:
-
- import org.apache.spark.mllib.util.LinearDataGenerator
- val data =
- sc.parallelize(LinearDataGenerator.generateLinearInput(6.3, Array(4.7, 7.2),
- Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2)
- data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1)).coalesce(1)
- .saveAsTextFile("path")
- */
override def beforeAll(): Unit = {
super.beforeAll()
datasetWithDenseFeature = sqlContext.createDataFrame(
@@ -60,8 +47,8 @@ class LinearRegressionSuite
intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
xVariance = Array(0.7, 1.2), nPoints = 10000, seed, eps = 0.1), 2))
/*
- datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating
- training model without intercept
+ datasetWithDenseFeatureWithoutIntercept is not needed for correctness testing
+ but is useful for illustrating training model without intercept
*/
datasetWithDenseFeatureWithoutIntercept = sqlContext.createDataFrame(
sc.parallelize(LinearDataGenerator.generateLinearInput(
@@ -119,6 +106,26 @@ class LinearRegressionSuite
), 2))
}
+ /**
+ * Enable the ignored test to export the dataset into CSV format,
+ * so we can validate the training accuracy compared with R's glmnet package.
+ */
+ ignore("export test data into CSV format") {
+ datasetWithDenseFeature.rdd.map { case Row(label: Double, features: Vector) =>
+ label + "," + features.toArray.mkString(",")
+ }.repartition(1).saveAsTextFile("target/tmp/LinearRegressionSuite/datasetWithDenseFeature")
+
+ datasetWithDenseFeatureWithoutIntercept.rdd.map {
+ case Row(label: Double, features: Vector) =>
+ label + "," + features.toArray.mkString(",")
+ }.repartition(1).saveAsTextFile(
+ "target/tmp/LinearRegressionSuite/datasetWithDenseFeatureWithoutIntercept")
+
+ datasetWithSparseFeature.rdd.map { case Row(label: Double, features: Vector) =>
+ label + "," + features.toArray.mkString(",")
+ }.repartition(1).saveAsTextFile("target/tmp/LinearRegressionSuite/datasetWithSparseFeature")
+ }
+
test("params") {
ParamsSuite.checkParams(new LinearRegression)
val model = new LinearRegressionModel("linearReg", Vectors.dense(0.0), 0.0)
@@ -222,7 +229,7 @@ class LinearRegressionSuite
/*
Then again with the data with no intercept:
- > coefficientsWithourIntercept
+ > coefficientsWithoutIntercept
3 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .