[SPARK-9493] [ML] add featureIndex to handle vector features in IsotonicRegression

This PR contains the following changes: * add `featureIndex` to handle vector features (in order to chain isotonic regression easily with output from logistic regression * make getter/setter names consistent with params * remove inheritance from Regressor because it is tricky to handle both `DoubleType` and `VectorType` * simplify test data generation jkbradley zapletal-martin Author: Xiangrui Meng <meng@databricks.com> Closes #7952 from mengxr/SPARK-9493 and squashes the following commits: 8818ac3 [Xiangrui Meng] address comments 05e2216 [Xiangrui Meng] address comments 8d08090 [Xiangrui Meng] add featureIndex to handle vector features make getter/setter names consistent with params remove inheritance from Regressor
author: Xiangrui Meng <meng@databricks.com> 2015-08-06 13:29:31 -0700
committer: Joseph K. Bradley <joseph@databricks.com> 2015-08-06 13:29:31 -0700
commit: 54c0789a05a783ce90e0e9848079be442a82966b (patch)
tree: 1108e74753dca0409b4e75f481556224a8d4cde0 /mllib/src/test
parent: 1f62f104c7a2aeac625b17d9e5ac62f1f10a2b21 (diff)
download: spark-54c0789a05a783ce90e0e9848079be442a82966b.tar.gz
spark-54c0789a05a783ce90e0e9848079be442a82966b.tar.bz2
spark-54c0789a05a783ce90e0e9848079be442a82966b.zip
1 files changed, 48 insertions, 34 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
index 66e4b170ba..c0ab00b68a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
@@ -19,57 +19,46 @@ package org.apache.spark.ml.regression
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
 import org.apache.spark.sql.{DataFrame, Row}
 
 class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
-  private val schema = StructType(
-    Array(
-      StructField("label", DoubleType),
-      StructField("features", DoubleType),
-      StructField("weight", DoubleType)))
-
-  private val predictionSchema = StructType(Array(StructField("features", DoubleType)))
-
   private def generateIsotonicInput(labels: Seq[Double]): DataFrame = {
-    val data = Seq.tabulate(labels.size)(i => Row(labels(i), i.toDouble, 1d))
-    val parallelData = sc.parallelize(data)
-
-    sqlContext.createDataFrame(parallelData, schema)
+    sqlContext.createDataFrame(
+      labels.zipWithIndex.map { case (label, i) => (label, i.toDouble, 1.0) }
+    ).toDF("label", "features", "weight")
   }
 
   private def generatePredictionInput(features: Seq[Double]): DataFrame = {
-    val data = Seq.tabulate(features.size)(i => Row(features(i)))
-
-    val parallelData = sc.parallelize(data)
-    sqlContext.createDataFrame(parallelData, predictionSchema)
+    sqlContext.createDataFrame(features.map(Tuple1.apply))
+      .toDF("features")
   }
 
   test("isotonic regression predictions") {
     val dataset = generateIsotonicInput(Seq(1, 2, 3, 1, 6, 17, 16, 17, 18))
-    val trainer = new IsotonicRegression().setIsotonicParam(true)
+    val ir = new IsotonicRegression().setIsotonic(true)
 
-    val model = trainer.fit(dataset)
+    val model = ir.fit(dataset)
 
     val predictions = model
       .transform(dataset)
-      .select("prediction").map {
-        case Row(pred) => pred
+      .select("prediction").map { case Row(pred) =>
+        pred
       }.collect()
 
     assert(predictions === Array(1, 2, 2, 2, 6, 16.5, 16.5, 17, 18))
 
-    assert(model.parentModel.boundaries === Array(0, 1, 3, 4, 5, 6, 7, 8))
-    assert(model.parentModel.predictions === Array(1, 2, 2, 6, 16.5, 16.5, 17.0, 18.0))
-    assert(model.parentModel.isotonic)
+    assert(model.boundaries === Vectors.dense(0, 1, 3, 4, 5, 6, 7, 8))
+    assert(model.predictions === Vectors.dense(1, 2, 2, 6, 16.5, 16.5, 17.0, 18.0))
+    assert(model.getIsotonic)
   }
 
   test("antitonic regression predictions") {
     val dataset = generateIsotonicInput(Seq(7, 5, 3, 5, 1))
-    val trainer = new IsotonicRegression().setIsotonicParam(false)
+    val ir = new IsotonicRegression().setIsotonic(false)
 
-    val model = trainer.fit(dataset)
+    val model = ir.fit(dataset)
     val features = generatePredictionInput(Seq(-2.0, -1.0, 0.5, 0.75, 1.0, 2.0, 9.0))
 
     val predictions = model
@@ -94,9 +83,10 @@ class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     val ir = new IsotonicRegression()
     assert(ir.getLabelCol === "label")
     assert(ir.getFeaturesCol === "features")
-    assert(ir.getWeightCol === "weight")
     assert(ir.getPredictionCol === "prediction")
-    assert(ir.getIsotonicParam === true)
+    assert(!ir.isDefined(ir.weightCol))
+    assert(ir.getIsotonic)
+    assert(ir.getFeatureIndex === 0)
 
     val model = ir.fit(dataset)
     model.transform(dataset)
@@ -105,21 +95,22 @@ class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     assert(model.getLabelCol === "label")
     assert(model.getFeaturesCol === "features")
-    assert(model.getWeightCol === "weight")
     assert(model.getPredictionCol === "prediction")
-    assert(model.getIsotonicParam === true)
+    assert(!model.isDefined(model.weightCol))
+    assert(model.getIsotonic)
+    assert(model.getFeatureIndex === 0)
     assert(model.hasParent)
   }
 
   test("set parameters") {
     val isotonicRegression = new IsotonicRegression()
-      .setIsotonicParam(false)
-      .setWeightParam("w")
+      .setIsotonic(false)
+      .setWeightCol("w")
       .setFeaturesCol("f")
       .setLabelCol("l")
       .setPredictionCol("p")
 
-    assert(isotonicRegression.getIsotonicParam === false)
+    assert(!isotonicRegression.getIsotonic)
     assert(isotonicRegression.getWeightCol === "w")
     assert(isotonicRegression.getFeaturesCol === "f")
     assert(isotonicRegression.getLabelCol === "l")
@@ -130,7 +121,7 @@ class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     val dataset = generateIsotonicInput(Seq(1, 2, 3))
 
     intercept[IllegalArgumentException] {
-      new IsotonicRegression().setWeightParam("w").fit(dataset)
+      new IsotonicRegression().setWeightCol("w").fit(dataset)
     }
 
     intercept[IllegalArgumentException] {
@@ -145,4 +136,27 @@ class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       new IsotonicRegression().fit(dataset).setFeaturesCol("f").transform(dataset)
     }
   }
+
+  test("vector features column with feature index") {
+    val dataset = sqlContext.createDataFrame(Seq(
+      (4.0, Vectors.dense(0.0, 1.0)),
+      (3.0, Vectors.dense(0.0, 2.0)),
+      (5.0, Vectors.sparse(2, Array(1), Array(3.0))))
+    ).toDF("label", "features")
+
+    val ir = new IsotonicRegression()
+      .setFeatureIndex(1)
+
+    val model = ir.fit(dataset)
+
+    val features = generatePredictionInput(Seq(2.0, 3.0, 4.0, 5.0))
+
+    val predictions = model
+      .transform(features)
+      .select("prediction").map {
+      case Row(pred) => pred
+    }.collect()
+
+    assert(predictions === Array(3.5, 5.0, 5.0, 5.0))
+  }
 }
author	Xiangrui Meng <meng@databricks.com>	2015-08-06 13:29:31 -0700
committer	Joseph K. Bradley <joseph@databricks.com>	2015-08-06 13:29:31 -0700
commit	54c0789a05a783ce90e0e9848079be442a82966b (patch)
tree	1108e74753dca0409b4e75f481556224a8d4cde0 /mllib/src/test
parent	1f62f104c7a2aeac625b17d9e5ac62f1f10a2b21 (diff)
download	spark-54c0789a05a783ce90e0e9848079be442a82966b.tar.gz spark-54c0789a05a783ce90e0e9848079be442a82966b.tar.bz2 spark-54c0789a05a783ce90e0e9848079be442a82966b.zip