aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/test
diff options
context:
space:
mode:
authorXiangrui Meng <meng@databricks.com>2015-08-06 13:29:31 -0700
committerJoseph K. Bradley <joseph@databricks.com>2015-08-06 13:29:31 -0700
commit54c0789a05a783ce90e0e9848079be442a82966b (patch)
tree1108e74753dca0409b4e75f481556224a8d4cde0 /mllib/src/test
parent1f62f104c7a2aeac625b17d9e5ac62f1f10a2b21 (diff)
downloadspark-54c0789a05a783ce90e0e9848079be442a82966b.tar.gz
spark-54c0789a05a783ce90e0e9848079be442a82966b.tar.bz2
spark-54c0789a05a783ce90e0e9848079be442a82966b.zip
[SPARK-9493] [ML] add featureIndex to handle vector features in IsotonicRegression
This PR contains the following changes: * add `featureIndex` to handle vector features (in order to chain isotonic regression easily with output from logistic regression * make getter/setter names consistent with params * remove inheritance from Regressor because it is tricky to handle both `DoubleType` and `VectorType` * simplify test data generation jkbradley zapletal-martin Author: Xiangrui Meng <meng@databricks.com> Closes #7952 from mengxr/SPARK-9493 and squashes the following commits: 8818ac3 [Xiangrui Meng] address comments 05e2216 [Xiangrui Meng] address comments 8d08090 [Xiangrui Meng] add featureIndex to handle vector features make getter/setter names consistent with params remove inheritance from Regressor
Diffstat (limited to 'mllib/src/test')
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala82
1 files changed, 48 insertions, 34 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
index 66e4b170ba..c0ab00b68a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
@@ -19,57 +19,46 @@ package org.apache.spark.ml.regression
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row}
class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
- private val schema = StructType(
- Array(
- StructField("label", DoubleType),
- StructField("features", DoubleType),
- StructField("weight", DoubleType)))
-
- private val predictionSchema = StructType(Array(StructField("features", DoubleType)))
-
private def generateIsotonicInput(labels: Seq[Double]): DataFrame = {
- val data = Seq.tabulate(labels.size)(i => Row(labels(i), i.toDouble, 1d))
- val parallelData = sc.parallelize(data)
-
- sqlContext.createDataFrame(parallelData, schema)
+ sqlContext.createDataFrame(
+ labels.zipWithIndex.map { case (label, i) => (label, i.toDouble, 1.0) }
+ ).toDF("label", "features", "weight")
}
private def generatePredictionInput(features: Seq[Double]): DataFrame = {
- val data = Seq.tabulate(features.size)(i => Row(features(i)))
-
- val parallelData = sc.parallelize(data)
- sqlContext.createDataFrame(parallelData, predictionSchema)
+ sqlContext.createDataFrame(features.map(Tuple1.apply))
+ .toDF("features")
}
test("isotonic regression predictions") {
val dataset = generateIsotonicInput(Seq(1, 2, 3, 1, 6, 17, 16, 17, 18))
- val trainer = new IsotonicRegression().setIsotonicParam(true)
+ val ir = new IsotonicRegression().setIsotonic(true)
- val model = trainer.fit(dataset)
+ val model = ir.fit(dataset)
val predictions = model
.transform(dataset)
- .select("prediction").map {
- case Row(pred) => pred
+ .select("prediction").map { case Row(pred) =>
+ pred
}.collect()
assert(predictions === Array(1, 2, 2, 2, 6, 16.5, 16.5, 17, 18))
- assert(model.parentModel.boundaries === Array(0, 1, 3, 4, 5, 6, 7, 8))
- assert(model.parentModel.predictions === Array(1, 2, 2, 6, 16.5, 16.5, 17.0, 18.0))
- assert(model.parentModel.isotonic)
+ assert(model.boundaries === Vectors.dense(0, 1, 3, 4, 5, 6, 7, 8))
+ assert(model.predictions === Vectors.dense(1, 2, 2, 6, 16.5, 16.5, 17.0, 18.0))
+ assert(model.getIsotonic)
}
test("antitonic regression predictions") {
val dataset = generateIsotonicInput(Seq(7, 5, 3, 5, 1))
- val trainer = new IsotonicRegression().setIsotonicParam(false)
+ val ir = new IsotonicRegression().setIsotonic(false)
- val model = trainer.fit(dataset)
+ val model = ir.fit(dataset)
val features = generatePredictionInput(Seq(-2.0, -1.0, 0.5, 0.75, 1.0, 2.0, 9.0))
val predictions = model
@@ -94,9 +83,10 @@ class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
val ir = new IsotonicRegression()
assert(ir.getLabelCol === "label")
assert(ir.getFeaturesCol === "features")
- assert(ir.getWeightCol === "weight")
assert(ir.getPredictionCol === "prediction")
- assert(ir.getIsotonicParam === true)
+ assert(!ir.isDefined(ir.weightCol))
+ assert(ir.getIsotonic)
+ assert(ir.getFeatureIndex === 0)
val model = ir.fit(dataset)
model.transform(dataset)
@@ -105,21 +95,22 @@ class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
assert(model.getLabelCol === "label")
assert(model.getFeaturesCol === "features")
- assert(model.getWeightCol === "weight")
assert(model.getPredictionCol === "prediction")
- assert(model.getIsotonicParam === true)
+ assert(!model.isDefined(model.weightCol))
+ assert(model.getIsotonic)
+ assert(model.getFeatureIndex === 0)
assert(model.hasParent)
}
test("set parameters") {
val isotonicRegression = new IsotonicRegression()
- .setIsotonicParam(false)
- .setWeightParam("w")
+ .setIsotonic(false)
+ .setWeightCol("w")
.setFeaturesCol("f")
.setLabelCol("l")
.setPredictionCol("p")
- assert(isotonicRegression.getIsotonicParam === false)
+ assert(!isotonicRegression.getIsotonic)
assert(isotonicRegression.getWeightCol === "w")
assert(isotonicRegression.getFeaturesCol === "f")
assert(isotonicRegression.getLabelCol === "l")
@@ -130,7 +121,7 @@ class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
val dataset = generateIsotonicInput(Seq(1, 2, 3))
intercept[IllegalArgumentException] {
- new IsotonicRegression().setWeightParam("w").fit(dataset)
+ new IsotonicRegression().setWeightCol("w").fit(dataset)
}
intercept[IllegalArgumentException] {
@@ -145,4 +136,27 @@ class IsotonicRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
new IsotonicRegression().fit(dataset).setFeaturesCol("f").transform(dataset)
}
}
+
+ test("vector features column with feature index") {
+ val dataset = sqlContext.createDataFrame(Seq(
+ (4.0, Vectors.dense(0.0, 1.0)),
+ (3.0, Vectors.dense(0.0, 2.0)),
+ (5.0, Vectors.sparse(2, Array(1), Array(3.0))))
+ ).toDF("label", "features")
+
+ val ir = new IsotonicRegression()
+ .setFeatureIndex(1)
+
+ val model = ir.fit(dataset)
+
+ val features = generatePredictionInput(Seq(2.0, 3.0, 4.0, 5.0))
+
+ val predictions = model
+ .transform(features)
+ .select("prediction").map {
+ case Row(pred) => pred
+ }.collect()
+
+ assert(predictions === Array(3.5, 5.0, 5.0, 5.0))
+ }
}