aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoseph K. Bradley <joseph@databricks.com>2015-06-30 14:02:50 -0700
committerXiangrui Meng <meng@databricks.com>2015-06-30 14:02:57 -0700
commitbc355e24368123baca5335ddf5560ded1da11141 (patch)
treefd2c880cf7fccb35e524e3b3a62c3482fcaa5536
parent80b0fe2009dea98a6a09b7cf43590a555c638cad (diff)
downloadspark-bc355e24368123baca5335ddf5560ded1da11141.tar.gz
spark-bc355e24368123baca5335ddf5560ded1da11141.tar.bz2
spark-bc355e24368123baca5335ddf5560ded1da11141.zip
[SPARK-8736] [ML] GBTRegressor should not threshold prediction
Changed GBTRegressor so it does NOT threshold the prediction. Added test which fails with bug but works after fix. CC: feynmanliang mengxr Author: Joseph K. Bradley <joseph@databricks.com> Closes #7134 from jkbradley/gbrt-fix and squashes the following commits: 613b90e [Joseph K. Bradley] Changed GBTRegressor so it does NOT threshold the prediction (cherry picked from commit 3ba23ffd377d12383d923d1550ac8e2b916090fc) Signed-off-by: Xiangrui Meng <meng@databricks.com>
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala3
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala23
2 files changed, 23 insertions, 3 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index 036e3acb07..47c110d027 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -172,8 +172,7 @@ final class GBTRegressionModel(
// TODO: When we add a generic Boosting class, handle transform there? SPARK-7129
// Classifies by thresholding sum of weighted tree predictions
val treePredictions = _trees.map(_.rootNode.predict(features))
- val prediction = blas.ddot(numTrees, treePredictions, 1, _treeWeights, 1)
- if (prediction > 0.0) 1.0 else 0.0
+ blas.ddot(numTrees, treePredictions, 1, _treeWeights, 1)
}
override def copy(extra: ParamMap): GBTRegressionModel = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
index 98fb3d3f5f..9682edcd9b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
@@ -19,12 +19,13 @@ package org.apache.spark.ml.regression
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.impl.TreeTests
+import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.{EnsembleTestHelper, GradientBoostedTrees => OldGBT}
import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.{DataFrame, Row}
/**
@@ -67,6 +68,26 @@ class GBTRegressorSuite extends SparkFunSuite with MLlibTestSparkContext {
}
}
+ test("GBTRegressor behaves reasonably on toy data") {
+ val df = sqlContext.createDataFrame(Seq(
+ LabeledPoint(10, Vectors.dense(1, 2, 3, 4)),
+ LabeledPoint(-5, Vectors.dense(6, 3, 2, 1)),
+ LabeledPoint(11, Vectors.dense(2, 2, 3, 4)),
+ LabeledPoint(-6, Vectors.dense(6, 4, 2, 1)),
+ LabeledPoint(9, Vectors.dense(1, 2, 6, 4)),
+ LabeledPoint(-4, Vectors.dense(6, 3, 2, 2))
+ ))
+ val gbt = new GBTRegressor()
+ .setMaxDepth(2)
+ .setMaxIter(2)
+ val model = gbt.fit(df)
+ val preds = model.transform(df)
+ val predictions = preds.select("prediction").map(_.getDouble(0))
+ // Checks based on SPARK-8736 (to ensure it is not doing classification)
+ assert(predictions.max() > 2)
+ assert(predictions.min() < -1)
+ }
+
// TODO: Reinstate test once runWithValidation is implemented SPARK-7132
/*
test("runWithValidation stops early and performs better on a validation dataset") {