aboutsummaryrefslogtreecommitdiff
path: root/mllib/src
diff options
context:
space:
mode:
authorHolden Karau <holden@pigscanfly.ca>2015-10-08 11:16:20 -0700
committerJoseph K. Bradley <joseph@databricks.com>2015-10-08 11:16:20 -0700
commit0903c6489e9fa39db9575dace22a64015b9cd4c5 (patch)
tree99f20da94f24788d753fb3d0ce9771b49b9eecb9 /mllib/src
parentdcbd58a929be0058b1cfa59b14898c4c428a7680 (diff)
downloadspark-0903c6489e9fa39db9575dace22a64015b9cd4c5.tar.gz
spark-0903c6489e9fa39db9575dace22a64015b9cd4c5.tar.bz2
spark-0903c6489e9fa39db9575dace22a64015b9cd4c5.zip
[SPARK-9718] [ML] linear regression training summary all columns
LinearRegression training summary: The transformed dataset should hold all columns, not just selected ones like prediction and label. There is no real need to remove some, and the user may find them useful. Author: Holden Karau <holden@pigscanfly.ca> Closes #8564 from holdenk/SPARK-9718-LinearRegressionTrainingSummary-all-columns.
Diffstat (limited to 'mllib/src')
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala35
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala13
2 files changed, 40 insertions, 8 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 0dc084fdd1..dd09667ef5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -170,9 +170,12 @@ class LinearRegression(override val uid: String)
val intercept = yMean
val model = new LinearRegressionModel(uid, coefficients, intercept)
+ // Handle possible missing or invalid prediction columns
+ val (summaryModel, predictionColName) = model.findSummaryModelAndPredictionCol()
+
val trainingSummary = new LinearRegressionTrainingSummary(
- model.transform(dataset),
- $(predictionCol),
+ summaryModel.transform(dataset),
+ predictionColName,
$(labelCol),
$(featuresCol),
Array(0D))
@@ -262,9 +265,12 @@ class LinearRegression(override val uid: String)
if (handlePersistence) instances.unpersist()
val model = copyValues(new LinearRegressionModel(uid, coefficients, intercept))
+ // Handle possible missing or invalid prediction columns
+ val (summaryModel, predictionColName) = model.findSummaryModelAndPredictionCol()
+
val trainingSummary = new LinearRegressionTrainingSummary(
- model.transform(dataset),
- $(predictionCol),
+ summaryModel.transform(dataset),
+ predictionColName,
$(labelCol),
$(featuresCol),
objectiveHistory)
@@ -316,13 +322,26 @@ class LinearRegressionModel private[ml] (
*/
// TODO: decide on a good name before exposing to public API
private[regression] def evaluate(dataset: DataFrame): LinearRegressionSummary = {
- val t = udf { features: Vector => predict(features) }
- val predictionAndObservations = dataset
- .select(col($(labelCol)), t(col($(featuresCol))).as($(predictionCol)))
+ // Handle possible missing or invalid prediction columns
+ val (summaryModel, predictionColName) = findSummaryModelAndPredictionCol()
+ new LinearRegressionSummary(summaryModel.transform(dataset), predictionColName, $(labelCol))
+ }
- new LinearRegressionSummary(predictionAndObservations, $(predictionCol), $(labelCol))
+ /**
+ * If the prediction column is set returns the current model and prediction column,
+ * otherwise generates a new column and sets it as the prediction column on a new copy
+ * of the current model.
+ */
+ private[regression] def findSummaryModelAndPredictionCol(): (LinearRegressionModel, String) = {
+ $(predictionCol) match {
+ case "" =>
+ val predictionColName = "prediction_" + java.util.UUID.randomUUID.toString()
+ (copy(ParamMap.empty).setPredictionCol(predictionColName), predictionColName)
+ case p => (this, p)
+ }
}
+
override protected def predict(features: Vector): Double = {
dot(features, weights) + intercept
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 32729470d5..73a0a5caf8 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -462,9 +462,22 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
test("linear regression model training summary") {
val trainer = new LinearRegression
val model = trainer.fit(dataset)
+ val trainerNoPredictionCol = trainer.setPredictionCol("")
+ val modelNoPredictionCol = trainerNoPredictionCol.fit(dataset)
+
// Training results for the model should be available
assert(model.hasSummary)
+ assert(modelNoPredictionCol.hasSummary)
+
+ // Schema should be a superset of the input dataset
+ assert((dataset.schema.fieldNames.toSet + "prediction").subsetOf(
+ model.summary.predictions.schema.fieldNames.toSet))
+ // Validate that we re-insert a prediction column for evaluation
+ val modelNoPredictionColFieldNames = modelNoPredictionCol.summary.predictions.schema.fieldNames
+ assert((dataset.schema.fieldNames.toSet).subsetOf(
+ modelNoPredictionColFieldNames.toSet))
+ assert(modelNoPredictionColFieldNames.exists(s => s.startsWith("prediction_")))
// Residuals in [[LinearRegressionResults]] should equal those manually computed
val expectedResiduals = dataset.select("features", "label")