aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/main
diff options
context:
space:
mode:
Diffstat (limited to 'mllib/src/main')
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala9
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala17
2 files changed, 23 insertions, 3 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index f71726f110..a1d36c4bec 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -29,6 +29,8 @@ import org.apache.spark.sql.types.DoubleType
/**
* :: Experimental ::
* Evaluator for binary classification, which expects two input columns: rawPrediction and label.
+ * The rawPrediction column can be of type double (binary 0/1 prediction, or probability of label 1)
+ * or of type vector (length-2 vector of raw predictions, scores, or label probabilities).
*/
@Since("1.2.0")
@Experimental
@@ -78,13 +80,14 @@ class BinaryClassificationEvaluator @Since("1.4.0") (@Since("1.4.0") override va
@Since("1.2.0")
override def evaluate(dataset: DataFrame): Double = {
val schema = dataset.schema
- SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT)
+ SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT))
SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType)
// TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2.
val scoreAndLabels = dataset.select($(rawPredictionCol), $(labelCol))
- .map { case Row(rawPrediction: Vector, label: Double) =>
- (rawPrediction(1), label)
+ .map {
+ case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label)
+ case Row(rawPrediction: Double, label: Double) => (rawPrediction, label)
}
val metrics = new BinaryClassificationMetrics(scoreAndLabels)
val metric = $(metricName) match {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
index 76f651488a..e71dd9eee0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
@@ -44,6 +44,23 @@ private[spark] object SchemaUtils {
}
/**
+ * Check whether the given schema contains a column of one of the require data types.
+ * @param colName column name
+ * @param dataTypes required column data types
+ */
+ def checkColumnTypes(
+ schema: StructType,
+ colName: String,
+ dataTypes: Seq[DataType],
+ msg: String = ""): Unit = {
+ val actualDataType = schema(colName).dataType
+ val message = if (msg != null && msg.trim.length > 0) " " + msg else ""
+ require(dataTypes.exists(actualDataType.equals),
+ s"Column $colName must be of type equal to one of the following types: " +
+ s"${dataTypes.mkString("[", ", ", "]")} but was actually of type $actualDataType.$message")
+ }
+
+ /**
* Appends a new column to the input schema. This fails if the given output column already exists.
* @param schema input schema
* @param colName new column name. If this column name is an empty string "", this method returns