aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBenFradet <benjamin.fradet@gmail.com>2016-04-01 18:25:43 -0700
committerJoseph K. Bradley <joseph@databricks.com>2016-04-01 18:25:43 -0700
commit36e8fb8005eccea67a9dea8cf68ec3105aa43351 (patch)
tree98cc84e42e9c5472752e6f73022ed3e2ec2b7778
parentabc6c42c2d76d6aa4a8cac605cb4214e8f8f3328 (diff)
downloadspark-36e8fb8005eccea67a9dea8cf68ec3105aa43351.tar.gz
spark-36e8fb8005eccea67a9dea8cf68ec3105aa43351.tar.bz2
spark-36e8fb8005eccea67a9dea8cf68ec3105aa43351.zip
[SPARK-7425][ML] spark.ml Predictor should support other numeric types for label
Currently, the Predictor abstraction expects the input labelCol type to be DoubleType, but we should support other numeric types. This will involve updating the PredictorParams.validateAndTransformSchema method. Author: BenFradet <benjamin.fradet@gmail.com> Closes #10355 from BenFradet/SPARK-7425.
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/Predictor.scala9
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala7
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala11
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala13
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala11
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala24
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala15
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala9
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala11
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala12
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala14
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala16
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala8
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala9
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala8
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala8
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala12
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala9
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala17
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala8
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala18
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala86
24 files changed, 294 insertions, 49 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
index ebe48700f8..d23ae6f794 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
@@ -36,6 +36,7 @@ private[ml] trait PredictorParams extends Params
/**
* Validates and transforms the input schema with the provided param map.
+ *
* @param schema input schema
* @param fitting whether this is in fitting
* @param featuresDataType SQL DataType for FeaturesType.
@@ -49,8 +50,7 @@ private[ml] trait PredictorParams extends Params
// TODO: Support casting Array[Double] and Array[Float] to Vector when FeaturesType = Vector
SchemaUtils.checkColumnType(schema, $(featuresCol), featuresDataType)
if (fitting) {
- // TODO: Allow other numeric types
- SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType)
+ SchemaUtils.checkNumericType(schema, $(labelCol))
}
SchemaUtils.appendColumn(schema, $(predictionCol), DoubleType)
}
@@ -121,9 +121,8 @@ abstract class Predictor[
* and put it in an RDD with strong types.
*/
protected def extractLabeledPoints(dataset: DataFrame): RDD[LabeledPoint] = {
- dataset.select($(labelCol), $(featuresCol)).rdd.map {
- case Row(label: Double, features: Vector) =>
- LabeledPoint(label, features)
+ dataset.select(col($(labelCol)).cast(DoubleType), col($(featuresCol))).rdd.map {
+ case Row(label: Double, features: Vector) => LabeledPoint(label, features)
}
}
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 3d1d5b6892..aeb94a6600 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -38,6 +38,7 @@ import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.functions.{col, lit}
+import org.apache.spark.sql.types.DoubleType
import org.apache.spark.storage.StorageLevel
/**
@@ -265,7 +266,7 @@ class LogisticRegression @Since("1.2.0") (
LogisticRegressionModel = {
val w = if ($(weightCol).isEmpty) lit(1.0) else col($(weightCol))
val instances: RDD[Instance] =
- dataset.select(col($(labelCol)), w, col($(featuresCol))).rdd.map {
+ dataset.select(col($(labelCol)).cast(DoubleType), w, col($(featuresCol))).rdd.map {
case Row(label: Double, weight: Double, features: Vector) =>
Instance(label, weight, features)
}
@@ -361,7 +362,7 @@ class LogisticRegression @Since("1.2.0") (
if (optInitialModel.isDefined && optInitialModel.get.coefficients.size != numFeatures) {
val vec = optInitialModel.get.coefficients
logWarning(
- s"Initial coefficients provided ${vec} did not match the expected size ${numFeatures}")
+ s"Initial coefficients provided $vec did not match the expected size $numFeatures")
}
if (optInitialModel.isDefined && optInitialModel.get.coefficients.size == numFeatures) {
@@ -522,7 +523,7 @@ class LogisticRegressionModel private[spark] (
(LogisticRegressionModel, String) = {
$(probabilityCol) match {
case "" =>
- val probabilityColName = "probability_" + java.util.UUID.randomUUID.toString()
+ val probabilityColName = "probability_" + java.util.UUID.randomUUID.toString
(copy(ParamMap.empty).setProbabilityCol(probabilityColName), probabilityColName)
case p => (this, p)
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index 98b99a3485..263d54ce4d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -295,10 +295,12 @@ final class OneVsRest @Since("1.4.0") (
@Since("1.4.0")
override def fit(dataset: DataFrame): OneVsRestModel = {
+ transformSchema(dataset.schema)
+
// determine number of classes either from metadata if provided, or via computation.
val labelSchema = dataset.schema($(labelCol))
val computeNumClasses: () => Int = () => {
- val Row(maxLabelIndex: Double) = dataset.agg(max($(labelCol))).head()
+ val Row(maxLabelIndex: Double) = dataset.agg(max(col($(labelCol)).cast(DoubleType))).head()
// classes are assumed to be numbered from 0,...,maxLabelIndex
maxLabelIndex.toInt + 1
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index ba5708ab8d..3278974954 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -103,7 +103,7 @@ private[regression] trait AFTSurvivalRegressionParams extends Params
SchemaUtils.checkColumnType(schema, $(featuresCol), new VectorUDT)
if (fitting) {
SchemaUtils.checkColumnType(schema, $(censorCol), DoubleType)
- SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType)
+ SchemaUtils.checkNumericType(schema, $(labelCol))
}
if (hasQuantilesCol) {
SchemaUtils.appendColumn(schema, $(quantilesCol), new VectorUDT)
@@ -184,10 +184,11 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S
* and put it in an RDD with strong types.
*/
protected[ml] def extractAFTPoints(dataset: DataFrame): RDD[AFTPoint] = {
- dataset.select($(featuresCol), $(labelCol), $(censorCol)).rdd.map {
- case Row(features: Vector, label: Double, censor: Double) =>
- AFTPoint(features, label, censor)
- }
+ dataset.select(col($(featuresCol)), col($(labelCol)).cast(DoubleType), col($(censorCol)))
+ .rdd.map {
+ case Row(features: Vector, label: Double, censor: Double) =>
+ AFTPoint(features, label, censor)
+ }
}
@Since("1.6.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 0e71e8d8e1..a40d3731cb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -33,7 +33,7 @@ import org.apache.spark.mllib.linalg.{BLAS, Vector}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types.{DataType, StructType}
+import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
/**
* Params for Generalized Linear Regression.
@@ -47,6 +47,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
* to be used in the model.
* Supported options: "gaussian", "binomial", "poisson" and "gamma".
* Default is "gaussian".
+ *
* @group param
*/
@Since("2.0.0")
@@ -63,6 +64,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
* Param for the name of link function which provides the relationship
* between the linear predictor and the mean of the distribution function.
* Supported options: "identity", "log", "inverse", "logit", "probit", "cloglog" and "sqrt".
+ *
* @group param
*/
@Since("2.0.0")
@@ -210,9 +212,10 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
}
val w = if ($(weightCol).isEmpty) lit(1.0) else col($(weightCol))
- val instances: RDD[Instance] = dataset.select(col($(labelCol)), w, col($(featuresCol))).rdd
- .map { case Row(label: Double, weight: Double, features: Vector) =>
- Instance(label, weight, features)
+ val instances: RDD[Instance] =
+ dataset.select(col($(labelCol)).cast(DoubleType), w, col($(featuresCol))).rdd.map {
+ case Row(label: Double, weight: Double, features: Vector) =>
+ Instance(label, weight, features)
}
if (familyObj == Gaussian && linkObj == Identity) {
@@ -698,7 +701,7 @@ class GeneralizedLinearRegressionModel private[ml] (
: (GeneralizedLinearRegressionModel, String) = {
$(predictionCol) match {
case "" =>
- val predictionColName = "prediction_" + java.util.UUID.randomUUID.toString()
+ val predictionColName = "prediction_" + java.util.UUID.randomUUID.toString
(copy(ParamMap.empty).setPredictionCol(predictionColName), predictionColName)
case p => (this, p)
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
index fb733f9a34..bd0b631d89 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
@@ -90,7 +90,7 @@ private[regression] trait IsotonicRegressionBase extends Params with HasFeatures
} else {
lit(1.0)
}
- dataset.select(col($(labelCol)), f, w).rdd.map {
+ dataset.select(col($(labelCol)).cast(DoubleType), f, w).rdd.map {
case Row(label: Double, feature: Double, weight: Double) =>
(label, feature, weight)
}
@@ -106,7 +106,7 @@ private[regression] trait IsotonicRegressionBase extends Params with HasFeatures
schema: StructType,
fitting: Boolean): StructType = {
if (fitting) {
- SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType)
+ SchemaUtils.checkNumericType(schema, $(labelCol))
if (hasWeightCol) {
SchemaUtils.checkColumnType(schema, $(weightCol), DoubleType)
} else {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 5ec02135cc..ba5ad4c072 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -40,6 +40,7 @@ import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.DoubleType
import org.apache.spark.storage.StorageLevel
/**
@@ -171,7 +172,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
// For low dimensional data, WeightedLeastSquares is more efficiently since the
// training algorithm only requires one pass through the data. (SPARK-10668)
val instances: RDD[Instance] = dataset.select(
- col($(labelCol)), w, col($(featuresCol))).rdd.map {
+ col($(labelCol)).cast(DoubleType), w, col($(featuresCol))).rdd.map {
case Row(label: Double, weight: Double, features: Vector) =>
Instance(label, weight, features)
}
@@ -431,7 +432,7 @@ class LinearRegressionModel private[ml] (
private[regression] def findSummaryModelAndPredictionCol(): (LinearRegressionModel, String) = {
$(predictionCol) match {
case "" =>
- val predictionColName = "prediction_" + java.util.UUID.randomUUID.toString()
+ val predictionColName = "prediction_" + java.util.UUID.randomUUID.toString
(copy(ParamMap.empty).setPredictionCol(predictionColName), predictionColName)
case p => (this, p)
}
@@ -550,7 +551,7 @@ class LinearRegressionSummary private[regression] (
@transient private val metrics = new RegressionMetrics(
predictions
- .select(predictionCol, labelCol)
+ .select(col(predictionCol), col(labelCol).cast(DoubleType))
.rdd
.map { case Row(pred: Double, label: Double) => (pred, label) },
!model.getFitIntercept)
@@ -653,7 +654,7 @@ class LinearRegressionSummary private[regression] (
col(model.getWeightCol)).as("wse")).agg(sum(col("wse"))).first().getDouble(0)
}
val sigma2 = rss / degreesOfFreedom
- diagInvAtWA.map(_ * sigma2).map(math.sqrt(_))
+ diagInvAtWA.map(_ * sigma2).map(math.sqrt)
}
}
@@ -826,7 +827,7 @@ private class LeastSquaresAggregator(
instance match { case Instance(label, weight, features) =>
require(dim == features.size, s"Dimensions mismatch when adding new sample." +
s" Expecting $dim but got ${features.size}.")
- require(weight >= 0.0, s"instance weight, ${weight} has to be >= 0.0")
+ require(weight >= 0.0, s"instance weight, $weight has to be >= 0.0")
if (weight == 0.0) return this
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
index 76021ad8f4..334410c962 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/SchemaUtils.scala
@@ -17,7 +17,7 @@
package org.apache.spark.ml.util
-import org.apache.spark.sql.types.{DataType, StructField, StructType}
+import org.apache.spark.sql.types.{DataType, NumericType, StructField, StructType}
/**
@@ -44,10 +44,10 @@ private[spark] object SchemaUtils {
}
/**
- * Check whether the given schema contains a column of one of the require data types.
- * @param colName column name
- * @param dataTypes required column data types
- */
+ * Check whether the given schema contains a column of one of the require data types.
+ * @param colName column name
+ * @param dataTypes required column data types
+ */
def checkColumnTypes(
schema: StructType,
colName: String,
@@ -61,6 +61,20 @@ private[spark] object SchemaUtils {
}
/**
+ * Check whether the given schema contains a column of the numeric data type.
+ * @param colName column name
+ */
+ def checkNumericType(
+ schema: StructType,
+ colName: String,
+ msg: String = ""): Unit = {
+ val actualDataType = schema(colName).dataType
+ val message = if (msg != null && msg.trim.length > 0) " " + msg else ""
+ require(actualDataType.isInstanceOf[NumericType], s"Column $colName must be of type " +
+ s"NumericType but was actually of type $actualDataType.$message")
+ }
+
+ /**
* Appends a new column to the input schema. This fails if the given output column already exists.
* @param schema input schema
* @param colName new column name. If this column name is an empty string "", this method returns
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
index 2b07524815..fe839e15e9 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
@@ -27,8 +27,7 @@ import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree, DecisionTreeSuite => OldDecisionTreeSuite}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.{DataFrame, Row}
class DecisionTreeClassifierSuite
extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
@@ -176,7 +175,7 @@ class DecisionTreeClassifierSuite
}
test("Multiclass classification tree with 10-ary (ordered) categorical features," +
- " with just enough bins") {
+ " with just enough bins") {
val rdd = categoricalDataPointsForMulticlassForOrderedFeaturesRDD
val dt = new DecisionTreeClassifier()
.setImpurity("Gini")
@@ -273,7 +272,7 @@ class DecisionTreeClassifierSuite
))
val df = TreeTests.setMetadata(data, Map(0 -> 1), 2)
val dt = new DecisionTreeClassifier().setMaxDepth(3)
- val model = dt.fit(df)
+ dt.fit(df)
}
test("Use soft prediction for binary classification with ordered categorical features") {
@@ -335,6 +334,14 @@ class DecisionTreeClassifierSuite
assert(importances.toArray.forall(_ >= 0.0))
}
+ test("should support all NumericType labels and not support other types") {
+ val dt = new DecisionTreeClassifier().setMaxDepth(1)
+ MLTestingUtils.checkNumericTypes[DecisionTreeClassificationModel, DecisionTreeClassifier](
+ dt, isClassification = true, sqlContext) { (expected, actual) =>
+ TreeTests.checkEqual(expected, actual)
+ }
+ }
+
/////////////////////////////////////////////////////////////////////////////
// Tests of model save/load
/////////////////////////////////////////////////////////////////////////////
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
index bf7481e8a3..76d8c9372e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala
@@ -31,7 +31,6 @@ import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame
import org.apache.spark.util.Utils
-
/**
* Test suite for [[GBTClassifier]].
*/
@@ -102,6 +101,14 @@ class GBTClassifierSuite extends SparkFunSuite with MLlibTestSparkContext {
Utils.deleteRecursively(tempDir)
}
+ test("should support all NumericType labels and not support other types") {
+ val gbt = new GBTClassifier().setMaxDepth(1)
+ MLTestingUtils.checkNumericTypes[GBTClassificationModel, GBTClassifier](
+ gbt, isClassification = true, sqlContext) { (expected, actual) =>
+ TreeTests.checkEqual(expected, actual)
+ }
+ }
+
// TODO: Reinstate test once runWithValidation is implemented SPARK-7132
/*
test("runWithValidation stops early and performs better on a validation dataset") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index afeeaf7fb5..7eefaf2346 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -103,7 +103,7 @@ class LogisticRegressionSuite
assert(model.hasSummary)
// Validate that we re-insert a probability column for evaluation
val fieldNames = model.summary.predictions.schema.fieldNames
- assert((dataset.schema.fieldNames.toSet).subsetOf(
+ assert(dataset.schema.fieldNames.toSet.subsetOf(
fieldNames.toSet))
assert(fieldNames.exists(s => s.startsWith("probability_")))
}
@@ -934,6 +934,15 @@ class LogisticRegressionSuite
testEstimatorAndModelReadWrite(lr, dataset, LogisticRegressionSuite.allParamSettings,
checkModelData)
}
+
+ test("should support all NumericType labels and not support other types") {
+ val lr = new LogisticRegression().setMaxIter(1)
+ MLTestingUtils.checkNumericTypes[LogisticRegressionModel, LogisticRegression](
+ lr, isClassification = true, sqlContext) { (expected, actual) =>
+ assert(expected.intercept === actual.intercept)
+ assert(expected.coefficients.toArray === actual.coefficients.toArray)
+ }
+ }
}
object LogisticRegressionSuite {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
index 43781385db..06ff049b48 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.ml.classification
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.MLTestingUtils
import org.apache.spark.mllib.classification.LogisticRegressionSuite._
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
import org.apache.spark.mllib.evaluation.MulticlassMetrics
@@ -162,4 +163,15 @@ class MultilayerPerceptronClassifierSuite
assert(newMlpModel.layers === mlpModel.layers)
assert(newMlpModel.weights === mlpModel.weights)
}
+
+ test("should support all NumericType labels and not support other types") {
+ val layers = Array(3, 2)
+ val mpc = new MultilayerPerceptronClassifier().setLayers(layers).setMaxIter(1)
+ MLTestingUtils.checkNumericTypes[
+ MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier](
+ mpc, isClassification = true, sqlContext) { (expected, actual) =>
+ assert(expected.layers === actual.layers)
+ assert(expected.weights === actual.weights)
+ }
+ }
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
index 082a6bcd21..4727cd436f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
@@ -21,7 +21,7 @@ import breeze.linalg.{Vector => BV}
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.classification.NaiveBayes.{Bernoulli, Multinomial}
import org.apache.spark.mllib.classification.NaiveBayesSuite._
import org.apache.spark.mllib.linalg._
@@ -86,7 +86,7 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
model: NaiveBayesModel,
modelType: String): Unit = {
featureAndProbabilities.collect().foreach {
- case Row(features: Vector, probability: Vector) => {
+ case Row(features: Vector, probability: Vector) =>
assert(probability.toArray.sum ~== 1.0 relTol 1.0e-10)
val expected = modelType match {
case Multinomial =>
@@ -97,7 +97,6 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
throw new UnknownError(s"Invalid modelType: $modelType.")
}
assert(probability ~== expected relTol 1.0e-10)
- }
}
}
@@ -185,6 +184,15 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
val nb = new NaiveBayes()
testEstimatorAndModelReadWrite(nb, dataset, NaiveBayesSuite.allParamSettings, checkModelData)
}
+
+ test("should support all NumericType labels and not support other types") {
+ val nb = new NaiveBayes()
+ MLTestingUtils.checkNumericTypes[NaiveBayesModel, NaiveBayes](
+ nb, isClassification = true, sqlContext) { (expected, actual) =>
+ assert(expected.pi === actual.pi)
+ assert(expected.theta === actual.theta)
+ }
+ }
}
object NaiveBayesSuite {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
index 51c1baf682..4131396726 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -74,7 +74,7 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
// copied model must have the same parent.
MLTestingUtils.checkCopy(ovaModel)
- assert(ovaModel.models.size === numClasses)
+ assert(ovaModel.models.length === numClasses)
val transformedDataset = ovaModel.transform(dataset)
// check for label metadata in prediction col
@@ -224,6 +224,20 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
val newOvaModel = testDefaultReadWrite(ovaModel, testParams = false)
checkModelData(ovaModel, newOvaModel)
}
+
+ test("should support all NumericType labels and not support other types") {
+ val ovr = new OneVsRest().setClassifier(new LogisticRegression().setMaxIter(1))
+ MLTestingUtils.checkNumericTypes[OneVsRestModel, OneVsRest](
+ ovr, isClassification = true, sqlContext) { (expected, actual) =>
+ val expectedModels = expected.models.map(m => m.asInstanceOf[LogisticRegressionModel])
+ val actualModels = actual.models.map(m => m.asInstanceOf[LogisticRegressionModel])
+ assert(expectedModels.length === actualModels.length)
+ expectedModels.zip(actualModels).foreach { case (e, a) =>
+ assert(e.intercept === a.intercept)
+ assert(e.coefficients.toArray === a.coefficients.toArray)
+ }
+ }
+ }
}
private class MockLogisticRegression(uid: String) extends LogisticRegression(uid) {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
index b896099e31..052bc83c38 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala
@@ -178,6 +178,14 @@ class RandomForestClassifierSuite extends SparkFunSuite with MLlibTestSparkConte
assert(importances.toArray.forall(_ >= 0.0))
}
+ test("should support all NumericType labels and not support other types") {
+ val rf = new RandomForestClassifier().setMaxDepth(1)
+ MLTestingUtils.checkNumericTypes[RandomForestClassificationModel, RandomForestClassifier](
+ rf, isClassification = true, sqlContext) { (expected, actual) =>
+ TreeTests.checkEqual(expected, actual)
+ }
+ }
+
/////////////////////////////////////////////////////////////////////////////
// Tests of model save/load
/////////////////////////////////////////////////////////////////////////////
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
index dbd752d2aa..f4844cc671 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
@@ -347,6 +347,15 @@ class AFTSurvivalRegressionSuite
}
}
+ test("should support all NumericType labels") {
+ val aft = new AFTSurvivalRegression().setMaxIter(1)
+ MLTestingUtils.checkNumericTypes[AFTSurvivalRegressionModel, AFTSurvivalRegression](
+ aft, isClassification = false, sqlContext) { (expected, actual) =>
+ assert(expected.intercept === actual.intercept)
+ assert(expected.coefficients === actual.coefficients)
+ }
+ }
+
test("read/write") {
def checkModelData(
model: AFTSurvivalRegressionModel,
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
index 662e3fc679..e9fb2677b2 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
@@ -117,6 +117,14 @@ class DecisionTreeRegressorSuite
assert(importances.toArray.forall(_ >= 0.0))
}
+ test("should support all NumericType labels and not support other types") {
+ val dt = new DecisionTreeRegressor().setMaxDepth(1)
+ MLTestingUtils.checkNumericTypes[DecisionTreeRegressionModel, DecisionTreeRegressor](
+ dt, isClassification = false, sqlContext) { (expected, actual) =>
+ TreeTests.checkEqual(expected, actual)
+ }
+ }
+
/////////////////////////////////////////////////////////////////////////////
// Tests of model save/load
/////////////////////////////////////////////////////////////////////////////
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
index dfb8418086..914818f41f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
@@ -29,7 +29,6 @@ import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame
import org.apache.spark.util.Utils
-
/**
* Test suite for [[GBTRegressor]].
*/
@@ -110,7 +109,14 @@ class GBTRegressorSuite extends SparkFunSuite with MLlibTestSparkContext {
sc.checkpointDir = None
Utils.deleteRecursively(tempDir)
+ }
+ test("should support all NumericType labels and not support other types") {
+ val gbt = new GBTRegressor().setMaxDepth(1)
+ MLTestingUtils.checkNumericTypes[GBTRegressionModel, GBTRegressor](
+ gbt, isClassification = false, sqlContext) { (expected, actual) =>
+ TreeTests.checkEqual(expected, actual)
+ }
}
// TODO: Reinstate test once runWithValidation is implemented SPARK-7132
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 4ebdbf2213..2265464b51 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -982,6 +982,16 @@ class GeneralizedLinearRegressionSuite
testEstimatorAndModelReadWrite(glr, datasetPoissonLog,
GeneralizedLinearRegressionSuite.allParamSettings, checkModelData)
}
+
+ test("should support all NumericType labels and not support other types") {
+ val glr = new GeneralizedLinearRegression().setMaxIter(1)
+ MLTestingUtils.checkNumericTypes[
+ GeneralizedLinearRegressionModel, GeneralizedLinearRegression](
+ glr, isClassification = false, sqlContext) { (expected, actual) =>
+ assert(expected.intercept === actual.intercept)
+ assert(expected.coefficients === actual.coefficients)
+ }
+ }
}
object GeneralizedLinearRegressionSuite {
@@ -1023,7 +1033,7 @@ object GeneralizedLinearRegressionSuite {
generator.setSeed(seed)
(0 until nPoints).map { _ =>
- val features = Vectors.dense(coefficients.indices.map { rndElement(_) }.toArray)
+ val features = Vectors.dense(coefficients.indices.map(rndElement).toArray)
val eta = BLAS.dot(Vectors.dense(coefficients), features) + intercept
val mu = link match {
case "identity" => eta
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
index b8874b4cd3..3a10ad7ed0 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
@@ -180,6 +180,15 @@ class IsotonicRegressionSuite
testEstimatorAndModelReadWrite(ir, dataset, IsotonicRegressionSuite.allParamSettings,
checkModelData)
}
+
+ test("should support all NumericType labels and not support other types") {
+ val ir = new IsotonicRegression()
+ MLTestingUtils.checkNumericTypes[IsotonicRegressionModel, IsotonicRegression](
+ ir, isClassification = false, sqlContext) { (expected, actual) =>
+ assert(expected.boundaries === actual.boundaries)
+ assert(expected.predictions === actual.predictions)
+ }
+ }
}
object IsotonicRegressionSuite {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index bd45d21e8d..cccb7f8d1b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -61,9 +61,9 @@ class LinearRegressionSuite
val featureSize = 4100
datasetWithSparseFeature = sqlContext.createDataFrame(
sc.parallelize(LinearDataGenerator.generateLinearInput(
- intercept = 0.0, weights = Seq.fill(featureSize)(r.nextDouble).toArray,
- xMean = Seq.fill(featureSize)(r.nextDouble).toArray,
- xVariance = Seq.fill(featureSize)(r.nextDouble).toArray, nPoints = 200,
+ intercept = 0.0, weights = Seq.fill(featureSize)(r.nextDouble()).toArray,
+ xMean = Seq.fill(featureSize)(r.nextDouble()).toArray,
+ xVariance = Seq.fill(featureSize)(r.nextDouble()).toArray, nPoints = 200,
seed, eps = 0.1, sparsity = 0.7), 2))
/*
@@ -687,7 +687,7 @@ class LinearRegressionSuite
// Validate that we re-insert a prediction column for evaluation
val modelNoPredictionColFieldNames
= modelNoPredictionCol.summary.predictions.schema.fieldNames
- assert((datasetWithDenseFeature.schema.fieldNames.toSet).subsetOf(
+ assert(datasetWithDenseFeature.schema.fieldNames.toSet.subsetOf(
modelNoPredictionColFieldNames.toSet))
assert(modelNoPredictionColFieldNames.exists(s => s.startsWith("prediction_")))
@@ -1006,6 +1006,15 @@ class LinearRegressionSuite
testEstimatorAndModelReadWrite(lr, datasetWithWeight, LinearRegressionSuite.allParamSettings,
checkModelData)
}
+
+ test("should support all NumericType labels and not support other types") {
+ val lr = new LinearRegression().setMaxIter(1)
+ MLTestingUtils.checkNumericTypes[LinearRegressionModel, LinearRegression](
+ lr, isClassification = false, sqlContext) { (expected, actual) =>
+ assert(expected.intercept === actual.intercept)
+ assert(expected.coefficients === actual.coefficients)
+ }
+ }
}
object LinearRegressionSuite {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
index 6be0c8bca0..2ab4f1b146 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala
@@ -94,6 +94,14 @@ class RandomForestRegressorSuite extends SparkFunSuite with MLlibTestSparkContex
assert(importances.toArray.forall(_ >= 0.0))
}
+ test("should support all NumericType labels and not support other types") {
+ val rf = new RandomForestRegressor().setMaxDepth(1)
+ MLTestingUtils.checkNumericTypes[RandomForestRegressionModel, RandomForestRegressor](
+ rf, isClassification = false, sqlContext) { (expected, actual) =>
+ TreeTests.checkEqual(expected, actual)
+ }
+ }
+
/////////////////////////////////////////////////////////////////////////////
// Tests of model save/load
/////////////////////////////////////////////////////////////////////////////
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala
index 12808b0305..bd5bd17147 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreeTests.scala
@@ -74,6 +74,24 @@ private[ml] object TreeTests extends SparkFunSuite {
}
/**
+ * Set label metadata (particularly the number of classes) on a DataFrame.
+ * @param data Dataset. Categorical features and labels must already have 0-based indices.
+ * This must be non-empty.
+ * @param numClasses Number of classes label can take. If 0, mark as continuous.
+ * @param labelColName Name of the label column on which to set the metadata.
+ * @return DataFrame with metadata
+ */
+ def setMetadata(data: DataFrame, numClasses: Int, labelColName: String): DataFrame = {
+ val labelAttribute = if (numClasses == 0) {
+ NumericAttribute.defaultAttr.withName(labelColName)
+ } else {
+ NominalAttribute.defaultAttr.withName(labelColName).withNumValues(numClasses)
+ }
+ val labelMetadata = labelAttribute.toMetadata()
+ data.select(data("features"), data(labelColName).as(labelColName, labelMetadata))
+ }
+
+ /**
* Check if the two trees are exactly the same.
* Note: I hesitate to override Node.equals since it could cause problems if users
* make mistakes such as creating loops of Nodes.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
index d290cc9b06..8108460518 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
@@ -17,14 +17,96 @@
package org.apache.spark.ml.util
-import org.apache.spark.ml.Model
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.tree.impl.TreeTests
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
-object MLTestingUtils {
+object MLTestingUtils extends SparkFunSuite {
def checkCopy(model: Model[_]): Unit = {
val copied = model.copy(ParamMap.empty)
.asInstanceOf[Model[_]]
assert(copied.parent.uid == model.parent.uid)
assert(copied.parent == model.parent)
}
+
+ def checkNumericTypes[M <: Model[M], T <: Estimator[M]](
+ estimator: T,
+ isClassification: Boolean,
+ sqlContext: SQLContext)(check: (M, M) => Unit): Unit = {
+ val dfs = if (isClassification) {
+ genClassifDFWithNumericLabelCol(sqlContext)
+ } else {
+ genRegressionDFWithNumericLabelCol(sqlContext)
+ }
+ val expected = estimator.fit(dfs(DoubleType))
+ val actuals = dfs.keys.filter(_ != DoubleType).map(t => estimator.fit(dfs(t)))
+ actuals.foreach(actual => check(expected, actual))
+
+ val dfWithStringLabels = generateDFWithStringLabelCol(sqlContext)
+ val thrown = intercept[IllegalArgumentException] {
+ estimator.fit(dfWithStringLabels)
+ }
+ assert(thrown.getMessage contains
+ "Column label must be of type NumericType but was actually of type StringType")
+ }
+
+ def genClassifDFWithNumericLabelCol(
+ sqlContext: SQLContext,
+ labelColName: String = "label",
+ featuresColName: String = "features"): Map[NumericType, DataFrame] = {
+ val df = sqlContext.createDataFrame(Seq(
+ (0, Vectors.dense(0, 2, 3)),
+ (1, Vectors.dense(0, 3, 1)),
+ (0, Vectors.dense(0, 2, 2)),
+ (1, Vectors.dense(0, 3, 9)),
+ (0, Vectors.dense(0, 2, 6))
+ )).toDF(labelColName, featuresColName)
+
+ val types =
+ Seq(ShortType, LongType, IntegerType, FloatType, ByteType, DoubleType, DecimalType(10, 0))
+ types.map(t => t -> df.select(col(labelColName).cast(t), col(featuresColName)))
+ .map { case (t, d) => t -> TreeTests.setMetadata(d, 2, labelColName) }
+ .toMap
+ }
+
+ def genRegressionDFWithNumericLabelCol(
+ sqlContext: SQLContext,
+ labelColName: String = "label",
+ featuresColName: String = "features",
+ censorColName: String = "censor"): Map[NumericType, DataFrame] = {
+ val df = sqlContext.createDataFrame(Seq(
+ (0, Vectors.dense(0)),
+ (1, Vectors.dense(1)),
+ (2, Vectors.dense(2)),
+ (3, Vectors.dense(3)),
+ (4, Vectors.dense(4))
+ )).toDF(labelColName, featuresColName)
+
+ val types =
+ Seq(ShortType, LongType, IntegerType, FloatType, ByteType, DoubleType, DecimalType(10, 0))
+ types
+ .map(t => t -> df.select(col(labelColName).cast(t), col(featuresColName)))
+ .map { case (t, d) =>
+ t -> TreeTests.setMetadata(d, 0, labelColName).withColumn(censorColName, lit(0.0))
+ }
+ .toMap
+ }
+
+ def generateDFWithStringLabelCol(
+ sqlContext: SQLContext,
+ labelColName: String = "label",
+ featuresColName: String = "features",
+ censorColName: String = "censor"): DataFrame =
+ sqlContext.createDataFrame(Seq(
+ ("0", Vectors.dense(0, 2, 3), 0.0),
+ ("1", Vectors.dense(0, 3, 1), 1.0),
+ ("0", Vectors.dense(0, 2, 2), 0.0),
+ ("1", Vectors.dense(0, 3, 9), 1.0),
+ ("0", Vectors.dense(0, 2, 6), 0.0)
+ )).toDF(labelColName, featuresColName, censorColName)
}