aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorJoseph K. Bradley <joseph@databricks.com>2015-05-06 16:15:51 -0700
committerXiangrui Meng <meng@databricks.com>2015-05-06 16:15:51 -0700
commit1ad04dae038673a448f529c39b17817b78d6acd0 (patch)
tree25476831643512823fd6c9361fbeb040ee7760ec /mllib
parent77409967008415b7ea57e9349d296350e6519687 (diff)
downloadspark-1ad04dae038673a448f529c39b17817b78d6acd0.tar.gz
spark-1ad04dae038673a448f529c39b17817b78d6acd0.tar.bz2
spark-1ad04dae038673a448f529c39b17817b78d6acd0.zip
[SPARK-5995] [ML] Make Prediction dev API public
Changes: * Update protected prediction methods, following design doc. **<--most interesting change** * Changed abstract classes for Estimator and Model to be public. Added DeveloperApi tag. (I kept the traits for Estimator/Model Params private.) * Changed ProbabilisticClassificationModel method names to use probability instead of probabilities. CC: mengxr shivaram etrain Author: Joseph K. Bradley <joseph@databricks.com> Closes #5913 from jkbradley/public-dev-api and squashes the following commits: e9aa0ea [Joseph K. Bradley] moved findMax to DenseVector and renamed to argmax. fixed bug for vector of length 0 15b9957 [Joseph K. Bradley] renamed probabilities to probability in method names 5cda84d [Joseph K. Bradley] regenerated sharedParams 7d1877a [Joseph K. Bradley] Made spark.ml prediction abstractions public. Organized their prediction methods for efficient computation of multiple output columns.
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/Predictor.scala (renamed from mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala)50
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala110
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala100
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala100
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala6
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala4
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala42
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala (renamed from mllib/src/main/scala/org/apache/spark/ml/impl/tree/treeParams.scala)4
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala22
16 files changed, 206 insertions, 267 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
index e8b3628140..0e53877de9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala
@@ -15,29 +15,23 @@
* limitations under the License.
*/
-package org.apache.spark.ml.impl.estimator
+package org.apache.spark.ml
-import org.apache.spark.annotation.{AlphaComponent, DeveloperApi}
-import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.SchemaUtils
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
+import org.apache.spark.sql.{DataFrame, Row}
/**
- * :: DeveloperApi ::
- *
- * Trait for parameters for prediction (regression and classification).
- *
- * NOTE: This is currently private[spark] but will be made public later once it is stabilized.
+ * (private[ml]) Trait for parameters for prediction (regression and classification).
*/
-@DeveloperApi
-private[spark] trait PredictorParams extends Params
+private[ml] trait PredictorParams extends Params
with HasLabelCol with HasFeaturesCol with HasPredictionCol {
/**
@@ -63,7 +57,7 @@ private[spark] trait PredictorParams extends Params
}
/**
- * :: AlphaComponent ::
+ * :: DeveloperApi ::
*
* Abstraction for prediction problems (regression and classification).
*
@@ -73,11 +67,9 @@ private[spark] trait PredictorParams extends Params
* parameter to specify the concrete type.
* @tparam M Specialization of [[PredictionModel]]. If you subclass this type, use this type
* parameter to specify the concrete type for the corresponding model.
- *
- * NOTE: This is currently private[spark] but will be made public later once it is stabilized.
*/
-@AlphaComponent
-private[spark] abstract class Predictor[
+@DeveloperApi
+abstract class Predictor[
FeaturesType,
Learner <: Predictor[FeaturesType, Learner, M],
M <: PredictionModel[FeaturesType, M]]
@@ -104,8 +96,6 @@ private[spark] abstract class Predictor[
}
/**
- * :: DeveloperApi ::
- *
* Train a model using the given dataset and parameters.
* Developers can implement this instead of [[fit()]] to avoid dealing with schema validation
* and copying parameters into the model.
@@ -113,12 +103,9 @@ private[spark] abstract class Predictor[
* @param dataset Training dataset
* @return Fitted model
*/
- @DeveloperApi
protected def train(dataset: DataFrame): M
/**
- * :: DeveloperApi ::
- *
* Returns the SQL DataType corresponding to the FeaturesType type parameter.
*
* This is used by [[validateAndTransformSchema()]].
@@ -126,7 +113,6 @@ private[spark] abstract class Predictor[
*
* The default value is VectorUDT, but it may be overridden if FeaturesType is not Vector.
*/
- @DeveloperApi
protected def featuresDataType: DataType = new VectorUDT
override def transformSchema(schema: StructType): StructType = {
@@ -146,7 +132,7 @@ private[spark] abstract class Predictor[
}
/**
- * :: AlphaComponent ::
+ * :: DeveloperApi ::
*
* Abstraction for a model for prediction tasks (regression and classification).
*
@@ -154,11 +140,9 @@ private[spark] abstract class Predictor[
* E.g., [[org.apache.spark.mllib.linalg.VectorUDT]] for vector features.
* @tparam M Specialization of [[PredictionModel]]. If you subclass this type, use this type
* parameter to specify the concrete type for the corresponding model.
- *
- * NOTE: This is currently private[spark] but will be made public later once it is stabilized.
*/
-@AlphaComponent
-private[spark] abstract class PredictionModel[FeaturesType, M <: PredictionModel[FeaturesType, M]]
+@DeveloperApi
+abstract class PredictionModel[FeaturesType, M <: PredictionModel[FeaturesType, M]]
extends Model[M] with PredictorParams {
/** @group setParam */
@@ -168,8 +152,6 @@ private[spark] abstract class PredictionModel[FeaturesType, M <: PredictionModel
def setPredictionCol(value: String): M = set(predictionCol, value).asInstanceOf[M]
/**
- * :: DeveloperApi ::
- *
* Returns the SQL DataType corresponding to the FeaturesType type parameter.
*
* This is used by [[validateAndTransformSchema()]].
@@ -177,7 +159,6 @@ private[spark] abstract class PredictionModel[FeaturesType, M <: PredictionModel
*
* The default value is VectorUDT, but it may be overridden if FeaturesType is not Vector.
*/
- @DeveloperApi
protected def featuresDataType: DataType = new VectorUDT
override def transformSchema(schema: StructType): StructType = {
@@ -192,12 +173,8 @@ private[spark] abstract class PredictionModel[FeaturesType, M <: PredictionModel
* @return transformed dataset with [[predictionCol]] of type [[Double]]
*/
override def transform(dataset: DataFrame): DataFrame = {
- // This default implementation should be overridden as needed.
-
- // Check schema
transformSchema(dataset.schema, logging = true)
-
- if ($(predictionCol) != "") {
+ if ($(predictionCol).nonEmpty) {
dataset.withColumn($(predictionCol), callUDF(predict _, DoubleType, col($(featuresCol))))
} else {
this.logWarning(s"$uid: Predictor.transform() was called as NOOP" +
@@ -207,11 +184,8 @@ private[spark] abstract class PredictionModel[FeaturesType, M <: PredictionModel
}
/**
- * :: DeveloperApi ::
- *
* Predict label for the given features.
* This internal method is used to implement [[transform()]] and output [[predictionCol]].
*/
- @DeveloperApi
protected def predict(features: FeaturesType): Double
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
index d3361e2470..263d580fe2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
@@ -17,8 +17,8 @@
package org.apache.spark.ml.classification
-import org.apache.spark.annotation.{AlphaComponent, DeveloperApi}
-import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor, PredictorParams}
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.ml.{PredictionModel, PredictorParams, Predictor}
import org.apache.spark.ml.param.shared.HasRawPredictionCol
import org.apache.spark.ml.util.SchemaUtils
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
@@ -26,15 +26,12 @@ import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
+
/**
- * :: DeveloperApi ::
- * Params for classification.
- *
- * NOTE: This is currently private[spark] but will be made public later once it is stabilized.
+ * (private[spark]) Params for classification.
*/
-@DeveloperApi
-private[spark] trait ClassifierParams extends PredictorParams
- with HasRawPredictionCol {
+private[spark] trait ClassifierParams
+ extends PredictorParams with HasRawPredictionCol {
override protected def validateAndTransformSchema(
schema: StructType,
@@ -46,23 +43,21 @@ private[spark] trait ClassifierParams extends PredictorParams
}
/**
- * :: AlphaComponent ::
+ * :: DeveloperApi ::
+ *
* Single-label binary or multiclass classification.
* Classes are indexed {0, 1, ..., numClasses - 1}.
*
* @tparam FeaturesType Type of input features. E.g., [[Vector]]
* @tparam E Concrete Estimator type
* @tparam M Concrete Model type
- *
- * NOTE: This is currently private[spark] but will be made public later once it is stabilized.
*/
-@AlphaComponent
-private[spark] abstract class Classifier[
+@DeveloperApi
+abstract class Classifier[
FeaturesType,
E <: Classifier[FeaturesType, E, M],
M <: ClassificationModel[FeaturesType, M]]
- extends Predictor[FeaturesType, E, M]
- with ClassifierParams {
+ extends Predictor[FeaturesType, E, M] with ClassifierParams {
/** @group setParam */
def setRawPredictionCol(value: String): E = set(rawPredictionCol, value).asInstanceOf[E]
@@ -71,17 +66,15 @@ private[spark] abstract class Classifier[
}
/**
- * :: AlphaComponent ::
+ * :: DeveloperApi ::
+ *
* Model produced by a [[Classifier]].
* Classes are indexed {0, 1, ..., numClasses - 1}.
*
* @tparam FeaturesType Type of input features. E.g., [[Vector]]
* @tparam M Concrete Model type
- *
- * NOTE: This is currently private[spark] but will be made public later once it is stabilized.
*/
-@AlphaComponent
-private[spark]
+@DeveloperApi
abstract class ClassificationModel[FeaturesType, M <: ClassificationModel[FeaturesType, M]]
extends PredictionModel[FeaturesType, M] with ClassifierParams {
@@ -101,13 +94,27 @@ abstract class ClassificationModel[FeaturesType, M <: ClassificationModel[Featur
* @return transformed dataset
*/
override def transform(dataset: DataFrame): DataFrame = {
- // This default implementation should be overridden as needed.
-
- // Check schema
transformSchema(dataset.schema, logging = true)
- val (numColsOutput, outputData) =
- ClassificationModel.transformColumnsImpl[FeaturesType](dataset, this)
+ // Output selected columns only.
+ // This is a bit complicated since it tries to avoid repeated computation.
+ var outputData = dataset
+ var numColsOutput = 0
+ if (getRawPredictionCol != "") {
+ outputData = outputData.withColumn(getRawPredictionCol,
+ callUDF(predictRaw _, new VectorUDT, col(getFeaturesCol)))
+ numColsOutput += 1
+ }
+ if (getPredictionCol != "") {
+ val predUDF = if (getRawPredictionCol != "") {
+ callUDF(raw2prediction _, DoubleType, col(getRawPredictionCol))
+ } else {
+ callUDF(predict _, DoubleType, col(getFeaturesCol))
+ }
+ outputData = outputData.withColumn(getPredictionCol, predUDF)
+ numColsOutput += 1
+ }
+
if (numColsOutput == 0) {
logWarning(s"$uid: ClassificationModel.transform() was called as NOOP" +
" since no output columns were set.")
@@ -116,22 +123,17 @@ abstract class ClassificationModel[FeaturesType, M <: ClassificationModel[Featur
}
/**
- * :: DeveloperApi ::
- *
* Predict label for the given features.
* This internal method is used to implement [[transform()]] and output [[predictionCol]].
*
* This default implementation for classification predicts the index of the maximum value
* from [[predictRaw()]].
*/
- @DeveloperApi
override protected def predict(features: FeaturesType): Double = {
- predictRaw(features).toArray.zipWithIndex.maxBy(_._1)._2
+ raw2prediction(predictRaw(features))
}
/**
- * :: DeveloperApi ::
- *
* Raw prediction for each possible label.
* The meaning of a "raw" prediction may vary between algorithms, but it intuitively gives
* a measure of confidence in each possible label (where larger = more confident).
@@ -141,48 +143,12 @@ abstract class ClassificationModel[FeaturesType, M <: ClassificationModel[Featur
* This raw prediction may be any real number, where a larger value indicates greater
* confidence for that label.
*/
- @DeveloperApi
protected def predictRaw(features: FeaturesType): Vector
-}
-
-private[ml] object ClassificationModel {
/**
- * Added prediction column(s). This is separated from [[ClassificationModel.transform()]]
- * since it is used by [[org.apache.spark.ml.classification.ProbabilisticClassificationModel]].
- * @param dataset Input dataset
- * @return (number of columns added, transformed dataset)
+ * Given a vector of raw predictions, select the predicted label.
+ * This may be overridden to support thresholds which favor particular labels.
+ * @return predicted label
*/
- def transformColumnsImpl[FeaturesType](
- dataset: DataFrame,
- model: ClassificationModel[FeaturesType, _]): (Int, DataFrame) = {
-
- // Output selected columns only.
- // This is a bit complicated since it tries to avoid repeated computation.
- var tmpData = dataset
- var numColsOutput = 0
- if (model.getRawPredictionCol != "") {
- // output raw prediction
- val features2raw: FeaturesType => Vector = model.predictRaw
- tmpData = tmpData.withColumn(model.getRawPredictionCol,
- callUDF(features2raw, new VectorUDT, col(model.getFeaturesCol)))
- numColsOutput += 1
- if (model.getPredictionCol != "") {
- val raw2pred: Vector => Double = (rawPred) => {
- rawPred.toArray.zipWithIndex.maxBy(_._1)._2
- }
- tmpData = tmpData.withColumn(model.getPredictionCol,
- callUDF(raw2pred, DoubleType, col(model.getRawPredictionCol)))
- numColsOutput += 1
- }
- } else if (model.getPredictionCol != "") {
- // output prediction
- val features2pred: FeaturesType => Double = model.predict
- tmpData = tmpData.withColumn(model.getPredictionCol,
- callUDF(features2pred, DoubleType, col(model.getFeaturesCol)))
- numColsOutput += 1
- }
- (numColsOutput, tmpData)
- }
-
+ protected def raw2prediction(rawPrediction: Vector): Double = rawPrediction.toDense.argmax
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index 419e5ba05d..dcebea1d4b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -18,10 +18,9 @@
package org.apache.spark.ml.classification
import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor}
-import org.apache.spark.ml.impl.tree._
+import org.apache.spark.ml.{PredictionModel, Predictor}
import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.tree.{DecisionTreeModel, Node}
+import org.apache.spark.ml.tree.{TreeClassifierParams, DecisionTreeParams, DecisionTreeModel, Node}
import org.apache.spark.ml.util.MetadataUtils
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
index 534ea95b1c..ae51b05a0c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
@@ -21,11 +21,10 @@ import com.github.fommil.netlib.BLAS.{getInstance => blas}
import org.apache.spark.Logging
import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor}
-import org.apache.spark.ml.impl.tree._
+import org.apache.spark.ml.{PredictionModel, Predictor}
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.regression.DecisionTreeRegressionModel
-import org.apache.spark.ml.tree.{DecisionTreeModel, TreeEnsembleModel}
+import org.apache.spark.ml.tree.{GBTParams, TreeClassifierParams, DecisionTreeModel, TreeEnsembleModel}
import org.apache.spark.ml.util.MetadataUtils
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index b73be035e2..550369d18c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -21,9 +21,8 @@ import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
-import org.apache.spark.mllib.linalg.{BLAS, Vector, VectorUDT, Vectors}
+import org.apache.spark.mllib.linalg._
import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.functions._
import org.apache.spark.storage.StorageLevel
/**
@@ -99,76 +98,17 @@ class LogisticRegressionModel private[ml] (
/** @group setParam */
def setThreshold(value: Double): this.type = set(threshold, value)
+ /** Margin (rawPrediction) for class label 1. For binary classification only. */
private val margin: Vector => Double = (features) => {
BLAS.dot(features, weights) + intercept
}
+ /** Score (probability) for class label 1. For binary classification only. */
private val score: Vector => Double = (features) => {
val m = margin(features)
1.0 / (1.0 + math.exp(-m))
}
- override def transform(dataset: DataFrame): DataFrame = {
- // This is overridden (a) to be more efficient (avoiding re-computing values when creating
- // multiple output columns) and (b) to handle threshold, which the abstractions do not use.
- // TODO: We should abstract away the steps defined by UDFs below so that the abstractions
- // can call whichever UDFs are needed to create the output columns.
-
- // Check schema
- transformSchema(dataset.schema, logging = true)
-
- // Output selected columns only.
- // This is a bit complicated since it tries to avoid repeated computation.
- // rawPrediction (-margin, margin)
- // probability (1.0-score, score)
- // prediction (max margin)
- var tmpData = dataset
- var numColsOutput = 0
- if ($(rawPredictionCol) != "") {
- val features2raw: Vector => Vector = (features) => predictRaw(features)
- tmpData = tmpData.withColumn($(rawPredictionCol),
- callUDF(features2raw, new VectorUDT, col($(featuresCol))))
- numColsOutput += 1
- }
- if ($(probabilityCol) != "") {
- if ($(rawPredictionCol) != "") {
- val raw2prob = udf { (rawPreds: Vector) =>
- val prob1 = 1.0 / (1.0 + math.exp(-rawPreds(1)))
- Vectors.dense(1.0 - prob1, prob1): Vector
- }
- tmpData = tmpData.withColumn($(probabilityCol), raw2prob(col($(rawPredictionCol))))
- } else {
- val features2prob = udf { (features: Vector) => predictProbabilities(features) : Vector }
- tmpData = tmpData.withColumn($(probabilityCol), features2prob(col($(featuresCol))))
- }
- numColsOutput += 1
- }
- if ($(predictionCol) != "") {
- val t = $(threshold)
- if ($(probabilityCol) != "") {
- val predict = udf { probs: Vector =>
- if (probs(1) > t) 1.0 else 0.0
- }
- tmpData = tmpData.withColumn($(predictionCol), predict(col($(probabilityCol))))
- } else if ($(rawPredictionCol) != "") {
- val predict = udf { rawPreds: Vector =>
- val prob1 = 1.0 / (1.0 + math.exp(-rawPreds(1)))
- if (prob1 > t) 1.0 else 0.0
- }
- tmpData = tmpData.withColumn($(predictionCol), predict(col($(rawPredictionCol))))
- } else {
- val predict = udf { features: Vector => this.predict(features) }
- tmpData = tmpData.withColumn($(predictionCol), predict(col($(featuresCol))))
- }
- numColsOutput += 1
- }
- if (numColsOutput == 0) {
- this.logWarning(s"$uid: LogisticRegressionModel.transform() was called as NOOP" +
- " since no output columns were set.")
- }
- tmpData
- }
-
override val numClasses: Int = 2
/**
@@ -179,17 +119,43 @@ class LogisticRegressionModel private[ml] (
if (score(features) > getThreshold) 1 else 0
}
- override protected def predictProbabilities(features: Vector): Vector = {
- val s = score(features)
- Vectors.dense(1.0 - s, s)
+ override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = {
+ rawPrediction match {
+ case dv: DenseVector =>
+ var i = 0
+ while (i < dv.size) {
+ dv.values(i) = 1.0 / (1.0 + math.exp(-dv.values(i)))
+ i += 1
+ }
+ dv
+ case sv: SparseVector =>
+ throw new RuntimeException("Unexpected error in LogisticRegressionModel:" +
+ " raw2probabilitiesInPlace encountered SparseVector")
+ }
}
override protected def predictRaw(features: Vector): Vector = {
val m = margin(features)
- Vectors.dense(0.0, m)
+ Vectors.dense(-m, m)
}
override def copy(extra: ParamMap): LogisticRegressionModel = {
copyValues(new LogisticRegressionModel(parent, weights, intercept), extra)
}
+
+ override protected def raw2prediction(rawPrediction: Vector): Double = {
+ val t = getThreshold
+ val rawThreshold = if (t == 0.0) {
+ Double.NegativeInfinity
+ } else if (t == 1.0) {
+ Double.PositiveInfinity
+ } else {
+ Math.log(t / (1.0 - t))
+ }
+ if (rawPrediction(1) > rawThreshold) 1 else 0
+ }
+
+ override protected def probability2prediction(probability: Vector): Double = {
+ if (probability(1) > getThreshold) 1 else 0
+ }
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
index 8519841c5c..330ae2938f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
@@ -17,16 +17,16 @@
package org.apache.spark.ml.classification
-import org.apache.spark.annotation.{AlphaComponent, DeveloperApi}
+import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.SchemaUtils
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types.{DataType, StructType}
+import org.apache.spark.sql.types.{DoubleType, DataType, StructType}
/**
- * Params for probabilistic classification.
+ * (private[classification]) Params for probabilistic classification.
*/
private[classification] trait ProbabilisticClassifierParams
extends ClassifierParams with HasProbabilityCol {
@@ -42,17 +42,15 @@ private[classification] trait ProbabilisticClassifierParams
/**
- * :: AlphaComponent ::
+ * :: DeveloperApi ::
*
* Single-label binary or multiclass classifier which can output class conditional probabilities.
*
* @tparam FeaturesType Type of input features. E.g., [[Vector]]
* @tparam E Concrete Estimator type
* @tparam M Concrete Model type
- *
- * NOTE: This is currently private[spark] but will be made public later once it is stabilized.
*/
-@AlphaComponent
+@DeveloperApi
private[spark] abstract class ProbabilisticClassifier[
FeaturesType,
E <: ProbabilisticClassifier[FeaturesType, E, M],
@@ -65,17 +63,15 @@ private[spark] abstract class ProbabilisticClassifier[
/**
- * :: AlphaComponent ::
+ * :: DeveloperApi ::
*
* Model produced by a [[ProbabilisticClassifier]].
* Classes are indexed {0, 1, ..., numClasses - 1}.
*
* @tparam FeaturesType Type of input features. E.g., [[Vector]]
* @tparam M Concrete Model type
- *
- * NOTE: This is currently private[spark] but will be made public later once it is stabilized.
*/
-@AlphaComponent
+@DeveloperApi
private[spark] abstract class ProbabilisticClassificationModel[
FeaturesType,
M <: ProbabilisticClassificationModel[FeaturesType, M]]
@@ -95,39 +91,79 @@ private[spark] abstract class ProbabilisticClassificationModel[
* @return transformed dataset
*/
override def transform(dataset: DataFrame): DataFrame = {
- // This default implementation should be overridden as needed.
-
- // Check schema
transformSchema(dataset.schema, logging = true)
- val (numColsOutput, outputData) =
- ClassificationModel.transformColumnsImpl[FeaturesType](dataset, this)
-
// Output selected columns only.
- if ($(probabilityCol) != "") {
- // output probabilities
- outputData.withColumn($(probabilityCol),
- callUDF(predictProbabilities _, new VectorUDT, col($(featuresCol))))
- } else {
- if (numColsOutput == 0) {
- this.logWarning(s"$uid: ProbabilisticClassificationModel.transform() was called as NOOP" +
- " since no output columns were set.")
+ // This is a bit complicated since it tries to avoid repeated computation.
+ var outputData = dataset
+ var numColsOutput = 0
+ if ($(rawPredictionCol).nonEmpty) {
+ outputData = outputData.withColumn(getRawPredictionCol,
+ callUDF(predictRaw _, new VectorUDT, col(getFeaturesCol)))
+ numColsOutput += 1
+ }
+ if ($(probabilityCol).nonEmpty) {
+ val probUDF = if ($(rawPredictionCol).nonEmpty) {
+ callUDF(raw2probability _, new VectorUDT, col($(rawPredictionCol)))
+ } else {
+ callUDF(predictProbability _, new VectorUDT, col($(featuresCol)))
+ }
+ outputData = outputData.withColumn($(probabilityCol), probUDF)
+ numColsOutput += 1
+ }
+ if ($(predictionCol).nonEmpty) {
+ val predUDF = if ($(rawPredictionCol).nonEmpty) {
+ callUDF(raw2prediction _, DoubleType, col($(rawPredictionCol)))
+ } else if ($(probabilityCol).nonEmpty) {
+ callUDF(probability2prediction _, DoubleType, col($(probabilityCol)))
+ } else {
+ callUDF(predict _, DoubleType, col($(featuresCol)))
}
- outputData
+ outputData = outputData.withColumn($(predictionCol), predUDF)
+ numColsOutput += 1
+ }
+
+ if (numColsOutput == 0) {
+ this.logWarning(s"$uid: ProbabilisticClassificationModel.transform() was called as NOOP" +
+ " since no output columns were set.")
}
+ outputData
}
/**
- * :: DeveloperApi ::
+ * Estimate the probability of each class given the raw prediction,
+ * doing the computation in-place.
+ * These predictions are also called class conditional probabilities.
+ *
+ * This internal method is used to implement [[transform()]] and output [[probabilityCol]].
*
+ * @return Estimated class conditional probabilities (modified input vector)
+ */
+ protected def raw2probabilityInPlace(rawPrediction: Vector): Vector
+
+ /** Non-in-place version of [[raw2probabilityInPlace()]] */
+ protected def raw2probability(rawPrediction: Vector): Vector = {
+ val probs = rawPrediction.copy
+ raw2probabilityInPlace(probs)
+ }
+
+ /**
* Predict the probability of each class given the features.
* These predictions are also called class conditional probabilities.
*
- * WARNING: Not all models output well-calibrated probability estimates! These probabilities
- * should be treated as confidences, not precise probabilities.
- *
* This internal method is used to implement [[transform()]] and output [[probabilityCol]].
+ *
+ * @return Estimated class conditional probabilities
+ */
+ protected def predictProbability(features: FeaturesType): Vector = {
+ val rawPreds = predictRaw(features)
+ raw2probabilityInPlace(rawPreds)
+ }
+
+ /**
+ * Given a vector of class conditional probabilities, select the predicted label.
+ * This may be overridden to support thresholds which favor particular labels.
+ * @return predicted label
*/
- @DeveloperApi
- protected def predictProbabilities(features: FeaturesType): Vector
+ protected def probability2prediction(probability: Vector): Double = probability.toDense.argmax
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index 17f59bb42e..9954893f14 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -20,10 +20,9 @@ package org.apache.spark.ml.classification
import scala.collection.mutable
import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor}
-import org.apache.spark.ml.impl.tree._
+import org.apache.spark.ml.{PredictionModel, Predictor}
import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.tree.{DecisionTreeModel, TreeEnsembleModel}
+import org.apache.spark.ml.tree.{RandomForestParams, TreeClassifierParams, DecisionTreeModel, TreeEnsembleModel}
import org.apache.spark.ml.util.MetadataUtils
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index d379172e0b..0e1ff97a8b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -40,8 +40,10 @@ private[shared] object SharedParamsCodeGen {
ParamDesc[String]("predictionCol", "prediction column name", Some("\"prediction\"")),
ParamDesc[String]("rawPredictionCol", "raw prediction (a.k.a. confidence) column name",
Some("\"rawPrediction\"")),
- ParamDesc[String]("probabilityCol",
- "column name for predicted class conditional probabilities", Some("\"probability\"")),
+ ParamDesc[String]("probabilityCol", "Column name for predicted class conditional" +
+ " probabilities. Note: Not all models output well-calibrated probability estimates!" +
+ " These probabilities should be treated as confidences, not precise probabilities.",
+ Some("\"probability\"")),
ParamDesc[Double]("threshold",
"threshold in binary classification prediction, in range [0, 1]",
isValid = "ParamValidators.inRange(0, 1)"),
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index fb1874ccfc..87f86807c3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -128,10 +128,10 @@ private[ml] trait HasRawPredictionCol extends Params {
private[ml] trait HasProbabilityCol extends Params {
/**
- * Param for column name for predicted class conditional probabilities.
+ * Param for Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities..
* @group param
*/
- final val probabilityCol: Param[String] = new Param[String](this, "probabilityCol", "column name for predicted class conditional probabilities")
+ final val probabilityCol: Param[String] = new Param[String](this, "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.")
setDefault(probabilityCol, "probability")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index b07c26fe79..f8f0b161a4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -18,10 +18,9 @@
package org.apache.spark.ml.regression
import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor}
-import org.apache.spark.ml.impl.tree._
+import org.apache.spark.ml.{PredictionModel, Predictor}
import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.tree.{DecisionTreeModel, Node}
+import org.apache.spark.ml.tree.{TreeRegressorParams, DecisionTreeParams, DecisionTreeModel, Node}
import org.apache.spark.ml.util.MetadataUtils
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
index bc796958e4..461905c127 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala
@@ -21,10 +21,9 @@ import com.github.fommil.netlib.BLAS.{getInstance => blas}
import org.apache.spark.Logging
import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor}
-import org.apache.spark.ml.impl.tree._
+import org.apache.spark.ml.{PredictionModel, Predictor}
import org.apache.spark.ml.param.{Param, ParamMap}
-import org.apache.spark.ml.tree.{DecisionTreeModel, TreeEnsembleModel}
+import org.apache.spark.ml.tree.{GBTParams, TreeRegressorParams, DecisionTreeModel, TreeEnsembleModel}
import org.apache.spark.ml.util.MetadataUtils
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 66c475f2d9..e63c9a3eea 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -25,6 +25,7 @@ import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS,
import org.apache.spark.Logging
import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.ml.PredictorParams
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.{HasElasticNetParam, HasMaxIter, HasRegParam, HasTol}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
@@ -39,7 +40,7 @@ import org.apache.spark.util.StatCounter
/**
* Params for linear regression.
*/
-private[regression] trait LinearRegressionParams extends RegressorParams
+private[regression] trait LinearRegressionParams extends PredictorParams
with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol
/**
@@ -240,7 +241,7 @@ class LinearRegressionModel private[ml] (
* + \bar{y} / \hat{y}||^2
* = 1/2n ||\sum_i w_i^\prime x_i - y / \hat{y} + offset||^2 = 1/2n diff^2
* }}}
- * where w_i^\prime is the effective weights defined by w_i/\hat{x_i}, offset is
+ * where w_i^\prime^ is the effective weights defined by w_i/\hat{x_i}, offset is
* {{{
* - \sum_i (w_i/\hat{x_i})\bar{x_i} + \bar{y} / \hat{y}.
* }}}, and diff is
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
index 0468a1be1b..dbc6289274 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala
@@ -18,10 +18,9 @@
package org.apache.spark.ml.regression
import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor}
-import org.apache.spark.ml.impl.tree.{RandomForestParams, TreeRegressorParams}
+import org.apache.spark.ml.{PredictionModel, Predictor}
import org.apache.spark.ml.param.ParamMap
-import org.apache.spark.ml.tree.{DecisionTreeModel, TreeEnsembleModel}
+import org.apache.spark.ml.tree.{RandomForestParams, TreeRegressorParams, DecisionTreeModel, TreeEnsembleModel}
import org.apache.spark.ml.util.MetadataUtils
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.regression.LabeledPoint
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala
index c6b3327db6..c72ef29680 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala
@@ -17,62 +17,40 @@
package org.apache.spark.ml.regression
-import org.apache.spark.annotation.{AlphaComponent, DeveloperApi}
-import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor, PredictorParams}
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.ml.{PredictionModel, PredictorParams, Predictor}
-/**
- * :: DeveloperApi ::
- * Params for regression.
- * Currently empty, but may add functionality later.
- *
- * NOTE: This is currently private[spark] but will be made public later once it is stabilized.
- */
-@DeveloperApi
-private[spark] trait RegressorParams extends PredictorParams
/**
- * :: AlphaComponent ::
+ * :: DeveloperApi ::
*
* Single-label regression
*
* @tparam FeaturesType Type of input features. E.g., [[org.apache.spark.mllib.linalg.Vector]]
* @tparam Learner Concrete Estimator type
* @tparam M Concrete Model type
- *
- * NOTE: This is currently private[spark] but will be made public later once it is stabilized.
*/
-@AlphaComponent
+@DeveloperApi
private[spark] abstract class Regressor[
FeaturesType,
Learner <: Regressor[FeaturesType, Learner, M],
M <: RegressionModel[FeaturesType, M]]
- extends Predictor[FeaturesType, Learner, M]
- with RegressorParams {
+ extends Predictor[FeaturesType, Learner, M] with PredictorParams {
// TODO: defaultEvaluator (follow-up PR)
}
/**
- * :: AlphaComponent ::
+ * :: DeveloperApi ::
*
* Model produced by a [[Regressor]].
*
* @tparam FeaturesType Type of input features. E.g., [[org.apache.spark.mllib.linalg.Vector]]
* @tparam M Concrete Model type.
- *
- * NOTE: This is currently private[spark] but will be made public later once it is stabilized.
*/
-@AlphaComponent
-private[spark] abstract class RegressionModel[FeaturesType, M <: RegressionModel[FeaturesType, M]]
- extends PredictionModel[FeaturesType, M] with RegressorParams {
-
- /**
- * :: DeveloperApi ::
- *
- * Predict real-valued label for the given features.
- * This internal method is used to implement [[transform()]] and output [[predictionCol]].
- */
- @DeveloperApi
- protected def predict(features: FeaturesType): Double
+@DeveloperApi
+abstract class RegressionModel[FeaturesType, M <: RegressionModel[FeaturesType, M]]
+ extends PredictionModel[FeaturesType, M] with PredictorParams {
+ // TODO: defaultEvaluator (follow-up PR)
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/impl/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index 0e225627d4..816fcedf2e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/impl/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -15,10 +15,10 @@
* limitations under the License.
*/
-package org.apache.spark.ml.impl.tree
+package org.apache.spark.ml.tree
import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.ml.impl.estimator.PredictorParams
+import org.apache.spark.ml.PredictorParams
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared.{HasMaxIter, HasSeed}
import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo, BoostingStrategy => OldBoostingStrategy, Strategy => OldStrategy}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 188d1e542b..f6bcdf83cd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -587,6 +587,28 @@ class DenseVector(val values: Array[Double]) extends Vector {
}
new SparseVector(size, ii, vv)
}
+
+ /**
+ * Find the index of a maximal element. Returns the first maximal element in case of a tie.
+ * Returns -1 if vector has length 0.
+ */
+ private[spark] def argmax: Int = {
+ if (size == 0) {
+ -1
+ } else {
+ var maxIdx = 0
+ var maxValue = values(0)
+ var i = 1
+ while (i < size) {
+ if (values(i) > maxValue) {
+ maxIdx = i
+ maxValue = values(i)
+ }
+ i += 1
+ }
+ maxIdx
+ }
+ }
}
object DenseVector {