aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorsethah <seth.hendrickson16@gmail.com>2016-11-21 05:36:49 -0800
committerYanbo Liang <ybliang8@gmail.com>2016-11-21 05:36:49 -0800
commite811fbf9ed131bccbc46f3c5701c4ff317222fd9 (patch)
tree36026581dec2887f946fe15ad50e92ee69c69395 /mllib
parent658547974915ebcaae83e13e4c3bdf68d5426fda (diff)
downloadspark-e811fbf9ed131bccbc46f3c5701c4ff317222fd9.tar.gz
spark-e811fbf9ed131bccbc46f3c5701c4ff317222fd9.tar.bz2
spark-e811fbf9ed131bccbc46f3c5701c4ff317222fd9.zip
[SPARK-18282][ML][PYSPARK] Add python clustering summaries for GMM and BKM
## What changes were proposed in this pull request? Add model summary APIs for `GaussianMixtureModel` and `BisectingKMeansModel` in pyspark. ## How was this patch tested? Unit tests. Author: sethah <seth.hendrickson16@gmail.com> Closes #15777 from sethah/pyspark_cluster_summaries.
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala11
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala9
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala9
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala9
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala11
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala14
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala2
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala3
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala3
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala3
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala2
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala2
12 files changed, 44 insertions, 34 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index f58efd36a1..d07b4adebb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -648,7 +648,7 @@ class LogisticRegression @Since("1.2.0") (
$(labelCol),
$(featuresCol),
objectiveHistory)
- model.setSummary(logRegSummary)
+ model.setSummary(Some(logRegSummary))
} else {
model
}
@@ -790,9 +790,9 @@ class LogisticRegressionModel private[spark] (
}
}
- private[classification] def setSummary(
- summary: LogisticRegressionTrainingSummary): this.type = {
- this.trainingSummary = Some(summary)
+ private[classification]
+ def setSummary(summary: Option[LogisticRegressionTrainingSummary]): this.type = {
+ this.trainingSummary = summary
this
}
@@ -887,8 +887,7 @@ class LogisticRegressionModel private[spark] (
override def copy(extra: ParamMap): LogisticRegressionModel = {
val newModel = copyValues(new LogisticRegressionModel(uid, coefficientMatrix, interceptVector,
numClasses, isMultinomial), extra)
- if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get)
- newModel.setParent(parent)
+ newModel.setSummary(trainingSummary).setParent(parent)
}
override protected def raw2prediction(rawPrediction: Vector): Double = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index f8a606d60b..e6ca3aedff 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -95,8 +95,7 @@ class BisectingKMeansModel private[ml] (
@Since("2.0.0")
override def copy(extra: ParamMap): BisectingKMeansModel = {
val copied = copyValues(new BisectingKMeansModel(uid, parentModel), extra)
- if (trainingSummary.isDefined) copied.setSummary(trainingSummary.get)
- copied.setParent(this.parent)
+ copied.setSummary(trainingSummary).setParent(this.parent)
}
@Since("2.0.0")
@@ -132,8 +131,8 @@ class BisectingKMeansModel private[ml] (
private var trainingSummary: Option[BisectingKMeansSummary] = None
- private[clustering] def setSummary(summary: BisectingKMeansSummary): this.type = {
- this.trainingSummary = Some(summary)
+ private[clustering] def setSummary(summary: Option[BisectingKMeansSummary]): this.type = {
+ this.trainingSummary = summary
this
}
@@ -265,7 +264,7 @@ class BisectingKMeans @Since("2.0.0") (
val model = copyValues(new BisectingKMeansModel(uid, parentModel).setParent(this))
val summary = new BisectingKMeansSummary(
model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
- model.setSummary(summary)
+ model.setSummary(Some(summary))
instr.logSuccess(model)
model
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index c6035cc4c9..92d0b7d085 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -90,8 +90,7 @@ class GaussianMixtureModel private[ml] (
@Since("2.0.0")
override def copy(extra: ParamMap): GaussianMixtureModel = {
val copied = copyValues(new GaussianMixtureModel(uid, weights, gaussians), extra)
- if (trainingSummary.isDefined) copied.setSummary(trainingSummary.get)
- copied.setParent(this.parent)
+ copied.setSummary(trainingSummary).setParent(this.parent)
}
@Since("2.0.0")
@@ -150,8 +149,8 @@ class GaussianMixtureModel private[ml] (
private var trainingSummary: Option[GaussianMixtureSummary] = None
- private[clustering] def setSummary(summary: GaussianMixtureSummary): this.type = {
- this.trainingSummary = Some(summary)
+ private[clustering] def setSummary(summary: Option[GaussianMixtureSummary]): this.type = {
+ this.trainingSummary = summary
this
}
@@ -340,7 +339,7 @@ class GaussianMixture @Since("2.0.0") (
.setParent(this)
val summary = new GaussianMixtureSummary(model.transform(dataset),
$(predictionCol), $(probabilityCol), $(featuresCol), $(k))
- model.setSummary(summary)
+ model.setSummary(Some(summary))
instr.logNumFeatures(model.gaussians.head.mean.size)
instr.logSuccess(model)
model
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index 26505b4cc1..152bd13b7a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -110,8 +110,7 @@ class KMeansModel private[ml] (
@Since("1.5.0")
override def copy(extra: ParamMap): KMeansModel = {
val copied = copyValues(new KMeansModel(uid, parentModel), extra)
- if (trainingSummary.isDefined) copied.setSummary(trainingSummary.get)
- copied.setParent(this.parent)
+ copied.setSummary(trainingSummary).setParent(this.parent)
}
/** @group setParam */
@@ -165,8 +164,8 @@ class KMeansModel private[ml] (
private var trainingSummary: Option[KMeansSummary] = None
- private[clustering] def setSummary(summary: KMeansSummary): this.type = {
- this.trainingSummary = Some(summary)
+ private[clustering] def setSummary(summary: Option[KMeansSummary]): this.type = {
+ this.trainingSummary = summary
this
}
@@ -325,7 +324,7 @@ class KMeans @Since("1.5.0") (
val model = copyValues(new KMeansModel(uid, parentModel).setParent(this))
val summary = new KMeansSummary(
model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
- model.setSummary(summary)
+ model.setSummary(Some(summary))
instr.logSuccess(model)
model
}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 736fd3b9e0..3f9de1fe74 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -270,7 +270,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
.setParent(this))
val trainingSummary = new GeneralizedLinearRegressionTrainingSummary(dataset, model,
wlsModel.diagInvAtWA.toArray, 1, getSolver)
- return model.setSummary(trainingSummary)
+ return model.setSummary(Some(trainingSummary))
}
// Fit Generalized Linear Model by iteratively reweighted least squares (IRLS).
@@ -284,7 +284,7 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
.setParent(this))
val trainingSummary = new GeneralizedLinearRegressionTrainingSummary(dataset, model,
irlsModel.diagInvAtWA.toArray, irlsModel.numIterations, getSolver)
- model.setSummary(trainingSummary)
+ model.setSummary(Some(trainingSummary))
}
@Since("2.0.0")
@@ -761,8 +761,8 @@ class GeneralizedLinearRegressionModel private[ml] (
def hasSummary: Boolean = trainingSummary.nonEmpty
private[regression]
- def setSummary(summary: GeneralizedLinearRegressionTrainingSummary): this.type = {
- this.trainingSummary = Some(summary)
+ def setSummary(summary: Option[GeneralizedLinearRegressionTrainingSummary]): this.type = {
+ this.trainingSummary = summary
this
}
@@ -778,8 +778,7 @@ class GeneralizedLinearRegressionModel private[ml] (
override def copy(extra: ParamMap): GeneralizedLinearRegressionModel = {
val copied = copyValues(new GeneralizedLinearRegressionModel(uid, coefficients, intercept),
extra)
- if (trainingSummary.isDefined) copied.setSummary(trainingSummary.get)
- copied.setParent(parent)
+ copied.setSummary(trainingSummary).setParent(parent)
}
/**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index da7ce6b46f..8ea5e1e6c4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -225,7 +225,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
model.diagInvAtWA.toArray,
model.objectiveHistory)
- return lrModel.setSummary(trainingSummary)
+ return lrModel.setSummary(Some(trainingSummary))
}
val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
@@ -278,7 +278,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
model,
Array(0D),
Array(0D))
- return model.setSummary(trainingSummary)
+ return model.setSummary(Some(trainingSummary))
} else {
require($(regParam) == 0.0, "The standard deviation of the label is zero. " +
"Model cannot be regularized.")
@@ -400,7 +400,7 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
model,
Array(0D),
objectiveHistory)
- model.setSummary(trainingSummary)
+ model.setSummary(Some(trainingSummary))
}
@Since("1.4.0")
@@ -446,8 +446,9 @@ class LinearRegressionModel private[ml] (
throw new SparkException("No training summary available for this LinearRegressionModel")
}
- private[regression] def setSummary(summary: LinearRegressionTrainingSummary): this.type = {
- this.trainingSummary = Some(summary)
+ private[regression]
+ def setSummary(summary: Option[LinearRegressionTrainingSummary]): this.type = {
+ this.trainingSummary = summary
this
}
@@ -490,8 +491,7 @@ class LinearRegressionModel private[ml] (
@Since("1.4.0")
override def copy(extra: ParamMap): LinearRegressionModel = {
val newModel = copyValues(new LinearRegressionModel(uid, coefficients, intercept), extra)
- if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get)
- newModel.setParent(parent)
+ newModel.setSummary(trainingSummary).setParent(parent)
}
/**
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 2877285eb4..e360542eae 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -147,6 +147,8 @@ class LogisticRegressionSuite
assert(model.hasSummary)
val copiedModel = model.copy(ParamMap.empty)
assert(copiedModel.hasSummary)
+ model.setSummary(None)
+ assert(!model.hasSummary)
}
test("empty probabilityCol") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
index 49797d938d..fc491cd616 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
@@ -109,6 +109,9 @@ class BisectingKMeansSuite
assert(clusterSizes.length === k)
assert(clusterSizes.sum === numRows)
assert(clusterSizes.forall(_ >= 0))
+
+ model.setSummary(None)
+ assert(!model.hasSummary)
}
test("read/write") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
index 7165b63ed3..07299123f8 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -111,6 +111,9 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
assert(clusterSizes.length === k)
assert(clusterSizes.sum === numRows)
assert(clusterSizes.forall(_ >= 0))
+
+ model.setSummary(None)
+ assert(!model.hasSummary)
}
test("read/write") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
index 73972557d2..c1b7242e11 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -123,6 +123,9 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultR
assert(clusterSizes.length === k)
assert(clusterSizes.sum === numRows)
assert(clusterSizes.forall(_ >= 0))
+
+ model.setSummary(None)
+ assert(!model.hasSummary)
}
test("KMeansModel transform with non-default feature and prediction cols") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 6a4ac1735b..9b0fa67630 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -197,6 +197,8 @@ class GeneralizedLinearRegressionSuite
assert(model.hasSummary)
val copiedModel = model.copy(ParamMap.empty)
assert(copiedModel.hasSummary)
+ model.setSummary(None)
+ assert(!model.hasSummary)
assert(model.getFeaturesCol === "features")
assert(model.getPredictionCol === "prediction")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index df97d0b2ae..0be82742a3 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -146,6 +146,8 @@ class LinearRegressionSuite
assert(model.hasSummary)
val copiedModel = model.copy(ParamMap.empty)
assert(copiedModel.hasSummary)
+ model.setSummary(None)
+ assert(!model.hasSummary)
model.transform(datasetWithDenseFeature)
.select("label", "prediction")