aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorBryan Cutler <cutlerb@gmail.com>2017-04-03 10:56:54 +0200
committerNick Pentreath <nickp@za.ibm.com>2017-04-03 10:56:54 +0200
commit2a903a1eec46e3bd58af0fcbc57e76752d9c18b3 (patch)
tree9080b7736edf130303043d56700a0f08337e6d64 /mllib
parent93dbfe705f3e7410a7267e406332ffb3c3077829 (diff)
downloadspark-2a903a1eec46e3bd58af0fcbc57e76752d9c18b3.tar.gz
spark-2a903a1eec46e3bd58af0fcbc57e76752d9c18b3.tar.bz2
spark-2a903a1eec46e3bd58af0fcbc57e76752d9c18b3.zip
[SPARK-19985][ML] Fixed copy method for some ML Models
## What changes were proposed in this pull request? Some ML Models were using `defaultCopy` which expects a default constructor, and others were not setting the parent estimator. This change fixes these by creating a new instance of the model and explicitly setting values and parent. ## How was this patch tested? Added `MLTestingUtils.checkCopy` to the offending models to tests to verify the copy is made and parent is set. Author: Bryan Cutler <cutlerb@gmail.com> Closes #17326 from BryanCutler/ml-model-copy-error-SPARK-19985.
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala3
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala5
-rw-r--r--mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala6
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala1
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala6
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala11
-rw-r--r--mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala1
8 files changed, 30 insertions, 8 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index 95c1337ed5..ec39f964e2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -329,7 +329,8 @@ class MultilayerPerceptronClassificationModel private[ml] (
@Since("1.5.0")
override def copy(extra: ParamMap): MultilayerPerceptronClassificationModel = {
- copyValues(new MultilayerPerceptronClassificationModel(uid, layers, weights), extra)
+ val copied = new MultilayerPerceptronClassificationModel(uid, layers, weights).setParent(parent)
+ copyValues(copied, extra)
}
@Since("2.0.0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.scala
index cbac16345a..36a46ca6ff 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSH.scala
@@ -96,7 +96,10 @@ class BucketedRandomProjectionLSHModel private[ml](
}
@Since("2.1.0")
- override def copy(extra: ParamMap): this.type = defaultCopy(extra)
+ override def copy(extra: ParamMap): BucketedRandomProjectionLSHModel = {
+ val copied = new BucketedRandomProjectionLSHModel(uid, randUnitVectors).setParent(parent)
+ copyValues(copied, extra)
+ }
@Since("2.1.0")
override def write: MLWriter = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
index 620e1fbb09..145422a059 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
@@ -86,7 +86,10 @@ class MinHashLSHModel private[ml](
}
@Since("2.1.0")
- override def copy(extra: ParamMap): this.type = defaultCopy(extra)
+ override def copy(extra: ParamMap): MinHashLSHModel = {
+ val copied = new MinHashLSHModel(uid, randCoefficients).setParent(parent)
+ copyValues(copied, extra)
+ }
@Since("2.1.0")
override def write: MLWriter = new MinHashLSHModel.MinHashLSHModelWriter(this)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 389898666e..5a3e2929f5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -268,8 +268,10 @@ class RFormulaModel private[feature](
}
@Since("1.5.0")
- override def copy(extra: ParamMap): RFormulaModel = copyValues(
- new RFormulaModel(uid, resolvedFormula, pipelineModel))
+ override def copy(extra: ParamMap): RFormulaModel = {
+ val copied = new RFormulaModel(uid, resolvedFormula, pipelineModel).setParent(parent)
+ copyValues(copied, extra)
+ }
@Since("2.0.0")
override def toString: String = s"RFormulaModel($resolvedFormula) (uid=$uid)"
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
index 41684d92be..7700099caa 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala
@@ -74,6 +74,7 @@ class MultilayerPerceptronClassifierSuite
.setMaxIter(100)
.setSolver("l-bfgs")
val model = trainer.fit(dataset)
+ MLTestingUtils.checkCopy(model)
val result = model.transform(dataset)
val predictionAndLabels = result.select("prediction", "label").collect()
predictionAndLabels.foreach { case Row(p: Double, l: Double) =>
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala
index 91eac9e733..cc81da5c66 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketedRandomProjectionLSHSuite.scala
@@ -23,7 +23,7 @@ import breeze.numerics.constants.Pi
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Dataset
@@ -89,10 +89,12 @@ class BucketedRandomProjectionLSHSuite
.setOutputCol("values")
.setBucketLength(1.0)
.setSeed(12345)
- val unitVectors = brp.fit(dataset).randUnitVectors
+ val brpModel = brp.fit(dataset)
+ val unitVectors = brpModel.randUnitVectors
unitVectors.foreach { v: Vector =>
assert(Vectors.norm(v, 2.0) ~== 1.0 absTol 1e-14)
}
+ MLTestingUtils.checkCopy(brpModel)
}
test("BucketedRandomProjectionLSH: test of LSH property") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala
index a2f009310f..0ddf097a6e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ml.feature
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Dataset
@@ -57,6 +57,15 @@ class MinHashLSHSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
testEstimatorAndModelReadWrite(mh, dataset, settings, settings, checkModelData)
}
+ test("Model copy and uid checks") {
+ val mh = new MinHashLSH()
+ .setInputCol("keys")
+ .setOutputCol("values")
+ val model = mh.fit(dataset)
+ assert(mh.uid === model.uid)
+ MLTestingUtils.checkCopy(model)
+ }
+
test("hashFunction") {
val model = new MinHashLSHModel("mh", randCoefficients = Array((0, 1), (1, 2), (3, 0)))
val res = model.hashFunction(Vectors.sparse(10, Seq((2, 1.0), (3, 1.0), (5, 1.0), (7, 1.0))))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index c664460d7d..5cfd59e6b8 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -37,6 +37,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
val formula = new RFormula().setFormula("id ~ v1 + v2")
val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2")
val model = formula.fit(original)
+ MLTestingUtils.checkCopy(model)
val result = model.transform(original)
val resultSchema = model.transformSchema(original.schema)
val expected = Seq(