aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/ml/tests.py
diff options
context:
space:
mode:
authorTakuya Kuwahara <taakuu19@gmail.com>2016-05-18 08:29:47 +0200
committerNick Pentreath <nickp@za.ibm.com>2016-05-18 08:29:47 +0200
commit411c04adb596c514f2634efd5f5d126e12b05df7 (patch)
tree80c10aa79cebc920cb9481f5b4bdd3c866e23313 /python/pyspark/ml/tests.py
parent2a5db9c140b9d60a5ec91018be19bec7b80850ee (diff)
downloadspark-411c04adb596c514f2634efd5f5d126e12b05df7.tar.gz
spark-411c04adb596c514f2634efd5f5d126e12b05df7.tar.bz2
spark-411c04adb596c514f2634efd5f5d126e12b05df7.zip
[SPARK-14978][PYSPARK] PySpark TrainValidationSplitModel should support validationMetrics
## What changes were proposed in this pull request? This pull request includes supporting validationMetrics for TrainValidationSplitModel with Python and test for it. ## How was this patch tested? test in `python/pyspark/ml/tests.py` Author: Takuya Kuwahara <taakuu19@gmail.com> Closes #12767 from taku-k/spark-14978.
Diffstat (limited to 'python/pyspark/ml/tests.py')
-rwxr-xr-xpython/pyspark/ml/tests.py50
1 files changed, 44 insertions, 6 deletions
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index e3511120bd..a7c93ac802 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -610,17 +610,21 @@ class TrainValidationSplitTests(SparkSessionTestCase):
iee = InducedErrorEstimator()
evaluator = RegressionEvaluator(metricName="rmse")
- grid = (ParamGridBuilder()
- .addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
- .build())
+ grid = ParamGridBuilder() \
+ .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \
+ .build()
tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
tvsModel = tvs.fit(dataset)
bestModel = tvsModel.bestModel
bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))
+ validationMetrics = tvsModel.validationMetrics
self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
"Best model should have zero induced error")
self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0")
+ self.assertEqual(len(grid), len(validationMetrics),
+ "validationMetrics has the same size of grid parameter")
+ self.assertEqual(0.0, min(validationMetrics))
def test_fit_maximize_metric(self):
dataset = self.spark.createDataFrame([
@@ -633,17 +637,21 @@ class TrainValidationSplitTests(SparkSessionTestCase):
iee = InducedErrorEstimator()
evaluator = RegressionEvaluator(metricName="r2")
- grid = (ParamGridBuilder()
- .addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
- .build())
+ grid = ParamGridBuilder() \
+ .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \
+ .build()
tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
tvsModel = tvs.fit(dataset)
bestModel = tvsModel.bestModel
bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))
+ validationMetrics = tvsModel.validationMetrics
self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
"Best model should have zero induced error")
self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1")
+ self.assertEqual(len(grid), len(validationMetrics),
+ "validationMetrics has the same size of grid parameter")
+ self.assertEqual(1.0, max(validationMetrics))
def test_save_load(self):
# This tests saving and loading the trained model only.
@@ -669,6 +677,36 @@ class TrainValidationSplitTests(SparkSessionTestCase):
self.assertEqual(loadedLrModel.uid, lrModel.uid)
self.assertEqual(loadedLrModel.intercept, lrModel.intercept)
+ def test_copy(self):
+ dataset = self.spark.createDataFrame([
+ (10, 10.0),
+ (50, 50.0),
+ (100, 100.0),
+ (500, 500.0)] * 10,
+ ["feature", "label"])
+
+ iee = InducedErrorEstimator()
+ evaluator = RegressionEvaluator(metricName="r2")
+
+ grid = ParamGridBuilder() \
+ .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \
+ .build()
+ tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
+ tvsModel = tvs.fit(dataset)
+ tvsCopied = tvs.copy()
+ tvsModelCopied = tvsModel.copy()
+
+ self.assertEqual(tvs.getEstimator().uid, tvsCopied.getEstimator().uid,
+ "Copied TrainValidationSplit has the same uid of Estimator")
+
+ self.assertEqual(tvsModel.bestModel.uid, tvsModelCopied.bestModel.uid)
+ self.assertEqual(len(tvsModel.validationMetrics),
+ len(tvsModelCopied.validationMetrics),
+ "Copied validationMetrics has the same size of the original")
+ for index in range(len(tvsModel.validationMetrics)):
+ self.assertEqual(tvsModel.validationMetrics[index],
+ tvsModelCopied.validationMetrics[index])
+
class PersistenceTest(SparkSessionTestCase):