diff options
author | Joseph K. Bradley <joseph@databricks.com> | 2016-04-29 20:51:24 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2016-04-29 20:51:24 -0700 |
commit | 09da43d514dc4487af88056404953a1f8fd8bee1 (patch) | |
tree | ad58fa93d6113089c5aaf772c0d37d0414715b0a /python/pyspark/ml/tests.py | |
parent | 66773eb8a55bfe6437dd4096c2c55685aca29dcd (diff) | |
download | spark-09da43d514dc4487af88056404953a1f8fd8bee1.tar.gz spark-09da43d514dc4487af88056404953a1f8fd8bee1.tar.bz2 spark-09da43d514dc4487af88056404953a1f8fd8bee1.zip |
[SPARK-13786][ML][PYTHON] Removed save/load for python tuning
## What changes were proposed in this pull request?
Per discussion on [https://github.com/apache/spark/pull/12604], this removes ML persistence for Python tuning (TrainValidationSplit, CrossValidator, and their Models) since they do not handle nesting easily. This support should be re-designed and added in the next release.
## How was this patch tested?
Removed unit test elements saving and loading the tuning algorithms, but kept tests to save and load their bestModel fields.
Author: Joseph K. Bradley <joseph@databricks.com>
Closes #12782 from jkbradley/remove-python-tuning-saveload.
Diffstat (limited to 'python/pyspark/ml/tests.py')
-rw-r--r-- | python/pyspark/ml/tests.py | 39 |
1 files changed, 17 insertions, 22 deletions
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index e7d4c0af45..faca148218 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -44,8 +44,7 @@ import numpy as np from pyspark import keyword_only from pyspark.ml import Estimator, Model, Pipeline, PipelineModel, Transformer -from pyspark.ml.classification import ( - LogisticRegression, DecisionTreeClassifier, OneVsRest, OneVsRestModel) +from pyspark.ml.classification import * from pyspark.ml.clustering import * from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator from pyspark.ml.feature import * @@ -540,6 +539,8 @@ class CrossValidatorTests(PySparkTestCase): self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1") def test_save_load(self): + # This tests saving and loading the trained model only. + # Save/load for CrossValidator will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() sqlContext = SQLContext(self.sc) dataset = sqlContext.createDataFrame( @@ -554,18 +555,13 @@ class CrossValidatorTests(PySparkTestCase): evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) - cvPath = temp_path + "/cv" - cv.save(cvPath) - loadedCV = CrossValidator.load(cvPath) - self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid) - self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid) - self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps()) + lrModel = cvModel.bestModel + cvModelPath = temp_path + "/cvModel" - cvModel.save(cvModelPath) - loadedModel = CrossValidatorModel.load(cvModelPath) - self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid) - for index in range(len(loadedModel.avgMetrics)): - self.assertTrue(abs(loadedModel.avgMetrics[index] - cvModel.avgMetrics[index]) < 0.0001) + lrModel.save(cvModelPath) + loadedLrModel = LogisticRegressionModel.load(cvModelPath) + self.assertEqual(loadedLrModel.uid, lrModel.uid) + self.assertEqual(loadedLrModel.intercept, lrModel.intercept) class TrainValidationSplitTests(PySparkTestCase): @@ -619,6 +615,8 @@ class TrainValidationSplitTests(PySparkTestCase): self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1") def test_save_load(self): + # This tests saving and loading the trained model only. + # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() sqlContext = SQLContext(self.sc) dataset = sqlContext.createDataFrame( @@ -633,16 +631,13 @@ class TrainValidationSplitTests(PySparkTestCase): evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) - tvsPath = temp_path + "/tvs" - tvs.save(tvsPath) - loadedTvs = TrainValidationSplit.load(tvsPath) - self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) - self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) - self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps()) + lrModel = tvsModel.bestModel + tvsModelPath = temp_path + "/tvsModel" - tvsModel.save(tvsModelPath) - loadedModel = TrainValidationSplitModel.load(tvsModelPath) - self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid) + lrModel.save(tvsModelPath) + loadedLrModel = LogisticRegressionModel.load(tvsModelPath) + self.assertEqual(loadedLrModel.uid, lrModel.uid) + self.assertEqual(loadedLrModel.intercept, lrModel.intercept) class PersistenceTest(PySparkTestCase): |