[SPARK-13786][ML][PYTHON] Removed save/load for python tuning

## What changes were proposed in this pull request? Per discussion on [https://github.com/apache/spark/pull/12604], this removes ML persistence for Python tuning (TrainValidationSplit, CrossValidator, and their Models) since they do not handle nesting easily. This support should be re-designed and added in the next release. ## How was this patch tested? Removed unit test elements saving and loading the tuning algorithms, but kept tests to save and load their bestModel fields. Author: Joseph K. Bradley <joseph@databricks.com> Closes #12782 from jkbradley/remove-python-tuning-saveload.
author: Joseph K. Bradley <joseph@databricks.com> 2016-04-29 20:51:24 -0700
committer: Xiangrui Meng <meng@databricks.com> 2016-04-29 20:51:24 -0700
commit: 09da43d514dc4487af88056404953a1f8fd8bee1 (patch)
tree: ad58fa93d6113089c5aaf772c0d37d0414715b0a /python/pyspark/ml/tests.py
parent: 66773eb8a55bfe6437dd4096c2c55685aca29dcd (diff)
download: spark-09da43d514dc4487af88056404953a1f8fd8bee1.tar.gz
spark-09da43d514dc4487af88056404953a1f8fd8bee1.tar.bz2
spark-09da43d514dc4487af88056404953a1f8fd8bee1.zip
1 files changed, 17 insertions, 22 deletions
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index e7d4c0af45..faca148218 100644
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -44,8 +44,7 @@ import numpy as np
 
 from pyspark import keyword_only
 from pyspark.ml import Estimator, Model, Pipeline, PipelineModel, Transformer
-from pyspark.ml.classification import (
-    LogisticRegression, DecisionTreeClassifier, OneVsRest, OneVsRestModel)
+from pyspark.ml.classification import *
 from pyspark.ml.clustering import *
 from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
 from pyspark.ml.feature import *
@@ -540,6 +539,8 @@ class CrossValidatorTests(PySparkTestCase):
         self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1")
 
     def test_save_load(self):
+        # This tests saving and loading the trained model only.
+        # Save/load for CrossValidator will be added later: SPARK-13786
         temp_path = tempfile.mkdtemp()
         sqlContext = SQLContext(self.sc)
         dataset = sqlContext.createDataFrame(
@@ -554,18 +555,13 @@ class CrossValidatorTests(PySparkTestCase):
         evaluator = BinaryClassificationEvaluator()
         cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
         cvModel = cv.fit(dataset)
-        cvPath = temp_path + "/cv"
-        cv.save(cvPath)
-        loadedCV = CrossValidator.load(cvPath)
-        self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
-        self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
-        self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps())
+        lrModel = cvModel.bestModel
+
         cvModelPath = temp_path + "/cvModel"
-        cvModel.save(cvModelPath)
-        loadedModel = CrossValidatorModel.load(cvModelPath)
-        self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
-        for index in range(len(loadedModel.avgMetrics)):
-            self.assertTrue(abs(loadedModel.avgMetrics[index] - cvModel.avgMetrics[index]) < 0.0001)
+        lrModel.save(cvModelPath)
+        loadedLrModel = LogisticRegressionModel.load(cvModelPath)
+        self.assertEqual(loadedLrModel.uid, lrModel.uid)
+        self.assertEqual(loadedLrModel.intercept, lrModel.intercept)
 
 
 class TrainValidationSplitTests(PySparkTestCase):
@@ -619,6 +615,8 @@ class TrainValidationSplitTests(PySparkTestCase):
         self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1")
 
     def test_save_load(self):
+        # This tests saving and loading the trained model only.
+        # Save/load for TrainValidationSplit will be added later: SPARK-13786
         temp_path = tempfile.mkdtemp()
         sqlContext = SQLContext(self.sc)
         dataset = sqlContext.createDataFrame(
@@ -633,16 +631,13 @@ class TrainValidationSplitTests(PySparkTestCase):
         evaluator = BinaryClassificationEvaluator()
         tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
         tvsModel = tvs.fit(dataset)
-        tvsPath = temp_path + "/tvs"
-        tvs.save(tvsPath)
-        loadedTvs = TrainValidationSplit.load(tvsPath)
-        self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)
-        self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid)
-        self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps())
+        lrModel = tvsModel.bestModel
+
         tvsModelPath = temp_path + "/tvsModel"
-        tvsModel.save(tvsModelPath)
-        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
-        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
+        lrModel.save(tvsModelPath)
+        loadedLrModel = LogisticRegressionModel.load(tvsModelPath)
+        self.assertEqual(loadedLrModel.uid, lrModel.uid)
+        self.assertEqual(loadedLrModel.intercept, lrModel.intercept)
 
 
 class PersistenceTest(PySparkTestCase):
author	Joseph K. Bradley <joseph@databricks.com>	2016-04-29 20:51:24 -0700
committer	Xiangrui Meng <meng@databricks.com>	2016-04-29 20:51:24 -0700
commit	09da43d514dc4487af88056404953a1f8fd8bee1 (patch)
tree	ad58fa93d6113089c5aaf772c0d37d0414715b0a /python/pyspark/ml/tests.py
parent	66773eb8a55bfe6437dd4096c2c55685aca29dcd (diff)
download	spark-09da43d514dc4487af88056404953a1f8fd8bee1.tar.gz spark-09da43d514dc4487af88056404953a1f8fd8bee1.tar.bz2 spark-09da43d514dc4487af88056404953a1f8fd8bee1.zip