From d584a2b8ac57eff3bf230c760e5bda205c6ea747 Mon Sep 17 00:00:00 2001 From: Kai Jiang Date: Thu, 28 Apr 2016 14:19:11 -0700 Subject: [SPARK-12810][PYSPARK] PySpark CrossValidatorModel should support avgMetrics ## What changes were proposed in this pull request? support avgMetrics in CrossValidatorModel with Python ## How was this patch tested? Doctest and `test_save_load` in `pyspark/ml/test.py` [JIRA](https://issues.apache.org/jira/browse/SPARK-12810) Author: Kai Jiang Closes #12464 from vectorijk/spark-12810. --- python/pyspark/ml/tests.py | 27 +++++++++++++++++++++++++++ python/pyspark/ml/tuning.py | 18 ++++++++++-------- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index ebef656632..36cecd4682 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -466,6 +466,31 @@ class InducedErrorEstimator(Estimator, HasInducedError): class CrossValidatorTests(PySparkTestCase): + def test_copy(self): + sqlContext = SQLContext(self.sc) + dataset = sqlContext.createDataFrame([ + (10, 10.0), + (50, 50.0), + (100, 100.0), + (500, 500.0)] * 10, + ["feature", "label"]) + + iee = InducedErrorEstimator() + evaluator = RegressionEvaluator(metricName="rmse") + + grid = (ParamGridBuilder() + .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) + .build()) + cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) + cvCopied = cv.copy() + self.assertEqual(cv.getEstimator().uid, cvCopied.getEstimator().uid) + + cvModel = cv.fit(dataset) + cvModelCopied = cvModel.copy() + for index in range(len(cvModel.avgMetrics)): + self.assertTrue(abs(cvModel.avgMetrics[index] - cvModelCopied.avgMetrics[index]) + < 0.0001) + def test_fit_minimize_metric(self): sqlContext = SQLContext(self.sc) dataset = sqlContext.createDataFrame([ @@ -539,6 +564,8 @@ class CrossValidatorTests(PySparkTestCase): cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid) + for index in range(len(loadedModel.avgMetrics)): + self.assertTrue(abs(loadedModel.avgMetrics[index] - cvModel.avgMetrics[index]) < 0.0001) class TrainValidationSplitTests(PySparkTestCase): diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index b16628bc70..22f9680cab 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -248,7 +248,7 @@ class CrossValidator(Estimator, ValidatorParams, MLReadable, MLWritable): h = 1.0 / nFolds randCol = self.uid + "_rand" df = dataset.select("*", rand(seed).alias(randCol)) - metrics = np.zeros(numModels) + metrics = [0.0] * numModels for i in range(nFolds): validateLB = i * h validateUB = (i + 1) * h @@ -266,7 +266,7 @@ class CrossValidator(Estimator, ValidatorParams, MLReadable, MLWritable): else: bestIndex = np.argmin(metrics) bestModel = est.fit(dataset, epm[bestIndex]) - return self._copyValues(CrossValidatorModel(bestModel)) + return self._copyValues(CrossValidatorModel(bestModel, metrics)) @since("1.4.0") def copy(self, extra=None): @@ -346,10 +346,11 @@ class CrossValidatorModel(Model, ValidatorParams, MLReadable, MLWritable): .. versionadded:: 1.4.0 """ - def __init__(self, bestModel): + def __init__(self, bestModel, avgMetrics=[]): super(CrossValidatorModel, self).__init__() #: best model from cross validation self.bestModel = bestModel + self.avgMetrics = avgMetrics def _transform(self, dataset): return self.bestModel.transform(dataset) @@ -367,7 +368,9 @@ class CrossValidatorModel(Model, ValidatorParams, MLReadable, MLWritable): """ if extra is None: extra = dict() - return CrossValidatorModel(self.bestModel.copy(extra)) + bestModel = self.bestModel.copy(extra) + avgMetrics = self.avgMetrics + return CrossValidatorModel(bestModel, avgMetrics) @since("2.0.0") def write(self): @@ -394,9 +397,10 @@ class CrossValidatorModel(Model, ValidatorParams, MLReadable, MLWritable): # Load information from java_stage to the instance. bestModel = JavaParams._from_java(java_stage.bestModel()) + avgMetrics = list(java_stage.avgMetrics()) estimator, epms, evaluator = super(CrossValidatorModel, cls)._from_java_impl(java_stage) # Create a new instance of this stage. - py_stage = cls(bestModel=bestModel)\ + py_stage = cls(bestModel=bestModel, avgMetrics=avgMetrics)\ .setEstimator(estimator).setEstimatorParamMaps(epms).setEvaluator(evaluator) py_stage._resetUid(java_stage.uid()) return py_stage @@ -408,12 +412,10 @@ class CrossValidatorModel(Model, ValidatorParams, MLReadable, MLWritable): :return: Java object equivalent to this instance. """ - sc = SparkContext._active_spark_context - _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.CrossValidatorModel", self.uid, self.bestModel._to_java(), - _py2java(sc, [])) + self.avgMetrics) estimator, epms, evaluator = super(CrossValidatorModel, self)._to_java_impl() _java_obj.set("evaluator", evaluator) -- cgit v1.2.3