diff options
author | JeremyNixon <jnixon2@gmail.com> | 2016-03-03 09:50:05 -0800 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2016-03-03 09:50:05 -0800 |
commit | 511d4929c87ebf364b96bd46890628f736eaa026 (patch) | |
tree | 64ef0fed4ed117f1135439be4cd2109de01c5245 /python/pyspark/ml/tests.py | |
parent | 9a48c656eec8e47a55e5e0ecc6a7901c5be31566 (diff) | |
download | spark-511d4929c87ebf364b96bd46890628f736eaa026.tar.gz spark-511d4929c87ebf364b96bd46890628f736eaa026.tar.bz2 spark-511d4929c87ebf364b96bd46890628f736eaa026.zip |
[SPARK-12877][ML] Add train-validation-split to pyspark
## What changes were proposed in this pull request?
The changes proposed were to add train-validation-split to pyspark.ml.tuning.
## How was the this patch tested?
This patch was tested through unit tests located in pyspark/ml/test.py.
This is my original work and I license it to Spark.
Author: JeremyNixon <jnixon2@gmail.com>
Closes #11335 from JeremyNixon/tvs_pyspark.
Diffstat (limited to 'python/pyspark/ml/tests.py')
-rw-r--r-- | python/pyspark/ml/tests.py | 53 |
1 files changed, 52 insertions, 1 deletions
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 5fcfa9e61f..8182fcfb4e 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -45,7 +45,7 @@ from pyspark.ml.feature import * from pyspark.ml.param import Param, Params from pyspark.ml.param.shared import HasMaxIter, HasInputCol, HasSeed from pyspark.ml.regression import LinearRegression -from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel +from pyspark.ml.tuning import * from pyspark.ml.util import keyword_only from pyspark.mllib.linalg import DenseVector from pyspark.sql import DataFrame, SQLContext, Row @@ -423,6 +423,57 @@ class CrossValidatorTests(PySparkTestCase): self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1") +class TrainValidationSplitTests(PySparkTestCase): + + def test_fit_minimize_metric(self): + sqlContext = SQLContext(self.sc) + dataset = sqlContext.createDataFrame([ + (10, 10.0), + (50, 50.0), + (100, 100.0), + (500, 500.0)] * 10, + ["feature", "label"]) + + iee = InducedErrorEstimator() + evaluator = RegressionEvaluator(metricName="rmse") + + grid = (ParamGridBuilder() + .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) + .build()) + tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) + tvsModel = tvs.fit(dataset) + bestModel = tvsModel.bestModel + bestModelMetric = evaluator.evaluate(bestModel.transform(dataset)) + + self.assertEqual(0.0, bestModel.getOrDefault('inducedError'), + "Best model should have zero induced error") + self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0") + + def test_fit_maximize_metric(self): + sqlContext = SQLContext(self.sc) + dataset = sqlContext.createDataFrame([ + (10, 10.0), + (50, 50.0), + (100, 100.0), + (500, 500.0)] * 10, + ["feature", "label"]) + + iee = InducedErrorEstimator() + evaluator = RegressionEvaluator(metricName="r2") + + grid = (ParamGridBuilder() + .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) + .build()) + tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) + tvsModel = tvs.fit(dataset) + bestModel = tvsModel.bestModel + bestModelMetric = evaluator.evaluate(bestModel.transform(dataset)) + + self.assertEqual(0.0, bestModel.getOrDefault('inducedError'), + "Best model should have zero induced error") + self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1") + + class PersistenceTest(PySparkTestCase): def test_linear_regression(self): |