diff options
author | wm624@hotmail.com <wm624@hotmail.com> | 2016-04-08 10:47:05 -0700 |
---|---|---|
committer | Joseph K. Bradley <joseph@databricks.com> | 2016-04-08 10:47:05 -0700 |
commit | e0ad75f2b55772efc82a6f8ebb1b2d80fe27d9b5 (patch) | |
tree | 600d2505875b418ec19f461416c42585693a92d6 | |
parent | e5d8d6e09cad304e353c96f9408fb9f799348827 (diff) | |
download | spark-e0ad75f2b55772efc82a6f8ebb1b2d80fe27d9b5.tar.gz spark-e0ad75f2b55772efc82a6f8ebb1b2d80fe27d9b5.tar.bz2 spark-e0ad75f2b55772efc82a6f8ebb1b2d80fe27d9b5.zip |
[SPARK-12569][PYSPARK][ML] DecisionTreeRegressor: provide variance of prediction: Python API
## What changes were proposed in this pull request?
A new column VarianceCol has been added to DecisionTreeRegressor in ML scala code.
This patch adds the corresponding Python API, HasVarianceCol, to class DecisionTreeRegressor.
## How was this patch tested?
./dev/lint-python
PEP8 checks passed.
rm -rf _build/*
pydoc checks passed.
./python/run-tests --python-executables=python2.7 --modules=pyspark-ml
Running PySpark tests. Output is in /Users/mwang/spark_ws_0904/python/unit-tests.log
Will test against the following Python executables: ['python2.7']
Will test the following Python modules: ['pyspark-ml']
Finished test(python2.7): pyspark.ml.evaluation (12s)
Finished test(python2.7): pyspark.ml.clustering (18s)
Finished test(python2.7): pyspark.ml.classification (30s)
Finished test(python2.7): pyspark.ml.recommendation (28s)
Finished test(python2.7): pyspark.ml.feature (43s)
Finished test(python2.7): pyspark.ml.regression (31s)
Finished test(python2.7): pyspark.ml.tuning (19s)
Finished test(python2.7): pyspark.ml.tests (34s)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)
Author: wm624@hotmail.com <wm624@hotmail.com>
Closes #12116 from wangmiao1981/fix_api.
-rw-r--r-- | python/pyspark/ml/param/_shared_params_code_gen.py | 4 | ||||
-rw-r--r-- | python/pyspark/ml/param/shared.py | 24 | ||||
-rw-r--r-- | python/pyspark/ml/regression.py | 14 |
3 files changed, 35 insertions, 7 deletions
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index 715fa9e9f8..a7615c43be 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -146,7 +146,9 @@ if __name__ == "__main__": ("weightCol", "weight column name. If this is not set or empty, we treat " + "all instance weights as 1.0.", None, "TypeConverters.toString"), ("solver", "the solver algorithm for optimization. If this is not set or empty, " + - "default value is 'auto'.", "'auto'", "TypeConverters.toString")] + "default value is 'auto'.", "'auto'", "TypeConverters.toString"), + ("varianceCol", "column name for the biased sample variance of prediction.", + None, "TypeConverters.toString")] code = [] for name, doc, defaultValueStr, typeConverter in shared: diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index d79d55e463..c9e975525c 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -559,6 +559,30 @@ class HasSolver(Params): return self.getOrDefault(self.solver) +class HasVarianceCol(Params): + """ + Mixin for param varianceCol: column name for the biased sample variance of prediction. + """ + + varianceCol = Param(Params._dummy(), "varianceCol", "column name for the biased sample variance of prediction.", typeConverter=TypeConverters.toString) + + def __init__(self): + super(HasVarianceCol, self).__init__() + + def setVarianceCol(self, value): + """ + Sets the value of :py:attr:`varianceCol`. + """ + self._set(varianceCol=value) + return self + + def getVarianceCol(self): + """ + Gets the value of varianceCol or its default value. + """ + return self.getOrDefault(self.varianceCol) + + class DecisionTreeParams(Params): """ Mixin for Decision Tree parameters. diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 00a6a0de90..f6c5d130dd 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -630,7 +630,7 @@ class GBTParams(TreeEnsembleParams): @inherit_doc class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, DecisionTreeParams, TreeRegressorParams, HasCheckpointInterval, - HasSeed, JavaMLWritable, JavaMLReadable): + HasSeed, JavaMLWritable, JavaMLReadable, HasVarianceCol): """ `http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree` learning algorithm for regression. @@ -640,7 +640,7 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi >>> df = sqlContext.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) - >>> dt = DecisionTreeRegressor(maxDepth=2) + >>> dt = DecisionTreeRegressor(maxDepth=2, varianceCol="variance") >>> model = dt.fit(df) >>> model.depth 1 @@ -666,6 +666,8 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi True >>> model.depth == model2.depth True + >>> model.transform(test1).head().variance + 0.0 .. versionadded:: 1.4.0 """ @@ -674,12 +676,12 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance", - seed=None): + seed=None, varianceCol=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ - impurity="variance", seed=None) + impurity="variance", seed=None, varianceCol=None) """ super(DecisionTreeRegressor, self).__init__() self._java_obj = self._new_java_obj( @@ -695,12 +697,12 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - impurity="variance", seed=None): + impurity="variance", seed=None, varianceCol=None): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ - impurity="variance", seed=None) + impurity="variance", seed=None, varianceCol=None) Sets params for the DecisionTreeRegressor. """ kwargs = self.setParams._input_kwargs |