From e0ad75f2b55772efc82a6f8ebb1b2d80fe27d9b5 Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Fri, 8 Apr 2016 10:47:05 -0700 Subject: [SPARK-12569][PYSPARK][ML] DecisionTreeRegressor: provide variance of prediction: Python API ## What changes were proposed in this pull request? A new column VarianceCol has been added to DecisionTreeRegressor in ML scala code. This patch adds the corresponding Python API, HasVarianceCol, to class DecisionTreeRegressor. ## How was this patch tested? ./dev/lint-python PEP8 checks passed. rm -rf _build/* pydoc checks passed. ./python/run-tests --python-executables=python2.7 --modules=pyspark-ml Running PySpark tests. Output is in /Users/mwang/spark_ws_0904/python/unit-tests.log Will test against the following Python executables: ['python2.7'] Will test the following Python modules: ['pyspark-ml'] Finished test(python2.7): pyspark.ml.evaluation (12s) Finished test(python2.7): pyspark.ml.clustering (18s) Finished test(python2.7): pyspark.ml.classification (30s) Finished test(python2.7): pyspark.ml.recommendation (28s) Finished test(python2.7): pyspark.ml.feature (43s) Finished test(python2.7): pyspark.ml.regression (31s) Finished test(python2.7): pyspark.ml.tuning (19s) Finished test(python2.7): pyspark.ml.tests (34s) (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) Author: wm624@hotmail.com Closes #12116 from wangmiao1981/fix_api. --- python/pyspark/ml/param/_shared_params_code_gen.py | 4 +++- python/pyspark/ml/param/shared.py | 24 ++++++++++++++++++++++ python/pyspark/ml/regression.py | 14 +++++++------ 3 files changed, 35 insertions(+), 7 deletions(-) (limited to 'python/pyspark/ml') diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index 715fa9e9f8..a7615c43be 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -146,7 +146,9 @@ if __name__ == "__main__": ("weightCol", "weight column name. If this is not set or empty, we treat " + "all instance weights as 1.0.", None, "TypeConverters.toString"), ("solver", "the solver algorithm for optimization. If this is not set or empty, " + - "default value is 'auto'.", "'auto'", "TypeConverters.toString")] + "default value is 'auto'.", "'auto'", "TypeConverters.toString"), + ("varianceCol", "column name for the biased sample variance of prediction.", + None, "TypeConverters.toString")] code = [] for name, doc, defaultValueStr, typeConverter in shared: diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index d79d55e463..c9e975525c 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -559,6 +559,30 @@ class HasSolver(Params): return self.getOrDefault(self.solver) +class HasVarianceCol(Params): + """ + Mixin for param varianceCol: column name for the biased sample variance of prediction. + """ + + varianceCol = Param(Params._dummy(), "varianceCol", "column name for the biased sample variance of prediction.", typeConverter=TypeConverters.toString) + + def __init__(self): + super(HasVarianceCol, self).__init__() + + def setVarianceCol(self, value): + """ + Sets the value of :py:attr:`varianceCol`. + """ + self._set(varianceCol=value) + return self + + def getVarianceCol(self): + """ + Gets the value of varianceCol or its default value. + """ + return self.getOrDefault(self.varianceCol) + + class DecisionTreeParams(Params): """ Mixin for Decision Tree parameters. diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 00a6a0de90..f6c5d130dd 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -630,7 +630,7 @@ class GBTParams(TreeEnsembleParams): @inherit_doc class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, DecisionTreeParams, TreeRegressorParams, HasCheckpointInterval, - HasSeed, JavaMLWritable, JavaMLReadable): + HasSeed, JavaMLWritable, JavaMLReadable, HasVarianceCol): """ `http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree` learning algorithm for regression. @@ -640,7 +640,7 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi >>> df = sqlContext.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) - >>> dt = DecisionTreeRegressor(maxDepth=2) + >>> dt = DecisionTreeRegressor(maxDepth=2, varianceCol="variance") >>> model = dt.fit(df) >>> model.depth 1 @@ -666,6 +666,8 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi True >>> model.depth == model2.depth True + >>> model.transform(test1).head().variance + 0.0 .. versionadded:: 1.4.0 """ @@ -674,12 +676,12 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance", - seed=None): + seed=None, varianceCol=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ - impurity="variance", seed=None) + impurity="variance", seed=None, varianceCol=None) """ super(DecisionTreeRegressor, self).__init__() self._java_obj = self._new_java_obj( @@ -695,12 +697,12 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - impurity="variance", seed=None): + impurity="variance", seed=None, varianceCol=None): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ - impurity="variance", seed=None) + impurity="variance", seed=None, varianceCol=None) Sets params for the DecisionTreeRegressor. """ kwargs = self.setParams._input_kwargs -- cgit v1.2.3