aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorwm624@hotmail.com <wm624@hotmail.com>2016-04-08 10:47:05 -0700
committerJoseph K. Bradley <joseph@databricks.com>2016-04-08 10:47:05 -0700
commite0ad75f2b55772efc82a6f8ebb1b2d80fe27d9b5 (patch)
tree600d2505875b418ec19f461416c42585693a92d6 /python
parente5d8d6e09cad304e353c96f9408fb9f799348827 (diff)
downloadspark-e0ad75f2b55772efc82a6f8ebb1b2d80fe27d9b5.tar.gz
spark-e0ad75f2b55772efc82a6f8ebb1b2d80fe27d9b5.tar.bz2
spark-e0ad75f2b55772efc82a6f8ebb1b2d80fe27d9b5.zip
[SPARK-12569][PYSPARK][ML] DecisionTreeRegressor: provide variance of prediction: Python API
## What changes were proposed in this pull request? A new column VarianceCol has been added to DecisionTreeRegressor in ML scala code. This patch adds the corresponding Python API, HasVarianceCol, to class DecisionTreeRegressor. ## How was this patch tested? ./dev/lint-python PEP8 checks passed. rm -rf _build/* pydoc checks passed. ./python/run-tests --python-executables=python2.7 --modules=pyspark-ml Running PySpark tests. Output is in /Users/mwang/spark_ws_0904/python/unit-tests.log Will test against the following Python executables: ['python2.7'] Will test the following Python modules: ['pyspark-ml'] Finished test(python2.7): pyspark.ml.evaluation (12s) Finished test(python2.7): pyspark.ml.clustering (18s) Finished test(python2.7): pyspark.ml.classification (30s) Finished test(python2.7): pyspark.ml.recommendation (28s) Finished test(python2.7): pyspark.ml.feature (43s) Finished test(python2.7): pyspark.ml.regression (31s) Finished test(python2.7): pyspark.ml.tuning (19s) Finished test(python2.7): pyspark.ml.tests (34s) (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) Author: wm624@hotmail.com <wm624@hotmail.com> Closes #12116 from wangmiao1981/fix_api.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/ml/param/_shared_params_code_gen.py4
-rw-r--r--python/pyspark/ml/param/shared.py24
-rw-r--r--python/pyspark/ml/regression.py14
3 files changed, 35 insertions, 7 deletions
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 715fa9e9f8..a7615c43be 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -146,7 +146,9 @@ if __name__ == "__main__":
("weightCol", "weight column name. If this is not set or empty, we treat " +
"all instance weights as 1.0.", None, "TypeConverters.toString"),
("solver", "the solver algorithm for optimization. If this is not set or empty, " +
- "default value is 'auto'.", "'auto'", "TypeConverters.toString")]
+ "default value is 'auto'.", "'auto'", "TypeConverters.toString"),
+ ("varianceCol", "column name for the biased sample variance of prediction.",
+ None, "TypeConverters.toString")]
code = []
for name, doc, defaultValueStr, typeConverter in shared:
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index d79d55e463..c9e975525c 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -559,6 +559,30 @@ class HasSolver(Params):
return self.getOrDefault(self.solver)
+class HasVarianceCol(Params):
+ """
+ Mixin for param varianceCol: column name for the biased sample variance of prediction.
+ """
+
+ varianceCol = Param(Params._dummy(), "varianceCol", "column name for the biased sample variance of prediction.", typeConverter=TypeConverters.toString)
+
+ def __init__(self):
+ super(HasVarianceCol, self).__init__()
+
+ def setVarianceCol(self, value):
+ """
+ Sets the value of :py:attr:`varianceCol`.
+ """
+ self._set(varianceCol=value)
+ return self
+
+ def getVarianceCol(self):
+ """
+ Gets the value of varianceCol or its default value.
+ """
+ return self.getOrDefault(self.varianceCol)
+
+
class DecisionTreeParams(Params):
"""
Mixin for Decision Tree parameters.
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 00a6a0de90..f6c5d130dd 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -630,7 +630,7 @@ class GBTParams(TreeEnsembleParams):
@inherit_doc
class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
DecisionTreeParams, TreeRegressorParams, HasCheckpointInterval,
- HasSeed, JavaMLWritable, JavaMLReadable):
+ HasSeed, JavaMLWritable, JavaMLReadable, HasVarianceCol):
"""
`http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree`
learning algorithm for regression.
@@ -640,7 +640,7 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
>>> df = sqlContext.createDataFrame([
... (1.0, Vectors.dense(1.0)),
... (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
- >>> dt = DecisionTreeRegressor(maxDepth=2)
+ >>> dt = DecisionTreeRegressor(maxDepth=2, varianceCol="variance")
>>> model = dt.fit(df)
>>> model.depth
1
@@ -666,6 +666,8 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
True
>>> model.depth == model2.depth
True
+ >>> model.transform(test1).head().variance
+ 0.0
.. versionadded:: 1.4.0
"""
@@ -674,12 +676,12 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance",
- seed=None):
+ seed=None, varianceCol=None):
"""
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
- impurity="variance", seed=None)
+ impurity="variance", seed=None, varianceCol=None)
"""
super(DecisionTreeRegressor, self).__init__()
self._java_obj = self._new_java_obj(
@@ -695,12 +697,12 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
- impurity="variance", seed=None):
+ impurity="variance", seed=None, varianceCol=None):
"""
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
- impurity="variance", seed=None)
+ impurity="variance", seed=None, varianceCol=None)
Sets params for the DecisionTreeRegressor.
"""
kwargs = self.setParams._input_kwargs