aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorvectorijk <jiangkai@gmail.com>2015-10-27 13:55:03 -0700
committerXiangrui Meng <meng@databricks.com>2015-10-27 13:55:03 -0700
commit9dba5fb2b59174cefde5b62a5c892fe5925bea38 (patch)
treec94ad0820a14a5b120354d76d71ed4cb0cf0fa49 /python
parent5a5f65905a202e59bc85170b01c57a883718ddf6 (diff)
downloadspark-9dba5fb2b59174cefde5b62a5c892fe5925bea38.tar.gz
spark-9dba5fb2b59174cefde5b62a5c892fe5925bea38.tar.bz2
spark-9dba5fb2b59174cefde5b62a5c892fe5925bea38.zip
[SPARK-10024][PYSPARK] Python API RF and GBT related params clear up
implement {RandomForest, GBT, TreeEnsemble, TreeClassifier, TreeRegressor}Params for Python API in pyspark/ml/{classification, regression}.py Author: vectorijk <jiangkai@gmail.com> Closes #9233 from vectorijk/spark-10024.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/ml/classification.py182
-rw-r--r--python/pyspark/ml/regression.py324
2 files changed, 168 insertions, 338 deletions
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 88815e561f..4cbe7fbd48 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -19,7 +19,7 @@ from pyspark.ml.util import keyword_only
from pyspark.ml.wrapper import JavaEstimator, JavaModel
from pyspark.ml.param.shared import *
from pyspark.ml.regression import (
- RandomForestParams, DecisionTreeModel, TreeEnsembleModels)
+ RandomForestParams, TreeEnsembleParams, DecisionTreeModel, TreeEnsembleModels)
from pyspark.mllib.common import inherit_doc
@@ -205,8 +205,34 @@ class TreeClassifierParams(object):
"""
supportedImpurities = ["entropy", "gini"]
+ # a placeholder to make it appear in the generated doc
+ impurity = Param(Params._dummy(), "impurity",
+ "Criterion used for information gain calculation (case-insensitive). " +
+ "Supported options: " +
+ ", ".join(supportedImpurities))
+
+ def __init__(self):
+ super(TreeClassifierParams, self).__init__()
+ #: param for Criterion used for information gain calculation (case-insensitive).
+ self.impurity = Param(self, "impurity", "Criterion used for information " +
+ "gain calculation (case-insensitive). Supported options: " +
+ ", ".join(self.supportedImpurities))
+
+ def setImpurity(self, value):
+ """
+ Sets the value of :py:attr:`impurity`.
+ """
+ self._paramMap[self.impurity] = value
+ return self
-class GBTParams(object):
+ def getImpurity(self):
+ """
+ Gets the value of impurity or its default value.
+ """
+ return self.getOrDefault(self.impurity)
+
+
+class GBTParams(TreeEnsembleParams):
"""
Private class to track supported GBT params.
"""
@@ -216,7 +242,7 @@ class GBTParams(object):
@inherit_doc
class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
HasProbabilityCol, HasRawPredictionCol, DecisionTreeParams,
- HasCheckpointInterval):
+ TreeClassifierParams, HasCheckpointInterval):
"""
`http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree`
learning algorithm for classification.
@@ -250,11 +276,6 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
1.0
"""
- # a placeholder to make it appear in the generated doc
- impurity = Param(Params._dummy(), "impurity",
- "Criterion used for information gain calculation (case-insensitive). " +
- "Supported options: " + ", ".join(TreeClassifierParams.supportedImpurities))
-
@keyword_only
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
probabilityCol="probability", rawPredictionCol="rawPrediction",
@@ -269,11 +290,6 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
super(DecisionTreeClassifier, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.classification.DecisionTreeClassifier", self.uid)
- #: param for Criterion used for information gain calculation (case-insensitive).
- self.impurity = \
- Param(self, "impurity",
- "Criterion used for information gain calculation (case-insensitive). " +
- "Supported options: " + ", ".join(TreeClassifierParams.supportedImpurities))
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
impurity="gini")
@@ -299,19 +315,6 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
def _create_model(self, java_model):
return DecisionTreeClassificationModel(java_model)
- def setImpurity(self, value):
- """
- Sets the value of :py:attr:`impurity`.
- """
- self._paramMap[self.impurity] = value
- return self
-
- def getImpurity(self):
- """
- Gets the value of impurity or its default value.
- """
- return self.getOrDefault(self.impurity)
-
@inherit_doc
class DecisionTreeClassificationModel(DecisionTreeModel):
@@ -323,7 +326,7 @@ class DecisionTreeClassificationModel(DecisionTreeModel):
@inherit_doc
class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed,
HasRawPredictionCol, HasProbabilityCol,
- DecisionTreeParams, HasCheckpointInterval):
+ RandomForestParams, TreeClassifierParams, HasCheckpointInterval):
"""
`http://en.wikipedia.org/wiki/Random_forest Random Forest`
learning algorithm for classification.
@@ -357,19 +360,6 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
1.0
"""
- # a placeholder to make it appear in the generated doc
- impurity = Param(Params._dummy(), "impurity",
- "Criterion used for information gain calculation (case-insensitive). " +
- "Supported options: " + ", ".join(TreeClassifierParams.supportedImpurities))
- subsamplingRate = Param(Params._dummy(), "subsamplingRate",
- "Fraction of the training data used for learning each decision tree, " +
- "in range (0, 1].")
- numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train (>= 1)")
- featureSubsetStrategy = \
- Param(Params._dummy(), "featureSubsetStrategy",
- "The number of features to consider for splits at each tree node. Supported " +
- "options: " + ", ".join(RandomForestParams.supportedFeatureSubsetStrategies))
-
@keyword_only
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
probabilityCol="probability", rawPredictionCol="rawPrediction",
@@ -386,23 +376,6 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
super(RandomForestClassifier, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.classification.RandomForestClassifier", self.uid)
- #: param for Criterion used for information gain calculation (case-insensitive).
- self.impurity = \
- Param(self, "impurity",
- "Criterion used for information gain calculation (case-insensitive). " +
- "Supported options: " + ", ".join(TreeClassifierParams.supportedImpurities))
- #: param for Fraction of the training data used for learning each decision tree,
- # in range (0, 1]
- self.subsamplingRate = Param(self, "subsamplingRate",
- "Fraction of the training data used for learning each " +
- "decision tree, in range (0, 1].")
- #: param for Number of trees to train (>= 1)
- self.numTrees = Param(self, "numTrees", "Number of trees to train (>= 1)")
- #: param for The number of features to consider for splits at each tree node
- self.featureSubsetStrategy = \
- Param(self, "featureSubsetStrategy",
- "The number of features to consider for splits at each tree node. Supported " +
- "options: " + ", ".join(RandomForestParams.supportedFeatureSubsetStrategies))
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
impurity="gini", numTrees=20, featureSubsetStrategy="auto")
@@ -429,58 +402,6 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
def _create_model(self, java_model):
return RandomForestClassificationModel(java_model)
- def setImpurity(self, value):
- """
- Sets the value of :py:attr:`impurity`.
- """
- self._paramMap[self.impurity] = value
- return self
-
- def getImpurity(self):
- """
- Gets the value of impurity or its default value.
- """
- return self.getOrDefault(self.impurity)
-
- def setSubsamplingRate(self, value):
- """
- Sets the value of :py:attr:`subsamplingRate`.
- """
- self._paramMap[self.subsamplingRate] = value
- return self
-
- def getSubsamplingRate(self):
- """
- Gets the value of subsamplingRate or its default value.
- """
- return self.getOrDefault(self.subsamplingRate)
-
- def setNumTrees(self, value):
- """
- Sets the value of :py:attr:`numTrees`.
- """
- self._paramMap[self.numTrees] = value
- return self
-
- def getNumTrees(self):
- """
- Gets the value of numTrees or its default value.
- """
- return self.getOrDefault(self.numTrees)
-
- def setFeatureSubsetStrategy(self, value):
- """
- Sets the value of :py:attr:`featureSubsetStrategy`.
- """
- self._paramMap[self.featureSubsetStrategy] = value
- return self
-
- def getFeatureSubsetStrategy(self):
- """
- Gets the value of featureSubsetStrategy or its default value.
- """
- return self.getOrDefault(self.featureSubsetStrategy)
-
class RandomForestClassificationModel(TreeEnsembleModels):
"""
@@ -490,7 +411,7 @@ class RandomForestClassificationModel(TreeEnsembleModels):
@inherit_doc
class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
- DecisionTreeParams, HasCheckpointInterval):
+ GBTParams, HasCheckpointInterval, HasStepSize, HasSeed):
"""
`http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)`
learning algorithm for classification.
@@ -522,12 +443,6 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
lossType = Param(Params._dummy(), "lossType",
"Loss function which GBT tries to minimize (case-insensitive). " +
"Supported options: " + ", ".join(GBTParams.supportedLossTypes))
- subsamplingRate = Param(Params._dummy(), "subsamplingRate",
- "Fraction of the training data used for learning each decision tree, " +
- "in range (0, 1].")
- stepSize = Param(Params._dummy(), "stepSize",
- "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the " +
- "contribution of each estimator")
@keyword_only
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
@@ -547,15 +462,6 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
self.lossType = Param(self, "lossType",
"Loss function which GBT tries to minimize (case-insensitive). " +
"Supported options: " + ", ".join(GBTParams.supportedLossTypes))
- #: Fraction of the training data used for learning each decision tree, in range (0, 1].
- self.subsamplingRate = Param(self, "subsamplingRate",
- "Fraction of the training data used for learning each " +
- "decision tree, in range (0, 1].")
- #: Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of
- # each estimator
- self.stepSize = Param(self, "stepSize",
- "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking " +
- "the contribution of each estimator")
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
lossType="logistic", maxIter=20, stepSize=0.1)
@@ -593,32 +499,6 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
"""
return self.getOrDefault(self.lossType)
- def setSubsamplingRate(self, value):
- """
- Sets the value of :py:attr:`subsamplingRate`.
- """
- self._paramMap[self.subsamplingRate] = value
- return self
-
- def getSubsamplingRate(self):
- """
- Gets the value of subsamplingRate or its default value.
- """
- return self.getOrDefault(self.subsamplingRate)
-
- def setStepSize(self, value):
- """
- Sets the value of :py:attr:`stepSize`.
- """
- self._paramMap[self.stepSize] = value
- return self
-
- def getStepSize(self):
- """
- Gets the value of stepSize or its default value.
- """
- return self.getOrDefault(self.stepSize)
-
class GBTClassificationModel(TreeEnsembleModels):
"""
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index eb5f4bd6d7..eeb18b3e9d 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -260,21 +260,127 @@ class IsotonicRegressionModel(JavaModel):
return self._call_java("predictions")
-class TreeRegressorParams(object):
+class TreeEnsembleParams(DecisionTreeParams):
+ """
+ Mixin for Decision Tree-based ensemble algorithms parameters.
+ """
+
+ # a placeholder to make it appear in the generated doc
+ subsamplingRate = Param(Params._dummy(), "subsamplingRate", "Fraction of the training data " +
+ "used for learning each decision tree, in range (0, 1].")
+
+ def __init__(self):
+ super(TreeEnsembleParams, self).__init__()
+ #: param for Fraction of the training data, in range (0, 1].
+ self.subsamplingRate = Param(self, "subsamplingRate", "Fraction of the training data " +
+ "used for learning each decision tree, in range (0, 1].")
+
+ @since("1.4.0")
+ def setSubsamplingRate(self, value):
+ """
+ Sets the value of :py:attr:`subsamplingRate`.
+ """
+ self._paramMap[self.subsamplingRate] = value
+ return self
+
+ @since("1.4.0")
+ def getSubsamplingRate(self):
+ """
+ Gets the value of subsamplingRate or its default value.
+ """
+ return self.getOrDefault(self.subsamplingRate)
+
+
+class TreeRegressorParams(Params):
"""
Private class to track supported impurity measures.
"""
+
supportedImpurities = ["variance"]
+ # a placeholder to make it appear in the generated doc
+ impurity = Param(Params._dummy(), "impurity",
+ "Criterion used for information gain calculation (case-insensitive). " +
+ "Supported options: " +
+ ", ".join(supportedImpurities))
+ def __init__(self):
+ super(TreeRegressorParams, self).__init__()
+ #: param for Criterion used for information gain calculation (case-insensitive).
+ self.impurity = Param(self, "impurity", "Criterion used for information " +
+ "gain calculation (case-insensitive). Supported options: " +
+ ", ".join(self.supportedImpurities))
-class RandomForestParams(object):
+ @since("1.4.0")
+ def setImpurity(self, value):
+ """
+ Sets the value of :py:attr:`impurity`.
+ """
+ self._paramMap[self.impurity] = value
+ return self
+
+ @since("1.4.0")
+ def getImpurity(self):
+ """
+ Gets the value of impurity or its default value.
+ """
+ return self.getOrDefault(self.impurity)
+
+
+class RandomForestParams(TreeEnsembleParams):
"""
Private class to track supported random forest parameters.
"""
+
supportedFeatureSubsetStrategies = ["auto", "all", "onethird", "sqrt", "log2"]
+ # a placeholder to make it appear in the generated doc
+ numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train (>= 1).")
+ featureSubsetStrategy = \
+ Param(Params._dummy(), "featureSubsetStrategy",
+ "The number of features to consider for splits at each tree node. Supported " +
+ "options: " + ", ".join(supportedFeatureSubsetStrategies))
+
+ def __init__(self):
+ super(RandomForestParams, self).__init__()
+ #: param for Number of trees to train (>= 1).
+ self.numTrees = Param(self, "numTrees", "Number of trees to train (>= 1).")
+ #: param for The number of features to consider for splits at each tree node.
+ self.featureSubsetStrategy = \
+ Param(self, "featureSubsetStrategy",
+ "The number of features to consider for splits at each tree node. Supported " +
+ "options: " + ", ".join(self.supportedFeatureSubsetStrategies))
+
+ @since("1.4.0")
+ def setNumTrees(self, value):
+ """
+ Sets the value of :py:attr:`numTrees`.
+ """
+ self._paramMap[self.numTrees] = value
+ return self
+
+ @since("1.4.0")
+ def getNumTrees(self):
+ """
+ Gets the value of numTrees or its default value.
+ """
+ return self.getOrDefault(self.numTrees)
+ @since("1.4.0")
+ def setFeatureSubsetStrategy(self, value):
+ """
+ Sets the value of :py:attr:`featureSubsetStrategy`.
+ """
+ self._paramMap[self.featureSubsetStrategy] = value
+ return self
-class GBTParams(object):
+ @since("1.4.0")
+ def getFeatureSubsetStrategy(self):
+ """
+ Gets the value of featureSubsetStrategy or its default value.
+ """
+ return self.getOrDefault(self.featureSubsetStrategy)
+
+
+class GBTParams(TreeEnsembleParams):
"""
Private class to track supported GBT params.
"""
@@ -283,7 +389,7 @@ class GBTParams(object):
@inherit_doc
class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
- DecisionTreeParams, HasCheckpointInterval):
+ DecisionTreeParams, TreeRegressorParams, HasCheckpointInterval):
"""
`http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree`
learning algorithm for regression.
@@ -309,11 +415,6 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
- impurity = Param(Params._dummy(), "impurity",
- "Criterion used for information gain calculation (case-insensitive). " +
- "Supported options: " + ", ".join(TreeRegressorParams.supportedImpurities))
-
@keyword_only
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
@@ -326,11 +427,6 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
super(DecisionTreeRegressor, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.regression.DecisionTreeRegressor", self.uid)
- #: param for Criterion used for information gain calculation (case-insensitive).
- self.impurity = \
- Param(self, "impurity",
- "Criterion used for information gain calculation (case-insensitive). " +
- "Supported options: " + ", ".join(TreeRegressorParams.supportedImpurities))
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
impurity="variance")
@@ -355,21 +451,6 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
def _create_model(self, java_model):
return DecisionTreeRegressionModel(java_model)
- @since("1.4.0")
- def setImpurity(self, value):
- """
- Sets the value of :py:attr:`impurity`.
- """
- self._paramMap[self.impurity] = value
- return self
-
- @since("1.4.0")
- def getImpurity(self):
- """
- Gets the value of impurity or its default value.
- """
- return self.getOrDefault(self.impurity)
-
@inherit_doc
class DecisionTreeModel(JavaModel):
@@ -422,7 +503,7 @@ class DecisionTreeRegressionModel(DecisionTreeModel):
@inherit_doc
class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed,
- DecisionTreeParams, HasCheckpointInterval):
+ RandomForestParams, TreeRegressorParams, HasCheckpointInterval):
"""
`http://en.wikipedia.org/wiki/Random_forest Random Forest`
learning algorithm for regression.
@@ -447,54 +528,26 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
.. versionadded:: 1.4.0
"""
- # a placeholder to make it appear in the generated doc
- impurity = Param(Params._dummy(), "impurity",
- "Criterion used for information gain calculation (case-insensitive). " +
- "Supported options: " + ", ".join(TreeRegressorParams.supportedImpurities))
- subsamplingRate = Param(Params._dummy(), "subsamplingRate",
- "Fraction of the training data used for learning each decision tree, " +
- "in range (0, 1].")
- numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train (>= 1)")
- featureSubsetStrategy = \
- Param(Params._dummy(), "featureSubsetStrategy",
- "The number of features to consider for splits at each tree node. Supported " +
- "options: " + ", ".join(RandomForestParams.supportedFeatureSubsetStrategies))
-
@keyword_only
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
- maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance",
- numTrees=20, featureSubsetStrategy="auto", seed=None):
+ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
+ impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20,
+ featureSubsetStrategy="auto"):
"""
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
- impurity="variance", numTrees=20, \
- featureSubsetStrategy="auto", seed=None)
+ impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, \
+ featureSubsetStrategy="auto")
"""
super(RandomForestRegressor, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.regression.RandomForestRegressor", self.uid)
- #: param for Criterion used for information gain calculation (case-insensitive).
- self.impurity = \
- Param(self, "impurity",
- "Criterion used for information gain calculation (case-insensitive). " +
- "Supported options: " + ", ".join(TreeRegressorParams.supportedImpurities))
- #: param for Fraction of the training data used for learning each decision tree,
- # in range (0, 1]
- self.subsamplingRate = Param(self, "subsamplingRate",
- "Fraction of the training data used for learning each " +
- "decision tree, in range (0, 1].")
- #: param for Number of trees to train (>= 1)
- self.numTrees = Param(self, "numTrees", "Number of trees to train (>= 1)")
- #: param for The number of features to consider for splits at each tree node
- self.featureSubsetStrategy = \
- Param(self, "featureSubsetStrategy",
- "The number of features to consider for splits at each tree node. Supported " +
- "options: " + ", ".join(RandomForestParams.supportedFeatureSubsetStrategies))
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
- maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
- impurity="variance", numTrees=20, featureSubsetStrategy="auto")
+ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
+ impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20,
+ featureSubsetStrategy="auto")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -502,13 +555,15 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
@since("1.4.0")
def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
- maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
- impurity="variance", numTrees=20, featureSubsetStrategy="auto"):
+ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
+ impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20,
+ featureSubsetStrategy="auto"):
"""
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
- maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, \
- impurity="variance", numTrees=20, featureSubsetStrategy="auto")
+ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
+ impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, \
+ featureSubsetStrategy="auto")
Sets params for linear regression.
"""
kwargs = self.setParams._input_kwargs
@@ -517,66 +572,6 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
def _create_model(self, java_model):
return RandomForestRegressionModel(java_model)
- @since("1.4.0")
- def setImpurity(self, value):
- """
- Sets the value of :py:attr:`impurity`.
- """
- self._paramMap[self.impurity] = value
- return self
-
- @since("1.4.0")
- def getImpurity(self):
- """
- Gets the value of impurity or its default value.
- """
- return self.getOrDefault(self.impurity)
-
- @since("1.4.0")
- def setSubsamplingRate(self, value):
- """
- Sets the value of :py:attr:`subsamplingRate`.
- """
- self._paramMap[self.subsamplingRate] = value
- return self
-
- @since("1.4.0")
- def getSubsamplingRate(self):
- """
- Gets the value of subsamplingRate or its default value.
- """
- return self.getOrDefault(self.subsamplingRate)
-
- @since("1.4.0")
- def setNumTrees(self, value):
- """
- Sets the value of :py:attr:`numTrees`.
- """
- self._paramMap[self.numTrees] = value
- return self
-
- @since("1.4.0")
- def getNumTrees(self):
- """
- Gets the value of numTrees or its default value.
- """
- return self.getOrDefault(self.numTrees)
-
- @since("1.4.0")
- def setFeatureSubsetStrategy(self, value):
- """
- Sets the value of :py:attr:`featureSubsetStrategy`.
- """
- self._paramMap[self.featureSubsetStrategy] = value
- return self
-
- @since("1.4.0")
- def getFeatureSubsetStrategy(self):
- """
- Gets the value of featureSubsetStrategy or its default value.
- """
- return self.getOrDefault(self.featureSubsetStrategy)
-
class RandomForestRegressionModel(TreeEnsembleModels):
"""
@@ -588,7 +583,7 @@ class RandomForestRegressionModel(TreeEnsembleModels):
@inherit_doc
class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
- DecisionTreeParams, HasCheckpointInterval):
+ GBTParams, HasCheckpointInterval, HasStepSize, HasSeed):
"""
`http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)`
learning algorithm for regression.
@@ -617,23 +612,17 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
lossType = Param(Params._dummy(), "lossType",
"Loss function which GBT tries to minimize (case-insensitive). " +
"Supported options: " + ", ".join(GBTParams.supportedLossTypes))
- subsamplingRate = Param(Params._dummy(), "subsamplingRate",
- "Fraction of the training data used for learning each decision tree, " +
- "in range (0, 1].")
- stepSize = Param(Params._dummy(), "stepSize",
- "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the " +
- "contribution of each estimator")
@keyword_only
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
- maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="squared",
- maxIter=20, stepSize=0.1):
+ maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0,
+ checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1):
"""
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
- maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
- lossType="squared", maxIter=20, stepSize=0.1)
+ maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \
+ checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1)
"""
super(GBTRegressor, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.regression.GBTRegressor", self.uid)
@@ -641,18 +630,9 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
self.lossType = Param(self, "lossType",
"Loss function which GBT tries to minimize (case-insensitive). " +
"Supported options: " + ", ".join(GBTParams.supportedLossTypes))
- #: Fraction of the training data used for learning each decision tree, in range (0, 1].
- self.subsamplingRate = Param(self, "subsamplingRate",
- "Fraction of the training data used for learning each " +
- "decision tree, in range (0, 1].")
- #: Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of
- # each estimator
- self.stepSize = Param(self, "stepSize",
- "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking " +
- "the contribution of each estimator")
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
- maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
- lossType="squared", maxIter=20, stepSize=0.1)
+ maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0,
+ checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -660,13 +640,13 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
@since("1.4.0")
def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
- maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
- lossType="squared", maxIter=20, stepSize=0.1):
+ maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0,
+ checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1):
"""
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
- maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
- lossType="squared", maxIter=20, stepSize=0.1)
+ maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \
+ checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1)
Sets params for Gradient Boosted Tree Regression.
"""
kwargs = self.setParams._input_kwargs
@@ -690,36 +670,6 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
"""
return self.getOrDefault(self.lossType)
- @since("1.4.0")
- def setSubsamplingRate(self, value):
- """
- Sets the value of :py:attr:`subsamplingRate`.
- """
- self._paramMap[self.subsamplingRate] = value
- return self
-
- @since("1.4.0")
- def getSubsamplingRate(self):
- """
- Gets the value of subsamplingRate or its default value.
- """
- return self.getOrDefault(self.subsamplingRate)
-
- @since("1.4.0")
- def setStepSize(self, value):
- """
- Sets the value of :py:attr:`stepSize`.
- """
- self._paramMap[self.stepSize] = value
- return self
-
- @since("1.4.0")
- def getStepSize(self):
- """
- Gets the value of stepSize or its default value.
- """
- return self.getOrDefault(self.stepSize)
-
class GBTRegressionModel(TreeEnsembleModels):
"""
@@ -783,7 +733,7 @@ class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", \
quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], \
- quantilesCol=None):
+ quantilesCol=None)
"""
super(AFTSurvivalRegression, self).__init__()
self._java_obj = self._new_java_obj(