aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFelix Cheung <felixcheung_m@hotmail.com>2016-10-30 16:21:37 -0700
committerFelix Cheung <felixcheung@apache.org>2016-10-30 16:21:37 -0700
commit7c3786929205b962b430cf7fc292602c2993c193 (patch)
tree5805b48f8f027a92f9dd3e99aca042eee99b4cef
parentb6879b8b3518c71c23262554fcb0fdad60287011 (diff)
downloadspark-7c3786929205b962b430cf7fc292602c2993c193.tar.gz
spark-7c3786929205b962b430cf7fc292602c2993c193.tar.bz2
spark-7c3786929205b962b430cf7fc292602c2993c193.zip
[SPARK-18110][PYTHON][ML] add missing parameter in Python for RandomForest regression and classification
## What changes were proposed in this pull request? Add subsmaplingRate to randomForestClassifier Add varianceCol to randomForestRegressor In Python ## How was this patch tested? manual tests Author: Felix Cheung <felixcheung_m@hotmail.com> Closes #15638 from felixcheung/pyrandomforest.
-rw-r--r--python/pyspark/ml/classification.py11
-rw-r--r--python/pyspark/ml/regression.py12
2 files changed, 12 insertions, 11 deletions
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 3f763a10d4..d9ff356b94 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -758,20 +758,21 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
probabilityCol="probability", rawPredictionCol="rawPrediction",
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini",
- numTrees=20, featureSubsetStrategy="auto", seed=None):
+ numTrees=20, featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0):
"""
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
probabilityCol="probability", rawPredictionCol="rawPrediction", \
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \
- numTrees=20, featureSubsetStrategy="auto", seed=None)
+ numTrees=20, featureSubsetStrategy="auto", seed=None, subsamplingRate=1.0)
"""
super(RandomForestClassifier, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.classification.RandomForestClassifier", self.uid)
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
- impurity="gini", numTrees=20, featureSubsetStrategy="auto")
+ impurity="gini", numTrees=20, featureSubsetStrategy="auto",
+ subsamplingRate=1.0)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@@ -781,13 +782,13 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
probabilityCol="probability", rawPredictionCol="rawPrediction",
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None,
- impurity="gini", numTrees=20, featureSubsetStrategy="auto"):
+ impurity="gini", numTrees=20, featureSubsetStrategy="auto", subsamplingRate=1.0):
"""
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
probabilityCol="probability", rawPredictionCol="rawPrediction", \
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, \
- impurity="gini", numTrees=20, featureSubsetStrategy="auto")
+ impurity="gini", numTrees=20, featureSubsetStrategy="auto", subsamplingRate=1.0)
Sets params for linear classification.
"""
kwargs = self.setParams._input_kwargs
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 55d38033ef..9233d2e7e1 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -594,7 +594,7 @@ class RandomForestParams(TreeEnsembleParams):
featureSubsetStrategy = \
Param(Params._dummy(), "featureSubsetStrategy",
"The number of features to consider for splits at each tree node. Supported " +
- "options: " + ", ".join(supportedFeatureSubsetStrategies) + " (0.0-1.0], [1-n].",
+ "options: " + ", ".join(supportedFeatureSubsetStrategies) + ", (0.0-1.0], [1-n].",
typeConverter=TypeConverters.toString)
def __init__(self):
@@ -828,7 +828,7 @@ class DecisionTreeRegressionModel(DecisionTreeModel, JavaMLWritable, JavaMLReada
@inherit_doc
class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed,
RandomForestParams, TreeRegressorParams, HasCheckpointInterval,
- JavaMLWritable, JavaMLReadable):
+ JavaMLWritable, JavaMLReadable, HasVarianceCol):
"""
`Random Forest <http://en.wikipedia.org/wiki/Random_forest>`_
learning algorithm for regression.
@@ -876,13 +876,13 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20,
- featureSubsetStrategy="auto"):
+ featureSubsetStrategy="auto", varianceCol=None):
"""
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, \
- featureSubsetStrategy="auto")
+ featureSubsetStrategy="auto", varianceCol=None)
"""
super(RandomForestRegressor, self).__init__()
self._java_obj = self._new_java_obj(
@@ -900,13 +900,13 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20,
- featureSubsetStrategy="auto"):
+ featureSubsetStrategy="auto", varianceCol=None):
"""
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, \
- featureSubsetStrategy="auto")
+ featureSubsetStrategy="auto", varianceCol=None)
Sets params for linear regression.
"""
kwargs = self.setParams._input_kwargs