aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/mllib
diff options
context:
space:
mode:
authorBryan Cutler <cutlerb@gmail.com>2016-02-26 08:30:32 -0800
committerXiangrui Meng <meng@databricks.com>2016-02-26 08:30:32 -0800
commitb33261f91387904c5aaccae40f86922c92a4e09a (patch)
treeabae986f0bd829276d4b320f8242275a22609212 /python/pyspark/mllib
parent99dfcedbfd4c83c7b6a343456f03e8c6e29968c5 (diff)
downloadspark-b33261f91387904c5aaccae40f86922c92a4e09a.tar.gz
spark-b33261f91387904c5aaccae40f86922c92a4e09a.tar.bz2
spark-b33261f91387904c5aaccae40f86922c92a4e09a.zip
[SPARK-12634][PYSPARK][DOC] PySpark tree parameter desc to consistent format
Part of task for [SPARK-11219](https://issues.apache.org/jira/browse/SPARK-11219) to make PySpark MLlib parameter description formatting consistent. This is for the tree module. closes #10601 Author: Bryan Cutler <cutlerb@gmail.com> Author: vijaykiran <mail@vijaykiran.com> Closes #11353 from BryanCutler/param-desc-consistent-tree-SPARK-12634.
Diffstat (limited to 'python/pyspark/mllib')
-rw-r--r--python/pyspark/mllib/tree.py339
1 files changed, 197 insertions, 142 deletions
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 0001b60093..f7ea466b43 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -60,8 +60,7 @@ class TreeEnsembleModel(JavaModelWrapper, JavaSaveable):
@since("1.3.0")
def totalNumNodes(self):
"""
- Get total number of nodes, summed over all trees in the
- ensemble.
+ Get total number of nodes, summed over all trees in the ensemble.
"""
return self.call("totalNumNodes")
@@ -92,8 +91,9 @@ class DecisionTreeModel(JavaModelWrapper, JavaSaveable, JavaLoader):
transformation or action.
Call predict directly on the RDD instead.
- :param x: Data point (feature vector),
- or an RDD of data points (feature vectors).
+ :param x:
+ Data point (feature vector), or an RDD of data points (feature
+ vectors).
"""
if isinstance(x, RDD):
return self.call("predict", x.map(_convert_to_vector))
@@ -108,8 +108,9 @@ class DecisionTreeModel(JavaModelWrapper, JavaSaveable, JavaLoader):
@since("1.1.0")
def depth(self):
- """Get depth of tree.
- E.g.: Depth 0 means 1 leaf node. Depth 1 means 1 internal node and 2 leaf nodes.
+ """
+ Get depth of tree (e.g. depth 0 means 1 leaf node, depth 1
+ means 1 internal node + 2 leaf nodes).
"""
return self._java_model.depth()
@@ -152,24 +153,37 @@ class DecisionTree(object):
impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1,
minInfoGain=0.0):
"""
- Train a DecisionTreeModel for classification.
-
- :param data: Training data: RDD of LabeledPoint.
- Labels are integers {0,1,...,numClasses}.
- :param numClasses: Number of classes for classification.
- :param categoricalFeaturesInfo: Map from categorical feature index
- to number of categories.
- Any feature not in this map
- is treated as continuous.
- :param impurity: Supported values: "entropy" or "gini"
- :param maxDepth: Max depth of tree.
- E.g., depth 0 means 1 leaf node.
- Depth 1 means 1 internal node + 2 leaf nodes.
- :param maxBins: Number of bins used for finding splits at each node.
- :param minInstancesPerNode: Min number of instances required at child
- nodes to create the parent split
- :param minInfoGain: Min info gain required to create a split
- :return: DecisionTreeModel
+ Train a decision tree model for classification.
+
+ :param data:
+ Training data: RDD of LabeledPoint. Labels should take values
+ {0, 1, ..., numClasses-1}.
+ :param numClasses:
+ Number of classes for classification.
+ :param categoricalFeaturesInfo:
+ Map storing arity of categorical features. An entry (n -> k)
+ indicates that feature n is categorical with k categories
+ indexed from 0: {0, 1, ..., k-1}.
+ :param impurity:
+ Criterion used for information gain calculation.
+ Supported values: "gini" or "entropy".
+ (default: "gini")
+ :param maxDepth:
+ Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
+ means 1 internal node + 2 leaf nodes).
+ (default: 5)
+ :param maxBins:
+ Number of bins used for finding splits at each node.
+ (default: 32)
+ :param minInstancesPerNode:
+ Minimum number of instances required at child nodes to create
+ the parent split.
+ (default: 1)
+ :param minInfoGain:
+ Minimum info gain required to create a split.
+ (default: 0.0)
+ :return:
+ DecisionTreeModel.
Example usage:
@@ -211,23 +225,34 @@ class DecisionTree(object):
impurity="variance", maxDepth=5, maxBins=32, minInstancesPerNode=1,
minInfoGain=0.0):
"""
- Train a DecisionTreeModel for regression.
-
- :param data: Training data: RDD of LabeledPoint.
- Labels are real numbers.
- :param categoricalFeaturesInfo: Map from categorical feature
- index to number of categories.
- Any feature not in this map is treated as continuous.
- :param impurity: Supported values: "variance"
- :param maxDepth: Max depth of tree.
- E.g., depth 0 means 1 leaf node.
- Depth 1 means 1 internal node + 2 leaf nodes.
- :param maxBins: Number of bins used for finding splits at each
- node.
- :param minInstancesPerNode: Min number of instances required at
- child nodes to create the parent split
- :param minInfoGain: Min info gain required to create a split
- :return: DecisionTreeModel
+ Train a decision tree model for regression.
+
+ :param data:
+ Training data: RDD of LabeledPoint. Labels are real numbers.
+ :param categoricalFeaturesInfo:
+ Map storing arity of categorical features. An entry (n -> k)
+ indicates that feature n is categorical with k categories
+ indexed from 0: {0, 1, ..., k-1}.
+ :param impurity:
+ Criterion used for information gain calculation.
+ The only supported value for regression is "variance".
+ (default: "variance")
+ :param maxDepth:
+ Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
+ means 1 internal node + 2 leaf nodes).
+ (default: 5)
+ :param maxBins:
+ Number of bins used for finding splits at each node.
+ (default: 32)
+ :param minInstancesPerNode:
+ Minimum number of instances required at child nodes to create
+ the parent split.
+ (default: 1)
+ :param minInfoGain:
+ Minimum info gain required to create a split.
+ (default: 0.0)
+ :return:
+ DecisionTreeModel.
Example usage:
@@ -302,34 +327,44 @@ class RandomForest(object):
featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32,
seed=None):
"""
- Method to train a decision tree model for binary or multiclass
+ Train a random forest model for binary or multiclass
classification.
- :param data: Training dataset: RDD of LabeledPoint. Labels
- should take values {0, 1, ..., numClasses-1}.
- :param numClasses: number of classes for classification.
- :param categoricalFeaturesInfo: Map storing arity of categorical
- features. E.g., an entry (n -> k) indicates that
- feature n is categorical with k categories indexed
- from 0: {0, 1, ..., k-1}.
- :param numTrees: Number of trees in the random forest.
- :param featureSubsetStrategy: Number of features to consider for
- splits at each node.
- Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
- If "auto" is set, this parameter is set based on numTrees:
- if numTrees == 1, set to "all";
- if numTrees > 1 (forest) set to "sqrt".
- :param impurity: Criterion used for information gain calculation.
- Supported values: "gini" (recommended) or "entropy".
- :param maxDepth: Maximum depth of the tree.
- E.g., depth 0 means 1 leaf node; depth 1 means
- 1 internal node + 2 leaf nodes. (default: 4)
- :param maxBins: maximum number of bins used for splitting
- features
- (default: 32)
- :param seed: Random seed for bootstrapping and choosing feature
- subsets.
- :return: RandomForestModel that can be used for prediction
+ :param data:
+ Training dataset: RDD of LabeledPoint. Labels should take values
+ {0, 1, ..., numClasses-1}.
+ :param numClasses:
+ Number of classes for classification.
+ :param categoricalFeaturesInfo:
+ Map storing arity of categorical features. An entry (n -> k)
+ indicates that feature n is categorical with k categories
+ indexed from 0: {0, 1, ..., k-1}.
+ :param numTrees:
+ Number of trees in the random forest.
+ :param featureSubsetStrategy:
+ Number of features to consider for splits at each node.
+ Supported values: "auto", "all", "sqrt", "log2", "onethird".
+ If "auto" is set, this parameter is set based on numTrees:
+ if numTrees == 1, set to "all";
+ if numTrees > 1 (forest) set to "sqrt".
+ (default: "auto")
+ :param impurity:
+ Criterion used for information gain calculation.
+ Supported values: "gini" or "entropy".
+ (default: "gini")
+ :param maxDepth:
+ Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
+ means 1 internal node + 2 leaf nodes).
+ (default: 4)
+ :param maxBins:
+ Maximum number of bins used for splitting features.
+ (default: 32)
+ :param seed:
+ Random seed for bootstrapping and choosing feature subsets.
+ Set as None to generate seed based on system time.
+ (default: None)
+ :return:
+ RandomForestModel that can be used for prediction.
Example usage:
@@ -383,32 +418,40 @@ class RandomForest(object):
def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto",
impurity="variance", maxDepth=4, maxBins=32, seed=None):
"""
- Method to train a decision tree model for regression.
-
- :param data: Training dataset: RDD of LabeledPoint. Labels are
- real numbers.
- :param categoricalFeaturesInfo: Map storing arity of categorical
- features. E.g., an entry (n -> k) indicates that feature
- n is categorical with k categories indexed from 0:
- {0, 1, ..., k-1}.
- :param numTrees: Number of trees in the random forest.
- :param featureSubsetStrategy: Number of features to consider for
- splits at each node.
- Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
- If "auto" is set, this parameter is set based on numTrees:
- if numTrees == 1, set to "all";
- if numTrees > 1 (forest) set to "onethird" for regression.
- :param impurity: Criterion used for information gain
- calculation.
- Supported values: "variance".
- :param maxDepth: Maximum depth of the tree. E.g., depth 0 means
- 1 leaf node; depth 1 means 1 internal node + 2 leaf
- nodes. (default: 4)
- :param maxBins: maximum number of bins used for splitting
- features (default: 32)
- :param seed: Random seed for bootstrapping and choosing feature
- subsets.
- :return: RandomForestModel that can be used for prediction
+ Train a random forest model for regression.
+
+ :param data:
+ Training dataset: RDD of LabeledPoint. Labels are real numbers.
+ :param categoricalFeaturesInfo:
+ Map storing arity of categorical features. An entry (n -> k)
+ indicates that feature n is categorical with k categories
+ indexed from 0: {0, 1, ..., k-1}.
+ :param numTrees:
+ Number of trees in the random forest.
+ :param featureSubsetStrategy:
+ Number of features to consider for splits at each node.
+ Supported values: "auto", "all", "sqrt", "log2", "onethird".
+ If "auto" is set, this parameter is set based on numTrees:
+ if numTrees == 1, set to "all";
+ if numTrees > 1 (forest) set to "onethird" for regression.
+ (default: "auto")
+ :param impurity:
+ Criterion used for information gain calculation.
+ The only supported value for regression is "variance".
+ (default: "variance")
+ :param maxDepth:
+ Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
+ means 1 internal node + 2 leaf nodes).
+ (default: 4)
+ :param maxBins:
+ Maximum number of bins used for splitting features.
+ (default: 32)
+ :param seed:
+ Random seed for bootstrapping and choosing feature subsets.
+ Set as None to generate seed based on system time.
+ (default: None)
+ :return:
+ RandomForestModel that can be used for prediction.
Example usage:
@@ -480,31 +523,37 @@ class GradientBoostedTrees(object):
loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3,
maxBins=32):
"""
- Method to train a gradient-boosted trees model for
- classification.
-
- :param data: Training dataset: RDD of LabeledPoint.
- Labels should take values {0, 1}.
- :param categoricalFeaturesInfo: Map storing arity of categorical
- features. E.g., an entry (n -> k) indicates that feature
- n is categorical with k categories indexed from 0:
- {0, 1, ..., k-1}.
- :param loss: Loss function used for minimization during gradient
- boosting. Supported: {"logLoss" (default),
- "leastSquaresError", "leastAbsoluteError"}.
- :param numIterations: Number of iterations of boosting.
- (default: 100)
- :param learningRate: Learning rate for shrinking the
- contribution of each estimator. The learning rate
- should be between in the interval (0, 1].
- (default: 0.1)
- :param maxDepth: Maximum depth of the tree. E.g., depth 0 means
- 1 leaf node; depth 1 means 1 internal node + 2 leaf
- nodes. (default: 3)
- :param maxBins: maximum number of bins used for splitting
- features (default: 32) DecisionTree requires maxBins >= max categories
- :return: GradientBoostedTreesModel that can be used for
- prediction
+ Train a gradient-boosted trees model for classification.
+
+ :param data:
+ Training dataset: RDD of LabeledPoint. Labels should take values
+ {0, 1}.
+ :param categoricalFeaturesInfo:
+ Map storing arity of categorical features. An entry (n -> k)
+ indicates that feature n is categorical with k categories
+ indexed from 0: {0, 1, ..., k-1}.
+ :param loss:
+ Loss function used for minimization during gradient boosting.
+ Supported values: "logLoss", "leastSquaresError",
+ "leastAbsoluteError".
+ (default: "logLoss")
+ :param numIterations:
+ Number of iterations of boosting.
+ (default: 100)
+ :param learningRate:
+ Learning rate for shrinking the contribution of each estimator.
+ The learning rate should be between in the interval (0, 1].
+ (default: 0.1)
+ :param maxDepth:
+ Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
+ means 1 internal node + 2 leaf nodes).
+ (default: 3)
+ :param maxBins:
+ Maximum number of bins used for splitting features. DecisionTree
+ requires maxBins >= max categories.
+ (default: 32)
+ :return:
+ GradientBoostedTreesModel that can be used for prediction.
Example usage:
@@ -543,30 +592,36 @@ class GradientBoostedTrees(object):
loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3,
maxBins=32):
"""
- Method to train a gradient-boosted trees model for regression.
-
- :param data: Training dataset: RDD of LabeledPoint. Labels are
- real numbers.
- :param categoricalFeaturesInfo: Map storing arity of categorical
- features. E.g., an entry (n -> k) indicates that feature
- n is categorical with k categories indexed from 0:
- {0, 1, ..., k-1}.
- :param loss: Loss function used for minimization during gradient
- boosting. Supported: {"logLoss" (default),
- "leastSquaresError", "leastAbsoluteError"}.
- :param numIterations: Number of iterations of boosting.
- (default: 100)
- :param learningRate: Learning rate for shrinking the
- contribution of each estimator. The learning rate
- should be between in the interval (0, 1].
- (default: 0.1)
- :param maxBins: maximum number of bins used for splitting
- features (default: 32) DecisionTree requires maxBins >= max categories
- :param maxDepth: Maximum depth of the tree. E.g., depth 0 means
- 1 leaf node; depth 1 means 1 internal node + 2 leaf
- nodes. (default: 3)
- :return: GradientBoostedTreesModel that can be used for
- prediction
+ Train a gradient-boosted trees model for regression.
+
+ :param data:
+ Training dataset: RDD of LabeledPoint. Labels are real numbers.
+ :param categoricalFeaturesInfo:
+ Map storing arity of categorical features. An entry (n -> k)
+ indicates that feature n is categorical with k categories
+ indexed from 0: {0, 1, ..., k-1}.
+ :param loss:
+ Loss function used for minimization during gradient boosting.
+ Supported values: "logLoss", "leastSquaresError",
+ "leastAbsoluteError".
+ (default: "leastSquaresError")
+ :param numIterations:
+ Number of iterations of boosting.
+ (default: 100)
+ :param learningRate:
+ Learning rate for shrinking the contribution of each estimator.
+ The learning rate should be between in the interval (0, 1].
+ (default: 0.1)
+ :param maxDepth:
+ Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
+ means 1 internal node + 2 leaf nodes).
+ (default: 3)
+ :param maxBins:
+ Maximum number of bins used for splitting features. DecisionTree
+ requires maxBins >= max categories.
+ (default: 32)
+ :return:
+ GradientBoostedTreesModel that can be used for prediction.
Example usage: