[SPARK-12634][PYSPARK][DOC] PySpark tree parameter desc to consistent format

Part of task for [SPARK-11219](https://issues.apache.org/jira/browse/SPARK-11219) to make PySpark MLlib parameter description formatting consistent. This is for the tree module. closes #10601 Author: Bryan Cutler <cutlerb@gmail.com> Author: vijaykiran <mail@vijaykiran.com> Closes #11353 from BryanCutler/param-desc-consistent-tree-SPARK-12634.
author: Bryan Cutler <cutlerb@gmail.com> 2016-02-26 08:30:32 -0800
committer: Xiangrui Meng <meng@databricks.com> 2016-02-26 08:30:32 -0800
commit: b33261f91387904c5aaccae40f86922c92a4e09a (patch)
tree: abae986f0bd829276d4b320f8242275a22609212 /python/pyspark
parent: 99dfcedbfd4c83c7b6a343456f03e8c6e29968c5 (diff)
download: spark-b33261f91387904c5aaccae40f86922c92a4e09a.tar.gz
spark-b33261f91387904c5aaccae40f86922c92a4e09a.tar.bz2
spark-b33261f91387904c5aaccae40f86922c92a4e09a.zip
1 files changed, 197 insertions, 142 deletions
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 0001b60093..f7ea466b43 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -60,8 +60,7 @@ class TreeEnsembleModel(JavaModelWrapper, JavaSaveable):
     @since("1.3.0")
     def totalNumNodes(self):
         """
-        Get total number of nodes, summed over all trees in the
-        ensemble.
+        Get total number of nodes, summed over all trees in the ensemble.
         """
         return self.call("totalNumNodes")
 
@@ -92,8 +91,9 @@ class DecisionTreeModel(JavaModelWrapper, JavaSaveable, JavaLoader):
               transformation or action.
               Call predict directly on the RDD instead.
 
-        :param x:  Data point (feature vector),
-                   or an RDD of data points (feature vectors).
+        :param x:
+          Data point (feature vector), or an RDD of data points (feature
+          vectors).
         """
         if isinstance(x, RDD):
             return self.call("predict", x.map(_convert_to_vector))
@@ -108,8 +108,9 @@ class DecisionTreeModel(JavaModelWrapper, JavaSaveable, JavaLoader):
 
     @since("1.1.0")
     def depth(self):
-        """Get depth of tree.
-        E.g.: Depth 0 means 1 leaf node.  Depth 1 means 1 internal node and 2 leaf nodes.
+        """
+        Get depth of tree (e.g. depth 0 means 1 leaf node, depth 1
+        means 1 internal node + 2 leaf nodes).
         """
         return self._java_model.depth()
 
@@ -152,24 +153,37 @@ class DecisionTree(object):
                         impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1,
                         minInfoGain=0.0):
         """
-        Train a DecisionTreeModel for classification.
-
-        :param data: Training data: RDD of LabeledPoint.
-                     Labels are integers {0,1,...,numClasses}.
-        :param numClasses: Number of classes for classification.
-        :param categoricalFeaturesInfo: Map from categorical feature index
-                                        to number of categories.
-                                        Any feature not in this map
-                                        is treated as continuous.
-        :param impurity: Supported values: "entropy" or "gini"
-        :param maxDepth: Max depth of tree.
-                         E.g., depth 0 means 1 leaf node.
-                         Depth 1 means 1 internal node + 2 leaf nodes.
-        :param maxBins: Number of bins used for finding splits at each node.
-        :param minInstancesPerNode: Min number of instances required at child
-                                    nodes to create the parent split
-        :param minInfoGain: Min info gain required to create a split
-        :return: DecisionTreeModel
+        Train a decision tree model for classification.
+
+        :param data:
+          Training data: RDD of LabeledPoint. Labels should take values
+          {0, 1, ..., numClasses-1}.
+        :param numClasses:
+          Number of classes for classification.
+        :param categoricalFeaturesInfo:
+          Map storing arity of categorical features. An entry (n -> k)
+          indicates that feature n is categorical with k categories
+          indexed from 0: {0, 1, ..., k-1}.
+        :param impurity:
+          Criterion used for information gain calculation.
+          Supported values: "gini" or "entropy".
+          (default: "gini")
+        :param maxDepth:
+          Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
+          means 1 internal node + 2 leaf nodes).
+          (default: 5)
+        :param maxBins:
+          Number of bins used for finding splits at each node.
+          (default: 32)
+        :param minInstancesPerNode:
+          Minimum number of instances required at child nodes to create
+          the parent split.
+          (default: 1)
+        :param minInfoGain:
+          Minimum info gain required to create a split.
+          (default: 0.0)
+        :return:
+          DecisionTreeModel.
 
         Example usage:
 
@@ -211,23 +225,34 @@ class DecisionTree(object):
                        impurity="variance", maxDepth=5, maxBins=32, minInstancesPerNode=1,
                        minInfoGain=0.0):
         """
-        Train a DecisionTreeModel for regression.
-
-        :param data: Training data: RDD of LabeledPoint.
-                     Labels are real numbers.
-        :param categoricalFeaturesInfo: Map from categorical feature
-                 index to number of categories.
-                 Any feature not in this map is treated as continuous.
-        :param impurity: Supported values: "variance"
-        :param maxDepth: Max depth of tree.
-                 E.g., depth 0 means 1 leaf node.
-                 Depth 1 means 1 internal node + 2 leaf nodes.
-        :param maxBins: Number of bins used for finding splits at each
-                 node.
-        :param minInstancesPerNode: Min number of instances required at
-                 child nodes to create the parent split
-        :param minInfoGain: Min info gain required to create a split
-        :return: DecisionTreeModel
+        Train a decision tree model for regression.
+
+        :param data:
+          Training data: RDD of LabeledPoint. Labels are real numbers.
+        :param categoricalFeaturesInfo:
+          Map storing arity of categorical features. An entry (n -> k)
+          indicates that feature n is categorical with k categories
+          indexed from 0: {0, 1, ..., k-1}.
+        :param impurity:
+          Criterion used for information gain calculation.
+          The only supported value for regression is "variance".
+          (default: "variance")
+        :param maxDepth:
+          Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
+          means 1 internal node + 2 leaf nodes).
+          (default: 5)
+        :param maxBins:
+          Number of bins used for finding splits at each node.
+          (default: 32)
+        :param minInstancesPerNode:
+          Minimum number of instances required at child nodes to create
+          the parent split.
+          (default: 1)
+        :param minInfoGain:
+          Minimum info gain required to create a split.
+          (default: 0.0)
+        :return:
+          DecisionTreeModel.
 
         Example usage:
 
@@ -302,34 +327,44 @@ class RandomForest(object):
                         featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32,
                         seed=None):
         """
-        Method to train a decision tree model for binary or multiclass
+        Train a random forest model for binary or multiclass
         classification.
 
-        :param data: Training dataset: RDD of LabeledPoint. Labels
-                 should take values {0, 1, ..., numClasses-1}.
-        :param numClasses: number of classes for classification.
-        :param categoricalFeaturesInfo: Map storing arity of categorical
-                 features.  E.g., an entry (n -> k) indicates that
-                 feature n is categorical with k categories indexed
-                 from 0: {0, 1, ..., k-1}.
-        :param numTrees: Number of trees in the random forest.
-        :param featureSubsetStrategy: Number of features to consider for
-                 splits at each node.
-                 Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
-                 If "auto" is set, this parameter is set based on numTrees:
-                 if numTrees == 1, set to "all";
-                 if numTrees > 1 (forest) set to "sqrt".
-        :param impurity: Criterion used for information gain calculation.
-               Supported values: "gini" (recommended) or "entropy".
-        :param maxDepth: Maximum depth of the tree.
-                 E.g., depth 0 means 1 leaf node; depth 1 means
-                 1 internal node + 2 leaf nodes. (default: 4)
-        :param maxBins: maximum number of bins used for splitting
-                 features
-                 (default: 32)
-        :param seed: Random seed for bootstrapping and choosing feature
-                 subsets.
-        :return: RandomForestModel that can be used for prediction
+        :param data:
+          Training dataset: RDD of LabeledPoint. Labels should take values
+          {0, 1, ..., numClasses-1}.
+        :param numClasses:
+          Number of classes for classification.
+        :param categoricalFeaturesInfo:
+          Map storing arity of categorical features. An entry (n -> k)
+          indicates that feature n is categorical with k categories
+          indexed from 0: {0, 1, ..., k-1}.
+        :param numTrees:
+          Number of trees in the random forest.
+        :param featureSubsetStrategy:
+          Number of features to consider for splits at each node.
+          Supported values: "auto", "all", "sqrt", "log2", "onethird".
+          If "auto" is set, this parameter is set based on numTrees:
+          if numTrees == 1, set to "all";
+          if numTrees > 1 (forest) set to "sqrt".
+          (default: "auto")
+        :param impurity:
+          Criterion used for information gain calculation.
+          Supported values: "gini" or "entropy".
+          (default: "gini")
+        :param maxDepth:
+          Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
+          means 1 internal node + 2 leaf nodes).
+          (default: 4)
+        :param maxBins:
+          Maximum number of bins used for splitting features.
+          (default: 32)
+        :param seed:
+          Random seed for bootstrapping and choosing feature subsets.
+          Set as None to generate seed based on system time.
+          (default: None)
+        :return:
+          RandomForestModel that can be used for prediction.
 
         Example usage:
 
@@ -383,32 +418,40 @@ class RandomForest(object):
     def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto",
                        impurity="variance", maxDepth=4, maxBins=32, seed=None):
         """
-        Method to train a decision tree model for regression.
-
-        :param data: Training dataset: RDD of LabeledPoint. Labels are
-               real numbers.
-        :param categoricalFeaturesInfo: Map storing arity of categorical
-               features. E.g., an entry (n -> k) indicates that feature
-               n is categorical with k categories indexed from 0:
-               {0, 1, ..., k-1}.
-        :param numTrees: Number of trees in the random forest.
-        :param featureSubsetStrategy: Number of features to consider for
-                 splits at each node.
-                 Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
-                 If "auto" is set, this parameter is set based on numTrees:
-                 if numTrees == 1, set to "all";
-                 if numTrees > 1 (forest) set to "onethird" for regression.
-        :param impurity: Criterion used for information gain
-                 calculation.
-                 Supported values: "variance".
-        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means
-                 1 leaf node; depth 1 means 1 internal node + 2 leaf
-                 nodes. (default: 4)
-        :param maxBins: maximum number of bins used for splitting
-                 features (default: 32)
-        :param seed: Random seed for bootstrapping and choosing feature
-                 subsets.
-        :return: RandomForestModel that can be used for prediction
+        Train a random forest model for regression.
+
+        :param data:
+          Training dataset: RDD of LabeledPoint. Labels are real numbers.
+        :param categoricalFeaturesInfo:
+          Map storing arity of categorical features. An entry (n -> k)
+          indicates that feature n is categorical with k categories
+          indexed from 0: {0, 1, ..., k-1}.
+        :param numTrees:
+          Number of trees in the random forest.
+        :param featureSubsetStrategy:
+          Number of features to consider for splits at each node.
+          Supported values: "auto", "all", "sqrt", "log2", "onethird".
+          If "auto" is set, this parameter is set based on numTrees:
+          if numTrees == 1, set to "all";
+          if numTrees > 1 (forest) set to "onethird" for regression.
+          (default: "auto")
+        :param impurity:
+          Criterion used for information gain calculation.
+          The only supported value for regression is "variance".
+          (default: "variance")
+        :param maxDepth:
+          Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
+          means 1 internal node + 2 leaf nodes).
+          (default: 4)
+        :param maxBins:
+          Maximum number of bins used for splitting features.
+          (default: 32)
+        :param seed:
+          Random seed for bootstrapping and choosing feature subsets.
+          Set as None to generate seed based on system time.
+          (default: None)
+        :return:
+          RandomForestModel that can be used for prediction.
 
         Example usage:
 
@@ -480,31 +523,37 @@ class GradientBoostedTrees(object):
                         loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3,
                         maxBins=32):
         """
-        Method to train a gradient-boosted trees model for
-        classification.
-
-        :param data: Training dataset: RDD of LabeledPoint.
-                 Labels should take values {0, 1}.
-        :param categoricalFeaturesInfo: Map storing arity of categorical
-               features. E.g., an entry (n -> k) indicates that feature
-               n is categorical with k categories indexed from 0:
-               {0, 1, ..., k-1}.
-        :param loss: Loss function used for minimization during gradient
-                 boosting. Supported: {"logLoss" (default),
-                 "leastSquaresError", "leastAbsoluteError"}.
-        :param numIterations: Number of iterations of boosting.
-                              (default: 100)
-        :param learningRate: Learning rate for shrinking the
-                 contribution of each estimator. The learning rate
-                 should be between in the interval (0, 1].
-                 (default: 0.1)
-        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means
-                 1 leaf node; depth 1 means 1 internal node + 2 leaf
-                 nodes. (default: 3)
-        :param maxBins: maximum number of bins used for splitting
-                 features (default: 32) DecisionTree requires maxBins >= max categories
-        :return: GradientBoostedTreesModel that can be used for
-                   prediction
+        Train a gradient-boosted trees model for classification.
+
+        :param data:
+          Training dataset: RDD of LabeledPoint. Labels should take values
+          {0, 1}.
+        :param categoricalFeaturesInfo:
+          Map storing arity of categorical features. An entry (n -> k)
+          indicates that feature n is categorical with k categories
+          indexed from 0: {0, 1, ..., k-1}.
+        :param loss:
+          Loss function used for minimization during gradient boosting.
+          Supported values: "logLoss", "leastSquaresError",
+          "leastAbsoluteError".
+          (default: "logLoss")
+        :param numIterations:
+          Number of iterations of boosting.
+          (default: 100)
+        :param learningRate:
+          Learning rate for shrinking the contribution of each estimator.
+          The learning rate should be between in the interval (0, 1].
+          (default: 0.1)
+        :param maxDepth:
+          Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
+          means 1 internal node + 2 leaf nodes).
+          (default: 3)
+        :param maxBins:
+          Maximum number of bins used for splitting features. DecisionTree
+          requires maxBins >= max categories.
+          (default: 32)
+        :return:
+          GradientBoostedTreesModel that can be used for prediction.
 
         Example usage:
 
@@ -543,30 +592,36 @@ class GradientBoostedTrees(object):
                        loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3,
                        maxBins=32):
         """
-        Method to train a gradient-boosted trees model for regression.
-
-        :param data: Training dataset: RDD of LabeledPoint. Labels are
-               real numbers.
-        :param categoricalFeaturesInfo: Map storing arity of categorical
-               features. E.g., an entry (n -> k) indicates that feature
-               n is categorical with k categories indexed from 0:
-               {0, 1, ..., k-1}.
-        :param loss: Loss function used for minimization during gradient
-                 boosting. Supported: {"logLoss" (default),
-                 "leastSquaresError", "leastAbsoluteError"}.
-        :param numIterations: Number of iterations of boosting.
-                              (default: 100)
-        :param learningRate: Learning rate for shrinking the
-                 contribution of each estimator. The learning rate
-                 should be between in the interval (0, 1].
-                 (default: 0.1)
-        :param maxBins: maximum number of bins used for splitting
-                 features (default: 32) DecisionTree requires maxBins >= max categories
-        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means
-                 1 leaf node; depth 1 means 1 internal node + 2 leaf
-                 nodes.  (default: 3)
-        :return: GradientBoostedTreesModel that can be used for
-                   prediction
+        Train a gradient-boosted trees model for regression.
+
+        :param data:
+          Training dataset: RDD of LabeledPoint. Labels are real numbers.
+        :param categoricalFeaturesInfo:
+          Map storing arity of categorical features. An entry (n -> k)
+          indicates that feature n is categorical with k categories
+          indexed from 0: {0, 1, ..., k-1}.
+        :param loss:
+          Loss function used for minimization during gradient boosting.
+          Supported values: "logLoss", "leastSquaresError",
+          "leastAbsoluteError".
+          (default: "leastSquaresError")
+        :param numIterations:
+          Number of iterations of boosting.
+          (default: 100)
+        :param learningRate:
+          Learning rate for shrinking the contribution of each estimator.
+          The learning rate should be between in the interval (0, 1].
+          (default: 0.1)
+        :param maxDepth:
+          Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
+          means 1 internal node + 2 leaf nodes).
+          (default: 3)
+        :param maxBins:
+          Maximum number of bins used for splitting features. DecisionTree
+          requires maxBins >= max categories.
+          (default: 32)
+        :return:
+          GradientBoostedTreesModel that can be used for prediction.
 
         Example usage:
author	Bryan Cutler <cutlerb@gmail.com>	2016-02-26 08:30:32 -0800
committer	Xiangrui Meng <meng@databricks.com>	2016-02-26 08:30:32 -0800
commit	b33261f91387904c5aaccae40f86922c92a4e09a (patch)
tree	abae986f0bd829276d4b320f8242275a22609212 /python/pyspark
parent	99dfcedbfd4c83c7b6a343456f03e8c6e29968c5 (diff)
download	spark-b33261f91387904c5aaccae40f86922c92a4e09a.tar.gz spark-b33261f91387904c5aaccae40f86922c92a4e09a.tar.bz2 spark-b33261f91387904c5aaccae40f86922c92a4e09a.zip