aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2014-11-20 15:31:28 -0800
committerXiangrui Meng <meng@databricks.com>2014-11-20 15:31:28 -0800
commit1c53a5db993193122bfa79574d2540149fe2cc08 (patch)
tree68efb0022ee91fc29450473eede8ac1ce9bccb39 /python
parentb8e6886fb8ff8f667fb7e600cd727d8649cad1d1 (diff)
downloadspark-1c53a5db993193122bfa79574d2540149fe2cc08.tar.gz
spark-1c53a5db993193122bfa79574d2540149fe2cc08.tar.bz2
spark-1c53a5db993193122bfa79574d2540149fe2cc08.zip
[SPARK-4439] [MLlib] add python api for random forest
``` class RandomForestModel | A model trained by RandomForest | | numTrees(self) | Get number of trees in forest. | | predict(self, x) | Predict values for a single data point or an RDD of points using the model trained. | | toDebugString(self) | Full model | | totalNumNodes(self) | Get total number of nodes, summed over all trees in the forest. | class RandomForest | trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesInfo, numTrees, featureSubsetStrategy='auto', impurity='gini', maxDepth=4, maxBins=32, seed=None): | Method to train a decision tree model for binary or multiclass classification. | | :param data: Training dataset: RDD of LabeledPoint. | Labels should take values {0, 1, ..., numClasses-1}. | :param numClassesForClassification: number of classes for classification. | :param categoricalFeaturesInfo: Map storing arity of categorical features. | E.g., an entry (n -> k) indicates that feature n is categorical | with k categories indexed from 0: {0, 1, ..., k-1}. | :param numTrees: Number of trees in the random forest. | :param featureSubsetStrategy: Number of features to consider for splits at each node. | Supported: "auto" (default), "all", "sqrt", "log2", "onethird". | If "auto" is set, this parameter is set based on numTrees: | if numTrees == 1, set to "all"; | if numTrees > 1 (forest) set to "sqrt". | :param impurity: Criterion used for information gain calculation. | Supported values: "gini" (recommended) or "entropy". | :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means | 1 internal node + 2 leaf nodes. (default: 4) | :param maxBins: maximum number of bins used for splitting features (default: 100) | :param seed: Random seed for bootstrapping and choosing feature subsets. | :return: RandomForestModel that can be used for prediction | | trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy='auto', impurity='variance', maxDepth=4, maxBins=32, seed=None): | Method to train a decision tree model for regression. | | :param data: Training dataset: RDD of LabeledPoint. | Labels are real numbers. | :param categoricalFeaturesInfo: Map storing arity of categorical features. | E.g., an entry (n -> k) indicates that feature n is categorical | with k categories indexed from 0: {0, 1, ..., k-1}. | :param numTrees: Number of trees in the random forest. | :param featureSubsetStrategy: Number of features to consider for splits at each node. | Supported: "auto" (default), "all", "sqrt", "log2", "onethird". | If "auto" is set, this parameter is set based on numTrees: | if numTrees == 1, set to "all"; | if numTrees > 1 (forest) set to "onethird". | :param impurity: Criterion used for information gain calculation. | Supported values: "variance". | :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means | 1 internal node + 2 leaf nodes.(default: 4) | :param maxBins: maximum number of bins used for splitting features (default: 100) | :param seed: Random seed for bootstrapping and choosing feature subsets. | :return: RandomForestModel that can be used for prediction | ``` Author: Davies Liu <davies@databricks.com> Closes #3320 from davies/forest and squashes the following commits: 8003dfc [Davies Liu] reorder 53cf510 [Davies Liu] fix docs 4ca593d [Davies Liu] fix docs e0df852 [Davies Liu] fix docs 0431746 [Davies Liu] rebased 2b6f239 [Davies Liu] Merge branch 'master' of github.com:apache/spark into forest 885abee [Davies Liu] address comments dae7fc0 [Davies Liu] address comments 89a000f [Davies Liu] fix docs 565d476 [Davies Liu] add python api for random forest
Diffstat (limited to 'python')
-rw-r--r--python/docs/epytext.py2
-rw-r--r--python/pyspark/mllib/tree.py242
2 files changed, 221 insertions, 23 deletions
diff --git a/python/docs/epytext.py b/python/docs/epytext.py
index 19fefbfc05..e884d5e6b1 100644
--- a/python/docs/epytext.py
+++ b/python/docs/epytext.py
@@ -1,7 +1,7 @@
import re
RULES = (
- (r"<[\w.]+>", r""),
+ (r"<(!BLANKLINE)[\w.]+>", r""),
(r"L{([\w.()]+)}", r":class:`\1`"),
(r"[LC]{(\w+\.\w+)\(\)}", r":func:`\1`"),
(r"C{([\w.()]+)}", r":class:`\1`"),
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index ef0d556fac..46e253991a 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -15,12 +15,16 @@
# limitations under the License.
#
+from __future__ import absolute_import
+
+import random
+
from pyspark import SparkContext, RDD
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
from pyspark.mllib.linalg import _convert_to_vector
from pyspark.mllib.regression import LabeledPoint
-__all__ = ['DecisionTreeModel', 'DecisionTree']
+__all__ = ['DecisionTreeModel', 'DecisionTree', 'RandomForestModel', 'RandomForest']
class DecisionTreeModel(JavaModelWrapper):
@@ -51,27 +55,25 @@ class DecisionTreeModel(JavaModelWrapper):
return self._java_model.depth()
def __repr__(self):
- """ Print summary of model. """
+ """ summary of model. """
return self._java_model.toString()
def toDebugString(self):
- """ Print full model. """
+ """ full model. """
return self._java_model.toDebugString()
class DecisionTree(object):
"""
- Learning algorithm for a decision tree model
- for classification or regression.
+ Learning algorithm for a decision tree model for classification or regression.
EXPERIMENTAL: This is an experimental API.
- It will probably be modified for Spark v1.2.
-
+ It will probably be modified in future.
"""
- @staticmethod
- def _train(data, type, numClasses, features, impurity="gini", maxDepth=5, maxBins=32,
+ @classmethod
+ def _train(cls, data, type, numClasses, features, impurity="gini", maxDepth=5, maxBins=32,
minInstancesPerNode=1, minInfoGain=0.0):
first = data.first()
assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
@@ -79,8 +81,8 @@ class DecisionTree(object):
impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
return DecisionTreeModel(model)
- @staticmethod
- def trainClassifier(data, numClasses, categoricalFeaturesInfo,
+ @classmethod
+ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo,
impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1,
minInfoGain=0.0):
"""
@@ -98,8 +100,8 @@ class DecisionTree(object):
E.g., depth 0 means 1 leaf node.
Depth 1 means 1 internal node + 2 leaf nodes.
:param maxBins: Number of bins used for finding splits at each node.
- :param minInstancesPerNode: Min number of instances required at child nodes to create
- the parent split
+ :param minInstancesPerNode: Min number of instances required at child
+ nodes to create the parent split
:param minInfoGain: Min info gain required to create a split
:return: DecisionTreeModel
@@ -132,11 +134,11 @@ class DecisionTree(object):
>>> model.predict(rdd).collect()
[1.0, 0.0]
"""
- return DecisionTree._train(data, "classification", numClasses, categoricalFeaturesInfo,
- impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
+ return cls._train(data, "classification", numClasses, categoricalFeaturesInfo,
+ impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
- @staticmethod
- def trainRegressor(data, categoricalFeaturesInfo,
+ @classmethod
+ def trainRegressor(cls, data, categoricalFeaturesInfo,
impurity="variance", maxDepth=5, maxBins=32, minInstancesPerNode=1,
minInfoGain=0.0):
"""
@@ -153,14 +155,13 @@ class DecisionTree(object):
E.g., depth 0 means 1 leaf node.
Depth 1 means 1 internal node + 2 leaf nodes.
:param maxBins: Number of bins used for finding splits at each node.
- :param minInstancesPerNode: Min number of instances required at child nodes to create
- the parent split
+ :param minInstancesPerNode: Min number of instances required at child
+ nodes to create the parent split
:param minInfoGain: Min info gain required to create a split
:return: DecisionTreeModel
Example usage:
- >>> from numpy import array
>>> from pyspark.mllib.regression import LabeledPoint
>>> from pyspark.mllib.tree import DecisionTree
>>> from pyspark.mllib.linalg import SparseVector
@@ -181,8 +182,205 @@ class DecisionTree(object):
>>> model.predict(rdd).collect()
[1.0, 0.0]
"""
- return DecisionTree._train(data, "regression", 0, categoricalFeaturesInfo,
- impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
+ return cls._train(data, "regression", 0, categoricalFeaturesInfo,
+ impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
+
+
+class RandomForestModel(JavaModelWrapper):
+ """
+ Represents a random forest model.
+
+ EXPERIMENTAL: This is an experimental API.
+ It will probably be modified in future.
+ """
+ def predict(self, x):
+ """
+ Predict values for a single data point or an RDD of points using
+ the model trained.
+ """
+ if isinstance(x, RDD):
+ return self.call("predict", x.map(_convert_to_vector))
+
+ else:
+ return self.call("predict", _convert_to_vector(x))
+
+ def numTrees(self):
+ """
+ Get number of trees in forest.
+ """
+ return self.call("numTrees")
+
+ def totalNumNodes(self):
+ """
+ Get total number of nodes, summed over all trees in the forest.
+ """
+ return self.call("totalNumNodes")
+
+ def __repr__(self):
+ """ Summary of model """
+ return self._java_model.toString()
+
+ def toDebugString(self):
+ """ Full model """
+ return self._java_model.toDebugString()
+
+
+class RandomForest(object):
+ """
+ Learning algorithm for a random forest model for classification or regression.
+
+ EXPERIMENTAL: This is an experimental API.
+ It will probably be modified in future.
+ """
+
+ supportedFeatureSubsetStrategies = ("auto", "all", "sqrt", "log2", "onethird")
+
+ @classmethod
+ def _train(cls, data, algo, numClasses, categoricalFeaturesInfo, numTrees,
+ featureSubsetStrategy, impurity, maxDepth, maxBins, seed):
+ first = data.first()
+ assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
+ if featureSubsetStrategy not in cls.supportedFeatureSubsetStrategies:
+ raise ValueError("unsupported featureSubsetStrategy: %s" % featureSubsetStrategy)
+ if seed is None:
+ seed = random.randint(0, 1 << 30)
+ model = callMLlibFunc("trainRandomForestModel", data, algo, numClasses,
+ categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity,
+ maxDepth, maxBins, seed)
+ return RandomForestModel(model)
+
+ @classmethod
+ def trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesInfo, numTrees,
+ featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32,
+ seed=None):
+ """
+ Method to train a decision tree model for binary or multiclass
+ classification.
+
+ :param data: Training dataset: RDD of LabeledPoint. Labels should take
+ values {0, 1, ..., numClasses-1}.
+ :param numClassesForClassification: number of classes for classification.
+ :param categoricalFeaturesInfo: Map storing arity of categorical features.
+ E.g., an entry (n -> k) indicates that feature n is categorical
+ with k categories indexed from 0: {0, 1, ..., k-1}.
+ :param numTrees: Number of trees in the random forest.
+ :param featureSubsetStrategy: Number of features to consider for splits at
+ each node.
+ Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
+ If "auto" is set, this parameter is set based on numTrees:
+ if numTrees == 1, set to "all";
+ if numTrees > 1 (forest) set to "sqrt".
+ :param impurity: Criterion used for information gain calculation.
+ Supported values: "gini" (recommended) or "entropy".
+ :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node;
+ depth 1 means 1 internal node + 2 leaf nodes. (default: 4)
+ :param maxBins: maximum number of bins used for splitting features
+ (default: 100)
+ :param seed: Random seed for bootstrapping and choosing feature subsets.
+ :return: RandomForestModel that can be used for prediction
+
+ Example usage:
+
+ >>> from pyspark.mllib.regression import LabeledPoint
+ >>> from pyspark.mllib.tree import RandomForest
+ >>>
+ >>> data = [
+ ... LabeledPoint(0.0, [0.0]),
+ ... LabeledPoint(0.0, [1.0]),
+ ... LabeledPoint(1.0, [2.0]),
+ ... LabeledPoint(1.0, [3.0])
+ ... ]
+ >>> model = RandomForest.trainClassifier(sc.parallelize(data), 2, {}, 3, seed=42)
+ >>> model.numTrees()
+ 3
+ >>> model.totalNumNodes()
+ 7
+ >>> print model,
+ TreeEnsembleModel classifier with 3 trees
+ >>> print model.toDebugString(),
+ TreeEnsembleModel classifier with 3 trees
+ <BLANKLINE>
+ Tree 0:
+ Predict: 1.0
+ Tree 1:
+ If (feature 0 <= 1.0)
+ Predict: 0.0
+ Else (feature 0 > 1.0)
+ Predict: 1.0
+ Tree 2:
+ If (feature 0 <= 1.0)
+ Predict: 0.0
+ Else (feature 0 > 1.0)
+ Predict: 1.0
+ >>> model.predict([2.0])
+ 1.0
+ >>> model.predict([0.0])
+ 0.0
+ >>> rdd = sc.parallelize([[3.0], [1.0]])
+ >>> model.predict(rdd).collect()
+ [1.0, 0.0]
+ """
+ return cls._train(data, "classification", numClassesForClassification,
+ categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity,
+ maxDepth, maxBins, seed)
+
+ @classmethod
+ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto",
+ impurity="variance", maxDepth=4, maxBins=32, seed=None):
+ """
+ Method to train a decision tree model for regression.
+
+ :param data: Training dataset: RDD of LabeledPoint. Labels are
+ real numbers.
+ :param categoricalFeaturesInfo: Map storing arity of categorical
+ features. E.g., an entry (n -> k) indicates that feature
+ n is categorical with k categories indexed from 0:
+ {0, 1, ..., k-1}.
+ :param numTrees: Number of trees in the random forest.
+ :param featureSubsetStrategy: Number of features to consider for
+ splits at each node.
+ Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
+ If "auto" is set, this parameter is set based on numTrees:
+ if numTrees == 1, set to "all";
+ if numTrees > 1 (forest) set to "onethird" for regression.
+ :param impurity: Criterion used for information gain calculation.
+ Supported values: "variance".
+ :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1
+ leaf node; depth 1 means 1 internal node + 2 leaf nodes.
+ (default: 4)
+ :param maxBins: maximum number of bins used for splitting features
+ (default: 100)
+ :param seed: Random seed for bootstrapping and choosing feature subsets.
+ :return: RandomForestModel that can be used for prediction
+
+ Example usage:
+
+ >>> from pyspark.mllib.regression import LabeledPoint
+ >>> from pyspark.mllib.tree import RandomForest
+ >>> from pyspark.mllib.linalg import SparseVector
+ >>>
+ >>> sparse_data = [
+ ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
+ ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
+ ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
+ ... LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
+ ... ]
+ >>>
+ >>> model = RandomForest.trainRegressor(sc.parallelize(sparse_data), {}, 2, seed=42)
+ >>> model.numTrees()
+ 2
+ >>> model.totalNumNodes()
+ 4
+ >>> model.predict(SparseVector(2, {1: 1.0}))
+ 1.0
+ >>> model.predict(SparseVector(2, {0: 1.0}))
+ 0.5
+ >>> rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]])
+ >>> model.predict(rdd).collect()
+ [1.0, 0.5]
+ """
+ return cls._train(data, "regression", 0, categoricalFeaturesInfo, numTrees,
+ featureSubsetStrategy, impurity, maxDepth, maxBins, seed)
def _test():