aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/mllib/tree.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/pyspark/mllib/tree.py')
-rw-r--r--python/pyspark/mllib/tree.py242
1 files changed, 220 insertions, 22 deletions
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index ef0d556fac..46e253991a 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -15,12 +15,16 @@
# limitations under the License.
#
+from __future__ import absolute_import
+
+import random
+
from pyspark import SparkContext, RDD
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
from pyspark.mllib.linalg import _convert_to_vector
from pyspark.mllib.regression import LabeledPoint
-__all__ = ['DecisionTreeModel', 'DecisionTree']
+__all__ = ['DecisionTreeModel', 'DecisionTree', 'RandomForestModel', 'RandomForest']
class DecisionTreeModel(JavaModelWrapper):
@@ -51,27 +55,25 @@ class DecisionTreeModel(JavaModelWrapper):
return self._java_model.depth()
def __repr__(self):
- """ Print summary of model. """
+ """ summary of model. """
return self._java_model.toString()
def toDebugString(self):
- """ Print full model. """
+ """ full model. """
return self._java_model.toDebugString()
class DecisionTree(object):
"""
- Learning algorithm for a decision tree model
- for classification or regression.
+ Learning algorithm for a decision tree model for classification or regression.
EXPERIMENTAL: This is an experimental API.
- It will probably be modified for Spark v1.2.
-
+ It will probably be modified in future.
"""
- @staticmethod
- def _train(data, type, numClasses, features, impurity="gini", maxDepth=5, maxBins=32,
+ @classmethod
+ def _train(cls, data, type, numClasses, features, impurity="gini", maxDepth=5, maxBins=32,
minInstancesPerNode=1, minInfoGain=0.0):
first = data.first()
assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
@@ -79,8 +81,8 @@ class DecisionTree(object):
impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
return DecisionTreeModel(model)
- @staticmethod
- def trainClassifier(data, numClasses, categoricalFeaturesInfo,
+ @classmethod
+ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo,
impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1,
minInfoGain=0.0):
"""
@@ -98,8 +100,8 @@ class DecisionTree(object):
E.g., depth 0 means 1 leaf node.
Depth 1 means 1 internal node + 2 leaf nodes.
:param maxBins: Number of bins used for finding splits at each node.
- :param minInstancesPerNode: Min number of instances required at child nodes to create
- the parent split
+ :param minInstancesPerNode: Min number of instances required at child
+ nodes to create the parent split
:param minInfoGain: Min info gain required to create a split
:return: DecisionTreeModel
@@ -132,11 +134,11 @@ class DecisionTree(object):
>>> model.predict(rdd).collect()
[1.0, 0.0]
"""
- return DecisionTree._train(data, "classification", numClasses, categoricalFeaturesInfo,
- impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
+ return cls._train(data, "classification", numClasses, categoricalFeaturesInfo,
+ impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
- @staticmethod
- def trainRegressor(data, categoricalFeaturesInfo,
+ @classmethod
+ def trainRegressor(cls, data, categoricalFeaturesInfo,
impurity="variance", maxDepth=5, maxBins=32, minInstancesPerNode=1,
minInfoGain=0.0):
"""
@@ -153,14 +155,13 @@ class DecisionTree(object):
E.g., depth 0 means 1 leaf node.
Depth 1 means 1 internal node + 2 leaf nodes.
:param maxBins: Number of bins used for finding splits at each node.
- :param minInstancesPerNode: Min number of instances required at child nodes to create
- the parent split
+ :param minInstancesPerNode: Min number of instances required at child
+ nodes to create the parent split
:param minInfoGain: Min info gain required to create a split
:return: DecisionTreeModel
Example usage:
- >>> from numpy import array
>>> from pyspark.mllib.regression import LabeledPoint
>>> from pyspark.mllib.tree import DecisionTree
>>> from pyspark.mllib.linalg import SparseVector
@@ -181,8 +182,205 @@ class DecisionTree(object):
>>> model.predict(rdd).collect()
[1.0, 0.0]
"""
- return DecisionTree._train(data, "regression", 0, categoricalFeaturesInfo,
- impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
+ return cls._train(data, "regression", 0, categoricalFeaturesInfo,
+ impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
+
+
+class RandomForestModel(JavaModelWrapper):
+ """
+ Represents a random forest model.
+
+ EXPERIMENTAL: This is an experimental API.
+ It will probably be modified in future.
+ """
+ def predict(self, x):
+ """
+ Predict values for a single data point or an RDD of points using
+ the model trained.
+ """
+ if isinstance(x, RDD):
+ return self.call("predict", x.map(_convert_to_vector))
+
+ else:
+ return self.call("predict", _convert_to_vector(x))
+
+ def numTrees(self):
+ """
+ Get number of trees in forest.
+ """
+ return self.call("numTrees")
+
+ def totalNumNodes(self):
+ """
+ Get total number of nodes, summed over all trees in the forest.
+ """
+ return self.call("totalNumNodes")
+
+ def __repr__(self):
+ """ Summary of model """
+ return self._java_model.toString()
+
+ def toDebugString(self):
+ """ Full model """
+ return self._java_model.toDebugString()
+
+
+class RandomForest(object):
+ """
+ Learning algorithm for a random forest model for classification or regression.
+
+ EXPERIMENTAL: This is an experimental API.
+ It will probably be modified in future.
+ """
+
+ supportedFeatureSubsetStrategies = ("auto", "all", "sqrt", "log2", "onethird")
+
+ @classmethod
+ def _train(cls, data, algo, numClasses, categoricalFeaturesInfo, numTrees,
+ featureSubsetStrategy, impurity, maxDepth, maxBins, seed):
+ first = data.first()
+ assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
+ if featureSubsetStrategy not in cls.supportedFeatureSubsetStrategies:
+ raise ValueError("unsupported featureSubsetStrategy: %s" % featureSubsetStrategy)
+ if seed is None:
+ seed = random.randint(0, 1 << 30)
+ model = callMLlibFunc("trainRandomForestModel", data, algo, numClasses,
+ categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity,
+ maxDepth, maxBins, seed)
+ return RandomForestModel(model)
+
+ @classmethod
+ def trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesInfo, numTrees,
+ featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32,
+ seed=None):
+ """
+ Method to train a decision tree model for binary or multiclass
+ classification.
+
+ :param data: Training dataset: RDD of LabeledPoint. Labels should take
+ values {0, 1, ..., numClasses-1}.
+ :param numClassesForClassification: number of classes for classification.
+ :param categoricalFeaturesInfo: Map storing arity of categorical features.
+ E.g., an entry (n -> k) indicates that feature n is categorical
+ with k categories indexed from 0: {0, 1, ..., k-1}.
+ :param numTrees: Number of trees in the random forest.
+ :param featureSubsetStrategy: Number of features to consider for splits at
+ each node.
+ Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
+ If "auto" is set, this parameter is set based on numTrees:
+ if numTrees == 1, set to "all";
+ if numTrees > 1 (forest) set to "sqrt".
+ :param impurity: Criterion used for information gain calculation.
+ Supported values: "gini" (recommended) or "entropy".
+ :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node;
+ depth 1 means 1 internal node + 2 leaf nodes. (default: 4)
+ :param maxBins: maximum number of bins used for splitting features
+ (default: 100)
+ :param seed: Random seed for bootstrapping and choosing feature subsets.
+ :return: RandomForestModel that can be used for prediction
+
+ Example usage:
+
+ >>> from pyspark.mllib.regression import LabeledPoint
+ >>> from pyspark.mllib.tree import RandomForest
+ >>>
+ >>> data = [
+ ... LabeledPoint(0.0, [0.0]),
+ ... LabeledPoint(0.0, [1.0]),
+ ... LabeledPoint(1.0, [2.0]),
+ ... LabeledPoint(1.0, [3.0])
+ ... ]
+ >>> model = RandomForest.trainClassifier(sc.parallelize(data), 2, {}, 3, seed=42)
+ >>> model.numTrees()
+ 3
+ >>> model.totalNumNodes()
+ 7
+ >>> print model,
+ TreeEnsembleModel classifier with 3 trees
+ >>> print model.toDebugString(),
+ TreeEnsembleModel classifier with 3 trees
+ <BLANKLINE>
+ Tree 0:
+ Predict: 1.0
+ Tree 1:
+ If (feature 0 <= 1.0)
+ Predict: 0.0
+ Else (feature 0 > 1.0)
+ Predict: 1.0
+ Tree 2:
+ If (feature 0 <= 1.0)
+ Predict: 0.0
+ Else (feature 0 > 1.0)
+ Predict: 1.0
+ >>> model.predict([2.0])
+ 1.0
+ >>> model.predict([0.0])
+ 0.0
+ >>> rdd = sc.parallelize([[3.0], [1.0]])
+ >>> model.predict(rdd).collect()
+ [1.0, 0.0]
+ """
+ return cls._train(data, "classification", numClassesForClassification,
+ categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity,
+ maxDepth, maxBins, seed)
+
+ @classmethod
+ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto",
+ impurity="variance", maxDepth=4, maxBins=32, seed=None):
+ """
+ Method to train a decision tree model for regression.
+
+ :param data: Training dataset: RDD of LabeledPoint. Labels are
+ real numbers.
+ :param categoricalFeaturesInfo: Map storing arity of categorical
+ features. E.g., an entry (n -> k) indicates that feature
+ n is categorical with k categories indexed from 0:
+ {0, 1, ..., k-1}.
+ :param numTrees: Number of trees in the random forest.
+ :param featureSubsetStrategy: Number of features to consider for
+ splits at each node.
+ Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
+ If "auto" is set, this parameter is set based on numTrees:
+ if numTrees == 1, set to "all";
+ if numTrees > 1 (forest) set to "onethird" for regression.
+ :param impurity: Criterion used for information gain calculation.
+ Supported values: "variance".
+ :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1
+ leaf node; depth 1 means 1 internal node + 2 leaf nodes.
+ (default: 4)
+ :param maxBins: maximum number of bins used for splitting features
+ (default: 100)
+ :param seed: Random seed for bootstrapping and choosing feature subsets.
+ :return: RandomForestModel that can be used for prediction
+
+ Example usage:
+
+ >>> from pyspark.mllib.regression import LabeledPoint
+ >>> from pyspark.mllib.tree import RandomForest
+ >>> from pyspark.mllib.linalg import SparseVector
+ >>>
+ >>> sparse_data = [
+ ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
+ ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
+ ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
+ ... LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
+ ... ]
+ >>>
+ >>> model = RandomForest.trainRegressor(sc.parallelize(sparse_data), {}, 2, seed=42)
+ >>> model.numTrees()
+ 2
+ >>> model.totalNumNodes()
+ 4
+ >>> model.predict(SparseVector(2, {1: 1.0}))
+ 1.0
+ >>> model.predict(SparseVector(2, {0: 1.0}))
+ 0.5
+ >>> rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]])
+ >>> model.predict(rdd).collect()
+ [1.0, 0.5]
+ """
+ return cls._train(data, "regression", 0, categoricalFeaturesInfo, numTrees,
+ featureSubsetStrategy, impurity, maxDepth, maxBins, seed)
def _test():