From 1c53a5db993193122bfa79574d2540149fe2cc08 Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Thu, 20 Nov 2014 15:31:28 -0800 Subject: [SPARK-4439] [MLlib] add python api for random forest ``` class RandomForestModel | A model trained by RandomForest | | numTrees(self) | Get number of trees in forest. | | predict(self, x) | Predict values for a single data point or an RDD of points using the model trained. | | toDebugString(self) | Full model | | totalNumNodes(self) | Get total number of nodes, summed over all trees in the forest. | class RandomForest | trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesInfo, numTrees, featureSubsetStrategy='auto', impurity='gini', maxDepth=4, maxBins=32, seed=None): | Method to train a decision tree model for binary or multiclass classification. | | :param data: Training dataset: RDD of LabeledPoint. | Labels should take values {0, 1, ..., numClasses-1}. | :param numClassesForClassification: number of classes for classification. | :param categoricalFeaturesInfo: Map storing arity of categorical features. | E.g., an entry (n -> k) indicates that feature n is categorical | with k categories indexed from 0: {0, 1, ..., k-1}. | :param numTrees: Number of trees in the random forest. | :param featureSubsetStrategy: Number of features to consider for splits at each node. | Supported: "auto" (default), "all", "sqrt", "log2", "onethird". | If "auto" is set, this parameter is set based on numTrees: | if numTrees == 1, set to "all"; | if numTrees > 1 (forest) set to "sqrt". | :param impurity: Criterion used for information gain calculation. | Supported values: "gini" (recommended) or "entropy". | :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means | 1 internal node + 2 leaf nodes. (default: 4) | :param maxBins: maximum number of bins used for splitting features (default: 100) | :param seed: Random seed for bootstrapping and choosing feature subsets. | :return: RandomForestModel that can be used for prediction | | trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy='auto', impurity='variance', maxDepth=4, maxBins=32, seed=None): | Method to train a decision tree model for regression. | | :param data: Training dataset: RDD of LabeledPoint. | Labels are real numbers. | :param categoricalFeaturesInfo: Map storing arity of categorical features. | E.g., an entry (n -> k) indicates that feature n is categorical | with k categories indexed from 0: {0, 1, ..., k-1}. | :param numTrees: Number of trees in the random forest. | :param featureSubsetStrategy: Number of features to consider for splits at each node. | Supported: "auto" (default), "all", "sqrt", "log2", "onethird". | If "auto" is set, this parameter is set based on numTrees: | if numTrees == 1, set to "all"; | if numTrees > 1 (forest) set to "onethird". | :param impurity: Criterion used for information gain calculation. | Supported values: "variance". | :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means | 1 internal node + 2 leaf nodes.(default: 4) | :param maxBins: maximum number of bins used for splitting features (default: 100) | :param seed: Random seed for bootstrapping and choosing feature subsets. | :return: RandomForestModel that can be used for prediction | ``` Author: Davies Liu Closes #3320 from davies/forest and squashes the following commits: 8003dfc [Davies Liu] reorder 53cf510 [Davies Liu] fix docs 4ca593d [Davies Liu] fix docs e0df852 [Davies Liu] fix docs 0431746 [Davies Liu] rebased 2b6f239 [Davies Liu] Merge branch 'master' of github.com:apache/spark into forest 885abee [Davies Liu] address comments dae7fc0 [Davies Liu] address comments 89a000f [Davies Liu] fix docs 565d476 [Davies Liu] add python api for random forest --- .../spark/mllib/api/python/PythonMLLibAPI.scala | 38 +++- .../org/apache/spark/mllib/tree/RandomForest.scala | 12 +- python/docs/epytext.py | 2 +- python/pyspark/mllib/tree.py | 242 +++++++++++++++++++-- 4 files changed, 261 insertions(+), 33 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 6f94b7f483..b6f7618171 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -40,10 +40,10 @@ import org.apache.spark.mllib.regression._ import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} import org.apache.spark.mllib.stat.correlation.CorrelationNames import org.apache.spark.mllib.stat.test.ChiSqTestResult -import org.apache.spark.mllib.tree.DecisionTree +import org.apache.spark.mllib.tree.{RandomForest, DecisionTree} import org.apache.spark.mllib.tree.configuration.{Algo, Strategy} import org.apache.spark.mllib.tree.impurity._ -import org.apache.spark.mllib.tree.model.DecisionTreeModel +import org.apache.spark.mllib.tree.model.{RandomForestModel, DecisionTreeModel} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel @@ -499,6 +499,40 @@ class PythonMLLibAPI extends Serializable { DecisionTree.train(data.rdd, strategy) } + /** + * Java stub for Python mllib RandomForest.train(). + * This stub returns a handle to the Java object instead of the content of the Java object. + * Extra care needs to be taken in the Python code to ensure it gets freed on exit; + * see the Py4J documentation. + */ + def trainRandomForestModel( + data: JavaRDD[LabeledPoint], + algoStr: String, + numClasses: Int, + categoricalFeaturesInfo: JMap[Int, Int], + numTrees: Int, + featureSubsetStrategy: String, + impurityStr: String, + maxDepth: Int, + maxBins: Int, + seed: Int): RandomForestModel = { + + val algo = Algo.fromString(algoStr) + val impurity = Impurities.fromString(impurityStr) + val strategy = new Strategy( + algo = algo, + impurity = impurity, + maxDepth = maxDepth, + numClassesForClassification = numClasses, + maxBins = maxBins, + categoricalFeaturesInfo = categoricalFeaturesInfo.asScala.toMap) + if (algo == Algo.Classification) { + RandomForest.trainClassifier(data.rdd, strategy, numTrees, featureSubsetStrategy, seed) + } else { + RandomForest.trainRegressor(data.rdd, strategy, numTrees, featureSubsetStrategy, seed) + } + } + /** * Java stub for mllib Statistics.colStats(X: RDD[Vector]). * TODO figure out return type. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala index ca0b6eea9a..3ae6fa2a0e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala @@ -230,8 +230,7 @@ object RandomForest extends Serializable with Logging { * Supported: "auto" (default), "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; - * if numTrees > 1 (forest) set to "sqrt" for classification and - * to "onethird" for regression. + * if numTrees > 1 (forest) set to "sqrt". * @param seed Random seed for bootstrapping and choosing feature subsets. * @return a random forest model that can be used for prediction */ @@ -261,8 +260,7 @@ object RandomForest extends Serializable with Logging { * Supported: "auto" (default), "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; - * if numTrees > 1 (forest) set to "sqrt" for classification and - * to "onethird" for regression. + * if numTrees > 1 (forest) set to "sqrt". * @param impurity Criterion used for information gain calculation. * Supported values: "gini" (recommended) or "entropy". * @param maxDepth Maximum depth of the tree. @@ -318,8 +316,7 @@ object RandomForest extends Serializable with Logging { * Supported: "auto" (default), "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; - * if numTrees > 1 (forest) set to "sqrt" for classification and - * to "onethird" for regression. + * if numTrees > 1 (forest) set to "onethird". * @param seed Random seed for bootstrapping and choosing feature subsets. * @return a random forest model that can be used for prediction */ @@ -348,8 +345,7 @@ object RandomForest extends Serializable with Logging { * Supported: "auto" (default), "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; - * if numTrees > 1 (forest) set to "sqrt" for classification and - * to "onethird" for regression. + * if numTrees > 1 (forest) set to "onethird". * @param impurity Criterion used for information gain calculation. * Supported values: "variance". * @param maxDepth Maximum depth of the tree. diff --git a/python/docs/epytext.py b/python/docs/epytext.py index 19fefbfc05..e884d5e6b1 100644 --- a/python/docs/epytext.py +++ b/python/docs/epytext.py @@ -1,7 +1,7 @@ import re RULES = ( - (r"<[\w.]+>", r""), + (r"<(!BLANKLINE)[\w.]+>", r""), (r"L{([\w.()]+)}", r":class:`\1`"), (r"[LC]{(\w+\.\w+)\(\)}", r":func:`\1`"), (r"C{([\w.()]+)}", r":class:`\1`"), diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index ef0d556fac..46e253991a 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -15,12 +15,16 @@ # limitations under the License. # +from __future__ import absolute_import + +import random + from pyspark import SparkContext, RDD from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper from pyspark.mllib.linalg import _convert_to_vector from pyspark.mllib.regression import LabeledPoint -__all__ = ['DecisionTreeModel', 'DecisionTree'] +__all__ = ['DecisionTreeModel', 'DecisionTree', 'RandomForestModel', 'RandomForest'] class DecisionTreeModel(JavaModelWrapper): @@ -51,27 +55,25 @@ class DecisionTreeModel(JavaModelWrapper): return self._java_model.depth() def __repr__(self): - """ Print summary of model. """ + """ summary of model. """ return self._java_model.toString() def toDebugString(self): - """ Print full model. """ + """ full model. """ return self._java_model.toDebugString() class DecisionTree(object): """ - Learning algorithm for a decision tree model - for classification or regression. + Learning algorithm for a decision tree model for classification or regression. EXPERIMENTAL: This is an experimental API. - It will probably be modified for Spark v1.2. - + It will probably be modified in future. """ - @staticmethod - def _train(data, type, numClasses, features, impurity="gini", maxDepth=5, maxBins=32, + @classmethod + def _train(cls, data, type, numClasses, features, impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0): first = data.first() assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" @@ -79,8 +81,8 @@ class DecisionTree(object): impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) return DecisionTreeModel(model) - @staticmethod - def trainClassifier(data, numClasses, categoricalFeaturesInfo, + @classmethod + def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0): """ @@ -98,8 +100,8 @@ class DecisionTree(object): E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 leaf nodes. :param maxBins: Number of bins used for finding splits at each node. - :param minInstancesPerNode: Min number of instances required at child nodes to create - the parent split + :param minInstancesPerNode: Min number of instances required at child + nodes to create the parent split :param minInfoGain: Min info gain required to create a split :return: DecisionTreeModel @@ -132,11 +134,11 @@ class DecisionTree(object): >>> model.predict(rdd).collect() [1.0, 0.0] """ - return DecisionTree._train(data, "classification", numClasses, categoricalFeaturesInfo, - impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) + return cls._train(data, "classification", numClasses, categoricalFeaturesInfo, + impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) - @staticmethod - def trainRegressor(data, categoricalFeaturesInfo, + @classmethod + def trainRegressor(cls, data, categoricalFeaturesInfo, impurity="variance", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0): """ @@ -153,14 +155,13 @@ class DecisionTree(object): E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 leaf nodes. :param maxBins: Number of bins used for finding splits at each node. - :param minInstancesPerNode: Min number of instances required at child nodes to create - the parent split + :param minInstancesPerNode: Min number of instances required at child + nodes to create the parent split :param minInfoGain: Min info gain required to create a split :return: DecisionTreeModel Example usage: - >>> from numpy import array >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import DecisionTree >>> from pyspark.mllib.linalg import SparseVector @@ -181,8 +182,205 @@ class DecisionTree(object): >>> model.predict(rdd).collect() [1.0, 0.0] """ - return DecisionTree._train(data, "regression", 0, categoricalFeaturesInfo, - impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) + return cls._train(data, "regression", 0, categoricalFeaturesInfo, + impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) + + +class RandomForestModel(JavaModelWrapper): + """ + Represents a random forest model. + + EXPERIMENTAL: This is an experimental API. + It will probably be modified in future. + """ + def predict(self, x): + """ + Predict values for a single data point or an RDD of points using + the model trained. + """ + if isinstance(x, RDD): + return self.call("predict", x.map(_convert_to_vector)) + + else: + return self.call("predict", _convert_to_vector(x)) + + def numTrees(self): + """ + Get number of trees in forest. + """ + return self.call("numTrees") + + def totalNumNodes(self): + """ + Get total number of nodes, summed over all trees in the forest. + """ + return self.call("totalNumNodes") + + def __repr__(self): + """ Summary of model """ + return self._java_model.toString() + + def toDebugString(self): + """ Full model """ + return self._java_model.toDebugString() + + +class RandomForest(object): + """ + Learning algorithm for a random forest model for classification or regression. + + EXPERIMENTAL: This is an experimental API. + It will probably be modified in future. + """ + + supportedFeatureSubsetStrategies = ("auto", "all", "sqrt", "log2", "onethird") + + @classmethod + def _train(cls, data, algo, numClasses, categoricalFeaturesInfo, numTrees, + featureSubsetStrategy, impurity, maxDepth, maxBins, seed): + first = data.first() + assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" + if featureSubsetStrategy not in cls.supportedFeatureSubsetStrategies: + raise ValueError("unsupported featureSubsetStrategy: %s" % featureSubsetStrategy) + if seed is None: + seed = random.randint(0, 1 << 30) + model = callMLlibFunc("trainRandomForestModel", data, algo, numClasses, + categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, + maxDepth, maxBins, seed) + return RandomForestModel(model) + + @classmethod + def trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesInfo, numTrees, + featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32, + seed=None): + """ + Method to train a decision tree model for binary or multiclass + classification. + + :param data: Training dataset: RDD of LabeledPoint. Labels should take + values {0, 1, ..., numClasses-1}. + :param numClassesForClassification: number of classes for classification. + :param categoricalFeaturesInfo: Map storing arity of categorical features. + E.g., an entry (n -> k) indicates that feature n is categorical + with k categories indexed from 0: {0, 1, ..., k-1}. + :param numTrees: Number of trees in the random forest. + :param featureSubsetStrategy: Number of features to consider for splits at + each node. + Supported: "auto" (default), "all", "sqrt", "log2", "onethird". + If "auto" is set, this parameter is set based on numTrees: + if numTrees == 1, set to "all"; + if numTrees > 1 (forest) set to "sqrt". + :param impurity: Criterion used for information gain calculation. + Supported values: "gini" (recommended) or "entropy". + :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; + depth 1 means 1 internal node + 2 leaf nodes. (default: 4) + :param maxBins: maximum number of bins used for splitting features + (default: 100) + :param seed: Random seed for bootstrapping and choosing feature subsets. + :return: RandomForestModel that can be used for prediction + + Example usage: + + >>> from pyspark.mllib.regression import LabeledPoint + >>> from pyspark.mllib.tree import RandomForest + >>> + >>> data = [ + ... LabeledPoint(0.0, [0.0]), + ... LabeledPoint(0.0, [1.0]), + ... LabeledPoint(1.0, [2.0]), + ... LabeledPoint(1.0, [3.0]) + ... ] + >>> model = RandomForest.trainClassifier(sc.parallelize(data), 2, {}, 3, seed=42) + >>> model.numTrees() + 3 + >>> model.totalNumNodes() + 7 + >>> print model, + TreeEnsembleModel classifier with 3 trees + >>> print model.toDebugString(), + TreeEnsembleModel classifier with 3 trees + + Tree 0: + Predict: 1.0 + Tree 1: + If (feature 0 <= 1.0) + Predict: 0.0 + Else (feature 0 > 1.0) + Predict: 1.0 + Tree 2: + If (feature 0 <= 1.0) + Predict: 0.0 + Else (feature 0 > 1.0) + Predict: 1.0 + >>> model.predict([2.0]) + 1.0 + >>> model.predict([0.0]) + 0.0 + >>> rdd = sc.parallelize([[3.0], [1.0]]) + >>> model.predict(rdd).collect() + [1.0, 0.0] + """ + return cls._train(data, "classification", numClassesForClassification, + categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, + maxDepth, maxBins, seed) + + @classmethod + def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto", + impurity="variance", maxDepth=4, maxBins=32, seed=None): + """ + Method to train a decision tree model for regression. + + :param data: Training dataset: RDD of LabeledPoint. Labels are + real numbers. + :param categoricalFeaturesInfo: Map storing arity of categorical + features. E.g., an entry (n -> k) indicates that feature + n is categorical with k categories indexed from 0: + {0, 1, ..., k-1}. + :param numTrees: Number of trees in the random forest. + :param featureSubsetStrategy: Number of features to consider for + splits at each node. + Supported: "auto" (default), "all", "sqrt", "log2", "onethird". + If "auto" is set, this parameter is set based on numTrees: + if numTrees == 1, set to "all"; + if numTrees > 1 (forest) set to "onethird" for regression. + :param impurity: Criterion used for information gain calculation. + Supported values: "variance". + :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 + leaf node; depth 1 means 1 internal node + 2 leaf nodes. + (default: 4) + :param maxBins: maximum number of bins used for splitting features + (default: 100) + :param seed: Random seed for bootstrapping and choosing feature subsets. + :return: RandomForestModel that can be used for prediction + + Example usage: + + >>> from pyspark.mllib.regression import LabeledPoint + >>> from pyspark.mllib.tree import RandomForest + >>> from pyspark.mllib.linalg import SparseVector + >>> + >>> sparse_data = [ + ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})), + ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})), + ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})), + ... LabeledPoint(1.0, SparseVector(2, {1: 2.0})) + ... ] + >>> + >>> model = RandomForest.trainRegressor(sc.parallelize(sparse_data), {}, 2, seed=42) + >>> model.numTrees() + 2 + >>> model.totalNumNodes() + 4 + >>> model.predict(SparseVector(2, {1: 1.0})) + 1.0 + >>> model.predict(SparseVector(2, {0: 1.0})) + 0.5 + >>> rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]]) + >>> model.predict(rdd).collect() + [1.0, 0.5] + """ + return cls._train(data, "regression", 0, categoricalFeaturesInfo, numTrees, + featureSubsetStrategy, impurity, maxDepth, maxBins, seed) def _test(): -- cgit v1.2.3