From 657a88835d8bf22488b53d50f75281d7dc32442e Mon Sep 17 00:00:00 2001 From: "Joseph K. Bradley" Date: Thu, 4 Dec 2014 09:57:50 +0800 Subject: [SPARK-4580] [SPARK-4610] [mllib] [docs] Documentation for tree ensembles + DecisionTree API fix Major changes: * Added programming guide sections for tree ensembles * Added examples for tree ensembles * Updated DecisionTree programming guide with more info on parameters * **API change**: Standardized the tree parameter for the number of classes (for classification) Minor changes: * Updated decision tree documentation * Updated existing tree and tree ensemble examples * Use train/test split, and compute test error instead of training error. * Fixed decision_tree_runner.py to actually use the number of classes it computes from data. (small bug fix) Note: I know this is a lot of lines, but most is covered by: * Programming guide sections for gradient boosting and random forests. (The changes are probably best viewed by generating the docs locally.) * New examples (which were copied from the programming guide) * The "numClasses" renaming I have run all examples and relevant unit tests. CC: mengxr manishamde codedeft Author: Joseph K. Bradley Author: Joseph K. Bradley Closes #3461 from jkbradley/ensemble-docs and squashes the following commits: 70a75f3 [Joseph K. Bradley] updated forest vs boosting comparison d1de753 [Joseph K. Bradley] Added note about toString and toDebugString for DecisionTree to migration guide 8e87f8f [Joseph K. Bradley] Combined GBT and RandomForest guides into one ensembles guide 6fab846 [Joseph K. Bradley] small fixes based on review b9f8576 [Joseph K. Bradley] updated decision tree doc 375204c [Joseph K. Bradley] fixed python style 2b60b6e [Joseph K. Bradley] merged Java RandomForest examples into 1 file. added header. Fixed small bug in same example in the programming guide. 706d332 [Joseph K. Bradley] updated python DT runner to print full model if it is small c76c823 [Joseph K. Bradley] added migration guide for mllib abe5ed7 [Joseph K. Bradley] added examples for random forest in Java and Python to examples folder 07fc11d [Joseph K. Bradley] Renamed numClassesForClassification to numClasses everywhere in trees and ensembles. This is a breaking API change, but it was necessary to correct an API inconsistency in Spark 1.1 (where Python DecisionTree used numClasses but Scala used numClassesForClassification). cdfdfbc [Joseph K. Bradley] added examples for GBT 6372a2b [Joseph K. Bradley] updated decision tree examples to use random split. tested all of them. ad3e695 [Joseph K. Bradley] added gbt and random forest to programming guide. still need to update their examples --- .../mllib/JavaGradientBoostedTreesRunner.java | 2 +- .../examples/mllib/JavaRandomForestExample.java | 139 +++++++++++++++++++++ .../src/main/python/mllib/decision_tree_runner.py | 17 +-- .../src/main/python/mllib/random_forest_example.py | 89 +++++++++++++ .../spark/examples/mllib/DecisionTreeRunner.scala | 2 +- .../mllib/GradientBoostedTreesRunner.scala | 2 +- 6 files changed, 241 insertions(+), 10 deletions(-) create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomForestExample.java create mode 100755 examples/src/main/python/mllib/random_forest_example.py (limited to 'examples') diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaGradientBoostedTreesRunner.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaGradientBoostedTreesRunner.java index 4a5ac404ea..a1844d5d07 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaGradientBoostedTreesRunner.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaGradientBoostedTreesRunner.java @@ -73,7 +73,7 @@ public final class JavaGradientBoostedTreesRunner { return p.label(); } }).countByValue().size(); - boostingStrategy.treeStrategy().setNumClassesForClassification(numClasses); + boostingStrategy.treeStrategy().setNumClasses(numClasses); // Train a GradientBoosting model for classification. final GradientBoostedTreesModel model = GradientBoostedTrees.train(data, boostingStrategy); diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomForestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomForestExample.java new file mode 100644 index 0000000000..89a4e092a5 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomForestExample.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import scala.Tuple2; + +import java.util.HashMap; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.Function2; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.tree.RandomForest; +import org.apache.spark.mllib.tree.model.RandomForestModel; +import org.apache.spark.mllib.util.MLUtils; + +public final class JavaRandomForestExample { + + /** + * Note: This example illustrates binary classification. + * For information on multiclass classification, please refer to the JavaDecisionTree.java + * example. + */ + private static void testClassification(JavaRDD trainingData, + JavaRDD testData) { + // Train a RandomForest model. + // Empty categoricalFeaturesInfo indicates all features are continuous. + Integer numClasses = 2; + HashMap categoricalFeaturesInfo = new HashMap(); + Integer numTrees = 3; // Use more in practice. + String featureSubsetStrategy = "auto"; // Let the algorithm choose. + String impurity = "gini"; + Integer maxDepth = 4; + Integer maxBins = 32; + Integer seed = 12345; + + final RandomForestModel model = RandomForest.trainClassifier(trainingData, numClasses, + categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, + seed); + + // Evaluate model on test instances and compute test error + JavaPairRDD predictionAndLabel = + testData.mapToPair(new PairFunction() { + @Override + public Tuple2 call(LabeledPoint p) { + return new Tuple2(model.predict(p.features()), p.label()); + } + }); + Double testErr = + 1.0 * predictionAndLabel.filter(new Function, Boolean>() { + @Override + public Boolean call(Tuple2 pl) { + return !pl._1().equals(pl._2()); + } + }).count() / testData.count(); + System.out.println("Test Error: " + testErr); + System.out.println("Learned classification forest model:\n" + model.toDebugString()); + } + + private static void testRegression(JavaRDD trainingData, + JavaRDD testData) { + // Train a RandomForest model. + // Empty categoricalFeaturesInfo indicates all features are continuous. + HashMap categoricalFeaturesInfo = new HashMap(); + Integer numTrees = 3; // Use more in practice. + String featureSubsetStrategy = "auto"; // Let the algorithm choose. + String impurity = "variance"; + Integer maxDepth = 4; + Integer maxBins = 32; + Integer seed = 12345; + + final RandomForestModel model = RandomForest.trainRegressor(trainingData, + categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, + seed); + + // Evaluate model on test instances and compute test error + JavaPairRDD predictionAndLabel = + testData.mapToPair(new PairFunction() { + @Override + public Tuple2 call(LabeledPoint p) { + return new Tuple2(model.predict(p.features()), p.label()); + } + }); + Double testMSE = + predictionAndLabel.map(new Function, Double>() { + @Override + public Double call(Tuple2 pl) { + Double diff = pl._1() - pl._2(); + return diff * diff; + } + }).reduce(new Function2() { + @Override + public Double call(Double a, Double b) { + return a + b; + } + }) / testData.count(); + System.out.println("Test Mean Squared Error: " + testMSE); + System.out.println("Learned regression forest model:\n" + model.toDebugString()); + } + + public static void main(String[] args) { + SparkConf sparkConf = new SparkConf().setAppName("JavaRandomForestExample"); + JavaSparkContext sc = new JavaSparkContext(sparkConf); + + // Load and parse the data file. + String datapath = "data/mllib/sample_libsvm_data.txt"; + JavaRDD data = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD(); + // Split the data into training and test sets (30% held out for testing) + JavaRDD[] splits = data.randomSplit(new double[]{0.7, 0.3}); + JavaRDD trainingData = splits[0]; + JavaRDD testData = splits[1]; + + System.out.println("\nRunning example of classification using RandomForest\n"); + testClassification(trainingData, testData); + + System.out.println("\nRunning example of regression using RandomForest\n"); + testRegression(trainingData, testData); + sc.stop(); + } +} diff --git a/examples/src/main/python/mllib/decision_tree_runner.py b/examples/src/main/python/mllib/decision_tree_runner.py index 61ea4e06ec..fccabd841b 100755 --- a/examples/src/main/python/mllib/decision_tree_runner.py +++ b/examples/src/main/python/mllib/decision_tree_runner.py @@ -106,8 +106,7 @@ def reindexClassLabels(data): def usage(): print >> sys.stderr, \ - "Usage: decision_tree_runner [libsvm format data filepath]\n" + \ - " Note: This only supports binary classification." + "Usage: decision_tree_runner [libsvm format data filepath]" exit(1) @@ -127,16 +126,20 @@ if __name__ == "__main__": # Re-index class labels if needed. (reindexedData, origToNewLabels) = reindexClassLabels(points) + numClasses = len(origToNewLabels) # Train a classifier. categoricalFeaturesInfo = {} # no categorical features - model = DecisionTree.trainClassifier(reindexedData, numClasses=2, + model = DecisionTree.trainClassifier(reindexedData, numClasses=numClasses, categoricalFeaturesInfo=categoricalFeaturesInfo) # Print learned tree and stats. print "Trained DecisionTree for classification:" - print " Model numNodes: %d\n" % model.numNodes() - print " Model depth: %d\n" % model.depth() - print " Training accuracy: %g\n" % getAccuracy(model, reindexedData) - print model + print " Model numNodes: %d" % model.numNodes() + print " Model depth: %d" % model.depth() + print " Training accuracy: %g" % getAccuracy(model, reindexedData) + if model.numNodes() < 20: + print model.toDebugString() + else: + print model sc.stop() diff --git a/examples/src/main/python/mllib/random_forest_example.py b/examples/src/main/python/mllib/random_forest_example.py new file mode 100755 index 0000000000..d3c24f7664 --- /dev/null +++ b/examples/src/main/python/mllib/random_forest_example.py @@ -0,0 +1,89 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Random Forest classification and regression using MLlib. + +Note: This example illustrates binary classification. + For information on multiclass classification, please refer to the decision_tree_runner.py + example. +""" + +import sys + +from pyspark.context import SparkContext +from pyspark.mllib.tree import RandomForest +from pyspark.mllib.util import MLUtils + + +def testClassification(trainingData, testData): + # Train a RandomForest model. + # Empty categoricalFeaturesInfo indicates all features are continuous. + # Note: Use larger numTrees in practice. + # Setting featureSubsetStrategy="auto" lets the algorithm choose. + model = RandomForest.trainClassifier(trainingData, numClasses=2, + categoricalFeaturesInfo={}, + numTrees=3, featureSubsetStrategy="auto", + impurity='gini', maxDepth=4, maxBins=32) + + # Evaluate model on test instances and compute test error + predictions = model.predict(testData.map(lambda x: x.features)) + labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) + testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count()\ + / float(testData.count()) + print('Test Error = ' + str(testErr)) + print('Learned classification forest model:') + print(model.toDebugString()) + + +def testRegression(trainingData, testData): + # Train a RandomForest model. + # Empty categoricalFeaturesInfo indicates all features are continuous. + # Note: Use larger numTrees in practice. + # Setting featureSubsetStrategy="auto" lets the algorithm choose. + model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={}, + numTrees=3, featureSubsetStrategy="auto", + impurity='variance', maxDepth=4, maxBins=32) + + # Evaluate model on test instances and compute test error + predictions = model.predict(testData.map(lambda x: x.features)) + labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) + testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum()\ + / float(testData.count()) + print('Test Mean Squared Error = ' + str(testMSE)) + print('Learned regression forest model:') + print(model.toDebugString()) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + print >> sys.stderr, "Usage: random_forest_example" + exit(1) + sc = SparkContext(appName="PythonRandomForestExample") + + # Load and parse the data file into an RDD of LabeledPoint. + data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt') + # Split the data into training and test sets (30% held out for testing) + (trainingData, testData) = data.randomSplit([0.7, 0.3]) + + print('\nRunning example of classification using RandomForest\n') + testClassification(trainingData, testData) + + print('\nRunning example of regression using RandomForest\n') + testRegression(trainingData, testData) + + sc.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala index 54953adb5f..205d80dd02 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala @@ -278,7 +278,7 @@ object DecisionTreeRunner { impurity = impurityCalculator, maxDepth = params.maxDepth, maxBins = params.maxBins, - numClassesForClassification = numClasses, + numClasses = numClasses, minInstancesPerNode = params.minInstancesPerNode, minInfoGain = params.minInfoGain, useNodeIdCache = params.useNodeIdCache, diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostedTreesRunner.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostedTreesRunner.scala index 1def8b45a2..431ead8c0c 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostedTreesRunner.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostedTreesRunner.scala @@ -103,7 +103,7 @@ object GradientBoostedTreesRunner { params.dataFormat, params.testInput, Algo.withName(params.algo), params.fracTest) val boostingStrategy = BoostingStrategy.defaultParams(params.algo) - boostingStrategy.treeStrategy.numClassesForClassification = numClasses + boostingStrategy.treeStrategy.numClasses = numClasses boostingStrategy.numIterations = params.numIterations boostingStrategy.treeStrategy.maxDepth = params.maxDepth -- cgit v1.2.3