From 657a88835d8bf22488b53d50f75281d7dc32442e Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Thu, 4 Dec 2014 09:57:50 +0800
Subject: [SPARK-4580] [SPARK-4610] [mllib] [docs] Documentation for tree
 ensembles + DecisionTree API fix

Major changes:
* Added programming guide sections for tree ensembles
* Added examples for tree ensembles
* Updated DecisionTree programming guide with more info on parameters
* **API change**: Standardized the tree parameter for the number of classes (for classification)

Minor changes:
* Updated decision tree documentation
* Updated existing tree and tree ensemble examples
 * Use train/test split, and compute test error instead of training error.
 * Fixed decision_tree_runner.py to actually use the number of classes it computes from data. (small bug fix)

Note: I know this is a lot of lines, but most is covered by:
* Programming guide sections for gradient boosting and random forests.  (The changes are probably best viewed by generating the docs locally.)
* New examples (which were copied from the programming guide)
* The "numClasses" renaming

I have run all examples and relevant unit tests.

CC: mengxr manishamde codedeft

Author: Joseph K. Bradley <joseph@databricks.com>
Author: Joseph K. Bradley <joseph.kurata.bradley@gmail.com>

Closes #3461 from jkbradley/ensemble-docs and squashes the following commits:

70a75f3 [Joseph K. Bradley] updated forest vs boosting comparison
d1de753 [Joseph K. Bradley] Added note about toString and toDebugString for DecisionTree to migration guide
8e87f8f [Joseph K. Bradley] Combined GBT and RandomForest guides into one ensembles guide
6fab846 [Joseph K. Bradley] small fixes based on review
b9f8576 [Joseph K. Bradley] updated decision tree doc
375204c [Joseph K. Bradley] fixed python style
2b60b6e [Joseph K. Bradley] merged Java RandomForest examples into 1 file.  added header.  Fixed small bug in same example in the programming guide.
706d332 [Joseph K. Bradley] updated python DT runner to print full model if it is small
c76c823 [Joseph K. Bradley] added migration guide for mllib
abe5ed7 [Joseph K. Bradley] added examples for random forest in Java and Python to examples folder
07fc11d [Joseph K. Bradley] Renamed numClassesForClassification to numClasses everywhere in trees and ensembles. This is a breaking API change, but it was necessary to correct an API inconsistency in Spark 1.1 (where Python DecisionTree used numClasses but Scala used numClassesForClassification).
cdfdfbc [Joseph K. Bradley] added examples for GBT
6372a2b [Joseph K. Bradley] updated decision tree examples to use random split.  tested all of them.
ad3e695 [Joseph K. Bradley] added gbt and random forest to programming guide.  still need to update their examples
---
 .../mllib/JavaGradientBoostedTreesRunner.java      |   2 +-
 .../examples/mllib/JavaRandomForestExample.java    | 139 +++++++++++++++++++++
 .../src/main/python/mllib/decision_tree_runner.py  |  17 +--
 .../src/main/python/mllib/random_forest_example.py |  89 +++++++++++++
 .../spark/examples/mllib/DecisionTreeRunner.scala  |   2 +-
 .../mllib/GradientBoostedTreesRunner.scala         |   2 +-
 6 files changed, 241 insertions(+), 10 deletions(-)
 create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomForestExample.java
 create mode 100755 examples/src/main/python/mllib/random_forest_example.py

(limited to 'examples')

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaGradientBoostedTreesRunner.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaGradientBoostedTreesRunner.java
index 4a5ac404ea..a1844d5d07 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaGradientBoostedTreesRunner.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaGradientBoostedTreesRunner.java
@@ -73,7 +73,7 @@ public final class JavaGradientBoostedTreesRunner {
           return p.label();
         }
       }).countByValue().size();
-      boostingStrategy.treeStrategy().setNumClassesForClassification(numClasses);
+      boostingStrategy.treeStrategy().setNumClasses(numClasses);
 
       // Train a GradientBoosting model for classification.
       final GradientBoostedTreesModel model = GradientBoostedTrees.train(data, boostingStrategy);
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomForestExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomForestExample.java
new file mode 100644
index 0000000000..89a4e092a5
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRandomForestExample.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import scala.Tuple2;
+
+import java.util.HashMap;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.api.java.function.Function2;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.tree.RandomForest;
+import org.apache.spark.mllib.tree.model.RandomForestModel;
+import org.apache.spark.mllib.util.MLUtils;
+
+public final class JavaRandomForestExample {
+
+  /**
+   * Note: This example illustrates binary classification.
+   * For information on multiclass classification, please refer to the JavaDecisionTree.java
+   * example.
+   */
+  private static void testClassification(JavaRDD<LabeledPoint> trainingData,
+                                         JavaRDD<LabeledPoint> testData) {
+    // Train a RandomForest model.
+    //  Empty categoricalFeaturesInfo indicates all features are continuous.
+    Integer numClasses = 2;
+    HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
+    Integer numTrees = 3; // Use more in practice.
+    String featureSubsetStrategy = "auto"; // Let the algorithm choose.
+    String impurity = "gini";
+    Integer maxDepth = 4;
+    Integer maxBins = 32;
+    Integer seed = 12345;
+
+    final RandomForestModel model = RandomForest.trainClassifier(trainingData, numClasses,
+        categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins,
+        seed);
+
+    // Evaluate model on test instances and compute test error
+    JavaPairRDD<Double, Double> predictionAndLabel =
+        testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
+          @Override
+          public Tuple2<Double, Double> call(LabeledPoint p) {
+            return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
+          }
+        });
+    Double testErr =
+        1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
+          @Override
+          public Boolean call(Tuple2<Double, Double> pl) {
+            return !pl._1().equals(pl._2());
+          }
+        }).count() / testData.count();
+    System.out.println("Test Error: " + testErr);
+    System.out.println("Learned classification forest model:\n" + model.toDebugString());
+  }
+
+  private static void testRegression(JavaRDD<LabeledPoint> trainingData,
+                                     JavaRDD<LabeledPoint> testData) {
+    // Train a RandomForest model.
+    //  Empty categoricalFeaturesInfo indicates all features are continuous.
+    HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
+    Integer numTrees = 3; // Use more in practice.
+    String featureSubsetStrategy = "auto"; // Let the algorithm choose.
+    String impurity = "variance";
+    Integer maxDepth = 4;
+    Integer maxBins = 32;
+    Integer seed = 12345;
+
+    final RandomForestModel model = RandomForest.trainRegressor(trainingData,
+        categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins,
+        seed);
+
+    // Evaluate model on test instances and compute test error
+    JavaPairRDD<Double, Double> predictionAndLabel =
+        testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
+          @Override
+          public Tuple2<Double, Double> call(LabeledPoint p) {
+            return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
+          }
+        });
+    Double testMSE =
+        predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
+          @Override
+          public Double call(Tuple2<Double, Double> pl) {
+            Double diff = pl._1() - pl._2();
+            return diff * diff;
+          }
+        }).reduce(new Function2<Double, Double, Double>() {
+          @Override
+          public Double call(Double a, Double b) {
+            return a + b;
+          }
+        }) / testData.count();
+    System.out.println("Test Mean Squared Error: " + testMSE);
+    System.out.println("Learned regression forest model:\n" + model.toDebugString());
+  }
+
+  public static void main(String[] args) {
+    SparkConf sparkConf = new SparkConf().setAppName("JavaRandomForestExample");
+    JavaSparkContext sc = new JavaSparkContext(sparkConf);
+
+    // Load and parse the data file.
+    String datapath = "data/mllib/sample_libsvm_data.txt";
+    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD();
+    // Split the data into training and test sets (30% held out for testing)
+    JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
+    JavaRDD<LabeledPoint> trainingData = splits[0];
+    JavaRDD<LabeledPoint> testData = splits[1];
+
+    System.out.println("\nRunning example of classification using RandomForest\n");
+    testClassification(trainingData, testData);
+
+    System.out.println("\nRunning example of regression using RandomForest\n");
+    testRegression(trainingData, testData);
+    sc.stop();
+  }
+}
diff --git a/examples/src/main/python/mllib/decision_tree_runner.py b/examples/src/main/python/mllib/decision_tree_runner.py
index 61ea4e06ec..fccabd841b 100755
--- a/examples/src/main/python/mllib/decision_tree_runner.py
+++ b/examples/src/main/python/mllib/decision_tree_runner.py
@@ -106,8 +106,7 @@ def reindexClassLabels(data):
 
 def usage():
     print >> sys.stderr, \
-        "Usage: decision_tree_runner [libsvm format data filepath]\n" + \
-        " Note: This only supports binary classification."
+        "Usage: decision_tree_runner [libsvm format data filepath]"
     exit(1)
 
 
@@ -127,16 +126,20 @@ if __name__ == "__main__":
 
     # Re-index class labels if needed.
     (reindexedData, origToNewLabels) = reindexClassLabels(points)
+    numClasses = len(origToNewLabels)
 
     # Train a classifier.
     categoricalFeaturesInfo = {}  # no categorical features
-    model = DecisionTree.trainClassifier(reindexedData, numClasses=2,
+    model = DecisionTree.trainClassifier(reindexedData, numClasses=numClasses,
                                          categoricalFeaturesInfo=categoricalFeaturesInfo)
     # Print learned tree and stats.
     print "Trained DecisionTree for classification:"
-    print "  Model numNodes: %d\n" % model.numNodes()
-    print "  Model depth: %d\n" % model.depth()
-    print "  Training accuracy: %g\n" % getAccuracy(model, reindexedData)
-    print model
+    print "  Model numNodes: %d" % model.numNodes()
+    print "  Model depth: %d" % model.depth()
+    print "  Training accuracy: %g" % getAccuracy(model, reindexedData)
+    if model.numNodes() < 20:
+        print model.toDebugString()
+    else:
+        print model
 
     sc.stop()
diff --git a/examples/src/main/python/mllib/random_forest_example.py b/examples/src/main/python/mllib/random_forest_example.py
new file mode 100755
index 0000000000..d3c24f7664
--- /dev/null
+++ b/examples/src/main/python/mllib/random_forest_example.py
@@ -0,0 +1,89 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Random Forest classification and regression using MLlib.
+
+Note: This example illustrates binary classification.
+      For information on multiclass classification, please refer to the decision_tree_runner.py
+      example.
+"""
+
+import sys
+
+from pyspark.context import SparkContext
+from pyspark.mllib.tree import RandomForest
+from pyspark.mllib.util import MLUtils
+
+
+def testClassification(trainingData, testData):
+    # Train a RandomForest model.
+    #  Empty categoricalFeaturesInfo indicates all features are continuous.
+    #  Note: Use larger numTrees in practice.
+    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
+    model = RandomForest.trainClassifier(trainingData, numClasses=2,
+                                         categoricalFeaturesInfo={},
+                                         numTrees=3, featureSubsetStrategy="auto",
+                                         impurity='gini', maxDepth=4, maxBins=32)
+
+    # Evaluate model on test instances and compute test error
+    predictions = model.predict(testData.map(lambda x: x.features))
+    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
+    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count()\
+        / float(testData.count())
+    print('Test Error = ' + str(testErr))
+    print('Learned classification forest model:')
+    print(model.toDebugString())
+
+
+def testRegression(trainingData, testData):
+    # Train a RandomForest model.
+    #  Empty categoricalFeaturesInfo indicates all features are continuous.
+    #  Note: Use larger numTrees in practice.
+    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
+    model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
+                                        numTrees=3, featureSubsetStrategy="auto",
+                                        impurity='variance', maxDepth=4, maxBins=32)
+
+    # Evaluate model on test instances and compute test error
+    predictions = model.predict(testData.map(lambda x: x.features))
+    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
+    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum()\
+        / float(testData.count())
+    print('Test Mean Squared Error = ' + str(testMSE))
+    print('Learned regression forest model:')
+    print(model.toDebugString())
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        print >> sys.stderr, "Usage: random_forest_example"
+        exit(1)
+    sc = SparkContext(appName="PythonRandomForestExample")
+
+    # Load and parse the data file into an RDD of LabeledPoint.
+    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
+    # Split the data into training and test sets (30% held out for testing)
+    (trainingData, testData) = data.randomSplit([0.7, 0.3])
+
+    print('\nRunning example of classification using RandomForest\n')
+    testClassification(trainingData, testData)
+
+    print('\nRunning example of regression using RandomForest\n')
+    testRegression(trainingData, testData)
+
+    sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
index 54953adb5f..205d80dd02 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
@@ -278,7 +278,7 @@ object DecisionTreeRunner {
           impurity = impurityCalculator,
           maxDepth = params.maxDepth,
           maxBins = params.maxBins,
-          numClassesForClassification = numClasses,
+          numClasses = numClasses,
           minInstancesPerNode = params.minInstancesPerNode,
           minInfoGain = params.minInfoGain,
           useNodeIdCache = params.useNodeIdCache,
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostedTreesRunner.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostedTreesRunner.scala
index 1def8b45a2..431ead8c0c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostedTreesRunner.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostedTreesRunner.scala
@@ -103,7 +103,7 @@ object GradientBoostedTreesRunner {
       params.dataFormat, params.testInput, Algo.withName(params.algo), params.fracTest)
 
     val boostingStrategy = BoostingStrategy.defaultParams(params.algo)
-    boostingStrategy.treeStrategy.numClassesForClassification = numClasses
+    boostingStrategy.treeStrategy.numClasses = numClasses
     boostingStrategy.numIterations = params.numIterations
     boostingStrategy.treeStrategy.maxDepth = params.maxDepth
 
-- 
cgit v1.2.3