aboutsummaryrefslogtreecommitdiff
path: root/docs/ml-decision-tree.md
diff options
context:
space:
mode:
authorJoseph K. Bradley <joseph@databricks.com>2015-08-24 15:38:54 -0700
committerXiangrui Meng <meng@databricks.com>2015-08-24 15:38:54 -0700
commit13db11cb08eb90eb0ea3402c9fe0270aa282f247 (patch)
tree6b428c218b3c1585294284c452def849b20d204a /docs/ml-decision-tree.md
parentcb2d2e15844d7ae34b5dd7028b55e11586ed93fa (diff)
downloadspark-13db11cb08eb90eb0ea3402c9fe0270aa282f247.tar.gz
spark-13db11cb08eb90eb0ea3402c9fe0270aa282f247.tar.bz2
spark-13db11cb08eb90eb0ea3402c9fe0270aa282f247.zip
[SPARK-10061] [DOC] ML ensemble docs
User guide for spark.ml GBTs and Random Forests. The examples are copied from the decision tree guide and modified to run. I caught some issues I had somehow missed in the tree guide as well. I have run all examples, including Java ones. (Of course, I thought I had previously as well...) CC: mengxr manishamde yanboliang Author: Joseph K. Bradley <joseph@databricks.com> Closes #8369 from jkbradley/ml-ensemble-docs.
Diffstat (limited to 'docs/ml-decision-tree.md')
-rw-r--r--docs/ml-decision-tree.md75
1 files changed, 29 insertions, 46 deletions
diff --git a/docs/ml-decision-tree.md b/docs/ml-decision-tree.md
index 958c6f5e47..542819e93e 100644
--- a/docs/ml-decision-tree.md
+++ b/docs/ml-decision-tree.md
@@ -30,7 +30,7 @@ The Pipelines API for Decision Trees offers a bit more functionality than the or
Ensembles of trees (Random Forests and Gradient-Boosted Trees) are described in the [Ensembles guide](ml-ensembles.html).
-# Inputs and Outputs (Predictions)
+# Inputs and Outputs
We list the input and output (prediction) column types here.
All output columns are optional; to exclude an output column, set its corresponding Param to an empty string.
@@ -234,7 +234,7 @@ IndexToString labelConverter = new IndexToString()
// Chain indexers and tree in a Pipeline
Pipeline pipeline = new Pipeline()
- .setStages(new PipelineStage[]{labelIndexer, featureIndexer, dt, labelConverter});
+ .setStages(new PipelineStage[] {labelIndexer, featureIndexer, dt, labelConverter});
// Train model. This also runs the indexers.
PipelineModel model = pipeline.fit(trainingData);
@@ -315,10 +315,13 @@ print treeModel # summary only
## Regression
+The following examples load a dataset in LibSVM format, split it into training and test sets, train on the first dataset, and then evaluate on the held-out test set.
+We use a feature transformer to index categorical features, adding metadata to the `DataFrame` which the Decision Tree algorithm can recognize.
+
<div class="codetabs">
<div data-lang="scala" markdown="1">
-More details on parameters can be found in the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.classification.DecisionTreeClassifier).
+More details on parameters can be found in the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.regression.DecisionTreeRegressor).
{% highlight scala %}
import org.apache.spark.ml.Pipeline
@@ -347,7 +350,7 @@ val dt = new DecisionTreeRegressor()
.setLabelCol("label")
.setFeaturesCol("indexedFeatures")
-// Chain indexers and tree in a Pipeline
+// Chain indexer and tree in a Pipeline
val pipeline = new Pipeline()
.setStages(Array(featureIndexer, dt))
@@ -365,9 +368,7 @@ val evaluator = new RegressionEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("rmse")
-// We negate the RMSE value since RegressionEvalutor returns negated RMSE
-// (since evaluation metrics are meant to be maximized by CrossValidator).
-val rmse = - evaluator.evaluate(predictions)
+val rmse = evaluator.evaluate(predictions)
println("Root Mean Squared Error (RMSE) on test data = " + rmse)
val treeModel = model.stages(1).asInstanceOf[DecisionTreeRegressionModel]
@@ -377,14 +378,15 @@ println("Learned regression tree model:\n" + treeModel.toDebugString)
<div data-lang="java" markdown="1">
-More details on parameters can be found in the [Java API documentation](api/java/org/apache/spark/ml/classification/DecisionTreeClassifier.html).
+More details on parameters can be found in the [Java API documentation](api/java/org/apache/spark/ml/regression/DecisionTreeRegressor.html).
{% highlight java %}
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
import org.apache.spark.ml.PipelineStage;
import org.apache.spark.ml.evaluation.RegressionEvaluator;
-import org.apache.spark.ml.feature.*;
+import org.apache.spark.ml.feature.VectorIndexer;
+import org.apache.spark.ml.feature.VectorIndexerModel;
import org.apache.spark.ml.regression.DecisionTreeRegressionModel;
import org.apache.spark.ml.regression.DecisionTreeRegressor;
import org.apache.spark.mllib.regression.LabeledPoint;
@@ -396,17 +398,12 @@ import org.apache.spark.sql.DataFrame;
RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
-// Index labels, adding metadata to the label column.
-// Fit on whole dataset to include all labels in index.
-StringIndexerModel labelIndexer = new StringIndexer()
- .setInputCol("label")
- .setOutputCol("indexedLabel")
- .fit(data);
// Automatically identify categorical features, and index them.
+// Set maxCategories so features with > 4 distinct values are treated as continuous.
VectorIndexerModel featureIndexer = new VectorIndexer()
.setInputCol("features")
.setOutputCol("indexedFeatures")
- .setMaxCategories(4) // features with > 4 distinct values are treated as continuous
+ .setMaxCategories(4)
.fit(data);
// Split the data into training and test sets (30% held out for testing)
@@ -416,61 +413,49 @@ DataFrame testData = splits[1];
// Train a DecisionTree model.
DecisionTreeRegressor dt = new DecisionTreeRegressor()
- .setLabelCol("indexedLabel")
.setFeaturesCol("indexedFeatures");
-// Convert indexed labels back to original labels.
-IndexToString labelConverter = new IndexToString()
- .setInputCol("prediction")
- .setOutputCol("predictedLabel")
- .setLabels(labelIndexer.labels());
-
-// Chain indexers and tree in a Pipeline
+// Chain indexer and tree in a Pipeline
Pipeline pipeline = new Pipeline()
- .setStages(new PipelineStage[]{labelIndexer, featureIndexer, dt, labelConverter});
+ .setStages(new PipelineStage[] {featureIndexer, dt});
-// Train model. This also runs the indexers.
+// Train model. This also runs the indexer.
PipelineModel model = pipeline.fit(trainingData);
// Make predictions.
DataFrame predictions = model.transform(testData);
// Select example rows to display.
-predictions.select("predictedLabel", "label", "features").show(5);
+predictions.select("label", "features").show(5);
// Select (prediction, true label) and compute test error
RegressionEvaluator evaluator = new RegressionEvaluator()
- .setLabelCol("indexedLabel")
+ .setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("rmse");
-// We negate the RMSE value since RegressionEvalutor returns negated RMSE
-// (since evaluation metrics are meant to be maximized by CrossValidator).
-double rmse = - evaluator.evaluate(predictions);
+double rmse = evaluator.evaluate(predictions);
System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);
DecisionTreeRegressionModel treeModel =
- (DecisionTreeRegressionModel)(model.stages()[2]);
+ (DecisionTreeRegressionModel)(model.stages()[1]);
System.out.println("Learned regression tree model:\n" + treeModel.toDebugString());
{% endhighlight %}
</div>
<div data-lang="python" markdown="1">
-More details on parameters can be found in the [Python API documentation](api/python/pyspark.ml.html#pyspark.ml.classification.DecisionTreeClassifier).
+More details on parameters can be found in the [Python API documentation](api/python/pyspark.ml.html#pyspark.ml.regression.DecisionTreeRegressor).
{% highlight python %}
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
-from pyspark.ml.feature import StringIndexer, VectorIndexer
+from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.util import MLUtils
# Load and parse the data file, converting it to a DataFrame.
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
-# Index labels, adding metadata to the label column.
-# Fit on whole dataset to include all labels in index.
-labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
@@ -480,26 +465,24 @@ featureIndexer =\
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a DecisionTree model.
-dt = DecisionTreeRegressor(labelCol="indexedLabel", featuresCol="indexedFeatures")
+dt = DecisionTreeRegressor(featuresCol="indexedFeatures")
-# Chain indexers and tree in a Pipeline
-pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
+# Chain indexer and tree in a Pipeline
+pipeline = Pipeline(stages=[featureIndexer, dt])
-# Train model. This also runs the indexers.
+# Train model. This also runs the indexer.
model = pipeline.fit(trainingData)
# Make predictions.
predictions = model.transform(testData)
# Select example rows to display.
-predictions.select("prediction", "indexedLabel", "features").show(5)
+predictions.select("prediction", "label", "features").show(5)
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
- labelCol="indexedLabel", predictionCol="prediction", metricName="rmse")
-# We negate the RMSE value since RegressionEvalutor returns negated RMSE
-# (since evaluation metrics are meant to be maximized by CrossValidator).
-rmse = -evaluator.evaluate(predictions)
+ labelCol="label", predictionCol="prediction", metricName="rmse")
+rmse = evaluator.evaluate(predictions)
print "Root Mean Squared Error (RMSE) on test data = %g" % rmse
treeModel = model.stages[1]