[SPARK-11728] Replace example code in ml-ensembles.md using include_example

JIRA issue https://issues.apache.org/jira/browse/SPARK-11728. The ml-ensembles.md file contains `OneVsRestExample`. Instead of writing new code files of two `OneVsRestExample`s, I use two existing files in the examples directory, they are `OneVsRestExample.scala` and `JavaOneVsRestExample.scala`. Author: Xusen Yin <yinxusen@gmail.com> Closes #9716 from yinxusen/SPARK-11728.
author: Xusen Yin <yinxusen@gmail.com> 2015-11-17 23:44:06 -0800
committer: Xiangrui Meng <meng@databricks.com> 2015-11-17 23:44:06 -0800
commit: 9154f89befb7a33d4853cea95efd7dc6b25d033b (patch)
tree: 8eb6da0ff09ba6c3b2fe34859077e5a55c5ed3df /docs/ml-ensembles.md
parent: 2f191c66b668fc97f82f44fd8336b6a4488c2f5d (diff)
download: spark-9154f89befb7a33d4853cea95efd7dc6b25d033b.tar.gz
spark-9154f89befb7a33d4853cea95efd7dc6b25d033b.tar.bz2
spark-9154f89befb7a33d4853cea95efd7dc6b25d033b.zip
1 files changed, 14 insertions, 740 deletions
diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md
index ce15f5e646..f6c3c30d53 100644
--- a/docs/ml-ensembles.md
+++ b/docs/ml-ensembles.md
@@ -115,194 +115,21 @@ We use two feature transformers to prepare the data; these help index categories
 
 Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.RandomForestClassifier) for more details.
 
-{% highlight scala %}
-import org.apache.spark.ml.Pipeline
-import org.apache.spark.ml.classification.RandomForestClassifier
-import org.apache.spark.ml.classification.RandomForestClassificationModel
-import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer}
-import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
-
-// Load and parse the data file, converting it to a DataFrame.
-val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
-
-// Index labels, adding metadata to the label column.
-// Fit on whole dataset to include all labels in index.
-val labelIndexer = new StringIndexer()
-  .setInputCol("label")
-  .setOutputCol("indexedLabel")
-  .fit(data)
-// Automatically identify categorical features, and index them.
-// Set maxCategories so features with > 4 distinct values are treated as continuous.
-val featureIndexer = new VectorIndexer()
-  .setInputCol("features")
-  .setOutputCol("indexedFeatures")
-  .setMaxCategories(4)
-  .fit(data)
-
-// Split the data into training and test sets (30% held out for testing)
-val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
-
-// Train a RandomForest model.
-val rf = new RandomForestClassifier()
-  .setLabelCol("indexedLabel")
-  .setFeaturesCol("indexedFeatures")
-  .setNumTrees(10)
-
-// Convert indexed labels back to original labels.
-val labelConverter = new IndexToString()
-  .setInputCol("prediction")
-  .setOutputCol("predictedLabel")
-  .setLabels(labelIndexer.labels)
-
-// Chain indexers and forest in a Pipeline
-val pipeline = new Pipeline()
-  .setStages(Array(labelIndexer, featureIndexer, rf, labelConverter))
-
-// Train model.  This also runs the indexers.
-val model = pipeline.fit(trainingData)
-
-// Make predictions.
-val predictions = model.transform(testData)
-
-// Select example rows to display.
-predictions.select("predictedLabel", "label", "features").show(5)
-
-// Select (prediction, true label) and compute test error
-val evaluator = new MulticlassClassificationEvaluator()
-  .setLabelCol("indexedLabel")
-  .setPredictionCol("prediction")
-  .setMetricName("precision")
-val accuracy = evaluator.evaluate(predictions)
-println("Test Error = " + (1.0 - accuracy))
-
-val rfModel = model.stages(2).asInstanceOf[RandomForestClassificationModel]
-println("Learned classification forest model:\n" + rfModel.toDebugString)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
 
 Refer to the [Java API docs](api/java/org/apache/spark/ml/classification/RandomForestClassifier.html) for more details.
 
-{% highlight java %}
-import org.apache.spark.ml.Pipeline;
-import org.apache.spark.ml.PipelineModel;
-import org.apache.spark.ml.PipelineStage;
-import org.apache.spark.ml.classification.RandomForestClassifier;
-import org.apache.spark.ml.classification.RandomForestClassificationModel;
-import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
-import org.apache.spark.ml.feature.*;
-import org.apache.spark.sql.DataFrame;
-
-// Load and parse the data file, converting it to a DataFrame.
-DataFrame data = sqlContext.read().format("libsvm")
-  .load("data/mllib/sample_libsvm_data.txt");
-
-// Index labels, adding metadata to the label column.
-// Fit on whole dataset to include all labels in index.
-StringIndexerModel labelIndexer = new StringIndexer()
-  .setInputCol("label")
-  .setOutputCol("indexedLabel")
-  .fit(data);
-// Automatically identify categorical features, and index them.
-// Set maxCategories so features with > 4 distinct values are treated as continuous.
-VectorIndexerModel featureIndexer = new VectorIndexer()
-  .setInputCol("features")
-  .setOutputCol("indexedFeatures")
-  .setMaxCategories(4)
-  .fit(data);
-
-// Split the data into training and test sets (30% held out for testing)
-DataFrame[] splits = data.randomSplit(new double[] {0.7, 0.3});
-DataFrame trainingData = splits[0];
-DataFrame testData = splits[1];
-
-// Train a RandomForest model.
-RandomForestClassifier rf = new RandomForestClassifier()
-  .setLabelCol("indexedLabel")
-  .setFeaturesCol("indexedFeatures");
-
-// Convert indexed labels back to original labels.
-IndexToString labelConverter = new IndexToString()
-  .setInputCol("prediction")
-  .setOutputCol("predictedLabel")
-  .setLabels(labelIndexer.labels());
-
-// Chain indexers and forest in a Pipeline
-Pipeline pipeline = new Pipeline()
-  .setStages(new PipelineStage[] {labelIndexer, featureIndexer, rf, labelConverter});
-
-// Train model.  This also runs the indexers.
-PipelineModel model = pipeline.fit(trainingData);
-
-// Make predictions.
-DataFrame predictions = model.transform(testData);
-
-// Select example rows to display.
-predictions.select("predictedLabel", "label", "features").show(5);
-
-// Select (prediction, true label) and compute test error
-MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
-  .setLabelCol("indexedLabel")
-  .setPredictionCol("prediction")
-  .setMetricName("precision");
-double accuracy = evaluator.evaluate(predictions);
-System.out.println("Test Error = " + (1.0 - accuracy));
-
-RandomForestClassificationModel rfModel =
-  (RandomForestClassificationModel)(model.stages()[2]);
-System.out.println("Learned classification forest model:\n" + rfModel.toDebugString());
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaRandomForestClassifierExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
 
 Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classification.RandomForestClassifier) for more details.
 
-{% highlight python %}
-from pyspark.ml import Pipeline
-from pyspark.ml.classification import RandomForestClassifier
-from pyspark.ml.feature import StringIndexer, VectorIndexer
-from pyspark.ml.evaluation import MulticlassClassificationEvaluator
-
-# Load and parse the data file, converting it to a DataFrame.
-data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
-
-# Index labels, adding metadata to the label column.
-# Fit on whole dataset to include all labels in index.
-labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
-# Automatically identify categorical features, and index them.
-# Set maxCategories so features with > 4 distinct values are treated as continuous.
-featureIndexer =\
-    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)
-
-# Split the data into training and test sets (30% held out for testing)
-(trainingData, testData) = data.randomSplit([0.7, 0.3])
-
-# Train a RandomForest model.
-rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
-
-# Chain indexers and forest in a Pipeline
-pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])
-
-# Train model.  This also runs the indexers.
-model = pipeline.fit(trainingData)
-
-# Make predictions.
-predictions = model.transform(testData)
-
-# Select example rows to display.
-predictions.select("prediction", "indexedLabel", "features").show(5)
-
-# Select (prediction, true label) and compute test error
-evaluator = MulticlassClassificationEvaluator(
-    labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
-accuracy = evaluator.evaluate(predictions)
-print "Test Error = %g" % (1.0 - accuracy)
-
-rfModel = model.stages[2]
-print rfModel # summary only
-{% endhighlight %}
+{% include_example python/ml/random_forest_classifier_example.py %}
 </div>
 </div>
 
@@ -316,167 +143,21 @@ We use a feature transformer to index categorical features, adding metadata to t
 
 Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.regression.RandomForestRegressor) for more details.
 
-{% highlight scala %}
-import org.apache.spark.ml.Pipeline
-import org.apache.spark.ml.regression.RandomForestRegressor
-import org.apache.spark.ml.regression.RandomForestRegressionModel
-import org.apache.spark.ml.feature.VectorIndexer
-import org.apache.spark.ml.evaluation.RegressionEvaluator
-
-// Load and parse the data file, converting it to a DataFrame.
-val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
-
-// Automatically identify categorical features, and index them.
-// Set maxCategories so features with > 4 distinct values are treated as continuous.
-val featureIndexer = new VectorIndexer()
-  .setInputCol("features")
-  .setOutputCol("indexedFeatures")
-  .setMaxCategories(4)
-  .fit(data)
-
-// Split the data into training and test sets (30% held out for testing)
-val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
-
-// Train a RandomForest model.
-val rf = new RandomForestRegressor()
-  .setLabelCol("label")
-  .setFeaturesCol("indexedFeatures")
-
-// Chain indexer and forest in a Pipeline
-val pipeline = new Pipeline()
-  .setStages(Array(featureIndexer, rf))
-
-// Train model.  This also runs the indexer.
-val model = pipeline.fit(trainingData)
-
-// Make predictions.
-val predictions = model.transform(testData)
-
-// Select example rows to display.
-predictions.select("prediction", "label", "features").show(5)
-
-// Select (prediction, true label) and compute test error
-val evaluator = new RegressionEvaluator()
-  .setLabelCol("label")
-  .setPredictionCol("prediction")
-  .setMetricName("rmse")
-val rmse = evaluator.evaluate(predictions)
-println("Root Mean Squared Error (RMSE) on test data = " + rmse)
-
-val rfModel = model.stages(1).asInstanceOf[RandomForestRegressionModel]
-println("Learned regression forest model:\n" + rfModel.toDebugString)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
 
 Refer to the [Java API docs](api/java/org/apache/spark/ml/regression/RandomForestRegressor.html) for more details.
 
-{% highlight java %}
-import org.apache.spark.ml.Pipeline;
-import org.apache.spark.ml.PipelineModel;
-import org.apache.spark.ml.PipelineStage;
-import org.apache.spark.ml.evaluation.RegressionEvaluator;
-import org.apache.spark.ml.feature.VectorIndexer;
-import org.apache.spark.ml.feature.VectorIndexerModel;
-import org.apache.spark.ml.regression.RandomForestRegressionModel;
-import org.apache.spark.ml.regression.RandomForestRegressor;
-import org.apache.spark.sql.DataFrame;
-
-// Load and parse the data file, converting it to a DataFrame.
-DataFrame data = sqlContext.read().format("libsvm")
-  .load("data/mllib/sample_libsvm_data.txt");
-
-// Automatically identify categorical features, and index them.
-// Set maxCategories so features with > 4 distinct values are treated as continuous.
-VectorIndexerModel featureIndexer = new VectorIndexer()
-  .setInputCol("features")
-  .setOutputCol("indexedFeatures")
-  .setMaxCategories(4)
-  .fit(data);
-
-// Split the data into training and test sets (30% held out for testing)
-DataFrame[] splits = data.randomSplit(new double[] {0.7, 0.3});
-DataFrame trainingData = splits[0];
-DataFrame testData = splits[1];
-
-// Train a RandomForest model.
-RandomForestRegressor rf = new RandomForestRegressor()
-  .setLabelCol("label")
-  .setFeaturesCol("indexedFeatures");
-
-// Chain indexer and forest in a Pipeline
-Pipeline pipeline = new Pipeline()
-  .setStages(new PipelineStage[] {featureIndexer, rf});
-
-// Train model.  This also runs the indexer.
-PipelineModel model = pipeline.fit(trainingData);
-
-// Make predictions.
-DataFrame predictions = model.transform(testData);
-
-// Select example rows to display.
-predictions.select("prediction", "label", "features").show(5);
-
-// Select (prediction, true label) and compute test error
-RegressionEvaluator evaluator = new RegressionEvaluator()
-  .setLabelCol("label")
-  .setPredictionCol("prediction")
-  .setMetricName("rmse");
-double rmse = evaluator.evaluate(predictions);
-System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);
-
-RandomForestRegressionModel rfModel =
-  (RandomForestRegressionModel)(model.stages()[1]);
-System.out.println("Learned regression forest model:\n" + rfModel.toDebugString());
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaRandomForestRegressorExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
 
 Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.RandomForestRegressor) for more details.
 
-{% highlight python %}
-from pyspark.ml import Pipeline
-from pyspark.ml.regression import RandomForestRegressor
-from pyspark.ml.feature import VectorIndexer
-from pyspark.ml.evaluation import RegressionEvaluator
-
-# Load and parse the data file, converting it to a DataFrame.
-data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
-
-# Automatically identify categorical features, and index them.
-# Set maxCategories so features with > 4 distinct values are treated as continuous.
-featureIndexer =\
-    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)
-
-# Split the data into training and test sets (30% held out for testing)
-(trainingData, testData) = data.randomSplit([0.7, 0.3])
-
-# Train a RandomForest model.
-rf = RandomForestRegressor(featuresCol="indexedFeatures")
-
-# Chain indexer and forest in a Pipeline
-pipeline = Pipeline(stages=[featureIndexer, rf])
-
-# Train model.  This also runs the indexer.
-model = pipeline.fit(trainingData)
-
-# Make predictions.
-predictions = model.transform(testData)
-
-# Select example rows to display.
-predictions.select("prediction", "label", "features").show(5)
-
-# Select (prediction, true label) and compute test error
-evaluator = RegressionEvaluator(
-    labelCol="label", predictionCol="prediction", metricName="rmse")
-rmse = evaluator.evaluate(predictions)
-print "Root Mean Squared Error (RMSE) on test data = %g" % rmse
-
-rfModel = model.stages[1]
-print rfModel # summary only
-{% endhighlight %}
+{% include_example python/ml/random_forest_regressor_example.py %}
 </div>
 </div>
 
@@ -560,194 +241,21 @@ We use two feature transformers to prepare the data; these help index categories
 
 Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classification.GBTClassifier) for more details.
 
-{% highlight scala %}
-import org.apache.spark.ml.Pipeline
-import org.apache.spark.ml.classification.GBTClassifier
-import org.apache.spark.ml.classification.GBTClassificationModel
-import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer}
-import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
-
-// Load and parse the data file, converting it to a DataFrame.
-val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
-
-// Index labels, adding metadata to the label column.
-// Fit on whole dataset to include all labels in index.
-val labelIndexer = new StringIndexer()
-  .setInputCol("label")
-  .setOutputCol("indexedLabel")
-  .fit(data)
-// Automatically identify categorical features, and index them.
-// Set maxCategories so features with > 4 distinct values are treated as continuous.
-val featureIndexer = new VectorIndexer()
-  .setInputCol("features")
-  .setOutputCol("indexedFeatures")
-  .setMaxCategories(4)
-  .fit(data)
-
-// Split the data into training and test sets (30% held out for testing)
-val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
-
-// Train a GBT model.
-val gbt = new GBTClassifier()
-  .setLabelCol("indexedLabel")
-  .setFeaturesCol("indexedFeatures")
-  .setMaxIter(10)
-
-// Convert indexed labels back to original labels.
-val labelConverter = new IndexToString()
-  .setInputCol("prediction")
-  .setOutputCol("predictedLabel")
-  .setLabels(labelIndexer.labels)
-
-// Chain indexers and GBT in a Pipeline
-val pipeline = new Pipeline()
-  .setStages(Array(labelIndexer, featureIndexer, gbt, labelConverter))
-
-// Train model.  This also runs the indexers.
-val model = pipeline.fit(trainingData)
-
-// Make predictions.
-val predictions = model.transform(testData)
-
-// Select example rows to display.
-predictions.select("predictedLabel", "label", "features").show(5)
-
-// Select (prediction, true label) and compute test error
-val evaluator = new MulticlassClassificationEvaluator()
-  .setLabelCol("indexedLabel")
-  .setPredictionCol("prediction")
-  .setMetricName("precision")
-val accuracy = evaluator.evaluate(predictions)
-println("Test Error = " + (1.0 - accuracy))
-
-val gbtModel = model.stages(2).asInstanceOf[GBTClassificationModel]
-println("Learned classification GBT model:\n" + gbtModel.toDebugString)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
 
 Refer to the [Java API docs](api/java/org/apache/spark/ml/classification/GBTClassifier.html) for more details.
 
-{% highlight java %}
-import org.apache.spark.ml.Pipeline;
-import org.apache.spark.ml.PipelineModel;
-import org.apache.spark.ml.PipelineStage;
-import org.apache.spark.ml.classification.GBTClassifier;
-import org.apache.spark.ml.classification.GBTClassificationModel;
-import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
-import org.apache.spark.ml.feature.*;
-import org.apache.spark.sql.DataFrame;
-
-// Load and parse the data file, converting it to a DataFrame.
-DataFrame data sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
-
-// Index labels, adding metadata to the label column.
-// Fit on whole dataset to include all labels in index.
-StringIndexerModel labelIndexer = new StringIndexer()
-  .setInputCol("label")
-  .setOutputCol("indexedLabel")
-  .fit(data);
-// Automatically identify categorical features, and index them.
-// Set maxCategories so features with > 4 distinct values are treated as continuous.
-VectorIndexerModel featureIndexer = new VectorIndexer()
-  .setInputCol("features")
-  .setOutputCol("indexedFeatures")
-  .setMaxCategories(4)
-  .fit(data);
-
-// Split the data into training and test sets (30% held out for testing)
-DataFrame[] splits = data.randomSplit(new double[] {0.7, 0.3});
-DataFrame trainingData = splits[0];
-DataFrame testData = splits[1];
-
-// Train a GBT model.
-GBTClassifier gbt = new GBTClassifier()
-  .setLabelCol("indexedLabel")
-  .setFeaturesCol("indexedFeatures")
-  .setMaxIter(10);
-
-// Convert indexed labels back to original labels.
-IndexToString labelConverter = new IndexToString()
-  .setInputCol("prediction")
-  .setOutputCol("predictedLabel")
-  .setLabels(labelIndexer.labels());
-
-// Chain indexers and GBT in a Pipeline
-Pipeline pipeline = new Pipeline()
-  .setStages(new PipelineStage[] {labelIndexer, featureIndexer, gbt, labelConverter});
-
-// Train model.  This also runs the indexers.
-PipelineModel model = pipeline.fit(trainingData);
-
-// Make predictions.
-DataFrame predictions = model.transform(testData);
-
-// Select example rows to display.
-predictions.select("predictedLabel", "label", "features").show(5);
-
-// Select (prediction, true label) and compute test error
-MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
-  .setLabelCol("indexedLabel")
-  .setPredictionCol("prediction")
-  .setMetricName("precision");
-double accuracy = evaluator.evaluate(predictions);
-System.out.println("Test Error = " + (1.0 - accuracy));
-
-GBTClassificationModel gbtModel =
-  (GBTClassificationModel)(model.stages()[2]);
-System.out.println("Learned classification GBT model:\n" + gbtModel.toDebugString());
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaGradientBoostedTreeClassifierExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
 
 Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classification.GBTClassifier) for more details.
 
-{% highlight python %}
-from pyspark.ml import Pipeline
-from pyspark.ml.classification import GBTClassifier
-from pyspark.ml.feature import StringIndexer, VectorIndexer
-from pyspark.ml.evaluation import MulticlassClassificationEvaluator
-
-# Load and parse the data file, converting it to a DataFrame.
-data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
-
-# Index labels, adding metadata to the label column.
-# Fit on whole dataset to include all labels in index.
-labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
-# Automatically identify categorical features, and index them.
-# Set maxCategories so features with > 4 distinct values are treated as continuous.
-featureIndexer =\
-    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)
-
-# Split the data into training and test sets (30% held out for testing)
-(trainingData, testData) = data.randomSplit([0.7, 0.3])
-
-# Train a GBT model.
-gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)
-
-# Chain indexers and GBT in a Pipeline
-pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt])
-
-# Train model.  This also runs the indexers.
-model = pipeline.fit(trainingData)
-
-# Make predictions.
-predictions = model.transform(testData)
-
-# Select example rows to display.
-predictions.select("prediction", "indexedLabel", "features").show(5)
-
-# Select (prediction, true label) and compute test error
-evaluator = MulticlassClassificationEvaluator(
-    labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
-accuracy = evaluator.evaluate(predictions)
-print "Test Error = %g" % (1.0 - accuracy)
-
-gbtModel = model.stages[2]
-print gbtModel # summary only
-{% endhighlight %}
+{% include_example python/ml/gradient_boosted_tree_classifier_example.py %}
 </div>
 </div>
 
@@ -761,168 +269,21 @@ be true in general.
 
 Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.regression.GBTRegressor) for more details.
 
-{% highlight scala %}
-import org.apache.spark.ml.Pipeline
-import org.apache.spark.ml.regression.GBTRegressor
-import org.apache.spark.ml.regression.GBTRegressionModel
-import org.apache.spark.ml.feature.VectorIndexer
-import org.apache.spark.ml.evaluation.RegressionEvaluator
-
-// Load and parse the data file, converting it to a DataFrame.
-val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
-
-// Automatically identify categorical features, and index them.
-// Set maxCategories so features with > 4 distinct values are treated as continuous.
-val featureIndexer = new VectorIndexer()
-  .setInputCol("features")
-  .setOutputCol("indexedFeatures")
-  .setMaxCategories(4)
-  .fit(data)
-
-// Split the data into training and test sets (30% held out for testing)
-val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
-
-// Train a GBT model.
-val gbt = new GBTRegressor()
-  .setLabelCol("label")
-  .setFeaturesCol("indexedFeatures")
-  .setMaxIter(10)
-
-// Chain indexer and GBT in a Pipeline
-val pipeline = new Pipeline()
-  .setStages(Array(featureIndexer, gbt))
-
-// Train model.  This also runs the indexer.
-val model = pipeline.fit(trainingData)
-
-// Make predictions.
-val predictions = model.transform(testData)
-
-// Select example rows to display.
-predictions.select("prediction", "label", "features").show(5)
-
-// Select (prediction, true label) and compute test error
-val evaluator = new RegressionEvaluator()
-  .setLabelCol("label")
-  .setPredictionCol("prediction")
-  .setMetricName("rmse")
-val rmse = evaluator.evaluate(predictions)
-println("Root Mean Squared Error (RMSE) on test data = " + rmse)
-
-val gbtModel = model.stages(1).asInstanceOf[GBTRegressionModel]
-println("Learned regression GBT model:\n" + gbtModel.toDebugString)
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
 
 Refer to the [Java API docs](api/java/org/apache/spark/ml/regression/GBTRegressor.html) for more details.
 
-{% highlight java %}
-import org.apache.spark.ml.Pipeline;
-import org.apache.spark.ml.PipelineModel;
-import org.apache.spark.ml.PipelineStage;
-import org.apache.spark.ml.evaluation.RegressionEvaluator;
-import org.apache.spark.ml.feature.VectorIndexer;
-import org.apache.spark.ml.feature.VectorIndexerModel;
-import org.apache.spark.ml.regression.GBTRegressionModel;
-import org.apache.spark.ml.regression.GBTRegressor;
-import org.apache.spark.sql.DataFrame;
-
-// Load and parse the data file, converting it to a DataFrame.
-DataFrame data = sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
-
-// Automatically identify categorical features, and index them.
-// Set maxCategories so features with > 4 distinct values are treated as continuous.
-VectorIndexerModel featureIndexer = new VectorIndexer()
-  .setInputCol("features")
-  .setOutputCol("indexedFeatures")
-  .setMaxCategories(4)
-  .fit(data);
-
-// Split the data into training and test sets (30% held out for testing)
-DataFrame[] splits = data.randomSplit(new double[] {0.7, 0.3});
-DataFrame trainingData = splits[0];
-DataFrame testData = splits[1];
-
-// Train a GBT model.
-GBTRegressor gbt = new GBTRegressor()
-  .setLabelCol("label")
-  .setFeaturesCol("indexedFeatures")
-  .setMaxIter(10);
-
-// Chain indexer and GBT in a Pipeline
-Pipeline pipeline = new Pipeline()
-  .setStages(new PipelineStage[] {featureIndexer, gbt});
-
-// Train model.  This also runs the indexer.
-PipelineModel model = pipeline.fit(trainingData);
-
-// Make predictions.
-DataFrame predictions = model.transform(testData);
-
-// Select example rows to display.
-predictions.select("prediction", "label", "features").show(5);
-
-// Select (prediction, true label) and compute test error
-RegressionEvaluator evaluator = new RegressionEvaluator()
-  .setLabelCol("label")
-  .setPredictionCol("prediction")
-  .setMetricName("rmse");
-double rmse = evaluator.evaluate(predictions);
-System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);
-
-GBTRegressionModel gbtModel =
-  (GBTRegressionModel)(model.stages()[1]);
-System.out.println("Learned regression GBT model:\n" + gbtModel.toDebugString());
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaGradientBoostedTreeRegressorExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
 
 Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression.GBTRegressor) for more details.
 
-{% highlight python %}
-from pyspark.ml import Pipeline
-from pyspark.ml.regression import GBTRegressor
-from pyspark.ml.feature import VectorIndexer
-from pyspark.ml.evaluation import RegressionEvaluator
-
-# Load and parse the data file, converting it to a DataFrame.
-data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
-
-# Automatically identify categorical features, and index them.
-# Set maxCategories so features with > 4 distinct values are treated as continuous.
-featureIndexer =\
-    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)
-
-# Split the data into training and test sets (30% held out for testing)
-(trainingData, testData) = data.randomSplit([0.7, 0.3])
-
-# Train a GBT model.
-gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=10)
-
-# Chain indexer and GBT in a Pipeline
-pipeline = Pipeline(stages=[featureIndexer, gbt])
-
-# Train model.  This also runs the indexer.
-model = pipeline.fit(trainingData)
-
-# Make predictions.
-predictions = model.transform(testData)
-
-# Select example rows to display.
-predictions.select("prediction", "label", "features").show(5)
-
-# Select (prediction, true label) and compute test error
-evaluator = RegressionEvaluator(
-    labelCol="label", predictionCol="prediction", metricName="rmse")
-rmse = evaluator.evaluate(predictions)
-print "Root Mean Squared Error (RMSE) on test data = %g" % rmse
-
-gbtModel = model.stages[1]
-print gbtModel # summary only
-{% endhighlight %}
+{% include_example python/ml/gradient_boosted_tree_regressor_example.py %}
 </div>
 </div>
 
@@ -945,100 +306,13 @@ The example below demonstrates how to load the
 
 Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classifier.OneVsRest) for more details.
 
-{% highlight scala %}
-import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest}
-import org.apache.spark.mllib.evaluation.MulticlassMetrics
-import org.apache.spark.sql.{Row, SQLContext}
-
-val sqlContext = new SQLContext(sc)
-
-// parse data into dataframe
-val data = sqlContext.read.format("libsvm")
-  .load("data/mllib/sample_multiclass_classification_data.txt")
-val Array(train, test) = data.randomSplit(Array(0.7, 0.3))
-
-// instantiate multiclass learner and train
-val ovr = new OneVsRest().setClassifier(new LogisticRegression)
-
-val ovrModel = ovr.fit(train)
-
-// score model on test data
-val predictions = ovrModel.transform(test).select("prediction", "label")
-val predictionsAndLabels = predictions.map {case Row(p: Double, l: Double) => (p, l)}
-
-// compute confusion matrix
-val metrics = new MulticlassMetrics(predictionsAndLabels)
-println(metrics.confusionMatrix)
-
-// the Iris DataSet has three classes
-val numClasses = 3
-
-println("label\tfpr\n")
-(0 until numClasses).foreach { index =>
-  val label = index.toDouble
-  println(label + "\t" + metrics.falsePositiveRate(label))
-}
-{% endhighlight %}
+{% include_example scala/org/apache/spark/examples/ml/OneVsRestExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
 
 Refer to the [Java API docs](api/java/org/apache/spark/ml/classification/OneVsRest.html) for more details.
 
-{% highlight java %}
-import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.ml.classification.LogisticRegression;
-import org.apache.spark.ml.classification.OneVsRest;
-import org.apache.spark.ml.classification.OneVsRestModel;
-import org.apache.spark.mllib.evaluation.MulticlassMetrics;
-import org.apache.spark.mllib.linalg.Matrix;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.SQLContext;
-
-SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample");
-JavaSparkContext jsc = new JavaSparkContext(conf);
-SQLContext jsql = new SQLContext(jsc);
-
-DataFrame dataFrame = sqlContext.read().format("libsvm")
-  .load("data/mllib/sample_multiclass_classification_data.txt");
-
-DataFrame[] splits = dataFrame.randomSplit(new double[] {0.7, 0.3}, 12345);
-DataFrame train = splits[0];
-DataFrame test = splits[1];
-
-// instantiate the One Vs Rest Classifier
-OneVsRest ovr = new OneVsRest().setClassifier(new LogisticRegression());
-
-// train the multiclass model
-OneVsRestModel ovrModel = ovr.fit(train.cache());
-
-// score the model on test data
-DataFrame predictions = ovrModel
-  .transform(test)
-  .select("prediction", "label");
-
-// obtain metrics
-MulticlassMetrics metrics = new MulticlassMetrics(predictions);
-Matrix confusionMatrix = metrics.confusionMatrix();
-
-// output the Confusion Matrix
-System.out.println("Confusion Matrix");
-System.out.println(confusionMatrix);
-
-// compute the false positive rate per label
-System.out.println();
-System.out.println("label\tfpr\n");
-
-// the Iris DataSet has three classes
-int numClasses = 3;
-for (int index = 0; index < numClasses; index++) {
-  double label = (double) index;
-  System.out.print(label);
-  System.out.print("\t");
-  System.out.print(metrics.falsePositiveRate(label));
-  System.out.println();
-}
-{% endhighlight %}
+{% include_example java/org/apache/spark/examples/ml/JavaOneVsRestExample.java %}
 </div>
 </div>
author	Xusen Yin <yinxusen@gmail.com>	2015-11-17 23:44:06 -0800
committer	Xiangrui Meng <meng@databricks.com>	2015-11-17 23:44:06 -0800
commit	9154f89befb7a33d4853cea95efd7dc6b25d033b (patch)
tree	8eb6da0ff09ba6c3b2fe34859077e5a55c5ed3df /docs/ml-ensembles.md
parent	2f191c66b668fc97f82f44fd8336b6a4488c2f5d (diff)
download	spark-9154f89befb7a33d4853cea95efd7dc6b25d033b.tar.gz spark-9154f89befb7a33d4853cea95efd7dc6b25d033b.tar.bz2 spark-9154f89befb7a33d4853cea95efd7dc6b25d033b.zip