aboutsummaryrefslogtreecommitdiff
path: root/docs/mllib-ensembles.md
diff options
context:
space:
mode:
authorJoseph K. Bradley <joseph@databricks.com>2015-02-25 16:13:17 -0800
committerXiangrui Meng <meng@databricks.com>2015-02-25 16:13:17 -0800
commitd20559b157743981b9c09e286f2aaff8cbefab59 (patch)
tree6d92015c1ae6b05c725860685351f86b8c4ed6af /docs/mllib-ensembles.md
parent46a044a36a2aff1306f7f677e952ce253ddbefac (diff)
downloadspark-d20559b157743981b9c09e286f2aaff8cbefab59.tar.gz
spark-d20559b157743981b9c09e286f2aaff8cbefab59.tar.bz2
spark-d20559b157743981b9c09e286f2aaff8cbefab59.zip
[SPARK-5974] [SPARK-5980] [mllib] [python] [docs] Update ML guide with save/load, Python GBT
* Add GradientBoostedTrees Python examples to ML guide * I ran these in the pyspark shell, and they worked. * Add save/load to examples in ML guide * Added note to python docs about predict,transform not working within RDD actions,transformations in some cases (See SPARK-5981) CC: mengxr Author: Joseph K. Bradley <joseph@databricks.com> Closes #4750 from jkbradley/SPARK-5974 and squashes the following commits: c410e38 [Joseph K. Bradley] Added note to LabeledPoint about attributes bcae18b [Joseph K. Bradley] Added import of models for save/load examples in ml guide. Fixed line length for tree.py, feature.py (but not other ML Pyspark files yet). 6d81c3e [Joseph K. Bradley] completed python GBT examples 9903309 [Joseph K. Bradley] Added note to python docs about predict,transform not working within RDD actions,transformations in some cases c7dfad8 [Joseph K. Bradley] Added model save/load to ML guide. Added GBT examples to ML guide
Diffstat (limited to 'docs/mllib-ensembles.md')
-rw-r--r--docs/mllib-ensembles.md94
1 files changed, 92 insertions, 2 deletions
diff --git a/docs/mllib-ensembles.md b/docs/mllib-ensembles.md
index 00040e6073..ddae84165f 100644
--- a/docs/mllib-ensembles.md
+++ b/docs/mllib-ensembles.md
@@ -98,6 +98,7 @@ The test error is calculated to measure the algorithm accuracy.
<div data-lang="scala">
{% highlight scala %}
import org.apache.spark.mllib.tree.RandomForest
+import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.mllib.util.MLUtils
// Load and parse the data file.
@@ -127,6 +128,9 @@ val labelAndPreds = testData.map { point =>
val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
println("Test Error = " + testErr)
println("Learned classification forest model:\n" + model.toDebugString)
+
+model.save("myModelPath")
+val sameModel = RandomForestModel.load("myModelPath")
{% endhighlight %}
</div>
@@ -188,10 +192,16 @@ Double testErr =
}).count() / testData.count();
System.out.println("Test Error: " + testErr);
System.out.println("Learned classification forest model:\n" + model.toDebugString());
+
+model.save("myModelPath");
+RandomForestModel sameModel = RandomForestModel.load("myModelPath");
{% endhighlight %}
</div>
<div data-lang="python">
+
+Note that the Python API does not yet support model save/load but will in the future.
+
{% highlight python %}
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.util import MLUtils
@@ -235,6 +245,7 @@ The Mean Squared Error (MSE) is computed at the end to evaluate
<div data-lang="scala">
{% highlight scala %}
import org.apache.spark.mllib.tree.RandomForest
+import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.mllib.util.MLUtils
// Load and parse the data file.
@@ -264,6 +275,9 @@ val labelsAndPredictions = testData.map { point =>
val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean()
println("Test Mean Squared Error = " + testMSE)
println("Learned regression forest model:\n" + model.toDebugString)
+
+model.save("myModelPath")
+val sameModel = RandomForestModel.load("myModelPath")
{% endhighlight %}
</div>
@@ -328,10 +342,16 @@ Double testMSE =
}) / testData.count();
System.out.println("Test Mean Squared Error: " + testMSE);
System.out.println("Learned regression forest model:\n" + model.toDebugString());
+
+model.save("myModelPath");
+RandomForestModel sameModel = RandomForestModel.load("myModelPath");
{% endhighlight %}
</div>
<div data-lang="python">
+
+Note that the Python API does not yet support model save/load but will in the future.
+
{% highlight python %}
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.util import MLUtils
@@ -441,8 +461,6 @@ iterations.
### Examples
-GBTs currently have APIs in Scala and Java. Examples in both languages are shown below.
-
#### Classification
The example below demonstrates how to load a
@@ -457,6 +475,7 @@ The test error is calculated to measure the algorithm accuracy.
{% highlight scala %}
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
+import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
import org.apache.spark.mllib.util.MLUtils
// Load and parse the data file.
@@ -484,6 +503,9 @@ val labelAndPreds = testData.map { point =>
val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
println("Test Error = " + testErr)
println("Learned classification GBT model:\n" + model.toDebugString)
+
+model.save("myModelPath")
+val sameModel = GradientBoostedTreesModel.load("myModelPath")
{% endhighlight %}
</div>
@@ -545,6 +567,38 @@ Double testErr =
}).count() / testData.count();
System.out.println("Test Error: " + testErr);
System.out.println("Learned classification GBT model:\n" + model.toDebugString());
+
+model.save("myModelPath");
+GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load("myModelPath");
+{% endhighlight %}
+</div>
+
+<div data-lang="python">
+
+Note that the Python API does not yet support model save/load but will in the future.
+
+{% highlight python %}
+from pyspark.mllib.tree import GradientBoostedTrees
+from pyspark.mllib.util import MLUtils
+
+# Load and parse the data file.
+data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+# Split the data into training and test sets (30% held out for testing)
+(trainingData, testData) = data.randomSplit([0.7, 0.3])
+
+# Train a GradientBoostedTrees model.
+# Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
+# (b) Use more iterations in practice.
+model = GradientBoostedTrees.trainClassifier(trainingData,
+ categoricalFeaturesInfo={}, numIterations=3)
+
+# Evaluate model on test instances and compute test error
+predictions = model.predict(testData.map(lambda x: x.features))
+labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
+testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
+print('Test Error = ' + str(testErr))
+print('Learned classification GBT model:')
+print(model.toDebugString())
{% endhighlight %}
</div>
@@ -565,6 +619,7 @@ The Mean Squared Error (MSE) is computed at the end to evaluate
{% highlight scala %}
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
+import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel
import org.apache.spark.mllib.util.MLUtils
// Load and parse the data file.
@@ -591,6 +646,9 @@ val labelsAndPredictions = testData.map { point =>
val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean()
println("Test Mean Squared Error = " + testMSE)
println("Learned regression GBT model:\n" + model.toDebugString)
+
+model.save("myModelPath")
+val sameModel = GradientBoostedTreesModel.load("myModelPath")
{% endhighlight %}
</div>
@@ -658,6 +716,38 @@ Double testMSE =
}) / data.count();
System.out.println("Test Mean Squared Error: " + testMSE);
System.out.println("Learned regression GBT model:\n" + model.toDebugString());
+
+model.save("myModelPath");
+GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load("myModelPath");
+{% endhighlight %}
+</div>
+
+<div data-lang="python">
+
+Note that the Python API does not yet support model save/load but will in the future.
+
+{% highlight python %}
+from pyspark.mllib.tree import GradientBoostedTrees
+from pyspark.mllib.util import MLUtils
+
+# Load and parse the data file.
+data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+# Split the data into training and test sets (30% held out for testing)
+(trainingData, testData) = data.randomSplit([0.7, 0.3])
+
+# Train a GradientBoostedTrees model.
+# Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
+# (b) Use more iterations in practice.
+model = GradientBoostedTrees.trainRegressor(trainingData,
+ categoricalFeaturesInfo={}, numIterations=3)
+
+# Evaluate model on test instances and compute test error
+predictions = model.predict(testData.map(lambda x: x.features))
+labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
+testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
+print('Test Mean Squared Error = ' + str(testMSE))
+print('Learned regression GBT model:')
+print(model.toDebugString())
{% endhighlight %}
</div>