SPARK-2293. Replace RDD.zip usage by map with predict inside.

This is the only occurrence of this pattern in the examples that needs to be replaced. It only addresses the example change. Author: Sean Owen <sowen@cloudera.com> Closes #1250 from srowen/SPARK-2293 and squashes the following commits: 6b1b28c [Sean Owen] Compute prediction-and-label RDD directly rather than by zipping, for efficiency
author: Sean Owen <sowen@cloudera.com> 2014-06-30 16:03:38 -0700
committer: Xiangrui Meng <meng@databricks.com> 2014-06-30 16:03:38 -0700
commit: 04fa1223ee69760f0d23b40e56f4b036aa301879 (patch)
tree: abadb11aa51c6f635d38d41a8497e54790285ce2 /docs/mllib-naive-bayes.md
parent: 5fccb567b37a94445512c7ec20b830b5e062089f (diff)
download: spark-04fa1223ee69760f0d23b40e56f4b036aa301879.tar.gz
spark-04fa1223ee69760f0d23b40e56f4b036aa301879.tar.bz2
spark-04fa1223ee69760f0d23b40e56f4b036aa301879.zip
1 files changed, 6 insertions, 12 deletions
diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
index 4b3a7cab32..1d1d7dcf6f 100644
--- a/docs/mllib-naive-bayes.md
+++ b/docs/mllib-naive-bayes.md
@@ -51,9 +51,8 @@ val training = splits(0)
 val test = splits(1)
 
 val model = NaiveBayes.train(training, lambda = 1.0)
-val prediction = model.predict(test.map(_.features))
 
-val predictionAndLabel = prediction.zip(test.map(_.label))
+val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
 val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
 {% endhighlight %}
 </div>
@@ -71,6 +70,7 @@ can be used for evaluation and prediction.
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.function.Function;
+import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.mllib.classification.NaiveBayes;
 import org.apache.spark.mllib.classification.NaiveBayesModel;
 import org.apache.spark.mllib.regression.LabeledPoint;
@@ -81,18 +81,12 @@ JavaRDD<LabeledPoint> test = ... // test set
 
 final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
 
-JavaRDD<Double> prediction =
-  test.map(new Function<LabeledPoint, Double>() {
-    @Override public Double call(LabeledPoint p) {
-      return model.predict(p.features());
-    }
-  });
 JavaPairRDD<Double, Double> predictionAndLabel = 
-  prediction.zip(test.map(new Function<LabeledPoint, Double>() {
-    @Override public Double call(LabeledPoint p) {
-      return p.label();
+  test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
+    @Override public Tuple2<Double, Double> call(LabeledPoint p) {
+      return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
     }
-  }));
+  });
 double accuracy = 1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
     @Override public Boolean call(Tuple2<Double, Double> pl) {
       return pl._1() == pl._2();
author	Sean Owen <sowen@cloudera.com>	2014-06-30 16:03:38 -0700
committer	Xiangrui Meng <meng@databricks.com>	2014-06-30 16:03:38 -0700
commit	04fa1223ee69760f0d23b40e56f4b036aa301879 (patch)
tree	abadb11aa51c6f635d38d41a8497e54790285ce2 /docs/mllib-naive-bayes.md
parent	5fccb567b37a94445512c7ec20b830b5e062089f (diff)
download	spark-04fa1223ee69760f0d23b40e56f4b036aa301879.tar.gz spark-04fa1223ee69760f0d23b40e56f4b036aa301879.tar.bz2 spark-04fa1223ee69760f0d23b40e56f4b036aa301879.zip