aboutsummaryrefslogtreecommitdiff
path: root/docs/mllib-naive-bayes.md
diff options
context:
space:
mode:
Diffstat (limited to 'docs/mllib-naive-bayes.md')
-rw-r--r--docs/mllib-naive-bayes.md48
1 files changed, 32 insertions, 16 deletions
diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
index 6160fe5b2f..c47508b7da 100644
--- a/docs/mllib-naive-bayes.md
+++ b/docs/mllib-naive-bayes.md
@@ -7,13 +7,13 @@ Naive Bayes is a simple multiclass classification algorithm with the assumption
between every pair of features. Naive Bayes can be trained very efficiently. Within a single pass to
the training data, it computes the conditional probability distribution of each feature given label,
and then it applies Bayes' theorem to compute the conditional probability distribution of label
-given an observation and use it for prediction. For more details, please visit the wikipedia page
+given an observation and use it for prediction. For more details, please visit the Wikipedia page
[Naive Bayes classifier](http://en.wikipedia.org/wiki/Naive_Bayes_classifier).
In MLlib, we implemented multinomial naive Bayes, which is typically used for document
classification. Within that context, each observation is a document, each feature represents a term,
-whose value is the frequency of the term. For its formulation, please visit the wikipedia page
-[Multinomial naive Bayes](http://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_Bayes)
+whose value is the frequency of the term. For its formulation, please visit the Wikipedia page
+[Multinomial Naive Bayes](http://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_Bayes)
or the section
[Naive Bayes text classification](http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html)
from the book Introduction to Information
@@ -36,9 +36,18 @@ can be used for evaluation and prediction.
{% highlight scala %}
import org.apache.spark.mllib.classification.NaiveBayes
-
-val training: RDD[LabeledPoint] = ... // training set
-val test: RDD[LabeledPoint] = ... // test set
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.regression.LabeledPoint
+
+val data = sc.textFile("mllib/data/sample_naive_bayes_data.txt")
+val parsedData = data.map { line =>
+ val parts = line.split(',')
+ LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
+}
+// Split data into training (60%) and test (40%).
+val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
+val training = splits(0)
+val test = splits(1)
val model = NaiveBayes.train(training, lambda = 1.0)
val prediction = model.predict(test.map(_.features))
@@ -58,29 +67,36 @@ optionally smoothing parameter `lambda` as input, and output a
can be used for evaluation and prediction.
{% highlight java %}
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.classification.NaiveBayes;
+import org.apache.spark.mllib.classification.NaiveBayesModel;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import scala.Tuple2;
JavaRDD<LabeledPoint> training = ... // training set
JavaRDD<LabeledPoint> test = ... // test set
-NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
+final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
-JavaRDD<Double> prediction = model.predict(test.map(new Function<LabeledPoint, Vector>() {
- public Vector call(LabeledPoint p) {
- return p.features();
+JavaRDD<Double> prediction =
+ test.map(new Function<LabeledPoint, Double>() {
+ @Override public Double call(LabeledPoint p) {
+ return model.predict(p.features());
}
- })
+ });
JavaPairRDD<Double, Double> predictionAndLabel =
prediction.zip(test.map(new Function<LabeledPoint, Double>() {
- public Double call(LabeledPoint p) {
+ @Override public Double call(LabeledPoint p) {
return p.label();
}
- })
+ }));
double accuracy = 1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
- public Boolean call(Tuple2<Double, Double> pl) {
+ @Override public Boolean call(Tuple2<Double, Double> pl) {
return pl._1() == pl._2();
}
- }).count() / test.count()
+ }).count() / test.count();
{% endhighlight %}
</div>
@@ -93,7 +109,7 @@ smoothing parameter `lambda` as input, and output a
[NaiveBayesModel](api/pyspark/pyspark.mllib.classification.NaiveBayesModel-class.html), which can be
used for evaluation and prediction.
-<!--- TODO: Make Python's example consistent with Scala's and Java's. --->
+<!-- TODO: Make Python's example consistent with Scala's and Java's. -->
{% highlight python %}
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import NaiveBayes