diff options
Diffstat (limited to 'docs/mllib-naive-bayes.md')
-rw-r--r-- | docs/mllib-naive-bayes.md | 89 |
1 files changed, 3 insertions, 86 deletions
diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md index f4f6a10c82..60ac6c7e5b 100644 --- a/docs/mllib-naive-bayes.md +++ b/docs/mllib-naive-bayes.md @@ -40,32 +40,8 @@ can be used for evaluation and prediction. Refer to the [`NaiveBayes` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayes) and [`NaiveBayesModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayesModel) for details on the API. -{% highlight scala %} -import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel} -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.regression.LabeledPoint - -val data = sc.textFile("data/mllib/sample_naive_bayes_data.txt") -val parsedData = data.map { line => - val parts = line.split(',') - LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) -} -// Split data into training (60%) and test (40%). -val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L) -val training = splits(0) -val test = splits(1) - -val model = NaiveBayes.train(training, lambda = 1.0, modelType = "multinomial") - -val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) -val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() - -// Save and load model -model.save(sc, "myModelPath") -val sameModel = NaiveBayesModel.load(sc, "myModelPath") -{% endhighlight %} +{% include_example scala/org/apache/spark/examples/mllib/NaiveBayesExample.scala %} </div> - <div data-lang="java" markdown="1"> [NaiveBayes](api/java/org/apache/spark/mllib/classification/NaiveBayes.html) implements @@ -77,40 +53,8 @@ can be used for evaluation and prediction. Refer to the [`NaiveBayes` Java docs](api/java/org/apache/spark/mllib/classification/NaiveBayes.html) and [`NaiveBayesModel` Java docs](api/java/org/apache/spark/mllib/classification/NaiveBayesModel.html) for details on the API. -{% highlight java %} -import scala.Tuple2; - -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.mllib.classification.NaiveBayes; -import org.apache.spark.mllib.classification.NaiveBayesModel; -import org.apache.spark.mllib.regression.LabeledPoint; - -JavaRDD<LabeledPoint> training = ... // training set -JavaRDD<LabeledPoint> test = ... // test set - -final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0); - -JavaPairRDD<Double, Double> predictionAndLabel = - test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { - @Override public Tuple2<Double, Double> call(LabeledPoint p) { - return new Tuple2<Double, Double>(model.predict(p.features()), p.label()); - } - }); -double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { - @Override public Boolean call(Tuple2<Double, Double> pl) { - return pl._1().equals(pl._2()); - } - }).count() / (double) test.count(); - -// Save and load model -model.save(sc.sc(), "myModelPath"); -NaiveBayesModel sameModel = NaiveBayesModel.load(sc.sc(), "myModelPath"); -{% endhighlight %} +{% include_example java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java %} </div> - <div data-lang="python" markdown="1"> [NaiveBayes](api/python/pyspark.mllib.html#pyspark.mllib.classification.NaiveBayes) implements multinomial @@ -124,33 +68,6 @@ Note that the Python API does not yet support model save/load but will in the fu Refer to the [`NaiveBayes` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.classification.NaiveBayes) and [`NaiveBayesModel` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.classification.NaiveBayesModel) for more details on the API. -{% highlight python %} -from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel -from pyspark.mllib.linalg import Vectors -from pyspark.mllib.regression import LabeledPoint - -def parseLine(line): - parts = line.split(',') - label = float(parts[0]) - features = Vectors.dense([float(x) for x in parts[1].split(' ')]) - return LabeledPoint(label, features) - -data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine) - -# Split data aproximately into training (60%) and test (40%) -training, test = data.randomSplit([0.6, 0.4], seed = 0) - -# Train a naive Bayes model. -model = NaiveBayes.train(training, 1.0) - -# Make prediction and test accuracy. -predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label)) -accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() - -# Save and load model -model.save(sc, "myModelPath") -sameModel = NaiveBayesModel.load(sc, "myModelPath") -{% endhighlight %} - +{% include_example python/mllib/naive_bayes_example.py %} </div> </div> |