[SPARK-4894][mllib] Added Bernoulli option to NaiveBayes model in mllib

Added optional model type parameter for NaiveBayes training. Can be either Multinomial or Bernoulli. When Bernoulli is given the Bernoulli smoothing is used for fitting and for prediction as per: http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html. Default for model is original Multinomial fit and predict. Added additional testing for Bernoulli and Multinomial models. Author: leahmcguire <lmcguire@salesforce.com> Author: Joseph K. Bradley <joseph@databricks.com> Author: Leah McGuire <lmcguire@salesforce.com> Closes #4087 from leahmcguire/master and squashes the following commits: f3c8994 [leahmcguire] changed checks on model type to requires acb69af [leahmcguire] removed enum type and replaces all modelType parameters with strings 2224b15 [Leah McGuire] Merge pull request #2 from jkbradley/leahmcguire-master 9ad89ca [Joseph K. Bradley] removed old code 6a8f383 [Joseph K. Bradley] Added new model save/load format 2.0 for NaiveBayesModel after modelType parameter was added. Updated tests. Also updated ModelType enum-like type. 852a727 [leahmcguire] merged with upstream master a22d670 [leahmcguire] changed NaiveBayesModel modelType parameter back to NaiveBayes.ModelType, made NaiveBayes.ModelType serializable, fixed getter method in NavieBayes 18f3219 [leahmcguire] removed private from naive bayes constructor for lambda only bea62af [leahmcguire] put back in constructor for NaiveBayes 01baad7 [leahmcguire] made fixes from code review fb0a5c7 [leahmcguire] removed typo e2d925e [leahmcguire] fixed nonserializable error that was causing naivebayes test failures 2d0c1ba [leahmcguire] fixed typo in NaiveBayes c298e78 [leahmcguire] fixed scala style errors b85b0c9 [leahmcguire] Merge remote-tracking branch 'upstream/master' 900b586 [leahmcguire] fixed model call so that uses type argument ea09b28 [leahmcguire] Merge remote-tracking branch 'upstream/master' e016569 [leahmcguire] updated test suite with model type fix 85f298f [leahmcguire] Merge remote-tracking branch 'upstream/master' dc65374 [leahmcguire] integrated model type fix 7622b0c [leahmcguire] added comments and fixed style as per rb b93aaf6 [Leah McGuire] Merge pull request #1 from jkbradley/nb-model-type 3730572 [Joseph K. Bradley] modified NB model type to be more Java-friendly b61b5e2 [leahmcguire] added back compatable constructor to NaiveBayesModel to fix MIMA test failure 5a4a534 [leahmcguire] fixed scala style error in NaiveBayes 3891bf2 [leahmcguire] synced with apache spark and resolved merge conflict d9477ed [leahmcguire] removed old inaccurate comment from test suite for mllib naive bayes 76e5b0f [leahmcguire] removed unnecessary sort from test 0313c0c [leahmcguire] fixed style error in NaiveBayes.scala 4a3676d [leahmcguire] Updated changes re-comments. Got rid of verbose populateMatrix method. Public api now has string instead of enumeration. Docs are updated." ce73c63 [leahmcguire] added Bernoulli option to niave bayes model in mllib, added optional model type parameter for training. When Bernoulli is given the Bernoulli smoothing is used for fitting and for prediction http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
author: leahmcguire <lmcguire@salesforce.com> 2015-03-31 11:16:55 -0700
committer: Joseph K. Bradley <joseph@databricks.com> 2015-03-31 11:16:55 -0700
commit: d01a6d8c33fc5c8325b0cc4b51395dba5eb3462c (patch)
tree: 14a3faac411b44804fc141d32b4b2001952a0125 /mllib/src/test
parent: a05835b89fe2086e460f0b80f7c22e284c0c32d0 (diff)
download: spark-d01a6d8c33fc5c8325b0cc4b51395dba5eb3462c.tar.gz
spark-d01a6d8c33fc5c8325b0cc4b51395dba5eb3462c.tar.bz2
spark-d01a6d8c33fc5c8325b0cc4b51395dba5eb3462c.zip
2 files changed, 134 insertions, 37 deletions
diff --git a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java
index 1c90522a07..71fb7f13c3 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java
@@ -17,20 +17,22 @@
 
 package org.apache.spark.mllib.classification;
 
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.mllib.linalg.Vector;
 import org.apache.spark.mllib.linalg.Vectors;
 import org.apache.spark.mllib.regression.LabeledPoint;
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
 
-import java.io.Serializable;
-import java.util.Arrays;
-import java.util.List;
 
 public class JavaNaiveBayesSuite implements Serializable {
   private transient JavaSparkContext sc;
@@ -102,4 +104,11 @@ public class JavaNaiveBayesSuite implements Serializable {
     // Should be able to get the first prediction.
     predictions.first();
   }
+
+  @Test
+  public void testModelTypeSetters() {
+    NaiveBayes nb = new NaiveBayes()
+        .setModelType("Bernoulli")
+        .setModelType("Multinomial");
+  }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
index 5a27c7d230..f9fe3e006c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
@@ -19,6 +19,9 @@ package org.apache.spark.mllib.classification
 
 import scala.util.Random
 
+import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argmax => brzArgmax, sum => brzSum, Axis}
+import breeze.stats.distributions.{Multinomial => BrzMultinomial}
+
 import org.scalatest.FunSuite
 
 import org.apache.spark.SparkException
@@ -41,37 +44,48 @@ object NaiveBayesSuite {
 
   // Generate input of the form Y = (theta * x).argmax()
   def generateNaiveBayesInput(
-      pi: Array[Double],            // 1XC
-      theta: Array[Array[Double]],  // CXD
-      nPoints: Int,
-      seed: Int): Seq[LabeledPoint] = {
+    pi: Array[Double],            // 1XC
+    theta: Array[Array[Double]],  // CXD
+    nPoints: Int,
+    seed: Int,
+    modelType: String = "Multinomial",
+    sample: Int = 10): Seq[LabeledPoint] = {
     val D = theta(0).length
     val rnd = new Random(seed)
-
     val _pi = pi.map(math.pow(math.E, _))
     val _theta = theta.map(row => row.map(math.pow(math.E, _)))
 
     for (i <- 0 until nPoints) yield {
       val y = calcLabel(rnd.nextDouble(), _pi)
-      val xi = Array.tabulate[Double](D) { j =>
-        if (rnd.nextDouble() < _theta(y)(j)) 1 else 0
+      val xi = modelType match {
+        case "Bernoulli" => Array.tabulate[Double] (D) { j =>
+            if (rnd.nextDouble () < _theta(y)(j) ) 1 else 0
+        }
+        case "Multinomial" =>
+          val mult = BrzMultinomial(BDV(_theta(y)))
+          val emptyMap = (0 until D).map(x => (x, 0.0)).toMap
+          val counts = emptyMap ++ mult.sample(sample).groupBy(x => x).map {
+            case (index, reps) => (index, reps.size.toDouble)
+          }
+          counts.toArray.sortBy(_._1).map(_._2)
+        case _ =>
+          // This should never happen.
+          throw new UnknownError(s"NaiveBayesSuite found unknown ModelType: $modelType")
       }
 
       LabeledPoint(y, Vectors.dense(xi))
     }
   }
 
-  private val smallPi = Array(0.5, 0.3, 0.2).map(math.log)
+  /** Bernoulli NaiveBayes with binary labels, 3 features */
+  private val binaryBernoulliModel = new NaiveBayesModel(labels = Array(0.0, 1.0),
+    pi = Array(0.2, 0.8), theta = Array(Array(0.1, 0.3, 0.6), Array(0.2, 0.4, 0.4)),
+    "Bernoulli")
 
-  private val smallTheta = Array(
-    Array(0.91, 0.03, 0.03, 0.03), // label 0
-    Array(0.03, 0.91, 0.03, 0.03), // label 1
-    Array(0.03, 0.03, 0.91, 0.03)  // label 2
-  ).map(_.map(math.log))
-
-  /** Binary labels, 3 features */
-  private val binaryModel = new NaiveBayesModel(labels = Array(0.0, 1.0), pi = Array(0.2, 0.8),
-    theta = Array(Array(0.1, 0.3, 0.6), Array(0.2, 0.4, 0.4)))
+  /** Multinomial NaiveBayes with binary labels, 3 features */
+  private val binaryMultinomialModel = new NaiveBayesModel(labels = Array(0.0, 1.0),
+    pi = Array(0.2, 0.8), theta = Array(Array(0.1, 0.3, 0.6), Array(0.2, 0.4, 0.4)),
+    "Multinomial")
 }
 
 class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext {
@@ -85,6 +99,24 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext {
     assert(numOfPredictions < input.length / 5)
   }
 
+  def validateModelFit(
+      piData: Array[Double],
+      thetaData: Array[Array[Double]],
+      model: NaiveBayesModel) = {
+    def closeFit(d1: Double, d2: Double, precision: Double): Boolean = {
+      (d1 - d2).abs <= precision
+    }
+    val modelIndex = (0 until piData.length).zip(model.labels.map(_.toInt))
+    for (i <- modelIndex) {
+      assert(closeFit(math.exp(piData(i._2)), math.exp(model.pi(i._1)), 0.05))
+    }
+    for (i <- modelIndex) {
+      for (j <- 0 until thetaData(i._2).length) {
+        assert(closeFit(math.exp(thetaData(i._2)(j)), math.exp(model.theta(i._1)(j)), 0.05))
+      }
+    }
+  }
+
   test("get, set params") {
     val nb = new NaiveBayes()
     nb.setLambda(2.0)
@@ -93,19 +125,53 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext {
     assert(nb.getLambda === 3.0)
   }
 
-  test("Naive Bayes") {
-    val nPoints = 10000
+  test("Naive Bayes Multinomial") {
+    val nPoints = 1000
+    val pi = Array(0.5, 0.1, 0.4).map(math.log)
+    val theta = Array(
+      Array(0.70, 0.10, 0.10, 0.10), // label 0
+      Array(0.10, 0.70, 0.10, 0.10), // label 1
+      Array(0.10, 0.10, 0.70, 0.10)  // label 2
+    ).map(_.map(math.log))
+
+    val testData = NaiveBayesSuite.generateNaiveBayesInput(
+      pi, theta, nPoints, 42, "Multinomial")
+    val testRDD = sc.parallelize(testData, 2)
+    testRDD.cache()
+
+    val model = NaiveBayes.train(testRDD, 1.0, "Multinomial")
+    validateModelFit(pi, theta, model)
+
+    val validationData = NaiveBayesSuite.generateNaiveBayesInput(
+      pi, theta, nPoints, 17, "Multinomial")
+    val validationRDD = sc.parallelize(validationData, 2)
+
+    // Test prediction on RDD.
+    validatePrediction(model.predict(validationRDD.map(_.features)).collect(), validationData)
 
-    val pi = NaiveBayesSuite.smallPi
-    val theta = NaiveBayesSuite.smallTheta
+    // Test prediction on Array.
+    validatePrediction(validationData.map(row => model.predict(row.features)), validationData)
+  }
 
-    val testData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 42)
+  test("Naive Bayes Bernoulli") {
+    val nPoints = 10000
+    val pi = Array(0.5, 0.3, 0.2).map(math.log)
+    val theta = Array(
+      Array(0.50, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.40), // label 0
+      Array(0.02, 0.70, 0.10, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02), // label 1
+      Array(0.02, 0.02, 0.60, 0.02,  0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.30)  // label 2
+    ).map(_.map(math.log))
+
+    val testData = NaiveBayesSuite.generateNaiveBayesInput(
+      pi, theta, nPoints, 45, "Bernoulli")
     val testRDD = sc.parallelize(testData, 2)
     testRDD.cache()
 
-    val model = NaiveBayes.train(testRDD)
+    val model = NaiveBayes.train(testRDD, 1.0, "Bernoulli")
+    validateModelFit(pi, theta, model)
 
-    val validationData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 17)
+    val validationData = NaiveBayesSuite.generateNaiveBayesInput(
+      pi, theta, nPoints, 20, "Bernoulli")
     val validationRDD = sc.parallelize(validationData, 2)
 
     // Test prediction on RDD.
@@ -142,19 +208,41 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext {
     }
   }
 
-  test("model save/load") {
-    val model = NaiveBayesSuite.binaryModel
+  test("model save/load: 2.0 to 2.0") {
+    val tempDir = Utils.createTempDir()
+    val path = tempDir.toURI.toString
+
+    Seq(NaiveBayesSuite.binaryBernoulliModel, NaiveBayesSuite.binaryMultinomialModel).map {
+      model =>
+        // Save model, load it back, and compare.
+        try {
+          model.save(sc, path)
+          val sameModel = NaiveBayesModel.load(sc, path)
+          assert(model.labels === sameModel.labels)
+          assert(model.pi === sameModel.pi)
+          assert(model.theta === sameModel.theta)
+          assert(model.modelType === sameModel.modelType)
+        } finally {
+          Utils.deleteRecursively(tempDir)
+        }
+    }
+  }
+
+  test("model save/load: 1.0 to 2.0") {
+    val model = NaiveBayesSuite.binaryMultinomialModel
 
     val tempDir = Utils.createTempDir()
     val path = tempDir.toURI.toString
 
-    // Save model, load it back, and compare.
+    // Save model as version 1.0, load it back, and compare.
     try {
-      model.save(sc, path)
+      val data = NaiveBayesModel.SaveLoadV1_0.Data(model.labels, model.pi, model.theta)
+      NaiveBayesModel.SaveLoadV1_0.save(sc, path, data)
       val sameModel = NaiveBayesModel.load(sc, path)
       assert(model.labels === sameModel.labels)
       assert(model.pi === sameModel.pi)
       assert(model.theta === sameModel.theta)
+      assert(model.modelType === "Multinomial")
     } finally {
       Utils.deleteRecursively(tempDir)
     }
@@ -172,8 +260,8 @@ class NaiveBayesClusterSuite extends FunSuite with LocalClusterSparkContext {
         LabeledPoint(random.nextInt(2), Vectors.dense(Array.fill(n)(random.nextDouble())))
       }
     }
-    // If we serialize data directly in the task closure, the size of the serialized task would be
-    // greater than 1MB and hence Spark would throw an error.
+    // If we serialize data directly in the task closure, the size of the serialized task
+    // would be greater than 1MB and hence Spark would throw an error.
     val model = NaiveBayes.train(examples)
     val predictions = model.predict(examples.map(_.features))
   }
author	leahmcguire <lmcguire@salesforce.com>	2015-03-31 11:16:55 -0700
committer	Joseph K. Bradley <joseph@databricks.com>	2015-03-31 11:16:55 -0700
commit	d01a6d8c33fc5c8325b0cc4b51395dba5eb3462c (patch)
tree	14a3faac411b44804fc141d32b4b2001952a0125 /mllib/src/test
parent	a05835b89fe2086e460f0b80f7c22e284c0c32d0 (diff)
download	spark-d01a6d8c33fc5c8325b0cc4b51395dba5eb3462c.tar.gz spark-d01a6d8c33fc5c8325b0cc4b51395dba5eb3462c.tar.bz2 spark-d01a6d8c33fc5c8325b0cc4b51395dba5eb3462c.zip