diff options
author | Lian, Cheng <rhythm.mail@gmail.com> | 2013-12-30 22:46:32 +0800 |
---|---|---|
committer | Lian, Cheng <rhythm.mail@gmail.com> | 2013-12-30 22:46:32 +0800 |
commit | 6d0e2e86dfbca88abc847d3babac2d1f82d61aaf (patch) | |
tree | 982302a5b1b2485ad08b992d9468e2b7c9eb4cc9 /mllib/src/test | |
parent | f150b6e76c56ed6f604e6dbda7bce6b6278929fb (diff) | |
download | spark-6d0e2e86dfbca88abc847d3babac2d1f82d61aaf.tar.gz spark-6d0e2e86dfbca88abc847d3babac2d1f82d61aaf.tar.bz2 spark-6d0e2e86dfbca88abc847d3babac2d1f82d61aaf.zip |
Response to comments from Reynold, Ameet and Evan
* Arguments renamed according to Ameet's suggestion
* Using DoubleMatrix instead of Array[Double] in computation
* Removed arguments C (kinds of label) and D (dimension of feature vector) from NaiveBayes.train()
* Replaced reduceByKey with foldByKey to avoid modifying original input data
Diffstat (limited to 'mllib/src/test')
-rw-r--r-- | mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala | 32 |
1 files changed, 16 insertions, 16 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala index a2821347a7..18575f410c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala @@ -38,20 +38,20 @@ object NaiveBayesSuite { // Generate input of the form Y = (weightMatrix*x).argmax() def generateNaiveBayesInput( - weightPerLabel: Array[Double], // 1XC - weightsMatrix: Array[Array[Double]], // CXD + pi: Array[Double], // 1XC + theta: Array[Array[Double]], // CXD nPoints: Int, seed: Int): Seq[LabeledPoint] = { - val D = weightsMatrix(0).length + val D = theta(0).length val rnd = new Random(seed) - val _weightPerLabel = weightPerLabel.map(math.pow(math.E, _)) - val _weightMatrix = weightsMatrix.map(row => row.map(math.pow(math.E, _))) + val _pi = pi.map(math.pow(math.E, _)) + val _theta = theta.map(row => row.map(math.pow(math.E, _))) for (i <- 0 until nPoints) yield { - val y = calcLabel(rnd.nextDouble(), _weightPerLabel) + val y = calcLabel(rnd.nextDouble(), _pi) val xi = Array.tabulate[Double](D) { j => - if (rnd.nextDouble() < _weightMatrix(y)(j)) 1 else 0 + if (rnd.nextDouble() < _theta(y)(j)) 1 else 0 } LabeledPoint(y, xi) @@ -83,20 +83,20 @@ class NaiveBayesSuite extends FunSuite with BeforeAndAfterAll { test("Naive Bayes") { val nPoints = 10000 - val weightPerLabel = Array(math.log(0.5), math.log(0.3), math.log(0.2)) - val weightsMatrix = Array( - Array(math.log(0.91), math.log(0.03), math.log(0.03), math.log(0.03)), // label 0 - Array(math.log(0.03), math.log(0.91), math.log(0.03), math.log(0.03)), // label 1 - Array(math.log(0.03), math.log(0.03), math.log(0.91), math.log(0.03)) // label 2 - ) + val pi = Array(0.5, 0.3, 0.2).map(math.log) + val theta = Array( + Array(0.91, 0.03, 0.03, 0.03), // label 0 + Array(0.03, 0.91, 0.03, 0.03), // label 1 + Array(0.03, 0.03, 0.91, 0.03) // label 2 + ).map(_.map(math.log)) - val testData = NaiveBayesSuite.generateNaiveBayesInput(weightPerLabel, weightsMatrix, nPoints, 42) + val testData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 42) val testRDD = sc.parallelize(testData, 2) testRDD.cache() - val model = NaiveBayes.train(3, 4, testRDD) + val model = NaiveBayes.train(testRDD) - val validationData = NaiveBayesSuite.generateNaiveBayesInput(weightPerLabel, weightsMatrix, nPoints, 17) + val validationData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 17) val validationRDD = sc.parallelize(validationData, 2) // Test prediction on RDD. |