aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/test
diff options
context:
space:
mode:
authorLian, Cheng <rhythm.mail@gmail.com>2013-12-30 22:46:32 +0800
committerLian, Cheng <rhythm.mail@gmail.com>2013-12-30 22:46:32 +0800
commit6d0e2e86dfbca88abc847d3babac2d1f82d61aaf (patch)
tree982302a5b1b2485ad08b992d9468e2b7c9eb4cc9 /mllib/src/test
parentf150b6e76c56ed6f604e6dbda7bce6b6278929fb (diff)
downloadspark-6d0e2e86dfbca88abc847d3babac2d1f82d61aaf.tar.gz
spark-6d0e2e86dfbca88abc847d3babac2d1f82d61aaf.tar.bz2
spark-6d0e2e86dfbca88abc847d3babac2d1f82d61aaf.zip
Response to comments from Reynold, Ameet and Evan
* Arguments renamed according to Ameet's suggestion * Using DoubleMatrix instead of Array[Double] in computation * Removed arguments C (kinds of label) and D (dimension of feature vector) from NaiveBayes.train() * Replaced reduceByKey with foldByKey to avoid modifying original input data
Diffstat (limited to 'mllib/src/test')
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala32
1 files changed, 16 insertions, 16 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
index a2821347a7..18575f410c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
@@ -38,20 +38,20 @@ object NaiveBayesSuite {
// Generate input of the form Y = (weightMatrix*x).argmax()
def generateNaiveBayesInput(
- weightPerLabel: Array[Double], // 1XC
- weightsMatrix: Array[Array[Double]], // CXD
+ pi: Array[Double], // 1XC
+ theta: Array[Array[Double]], // CXD
nPoints: Int,
seed: Int): Seq[LabeledPoint] = {
- val D = weightsMatrix(0).length
+ val D = theta(0).length
val rnd = new Random(seed)
- val _weightPerLabel = weightPerLabel.map(math.pow(math.E, _))
- val _weightMatrix = weightsMatrix.map(row => row.map(math.pow(math.E, _)))
+ val _pi = pi.map(math.pow(math.E, _))
+ val _theta = theta.map(row => row.map(math.pow(math.E, _)))
for (i <- 0 until nPoints) yield {
- val y = calcLabel(rnd.nextDouble(), _weightPerLabel)
+ val y = calcLabel(rnd.nextDouble(), _pi)
val xi = Array.tabulate[Double](D) { j =>
- if (rnd.nextDouble() < _weightMatrix(y)(j)) 1 else 0
+ if (rnd.nextDouble() < _theta(y)(j)) 1 else 0
}
LabeledPoint(y, xi)
@@ -83,20 +83,20 @@ class NaiveBayesSuite extends FunSuite with BeforeAndAfterAll {
test("Naive Bayes") {
val nPoints = 10000
- val weightPerLabel = Array(math.log(0.5), math.log(0.3), math.log(0.2))
- val weightsMatrix = Array(
- Array(math.log(0.91), math.log(0.03), math.log(0.03), math.log(0.03)), // label 0
- Array(math.log(0.03), math.log(0.91), math.log(0.03), math.log(0.03)), // label 1
- Array(math.log(0.03), math.log(0.03), math.log(0.91), math.log(0.03)) // label 2
- )
+ val pi = Array(0.5, 0.3, 0.2).map(math.log)
+ val theta = Array(
+ Array(0.91, 0.03, 0.03, 0.03), // label 0
+ Array(0.03, 0.91, 0.03, 0.03), // label 1
+ Array(0.03, 0.03, 0.91, 0.03) // label 2
+ ).map(_.map(math.log))
- val testData = NaiveBayesSuite.generateNaiveBayesInput(weightPerLabel, weightsMatrix, nPoints, 42)
+ val testData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 42)
val testRDD = sc.parallelize(testData, 2)
testRDD.cache()
- val model = NaiveBayes.train(3, 4, testRDD)
+ val model = NaiveBayes.train(testRDD)
- val validationData = NaiveBayesSuite.generateNaiveBayesInput(weightPerLabel, weightsMatrix, nPoints, 17)
+ val validationData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 17)
val validationRDD = sc.parallelize(validationData, 2)
// Test prediction on RDD.