diff options
author | Xusen Yin <yinxusen@gmail.com> | 2014-04-22 11:06:18 -0700 |
---|---|---|
committer | Patrick Wendell <pwendell@gmail.com> | 2014-04-22 11:06:18 -0700 |
commit | c919798f0912dc03c8365b9a384d9ee6d5b25c51 (patch) | |
tree | 386eac712d26333b20a3950a551b6906a972824a | |
parent | 0f87e6ad4366a8c453a7415bc89399030003c264 (diff) | |
download | spark-c919798f0912dc03c8365b9a384d9ee6d5b25c51.tar.gz spark-c919798f0912dc03c8365b9a384d9ee6d5b25c51.tar.bz2 spark-c919798f0912dc03c8365b9a384d9ee6d5b25c51.zip |
fix bugs of dot in python
If there are no `transpose()` in `self.theta`, a
*ValueError: matrices are not aligned*
is occurring. The former test case just ignore this situation.
Author: Xusen Yin <yinxusen@gmail.com>
Closes #463 from yinxusen/python-naive-bayes and squashes the following commits:
fcbe3bc [Xusen Yin] fix bugs of dot in python
-rw-r--r-- | python/pyspark/mllib/classification.py | 2 | ||||
-rw-r--r-- | python/pyspark/mllib/tests.py | 8 |
2 files changed, 5 insertions, 5 deletions
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py index 3a23e0801f..c5844597c9 100644 --- a/python/pyspark/mllib/classification.py +++ b/python/pyspark/mllib/classification.py @@ -154,7 +154,7 @@ class NaiveBayesModel(object): def predict(self, x): """Return the most likely class for a data vector x""" - return self.labels[numpy.argmax(self.pi + _dot(x, self.theta))] + return self.labels[numpy.argmax(self.pi + _dot(x, self.theta.transpose()))] class NaiveBayes(object): @classmethod diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index d4771d779f..1ee96bb4af 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -104,10 +104,10 @@ class ListTests(PySparkTestCase): def test_classification(self): from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes data = [ - LabeledPoint(0.0, [1, 0]), - LabeledPoint(1.0, [0, 1]), - LabeledPoint(0.0, [2, 0]), - LabeledPoint(1.0, [0, 2]) + LabeledPoint(0.0, [1, 0, 0]), + LabeledPoint(1.0, [0, 1, 1]), + LabeledPoint(0.0, [2, 0, 0]), + LabeledPoint(1.0, [0, 2, 1]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] |