[SPARK-4324] [PySpark] [MLlib] support numpy.array for all MLlib API

This PR check all of the existing Python MLlib API to make sure that numpy.array is supported as Vector (also RDD of numpy.array). It also improve some docstring and doctest. cc mateiz mengxr Author: Davies Liu <davies@databricks.com> Closes #3189 from davies/numpy and squashes the following commits: d5057c4 [Davies Liu] fix tests 6987611 [Davies Liu] support numpy.array for all MLlib API (cherry picked from commit 65083e93ddd552b7d3e4eb09f87c091ef2ae83a2) Signed-off-by: Xiangrui Meng <meng@databricks.com>
author: Davies Liu <davies@databricks.com> 2014-11-10 22:26:16 -0800
committer: Xiangrui Meng <meng@databricks.com> 2014-11-10 22:26:25 -0800
commit: df8242c9b6307c085d4c1a7ec446b1701a7e7cde (patch)
tree: 4d144b6278dcf3a0d96cbd9929227ec6efb2c3ff /python/pyspark/mllib/classification.py
parent: 4eeaf3395a885b0a9ef79c31b720969155b0b7af (diff)
download: spark-df8242c9b6307c085d4c1a7ec446b1701a7e7cde.tar.gz
spark-df8242c9b6307c085d4c1a7ec446b1701a7e7cde.tar.bz2
spark-df8242c9b6307c085d4c1a7ec446b1701a7e7cde.zip
1 files changed, 8 insertions, 5 deletions
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index 297a2bf37d..5d90dddb5d 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -62,6 +62,7 @@ class LogisticRegressionModel(LinearModel):
     """
 
     def predict(self, x):
+        x = _convert_to_vector(x)
         margin = self.weights.dot(x) + self._intercept
         if margin > 0:
             prob = 1 / (1 + exp(-margin))
@@ -79,7 +80,7 @@ class LogisticRegressionWithSGD(object):
         """
         Train a logistic regression model on the given data.
 
-        :param data:              The training data.
+        :param data:              The training data, an RDD of LabeledPoint.
         :param iterations:        The number of iterations (default: 100).
         :param step:              The step parameter used in SGD
                                   (default: 1.0).
@@ -136,6 +137,7 @@ class SVMModel(LinearModel):
     """
 
     def predict(self, x):
+        x = _convert_to_vector(x)
         margin = self.weights.dot(x) + self.intercept
         return 1 if margin >= 0 else 0
 
@@ -148,7 +150,7 @@ class SVMWithSGD(object):
         """
         Train a support vector machine on the given data.
 
-        :param data:              The training data.
+        :param data:              The training data, an RDD of LabeledPoint.
         :param iterations:        The number of iterations (default: 100).
         :param step:              The step parameter used in SGD
                                   (default: 1.0).
@@ -233,11 +235,12 @@ class NaiveBayes(object):
         classification.  By making every vector a 0-1 vector, it can also be
         used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).
 
-        :param data: RDD of NumPy vectors, one per element, where the first
-               coordinate is the label and the rest is the feature vector
-               (e.g. a count vector).
+        :param data: RDD of LabeledPoint.
         :param lambda_: The smoothing parameter
         """
+        first = data.first()
+        if not isinstance(first, LabeledPoint):
+            raise ValueError("`data` should be an RDD of LabeledPoint")
         labels, pi, theta = callMLlibFunc("trainNaiveBayes", data, lambda_)
         return NaiveBayesModel(labels.toArray(), pi.toArray(), numpy.array(theta))
author	Davies Liu <davies@databricks.com>	2014-11-10 22:26:16 -0800
committer	Xiangrui Meng <meng@databricks.com>	2014-11-10 22:26:25 -0800
commit	df8242c9b6307c085d4c1a7ec446b1701a7e7cde (patch)
tree	4d144b6278dcf3a0d96cbd9929227ec6efb2c3ff /python/pyspark/mllib/classification.py
parent	4eeaf3395a885b0a9ef79c31b720969155b0b7af (diff)
download	spark-df8242c9b6307c085d4c1a7ec446b1701a7e7cde.tar.gz spark-df8242c9b6307c085d4c1a7ec446b1701a7e7cde.tar.bz2 spark-df8242c9b6307c085d4c1a7ec446b1701a7e7cde.zip