3 files changed, 23 insertions, 10 deletions
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 9a239abfbb..f485a69db1 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -23,6 +23,7 @@ object from MLlib or pass SciPy C{scipy.sparse} column vectors if
 SciPy is available in their environment.
 """
 
+import numpy
 from numpy import array, array_equal, ndarray, float64, int32
 
 
@@ -160,6 +161,15 @@ class SparseVector(object):
                 j += 1
             return result
 
+    def toArray(self):
+        """
+        Returns a copy of this SparseVector as a 1-dimensional NumPy array.
+        """
+        arr = numpy.zeros(self.size)
+        for i in xrange(self.indices.size):
+            arr[self.indices[i]] = self.values[i]
+        return arr
+
     def __str__(self):
         inds = "[" + ",".join([str(i) for i in self.indices]) + "]"
         vals = "[" + ",".join([str(v) for v in self.values]) + "]"
diff --git a/python/pyspark/mllib/stat.py b/python/pyspark/mllib/stat.py
index a73abc5ff9..feef0d16cd 100644
--- a/python/pyspark/mllib/stat.py
+++ b/python/pyspark/mllib/stat.py
@@ -118,16 +118,18 @@ class Statistics(object):
         >>> from linalg import Vectors
         >>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]),
         ...                       Vectors.dense([6, 7, 0,  8]), Vectors.dense([9, 0, 0, 1])])
-        >>> Statistics.corr(rdd)
-        array([[ 1.        ,  0.05564149,         nan,  0.40047142],
-               [ 0.05564149,  1.        ,         nan,  0.91359586],
-               [        nan,         nan,  1.        ,         nan],
-               [ 0.40047142,  0.91359586,         nan,  1.        ]])
-        >>> Statistics.corr(rdd, method="spearman")
-        array([[ 1.        ,  0.10540926,         nan,  0.4       ],
-               [ 0.10540926,  1.        ,         nan,  0.9486833 ],
-               [        nan,         nan,  1.        ,         nan],
-               [ 0.4       ,  0.9486833 ,         nan,  1.        ]])
+        >>> pearsonCorr = Statistics.corr(rdd)
+        >>> print str(pearsonCorr).replace('nan', 'NaN')
+        [[ 1.          0.05564149         NaN  0.40047142]
+         [ 0.05564149  1.                 NaN  0.91359586]
+         [        NaN         NaN  1.                 NaN]
+         [ 0.40047142  0.91359586         NaN  1.        ]]
+        >>> spearmanCorr = Statistics.corr(rdd, method="spearman")
+        >>> print str(spearmanCorr).replace('nan', 'NaN')
+        [[ 1.          0.10540926         NaN  0.4       ]
+         [ 0.10540926  1.                 NaN  0.9486833 ]
+         [        NaN         NaN  1.                 NaN]
+         [ 0.4         0.9486833          NaN  1.        ]]
         >>> try:
         ...     Statistics.corr(rdd, "spearman")
         ...     print "Method name as second argument without 'method=' shouldn't be allowed."
diff --git a/python/run-tests b/python/run-tests
index a6271e0cf5..b506559a5e 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -78,6 +78,7 @@ run_test "pyspark/mllib/linalg.py"
 run_test "pyspark/mllib/random.py"
 run_test "pyspark/mllib/recommendation.py"
 run_test "pyspark/mllib/regression.py"
+run_test "pyspark/mllib/stat.py"
 run_test "pyspark/mllib/tests.py"
 run_test "pyspark/mllib/tree.py"
 run_test "pyspark/mllib/util.py"