aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/mllib/linalg.py10
-rw-r--r--python/pyspark/mllib/stat.py22
-rwxr-xr-xpython/run-tests1
3 files changed, 23 insertions, 10 deletions
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 9a239abfbb..f485a69db1 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -23,6 +23,7 @@ object from MLlib or pass SciPy C{scipy.sparse} column vectors if
SciPy is available in their environment.
"""
+import numpy
from numpy import array, array_equal, ndarray, float64, int32
@@ -160,6 +161,15 @@ class SparseVector(object):
j += 1
return result
+ def toArray(self):
+ """
+ Returns a copy of this SparseVector as a 1-dimensional NumPy array.
+ """
+ arr = numpy.zeros(self.size)
+ for i in xrange(self.indices.size):
+ arr[self.indices[i]] = self.values[i]
+ return arr
+
def __str__(self):
inds = "[" + ",".join([str(i) for i in self.indices]) + "]"
vals = "[" + ",".join([str(v) for v in self.values]) + "]"
diff --git a/python/pyspark/mllib/stat.py b/python/pyspark/mllib/stat.py
index a73abc5ff9..feef0d16cd 100644
--- a/python/pyspark/mllib/stat.py
+++ b/python/pyspark/mllib/stat.py
@@ -118,16 +118,18 @@ class Statistics(object):
>>> from linalg import Vectors
>>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]),
... Vectors.dense([6, 7, 0, 8]), Vectors.dense([9, 0, 0, 1])])
- >>> Statistics.corr(rdd)
- array([[ 1. , 0.05564149, nan, 0.40047142],
- [ 0.05564149, 1. , nan, 0.91359586],
- [ nan, nan, 1. , nan],
- [ 0.40047142, 0.91359586, nan, 1. ]])
- >>> Statistics.corr(rdd, method="spearman")
- array([[ 1. , 0.10540926, nan, 0.4 ],
- [ 0.10540926, 1. , nan, 0.9486833 ],
- [ nan, nan, 1. , nan],
- [ 0.4 , 0.9486833 , nan, 1. ]])
+ >>> pearsonCorr = Statistics.corr(rdd)
+ >>> print str(pearsonCorr).replace('nan', 'NaN')
+ [[ 1. 0.05564149 NaN 0.40047142]
+ [ 0.05564149 1. NaN 0.91359586]
+ [ NaN NaN 1. NaN]
+ [ 0.40047142 0.91359586 NaN 1. ]]
+ >>> spearmanCorr = Statistics.corr(rdd, method="spearman")
+ >>> print str(spearmanCorr).replace('nan', 'NaN')
+ [[ 1. 0.10540926 NaN 0.4 ]
+ [ 0.10540926 1. NaN 0.9486833 ]
+ [ NaN NaN 1. NaN]
+ [ 0.4 0.9486833 NaN 1. ]]
>>> try:
... Statistics.corr(rdd, "spearman")
... print "Method name as second argument without 'method=' shouldn't be allowed."
diff --git a/python/run-tests b/python/run-tests
index a6271e0cf5..b506559a5e 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -78,6 +78,7 @@ run_test "pyspark/mllib/linalg.py"
run_test "pyspark/mllib/random.py"
run_test "pyspark/mllib/recommendation.py"
run_test "pyspark/mllib/regression.py"
+run_test "pyspark/mllib/stat.py"
run_test "pyspark/mllib/tests.py"
run_test "pyspark/mllib/tree.py"
run_test "pyspark/mllib/util.py"