aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark
diff options
context:
space:
mode:
authorMechCoder <manojkumarsivaraj334@gmail.com>2015-07-03 15:49:32 -0700
committerDavies Liu <davies.liu@gmail.com>2015-07-03 15:49:32 -0700
commitf0fac2aa80da7c739b88043571e5d49ba40f9413 (patch)
tree220bc609a18f6fc830cbb00b9f388e9f76304b77 /python/pyspark
parentab535b9a1dab40ea7335ff9abb9b522fc2b5ed66 (diff)
downloadspark-f0fac2aa80da7c739b88043571e5d49ba40f9413.tar.gz
spark-f0fac2aa80da7c739b88043571e5d49ba40f9413.tar.bz2
spark-f0fac2aa80da7c739b88043571e5d49ba40f9413.zip
[SPARK-7401] [MLLIB] [PYSPARK] Vectorize dot product and sq_dist between SparseVector and DenseVector
Currently we iterate over indices which can be vectorized. Author: MechCoder <manojkumarsivaraj334@gmail.com> Closes #5946 from MechCoder/spark-7203 and squashes the following commits: 034d086 [MechCoder] Vectorize dot calculation for numpy arrays for ndim=2 bce2b07 [MechCoder] fix doctest fcad0a3 [MechCoder] Remove type checks for list, pyarray etc 0ee5dd4 [MechCoder] Add tests and other isinstance changes e5f1de0 [MechCoder] [SPARK-7401] Vectorize dot product and sq_dist
Diffstat (limited to 'python/pyspark')
-rw-r--r--python/pyspark/mllib/linalg.py44
-rw-r--r--python/pyspark/mllib/tests.py8
2 files changed, 29 insertions, 23 deletions
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index e96c5ef87d..9959a01cce 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -577,22 +577,19 @@ class SparseVector(Vector):
...
AssertionError: dimension mismatch
"""
- if type(other) == np.ndarray:
- if other.ndim == 2:
- results = [self.dot(other[:, i]) for i in xrange(other.shape[1])]
- return np.array(results)
- elif other.ndim > 2:
+
+ if isinstance(other, np.ndarray):
+ if other.ndim not in [2, 1]:
raise ValueError("Cannot call dot with %d-dimensional array" % other.ndim)
+ assert len(self) == other.shape[0], "dimension mismatch"
+ return np.dot(self.values, other[self.indices])
assert len(self) == _vector_size(other), "dimension mismatch"
- if type(other) in (np.ndarray, array.array, DenseVector):
- result = 0.0
- for i in xrange(len(self.indices)):
- result += self.values[i] * other[self.indices[i]]
- return result
+ if isinstance(other, DenseVector):
+ return np.dot(other.array[self.indices], self.values)
- elif type(other) is SparseVector:
+ elif isinstance(other, SparseVector):
result = 0.0
i, j = 0, 0
while i < len(self.indices) and j < len(other.indices):
@@ -635,22 +632,23 @@ class SparseVector(Vector):
AssertionError: dimension mismatch
"""
assert len(self) == _vector_size(other), "dimension mismatch"
- if type(other) in (list, array.array, DenseVector, np.array, np.ndarray):
- if type(other) is np.array and other.ndim != 1:
+
+ if isinstance(other, np.ndarray) or isinstance(other, DenseVector):
+ if isinstance(other, np.ndarray) and other.ndim != 1:
raise Exception("Cannot call squared_distance with %d-dimensional array" %
other.ndim)
- result = 0.0
- j = 0 # index into our own array
- for i in xrange(len(other)):
- if j < len(self.indices) and self.indices[j] == i:
- diff = self.values[j] - other[i]
- result += diff * diff
- j += 1
- else:
- result += other[i] * other[i]
+ if isinstance(other, DenseVector):
+ other = other.array
+ sparse_ind = np.zeros(other.size, dtype=bool)
+ sparse_ind[self.indices] = True
+ dist = other[sparse_ind] - self.values
+ result = np.dot(dist, dist)
+
+ other_ind = other[~sparse_ind]
+ result += np.dot(other_ind, other_ind)
return result
- elif type(other) is SparseVector:
+ elif isinstance(other, SparseVector):
result = 0.0
i, j = 0, 0
while i < len(self.indices) and j < len(other.indices):
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 49ce125de7..d9f9874d50 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -129,17 +129,22 @@ class VectorTests(MLlibTestCase):
[1., 2., 3., 4.],
[1., 2., 3., 4.],
[1., 2., 3., 4.]])
+ arr = pyarray.array('d', [0, 1, 2, 3])
self.assertEquals(10.0, sv.dot(dv))
self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat)))
self.assertEquals(30.0, dv.dot(dv))
self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
self.assertEquals(30.0, lst.dot(dv))
self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
+ self.assertEquals(7.0, sv.dot(arr))
def test_squared_distance(self):
sv = SparseVector(4, {1: 1, 3: 2})
dv = DenseVector(array([1., 2., 3., 4.]))
lst = DenseVector([4, 3, 2, 1])
+ lst1 = [4, 3, 2, 1]
+ arr = pyarray.array('d', [0, 2, 1, 3])
+ narr = array([0, 2, 1, 3])
self.assertEquals(15.0, _squared_distance(sv, dv))
self.assertEquals(25.0, _squared_distance(sv, lst))
self.assertEquals(20.0, _squared_distance(dv, lst))
@@ -149,6 +154,9 @@ class VectorTests(MLlibTestCase):
self.assertEquals(0.0, _squared_distance(sv, sv))
self.assertEquals(0.0, _squared_distance(dv, dv))
self.assertEquals(0.0, _squared_distance(lst, lst))
+ self.assertEquals(25.0, _squared_distance(sv, lst1))
+ self.assertEquals(3.0, _squared_distance(sv, arr))
+ self.assertEquals(3.0, _squared_distance(sv, narr))
def test_conversion(self):
# numpy arrays should be automatically upcast to float64