From 738c10748b49eb8a475d1fd26c6a271ca36497cf Mon Sep 17 00:00:00 2001 From: MechCoder Date: Tue, 7 Jul 2015 08:59:52 -0700 Subject: [SPARK-8823] [MLLIB] [PYSPARK] Optimizations for SparseVector dot products Follow up for https://github.com/apache/spark/pull/5946 Currently we iterate over indices and values in SparseVector and can be vectorized. Author: MechCoder Closes #7222 from MechCoder/sparse_optim and squashes the following commits: dcb51d3 [MechCoder] [SPARK-8823] [MLlib] [PySpark] Optimizations for SparseVector dot product --- python/pyspark/mllib/linalg.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'python/pyspark/mllib') diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py index 9959a01cce..12d8dbbb92 100644 --- a/python/pyspark/mllib/linalg.py +++ b/python/pyspark/mllib/linalg.py @@ -590,18 +590,14 @@ class SparseVector(Vector): return np.dot(other.array[self.indices], self.values) elif isinstance(other, SparseVector): - result = 0.0 - i, j = 0, 0 - while i < len(self.indices) and j < len(other.indices): - if self.indices[i] == other.indices[j]: - result += self.values[i] * other.values[j] - i += 1 - j += 1 - elif self.indices[i] < other.indices[j]: - i += 1 - else: - j += 1 - return result + # Find out common indices. + self_cmind = np.in1d(self.indices, other.indices, assume_unique=True) + self_values = self.values[self_cmind] + if self_values.size == 0: + return 0.0 + else: + other_cmind = np.in1d(other.indices, self.indices, assume_unique=True) + return np.dot(self_values, other.values[other_cmind]) else: return self.dot(_convert_to_vector(other)) -- cgit v1.2.3