From 738c10748b49eb8a475d1fd26c6a271ca36497cf Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Tue, 7 Jul 2015 08:59:52 -0700
Subject: [SPARK-8823] [MLLIB] [PYSPARK] Optimizations for SparseVector dot
 products

Follow up for https://github.com/apache/spark/pull/5946

Currently we iterate over indices and values in SparseVector and can be vectorized.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #7222 from MechCoder/sparse_optim and squashes the following commits:

dcb51d3 [MechCoder] [SPARK-8823] [MLlib] [PySpark] Optimizations for SparseVector dot product
---
 python/pyspark/mllib/linalg.py | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'python/pyspark/mllib')

diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 9959a01cce..12d8dbbb92 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -590,18 +590,14 @@ class SparseVector(Vector):
             return np.dot(other.array[self.indices], self.values)
 
         elif isinstance(other, SparseVector):
-            result = 0.0
-            i, j = 0, 0
-            while i < len(self.indices) and j < len(other.indices):
-                if self.indices[i] == other.indices[j]:
-                    result += self.values[i] * other.values[j]
-                    i += 1
-                    j += 1
-                elif self.indices[i] < other.indices[j]:
-                    i += 1
-                else:
-                    j += 1
-            return result
+            # Find out common indices.
+            self_cmind = np.in1d(self.indices, other.indices, assume_unique=True)
+            self_values = self.values[self_cmind]
+            if self_values.size == 0:
+                return 0.0
+            else:
+                other_cmind = np.in1d(other.indices, self.indices, assume_unique=True)
+                return np.dot(self_values, other.values[other_cmind])
 
         else:
             return self.dot(_convert_to_vector(other))
-- 
cgit v1.2.3