1 files changed, 219 insertions, 37 deletions
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index e69051c104..0a5dcaac55 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -23,14 +23,148 @@ object from MLlib or pass SciPy C{scipy.sparse} column vectors if
 SciPy is available in their environment.
 """
 
-import numpy
-from numpy import array, array_equal, ndarray, float64, int32
+import sys
+import array
+import copy_reg
 
+import numpy as np
 
-__all__ = ['SparseVector', 'Vectors']
+__all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors']
 
 
-class SparseVector(object):
+if sys.version_info[:2] == (2, 7):
+    # speed up pickling array in Python 2.7
+    def fast_pickle_array(ar):
+        return array.array, (ar.typecode, ar.tostring())
+    copy_reg.pickle(array.array, fast_pickle_array)
+
+
+# Check whether we have SciPy. MLlib works without it too, but if we have it, some methods,
+# such as _dot and _serialize_double_vector, start to support scipy.sparse matrices.
+
+try:
+    import scipy.sparse
+    _have_scipy = True
+except:
+    # No SciPy in environment, but that's okay
+    _have_scipy = False
+
+
+def _convert_to_vector(l):
+    if isinstance(l, Vector):
+        return l
+    elif type(l) in (array.array, np.array, np.ndarray, list, tuple):
+        return DenseVector(l)
+    elif _have_scipy and scipy.sparse.issparse(l):
+        assert l.shape[1] == 1, "Expected column vector"
+        csc = l.tocsc()
+        return SparseVector(l.shape[0], csc.indices, csc.data)
+    else:
+        raise TypeError("Cannot convert type %s into Vector" % type(l))
+
+
+class Vector(object):
+    """
+    Abstract class for DenseVector and SparseVector
+    """
+    def toArray(self):
+        """
+        Convert the vector into an numpy.ndarray
+        :return: numpy.ndarray
+        """
+        raise NotImplementedError
+
+
+class DenseVector(Vector):
+    def __init__(self, ar):
+        if not isinstance(ar, array.array):
+            ar = array.array('d', ar)
+        self.array = ar
+
+    def __reduce__(self):
+        return DenseVector, (self.array,)
+
+    def dot(self, other):
+        """
+        Compute the dot product of two Vectors. We support
+        (Numpy array, list, SparseVector, or SciPy sparse)
+        and a target NumPy array that is either 1- or 2-dimensional.
+        Equivalent to calling numpy.dot of the two vectors.
+
+        >>> dense = DenseVector(array.array('d', [1., 2.]))
+        >>> dense.dot(dense)
+        5.0
+        >>> dense.dot(SparseVector(2, [0, 1], [2., 1.]))
+        4.0
+        >>> dense.dot(range(1, 3))
+        5.0
+        >>> dense.dot(np.array(range(1, 3)))
+        5.0
+        """
+        if isinstance(other, SparseVector):
+            return other.dot(self)
+        elif _have_scipy and scipy.sparse.issparse(other):
+            return other.transpose().dot(self.toArray())[0]
+        elif isinstance(other, Vector):
+            return np.dot(self.toArray(), other.toArray())
+        else:
+            return np.dot(self.toArray(), other)
+
+    def squared_distance(self, other):
+        """
+        Squared distance of two Vectors.
+
+        >>> dense1 = DenseVector(array.array('d', [1., 2.]))
+        >>> dense1.squared_distance(dense1)
+        0.0
+        >>> dense2 = np.array([2., 1.])
+        >>> dense1.squared_distance(dense2)
+        2.0
+        >>> dense3 = [2., 1.]
+        >>> dense1.squared_distance(dense3)
+        2.0
+        >>> sparse1 = SparseVector(2, [0, 1], [2., 1.])
+        >>> dense1.squared_distance(sparse1)
+        2.0
+        """
+        if isinstance(other, SparseVector):
+            return other.squared_distance(self)
+        elif _have_scipy and scipy.sparse.issparse(other):
+            return _convert_to_vector(other).squared_distance(self)
+
+        if isinstance(other, Vector):
+            other = other.toArray()
+        elif not isinstance(other, np.ndarray):
+            other = np.array(other)
+        diff = self.toArray() - other
+        return np.dot(diff, diff)
+
+    def toArray(self):
+        return np.array(self.array)
+
+    def __getitem__(self, item):
+        return self.array[item]
+
+    def __len__(self):
+        return len(self.array)
+
+    def __str__(self):
+        return "[" + ",".join([str(v) for v in self.array]) + "]"
+
+    def __repr__(self):
+        return "DenseVector(%r)" % self.array
+
+    def __eq__(self, other):
+        return isinstance(other, DenseVector) and self.array == other.array
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __getattr__(self, item):
+        return getattr(self.array, item)
+
+
+class SparseVector(Vector):
 
     """
     A simple sparse vector class for passing data to MLlib. Users may
@@ -61,16 +195,19 @@ class SparseVector(object):
             if type(pairs) == dict:
                 pairs = pairs.items()
             pairs = sorted(pairs)
-            self.indices = array([p[0] for p in pairs], dtype=int32)
-            self.values = array([p[1] for p in pairs], dtype=float64)
+            self.indices = array.array('i', [p[0] for p in pairs])
+            self.values = array.array('d', [p[1] for p in pairs])
         else:
             assert len(args[0]) == len(args[1]), "index and value arrays not same length"
-            self.indices = array(args[0], dtype=int32)
-            self.values = array(args[1], dtype=float64)
+            self.indices = array.array('i', args[0])
+            self.values = array.array('d', args[1])
             for i in xrange(len(self.indices) - 1):
                 if self.indices[i] >= self.indices[i + 1]:
                     raise TypeError("indices array must be sorted")
 
+    def __reduce__(self):
+        return (SparseVector, (self.size, self.indices, self.values))
+
     def dot(self, other):
         """
         Dot product with a SparseVector or 1- or 2-dimensional Numpy array.
@@ -78,15 +215,15 @@ class SparseVector(object):
         >>> a = SparseVector(4, [1, 3], [3.0, 4.0])
         >>> a.dot(a)
         25.0
-        >>> a.dot(array([1., 2., 3., 4.]))
+        >>> a.dot(array.array('d', [1., 2., 3., 4.]))
         22.0
         >>> b = SparseVector(4, [2, 4], [1.0, 2.0])
         >>> a.dot(b)
         0.0
-        >>> a.dot(array([[1, 1], [2, 2], [3, 3], [4, 4]]))
+        >>> a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]))
         array([ 22.,  22.])
         """
-        if type(other) == ndarray:
+        if type(other) == np.ndarray:
             if other.ndim == 1:
                 result = 0.0
                 for i in xrange(len(self.indices)):
@@ -94,10 +231,17 @@ class SparseVector(object):
                 return result
             elif other.ndim == 2:
                 results = [self.dot(other[:, i]) for i in xrange(other.shape[1])]
-                return array(results)
+                return np.array(results)
             else:
                 raise Exception("Cannot call dot with %d-dimensional array" % other.ndim)
-        else:
+
+        elif type(other) in (array.array, DenseVector):
+            result = 0.0
+            for i in xrange(len(self.indices)):
+                result += self.values[i] * other[self.indices[i]]
+            return result
+
+        elif type(other) is SparseVector:
             result = 0.0
             i, j = 0, 0
             while i < len(self.indices) and j < len(other.indices):
@@ -110,6 +254,8 @@ class SparseVector(object):
                 else:
                     j += 1
             return result
+        else:
+            return self.dot(_convert_to_vector(other))
 
     def squared_distance(self, other):
         """
@@ -118,7 +264,9 @@ class SparseVector(object):
         >>> a = SparseVector(4, [1, 3], [3.0, 4.0])
         >>> a.squared_distance(a)
         0.0
-        >>> a.squared_distance(array([1., 2., 3., 4.]))
+        >>> a.squared_distance(array.array('d', [1., 2., 3., 4.]))
+        11.0
+        >>> a.squared_distance(np.array([1., 2., 3., 4.]))
         11.0
         >>> b = SparseVector(4, [2, 4], [1.0, 2.0])
         >>> a.squared_distance(b)
@@ -126,22 +274,22 @@ class SparseVector(object):
         >>> b.squared_distance(a)
         30.0
         """
-        if type(other) == ndarray:
-            if other.ndim == 1:
-                result = 0.0
-                j = 0   # index into our own array
-                for i in xrange(other.shape[0]):
-                    if j < len(self.indices) and self.indices[j] == i:
-                        diff = self.values[j] - other[i]
-                        result += diff * diff
-                        j += 1
-                    else:
-                        result += other[i] * other[i]
-                return result
-            else:
+        if type(other) in (list, array.array, DenseVector, np.array, np.ndarray):
+            if type(other) is np.array and other.ndim != 1:
                 raise Exception("Cannot call squared_distance with %d-dimensional array" %
                                 other.ndim)
-        else:
+            result = 0.0
+            j = 0   # index into our own array
+            for i in xrange(len(other)):
+                if j < len(self.indices) and self.indices[j] == i:
+                    diff = self.values[j] - other[i]
+                    result += diff * diff
+                    j += 1
+                else:
+                    result += other[i] * other[i]
+            return result
+
+        elif type(other) is SparseVector:
             result = 0.0
             i, j = 0, 0
             while i < len(self.indices) and j < len(other.indices):
@@ -163,16 +311,21 @@ class SparseVector(object):
                 result += other.values[j] * other.values[j]
                 j += 1
             return result
+        else:
+            return self.squared_distance(_convert_to_vector(other))
 
     def toArray(self):
         """
         Returns a copy of this SparseVector as a 1-dimensional NumPy array.
         """
-        arr = numpy.zeros(self.size)
+        arr = np.zeros((self.size,), dtype=np.float64)
         for i in xrange(self.indices.size):
             arr[self.indices[i]] = self.values[i]
         return arr
 
+    def __len__(self):
+        return self.size
+
     def __str__(self):
         inds = "[" + ",".join([str(i) for i in self.indices]) + "]"
         vals = "[" + ",".join([str(v) for v in self.values]) + "]"
@@ -198,8 +351,8 @@ class SparseVector(object):
 
         return (isinstance(other, self.__class__)
                 and other.size == self.size
-                and array_equal(other.indices, self.indices)
-                and array_equal(other.values, self.values))
+                and other.indices == self.indices
+                and other.values == self.values)
 
     def __ne__(self, other):
         return not self.__eq__(other)
@@ -242,9 +395,9 @@ class Vectors(object):
         returns a NumPy array.
 
         >>> Vectors.dense([1, 2, 3])
-        array([ 1.,  2.,  3.])
+        DenseVector(array('d', [1.0, 2.0, 3.0]))
         """
-        return array(elements, dtype=float64)
+        return DenseVector(elements)
 
     @staticmethod
     def stringify(vector):
@@ -257,10 +410,39 @@ class Vectors(object):
         >>> Vectors.stringify(Vectors.dense([0.0, 1.0]))
         '[0.0,1.0]'
         """
-        if type(vector) == SparseVector:
-            return str(vector)
-        else:
-            return "[" + ",".join([str(v) for v in vector]) + "]"
+        return str(vector)
+
+
+class Matrix(object):
+    """ the Matrix """
+    def __init__(self, nRow, nCol):
+        self.nRow = nRow
+        self.nCol = nCol
+
+    def toArray(self):
+        raise NotImplementedError
+
+
+class DenseMatrix(Matrix):
+    def __init__(self, nRow, nCol, values):
+        Matrix.__init__(self, nRow, nCol)
+        assert len(values) == nRow * nCol
+        self.values = values
+
+    def __reduce__(self):
+        return DenseMatrix, (self.nRow, self.nCol, self.values)
+
+    def toArray(self):
+        """
+        Return an numpy.ndarray
+
+        >>> arr = array.array('d', [float(i) for i in range(4)])
+        >>> m = DenseMatrix(2, 2, arr)
+        >>> m.toArray()
+        array([[ 0.,  1.],
+               [ 2.,  3.]])
+        """
+        return np.ndarray((self.nRow, self.nCol), np.float64, buffer=self.values.tostring())
 
 
 def _test():