aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/mllib/linalg.py73
-rw-r--r--python/pyspark/mllib/tests.py6
2 files changed, 53 insertions, 26 deletions
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index 537b176578..f7aa2b0cb0 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -30,7 +30,7 @@ import copy_reg
import numpy as np
from pyspark.sql import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
- IntegerType, ByteType, Row
+ IntegerType, ByteType
__all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors', 'DenseMatrix', 'Matrices']
@@ -173,12 +173,16 @@ class DenseVector(Vector):
A dense vector represented by a value array.
"""
def __init__(self, ar):
- if not isinstance(ar, array.array):
- ar = array.array('d', ar)
+ if isinstance(ar, basestring):
+ ar = np.frombuffer(ar, dtype=np.float64)
+ elif not isinstance(ar, np.ndarray):
+ ar = np.array(ar, dtype=np.float64)
+ if ar.dtype != np.float64:
+ ar.astype(np.float64)
self.array = ar
def __reduce__(self):
- return DenseVector, (self.array,)
+ return DenseVector, (self.array.tostring(),)
def dot(self, other):
"""
@@ -207,9 +211,10 @@ class DenseVector(Vector):
...
AssertionError: dimension mismatch
"""
- if type(other) == np.ndarray and other.ndim > 1:
- assert len(self) == other.shape[0], "dimension mismatch"
- return np.dot(self.toArray(), other)
+ if type(other) == np.ndarray:
+ if other.ndim > 1:
+ assert len(self) == other.shape[0], "dimension mismatch"
+ return np.dot(self.array, other)
elif _have_scipy and scipy.sparse.issparse(other):
assert len(self) == other.shape[0], "dimension mismatch"
return other.transpose().dot(self.toArray())
@@ -261,7 +266,7 @@ class DenseVector(Vector):
return np.dot(diff, diff)
def toArray(self):
- return np.array(self.array)
+ return self.array
def __getitem__(self, item):
return self.array[item]
@@ -276,7 +281,7 @@ class DenseVector(Vector):
return "DenseVector([%s])" % (', '.join(_format_float(i) for i in self.array))
def __eq__(self, other):
- return isinstance(other, DenseVector) and self.array == other.array
+ return isinstance(other, DenseVector) and np.array_equal(self.array, other.array)
def __ne__(self, other):
return not self == other
@@ -314,18 +319,28 @@ class SparseVector(Vector):
if type(pairs) == dict:
pairs = pairs.items()
pairs = sorted(pairs)
- self.indices = array.array('i', [p[0] for p in pairs])
- self.values = array.array('d', [p[1] for p in pairs])
+ self.indices = np.array([p[0] for p in pairs], dtype=np.int32)
+ self.values = np.array([p[1] for p in pairs], dtype=np.float64)
else:
- assert len(args[0]) == len(args[1]), "index and value arrays not same length"
- self.indices = array.array('i', args[0])
- self.values = array.array('d', args[1])
+ if isinstance(args[0], basestring):
+ assert isinstance(args[1], str), "values should be string too"
+ if args[0]:
+ self.indices = np.frombuffer(args[0], np.int32)
+ self.values = np.frombuffer(args[1], np.float64)
+ else:
+ # np.frombuffer() doesn't work well with empty string in older version
+ self.indices = np.array([], dtype=np.int32)
+ self.values = np.array([], dtype=np.float64)
+ else:
+ self.indices = np.array(args[0], dtype=np.int32)
+ self.values = np.array(args[1], dtype=np.float64)
+ assert len(self.indices) == len(self.values), "index and value arrays not same length"
for i in xrange(len(self.indices) - 1):
if self.indices[i] >= self.indices[i + 1]:
raise TypeError("indices array must be sorted")
def __reduce__(self):
- return (SparseVector, (self.size, self.indices, self.values))
+ return (SparseVector, (self.size, self.indices.tostring(), self.values.tostring()))
def dot(self, other):
"""
@@ -461,8 +476,7 @@ class SparseVector(Vector):
Returns a copy of this SparseVector as a 1-dimensional NumPy array.
"""
arr = np.zeros((self.size,), dtype=np.float64)
- for i in xrange(len(self.indices)):
- arr[self.indices[i]] = self.values[i]
+ arr[self.indices] = self.values
return arr
def __len__(self):
@@ -493,8 +507,8 @@ class SparseVector(Vector):
"""
return (isinstance(other, self.__class__)
and other.size == self.size
- and other.indices == self.indices
- and other.values == self.values)
+ and np.array_equal(other.indices, self.indices)
+ and np.array_equal(other.values, self.values))
def __ne__(self, other):
return not self.__eq__(other)
@@ -577,25 +591,34 @@ class DenseMatrix(Matrix):
"""
def __init__(self, numRows, numCols, values):
Matrix.__init__(self, numRows, numCols)
+ if isinstance(values, basestring):
+ values = np.frombuffer(values, dtype=np.float64)
+ elif not isinstance(values, np.ndarray):
+ values = np.array(values, dtype=np.float64)
assert len(values) == numRows * numCols
- if not isinstance(values, array.array):
- values = array.array('d', values)
+ if values.dtype != np.float64:
+ values.astype(np.float64)
self.values = values
def __reduce__(self):
- return DenseMatrix, (self.numRows, self.numCols, self.values)
+ return DenseMatrix, (self.numRows, self.numCols, self.values.tostring())
def toArray(self):
"""
Return an numpy.ndarray
- >>> arr = array.array('d', [float(i) for i in range(4)])
- >>> m = DenseMatrix(2, 2, arr)
+ >>> m = DenseMatrix(2, 2, range(4))
>>> m.toArray()
array([[ 0., 2.],
[ 1., 3.]])
"""
- return np.reshape(self.values, (self.numRows, self.numCols), order='F')
+ return self.values.reshape((self.numRows, self.numCols), order='F')
+
+ def __eq__(self, other):
+ return (isinstance(other, DenseMatrix) and
+ self.numRows == other.numRows and
+ self.numCols == other.numCols and
+ all(self.values == other.values))
class Matrices(object):
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 9fa4d6f6a2..8332f8e061 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -33,7 +33,8 @@ if sys.version_info[:2] <= (2, 6):
else:
import unittest
-from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector
+from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector,\
+ DenseMatrix
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.random import RandomRDDs
from pyspark.mllib.stat import Statistics
@@ -62,6 +63,7 @@ def _squared_distance(a, b):
class VectorTests(PySparkTestCase):
def _test_serialize(self, v):
+ self.assertEqual(v, ser.loads(ser.dumps(v)))
jvec = self.sc._jvm.SerDe.loads(bytearray(ser.dumps(v)))
nv = ser.loads(str(self.sc._jvm.SerDe.dumps(jvec)))
self.assertEqual(v, nv)
@@ -75,6 +77,8 @@ class VectorTests(PySparkTestCase):
self._test_serialize(DenseVector(array([1., 2., 3., 4.])))
self._test_serialize(DenseVector(pyarray.array('d', range(10))))
self._test_serialize(SparseVector(4, {1: 1, 3: 2}))
+ self._test_serialize(SparseVector(3, {}))
+ self._test_serialize(DenseMatrix(2, 3, range(6)))
def test_dot(self):
sv = SparseVector(4, {1: 1, 3: 2})