From d75496b1898dace4da1cf95e53c38093f8f95221 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Tue, 30 Sep 2014 17:10:36 -0700 Subject: [SPARK-3701][MLLIB] update python linalg api and small fixes 1. doc updates 2. simple checks on vector dimensions 3. use column major for matrices davies jkbradley Author: Xiangrui Meng Closes #2548 from mengxr/mllib-py-clean and squashes the following commits: 6dce2df [Xiangrui Meng] address comments 116b5db [Xiangrui Meng] use np.dot instead of array.dot 75f2fcc [Xiangrui Meng] fix python style fefce00 [Xiangrui Meng] better check of vector size with more tests 067ef71 [Xiangrui Meng] majored -> major ef853f9 [Xiangrui Meng] update python linalg api and small fixes --- python/pyspark/mllib/linalg.py | 150 +++++++++++++++++++++++++++++++++-------- 1 file changed, 121 insertions(+), 29 deletions(-) (limited to 'python/pyspark') diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py index 0a5dcaac55..51014a8ceb 100644 --- a/python/pyspark/mllib/linalg.py +++ b/python/pyspark/mllib/linalg.py @@ -63,6 +63,41 @@ def _convert_to_vector(l): raise TypeError("Cannot convert type %s into Vector" % type(l)) +def _vector_size(v): + """ + Returns the size of the vector. + + >>> _vector_size([1., 2., 3.]) + 3 + >>> _vector_size((1., 2., 3.)) + 3 + >>> _vector_size(array.array('d', [1., 2., 3.])) + 3 + >>> _vector_size(np.zeros(3)) + 3 + >>> _vector_size(np.zeros((3, 1))) + 3 + >>> _vector_size(np.zeros((1, 3))) + Traceback (most recent call last): + ... + ValueError: Cannot treat an ndarray of shape (1, 3) as a vector + """ + if isinstance(v, Vector): + return len(v) + elif type(v) in (array.array, list, tuple): + return len(v) + elif type(v) == np.ndarray: + if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1): + return len(v) + else: + raise ValueError("Cannot treat an ndarray of shape %s as a vector" % str(v.shape)) + elif _have_scipy and scipy.sparse.issparse(v): + assert v.shape[1] == 1, "Expected column vector" + return v.shape[0] + else: + raise TypeError("Cannot treat type %s as a vector" % type(v)) + + class Vector(object): """ Abstract class for DenseVector and SparseVector @@ -76,6 +111,9 @@ class Vector(object): class DenseVector(Vector): + """ + A dense vector represented by a value array. + """ def __init__(self, ar): if not isinstance(ar, array.array): ar = array.array('d', ar) @@ -100,15 +138,31 @@ class DenseVector(Vector): 5.0 >>> dense.dot(np.array(range(1, 3))) 5.0 + >>> dense.dot([1.,]) + Traceback (most recent call last): + ... + AssertionError: dimension mismatch + >>> dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F')) + array([ 5., 11.]) + >>> dense.dot(np.reshape([1., 2., 3.], (3, 1), order='F')) + Traceback (most recent call last): + ... + AssertionError: dimension mismatch """ - if isinstance(other, SparseVector): - return other.dot(self) + if type(other) == np.ndarray and other.ndim > 1: + assert len(self) == other.shape[0], "dimension mismatch" + return np.dot(self.toArray(), other) elif _have_scipy and scipy.sparse.issparse(other): - return other.transpose().dot(self.toArray())[0] - elif isinstance(other, Vector): - return np.dot(self.toArray(), other.toArray()) + assert len(self) == other.shape[0], "dimension mismatch" + return other.transpose().dot(self.toArray()) else: - return np.dot(self.toArray(), other) + assert len(self) == _vector_size(other), "dimension mismatch" + if isinstance(other, SparseVector): + return other.dot(self) + elif isinstance(other, Vector): + return np.dot(self.toArray(), other.toArray()) + else: + return np.dot(self.toArray(), other) def squared_distance(self, other): """ @@ -126,7 +180,16 @@ class DenseVector(Vector): >>> sparse1 = SparseVector(2, [0, 1], [2., 1.]) >>> dense1.squared_distance(sparse1) 2.0 + >>> dense1.squared_distance([1.,]) + Traceback (most recent call last): + ... + AssertionError: dimension mismatch + >>> dense1.squared_distance(SparseVector(1, [0,], [1.,])) + Traceback (most recent call last): + ... + AssertionError: dimension mismatch """ + assert len(self) == _vector_size(other), "dimension mismatch" if isinstance(other, SparseVector): return other.squared_distance(self) elif _have_scipy and scipy.sparse.issparse(other): @@ -165,12 +228,10 @@ class DenseVector(Vector): class SparseVector(Vector): - """ A simple sparse vector class for passing data to MLlib. Users may alternatively pass SciPy's {scipy.sparse} data types. """ - def __init__(self, size, *args): """ Create a sparse vector, using either a dictionary, a list of @@ -222,20 +283,33 @@ class SparseVector(Vector): 0.0 >>> a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]])) array([ 22., 22.]) + >>> a.dot([1., 2., 3.]) + Traceback (most recent call last): + ... + AssertionError: dimension mismatch + >>> a.dot(np.array([1., 2.])) + Traceback (most recent call last): + ... + AssertionError: dimension mismatch + >>> a.dot(DenseVector([1., 2.])) + Traceback (most recent call last): + ... + AssertionError: dimension mismatch + >>> a.dot(np.zeros((3, 2))) + Traceback (most recent call last): + ... + AssertionError: dimension mismatch """ if type(other) == np.ndarray: - if other.ndim == 1: - result = 0.0 - for i in xrange(len(self.indices)): - result += self.values[i] * other[self.indices[i]] - return result - elif other.ndim == 2: + if other.ndim == 2: results = [self.dot(other[:, i]) for i in xrange(other.shape[1])] return np.array(results) - else: - raise Exception("Cannot call dot with %d-dimensional array" % other.ndim) + elif other.ndim > 2: + raise ValueError("Cannot call dot with %d-dimensional array" % other.ndim) + + assert len(self) == _vector_size(other), "dimension mismatch" - elif type(other) in (array.array, DenseVector): + if type(other) in (np.ndarray, array.array, DenseVector): result = 0.0 for i in xrange(len(self.indices)): result += self.values[i] * other[self.indices[i]] @@ -254,6 +328,7 @@ class SparseVector(Vector): else: j += 1 return result + else: return self.dot(_convert_to_vector(other)) @@ -273,7 +348,16 @@ class SparseVector(Vector): 30.0 >>> b.squared_distance(a) 30.0 + >>> b.squared_distance([1., 2.]) + Traceback (most recent call last): + ... + AssertionError: dimension mismatch + >>> b.squared_distance(SparseVector(3, [1,], [1.0,])) + Traceback (most recent call last): + ... + AssertionError: dimension mismatch """ + assert len(self) == _vector_size(other), "dimension mismatch" if type(other) in (list, array.array, DenseVector, np.array, np.ndarray): if type(other) is np.array and other.ndim != 1: raise Exception("Cannot call squared_distance with %d-dimensional array" % @@ -348,7 +432,6 @@ class SparseVector(Vector): >>> v1 != v2 False """ - return (isinstance(other, self.__class__) and other.size == self.size and other.indices == self.indices @@ -414,23 +497,32 @@ class Vectors(object): class Matrix(object): - """ the Matrix """ - def __init__(self, nRow, nCol): - self.nRow = nRow - self.nCol = nCol + """ + Represents a local matrix. + """ + + def __init__(self, numRows, numCols): + self.numRows = numRows + self.numCols = numCols def toArray(self): + """ + Returns its elements in a NumPy ndarray. + """ raise NotImplementedError class DenseMatrix(Matrix): - def __init__(self, nRow, nCol, values): - Matrix.__init__(self, nRow, nCol) - assert len(values) == nRow * nCol + """ + Column-major dense matrix. + """ + def __init__(self, numRows, numCols, values): + Matrix.__init__(self, numRows, numCols) + assert len(values) == numRows * numCols self.values = values def __reduce__(self): - return DenseMatrix, (self.nRow, self.nCol, self.values) + return DenseMatrix, (self.numRows, self.numCols, self.values) def toArray(self): """ @@ -439,10 +531,10 @@ class DenseMatrix(Matrix): >>> arr = array.array('d', [float(i) for i in range(4)]) >>> m = DenseMatrix(2, 2, arr) >>> m.toArray() - array([[ 0., 1.], - [ 2., 3.]]) + array([[ 0., 2.], + [ 1., 3.]]) """ - return np.ndarray((self.nRow, self.nCol), np.float64, buffer=self.values.tostring()) + return np.reshape(self.values, (self.numRows, self.numCols), order='F') def _test(): -- cgit v1.2.3