aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorMechCoder <manojkumarsivaraj334@gmail.com>2015-04-21 14:36:50 -0700
committerXiangrui Meng <meng@databricks.com>2015-04-21 14:36:50 -0700
commit45c47fa4176ea75886a58f5d73c44afcb29aa629 (patch)
tree68601b1683fe06ababd86884d5a92d406097c553 /python
parentc25ca7c5a1f2a4f88f40b0c5cdbfa927c186cfa8 (diff)
downloadspark-45c47fa4176ea75886a58f5d73c44afcb29aa629.tar.gz
spark-45c47fa4176ea75886a58f5d73c44afcb29aa629.tar.bz2
spark-45c47fa4176ea75886a58f5d73c44afcb29aa629.zip
[SPARK-6845] [MLlib] [PySpark] Add isTranposed flag to DenseMatrix
Since sparse matrices now support a isTransposed flag for row major data, DenseMatrices should do the same. Author: MechCoder <manojkumarsivaraj334@gmail.com> Closes #5455 from MechCoder/spark-6845 and squashes the following commits: 525c370 [MechCoder] minor 004a37f [MechCoder] Cast boolean to int 151f3b6 [MechCoder] [WIP] Add isTransposed to pickle DenseMatrix cc0b90a [MechCoder] [SPARK-6845] Add isTranposed flag to DenseMatrix
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/mllib/linalg.py49
-rw-r--r--python/pyspark/mllib/tests.py16
2 files changed, 49 insertions, 16 deletions
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index ec8c879ea9..cc9a4cf8ba 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -638,9 +638,10 @@ class Matrix(object):
Represents a local matrix.
"""
- def __init__(self, numRows, numCols):
+ def __init__(self, numRows, numCols, isTransposed=False):
self.numRows = numRows
self.numCols = numCols
+ self.isTransposed = isTransposed
def toArray(self):
"""
@@ -662,14 +663,16 @@ class DenseMatrix(Matrix):
"""
Column-major dense matrix.
"""
- def __init__(self, numRows, numCols, values):
- Matrix.__init__(self, numRows, numCols)
+ def __init__(self, numRows, numCols, values, isTransposed=False):
+ Matrix.__init__(self, numRows, numCols, isTransposed)
values = self._convert_to_array(values, np.float64)
assert len(values) == numRows * numCols
self.values = values
def __reduce__(self):
- return DenseMatrix, (self.numRows, self.numCols, self.values.tostring())
+ return DenseMatrix, (
+ self.numRows, self.numCols, self.values.tostring(),
+ int(self.isTransposed))
def toArray(self):
"""
@@ -680,15 +683,23 @@ class DenseMatrix(Matrix):
array([[ 0., 2.],
[ 1., 3.]])
"""
- return self.values.reshape((self.numRows, self.numCols), order='F')
+ if self.isTransposed:
+ return np.asfortranarray(
+ self.values.reshape((self.numRows, self.numCols)))
+ else:
+ return self.values.reshape((self.numRows, self.numCols), order='F')
def toSparse(self):
"""Convert to SparseMatrix"""
- indices = np.nonzero(self.values)[0]
+ if self.isTransposed:
+ values = np.ravel(self.toArray(), order='F')
+ else:
+ values = self.values
+ indices = np.nonzero(values)[0]
colCounts = np.bincount(indices // self.numRows)
colPtrs = np.cumsum(np.hstack(
(0, colCounts, np.zeros(self.numCols - colCounts.size))))
- values = self.values[indices]
+ values = values[indices]
rowIndices = indices % self.numRows
return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices, values)
@@ -701,21 +712,28 @@ class DenseMatrix(Matrix):
if j >= self.numCols or j < 0:
raise ValueError("Column index %d is out of range [0, %d)"
% (j, self.numCols))
- return self.values[i + j * self.numRows]
+
+ if self.isTransposed:
+ return self.values[i * self.numCols + j]
+ else:
+ return self.values[i + j * self.numRows]
def __eq__(self, other):
- return (isinstance(other, DenseMatrix) and
- self.numRows == other.numRows and
- self.numCols == other.numCols and
- all(self.values == other.values))
+ if (not isinstance(other, DenseMatrix) or
+ self.numRows != other.numRows or
+ self.numCols != other.numCols):
+ return False
+
+ self_values = np.ravel(self.toArray(), order='F')
+ other_values = np.ravel(other.toArray(), order='F')
+ return all(self_values == other_values)
class SparseMatrix(Matrix):
"""Sparse Matrix stored in CSC format."""
def __init__(self, numRows, numCols, colPtrs, rowIndices, values,
isTransposed=False):
- Matrix.__init__(self, numRows, numCols)
- self.isTransposed = isTransposed
+ Matrix.__init__(self, numRows, numCols, isTransposed)
self.colPtrs = self._convert_to_array(colPtrs, np.int32)
self.rowIndices = self._convert_to_array(rowIndices, np.int32)
self.values = self._convert_to_array(values, np.float64)
@@ -777,8 +795,7 @@ class SparseMatrix(Matrix):
return A
def toDense(self):
- densevals = np.reshape(
- self.toArray(), (self.numRows * self.numCols), order='F')
+ densevals = np.ravel(self.toArray(), order='F')
return DenseMatrix(self.numRows, self.numCols, densevals)
# TODO: More efficient implementation:
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 849c88341a..8f89e2cee0 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -195,6 +195,22 @@ class VectorTests(PySparkTestCase):
self.assertEquals(expected[i][j], sm1t[i, j])
self.assertTrue(array_equal(sm1t.toArray(), expected))
+ def test_dense_matrix_is_transposed(self):
+ mat1 = DenseMatrix(3, 2, [0, 4, 1, 6, 3, 9], isTransposed=True)
+ mat = DenseMatrix(3, 2, [0, 1, 3, 4, 6, 9])
+ self.assertEquals(mat1, mat)
+
+ expected = [[0, 4], [1, 6], [3, 9]]
+ for i in range(3):
+ for j in range(2):
+ self.assertEquals(mat1[i, j], expected[i][j])
+ self.assertTrue(array_equal(mat1.toArray(), expected))
+
+ sm = mat1.toSparse()
+ self.assertTrue(array_equal(sm.rowIndices, [1, 2, 0, 1, 2]))
+ self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5]))
+ self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9]))
+
class ListTests(PySparkTestCase):