From aa1e22b17b4ce885febe6970a2451c7d17d0acfb Mon Sep 17 00:00:00 2001 From: Reza Zadeh Date: Wed, 21 Jan 2015 09:48:38 -0800 Subject: [MLlib] [SPARK-5301] Missing conversions and operations on IndexedRowMatrix and CoordinateMatrix * Transpose is missing from CoordinateMatrix (this is cheap to compute, so it should be there) * IndexedRowMatrix should be convertable to CoordinateMatrix (conversion added) Tests for both added. Author: Reza Zadeh Closes #4089 from rezazadeh/matutils and squashes the following commits: ec5238b [Reza Zadeh] Array -> Iterator to avoid temp array 3ce0b5d [Reza Zadeh] Array -> Iterator bbc907a [Reza Zadeh] Use 'i' for index, and zipWithIndex cb10ae5 [Reza Zadeh] remove unnecessary import a7ae048 [Reza Zadeh] Missing linear algebra utilities --- .../mllib/linalg/distributed/CoordinateMatrix.scala | 5 +++++ .../mllib/linalg/distributed/IndexedRowMatrix.scala | 17 +++++++++++++++++ .../linalg/distributed/CoordinateMatrixSuite.scala | 5 +++++ .../linalg/distributed/IndexedRowMatrixSuite.scala | 8 ++++++++ 4 files changed, 35 insertions(+) (limited to 'mllib') diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala index 06d8915f3b..b60559c853 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrix.scala @@ -69,6 +69,11 @@ class CoordinateMatrix( nRows } + /** Transposes this CoordinateMatrix. */ + def transpose(): CoordinateMatrix = { + new CoordinateMatrix(entries.map(x => MatrixEntry(x.j, x.i, x.value)), numCols(), numRows()) + } + /** Converts to IndexedRowMatrix. The number of columns must be within the integer range. */ def toIndexedRowMatrix(): IndexedRowMatrix = { val nl = numCols() diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala index 181f507516..c518271f04 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrix.scala @@ -75,6 +75,23 @@ class IndexedRowMatrix( new RowMatrix(rows.map(_.vector), 0L, nCols) } + /** + * Converts this matrix to a + * [[org.apache.spark.mllib.linalg.distributed.CoordinateMatrix]]. + */ + def toCoordinateMatrix(): CoordinateMatrix = { + val entries = rows.flatMap { row => + val rowIndex = row.index + row.vector match { + case SparseVector(size, indices, values) => + Iterator.tabulate(indices.size)(i => MatrixEntry(rowIndex, indices(i), values(i))) + case DenseVector(values) => + Iterator.tabulate(values.size)(i => MatrixEntry(rowIndex, i, values(i))) + } + } + new CoordinateMatrix(entries, numRows(), numCols()) + } + /** * Computes the singular value decomposition of this IndexedRowMatrix. * Denote this matrix by A (m x n), this will compute matrices U, S, V such that A = U * S * V'. diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrixSuite.scala index f8709751ef..80bef814ce 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/CoordinateMatrixSuite.scala @@ -73,6 +73,11 @@ class CoordinateMatrixSuite extends FunSuite with MLlibTestSparkContext { assert(mat.toBreeze() === expected) } + test("transpose") { + val transposed = mat.transpose() + assert(mat.toBreeze().t === transposed.toBreeze()) + } + test("toIndexedRowMatrix") { val indexedRowMatrix = mat.toIndexedRowMatrix() val expected = BDM( diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala index 741cd4997b..b86c2ca5ff 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/IndexedRowMatrixSuite.scala @@ -80,6 +80,14 @@ class IndexedRowMatrixSuite extends FunSuite with MLlibTestSparkContext { assert(rowMat.rows.collect().toSeq === data.map(_.vector).toSeq) } + test("toCoordinateMatrix") { + val idxRowMat = new IndexedRowMatrix(indexedRows) + val coordMat = idxRowMat.toCoordinateMatrix() + assert(coordMat.numRows() === m) + assert(coordMat.numCols() === n) + assert(coordMat.toBreeze() === idxRowMat.toBreeze()) + } + test("multiply a local matrix") { val A = new IndexedRowMatrix(indexedRows) val B = Matrices.dense(3, 2, Array(0.0, 1.0, 2.0, 3.0, 4.0, 5.0)) -- cgit v1.2.3