From ef8974b1b7ff177d9636d091770dff64fedc385f Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Sat, 31 Jan 2015 00:47:30 -0800 Subject: [SPARK-3975] Added support for BlockMatrix addition and multiplication Support for multiplying and adding large distributed matrices! Author: Burak Yavuz Author: Burak Yavuz Author: Burak Yavuz Author: Burak Yavuz Author: Burak Yavuz Closes #4274 from brkyvz/SPARK-3975PR2 and squashes the following commits: 17abd59 [Burak Yavuz] added indices to error message ac25783 [Burak Yavuz] merged masyer b66fd8b [Burak Yavuz] merged masyer e39baff [Burak Yavuz] addressed code review v1 2dba642 [Burak Yavuz] [SPARK-3975] Added support for BlockMatrix addition and multiplication fb7624b [Burak Yavuz] merged master 98c58ea [Burak Yavuz] added tests cdeb5df [Burak Yavuz] before adding tests c9bf247 [Burak Yavuz] fixed merge conflicts 1cb0d06 [Burak Yavuz] [SPARK-3976] Added doc f92a916 [Burak Yavuz] merge upstream 1a63b20 [Burak Yavuz] [SPARK-3974] Remove setPartition method. Isn't required 1e8bb2a [Burak Yavuz] [SPARK-3974] Change return type of cache and persist e3d24c3 [Burak Yavuz] [SPARK-3976] Pulled upstream changes fa3774f [Burak Yavuz] [SPARK-3976] updated matrix multiplication and addition implementation 239ab4b [Burak Yavuz] [SPARK-3974] Addressed @jkbradley's comments add7b05 [Burak Yavuz] [SPARK-3976] Updated code according to upstream changes e29acfd [Burak Yavuz] Merge branch 'master' of github.com:apache/spark into SPARK-3976 3127233 [Burak Yavuz] fixed merge conflicts with upstream ba414d2 [Burak Yavuz] [SPARK-3974] fixed frobenius norm ab6cde0 [Burak Yavuz] [SPARK-3974] Modifications cleaning code up, making size calculation more robust 9ae85aa [Burak Yavuz] [SPARK-3974] Made partitioner a variable inside BlockMatrix instead of a constructor variable d033861 [Burak Yavuz] [SPARK-3974] Removed SubMatrixInfo and added constructor without partitioner 8e954ab [Burak Yavuz] save changes bbeae8c [Burak Yavuz] merged master 987ea53 [Burak Yavuz] merged master 49b9586 [Burak Yavuz] [SPARK-3974] Updated testing utils from master 645afbe [Burak Yavuz] [SPARK-3974] Pull latest master beb1edd [Burak Yavuz] merge conflicts fixed f41d8db [Burak Yavuz] update tests b05aabb [Burak Yavuz] [SPARK-3974] Updated tests to reflect changes 56b0546 [Burak Yavuz] updates from 3974 PR b7b8a8f [Burak Yavuz] pull updates from master b2dec63 [Burak Yavuz] Pull changes from 3974 19c17e8 [Burak Yavuz] [SPARK-3974] Changed blockIdRow and blockIdCol 5f062e6 [Burak Yavuz] updates with 3974 6729fbd [Burak Yavuz] Updated with respect to SPARK-3974 PR 589fbb6 [Burak Yavuz] [SPARK-3974] Code review feedback addressed 63a4858 [Burak Yavuz] added grid multiplication aa8f086 [Burak Yavuz] [SPARK-3974] Additional comments added 7381b99 [Burak Yavuz] merge with PR1 f378e16 [Burak Yavuz] [SPARK-3974] Block Matrix Abstractions ready b693209 [Burak Yavuz] Ready for Pull request --- .../scala/org/apache/spark/mllib/linalg/BLAS.scala | 7 +- .../mllib/linalg/distributed/BlockMatrix.scala | 104 +++++++++++++++++++-- .../linalg/distributed/BlockMatrixSuite.scala | 102 +++++++++++++++++--- 3 files changed, 186 insertions(+), 27 deletions(-) (limited to 'mllib') diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala index 34e0392f1b..079f7ca564 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala @@ -275,10 +275,8 @@ private[spark] object BLAS extends Serializable with Logging { logDebug("gemm: alpha is equal to 0. Returning C.") } else { A match { - case sparse: SparseMatrix => - gemm(alpha, sparse, B, beta, C) - case dense: DenseMatrix => - gemm(alpha, dense, B, beta, C) + case sparse: SparseMatrix => gemm(alpha, sparse, B, beta, C) + case dense: DenseMatrix => gemm(alpha, dense, B, beta, C) case _ => throw new IllegalArgumentException(s"gemm doesn't support matrix type ${A.getClass}.") } @@ -306,7 +304,6 @@ private[spark] object BLAS extends Serializable with Logging { s"The rows of C don't match the rows of A. C: ${C.numRows}, A: ${A.numRows}") require(B.numCols == C.numCols, s"The columns of C don't match the columns of B. C: ${C.numCols}, A: ${B.numCols}") - nativeBLAS.dgemm(tAstr, tBstr, A.numRows, B.numCols, A.numCols, alpha, A.values, lda, B.values, ldb, beta, C.values, C.numRows) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala index a6405975eb..3871152d06 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala @@ -22,7 +22,7 @@ import scala.collection.mutable.ArrayBuffer import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.{SparkException, Logging, Partitioner} -import org.apache.spark.mllib.linalg.{DenseMatrix, Matrix} +import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Matrix, SparseMatrix} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel @@ -45,8 +45,8 @@ private[mllib] class GridPartitioner( require(rowsPerPart > 0) require(colsPerPart > 0) - private val rowPartitions = math.ceil(rows / rowsPerPart).toInt - private val colPartitions = math.ceil(cols / colsPerPart).toInt + private val rowPartitions = math.ceil(rows * 1.0 / rowsPerPart).toInt + private val colPartitions = math.ceil(cols * 1.0 / colsPerPart).toInt override val numPartitions = rowPartitions * colPartitions @@ -106,8 +106,9 @@ private[mllib] object GridPartitioner { /** * Represents a distributed matrix in blocks of local matrices. * - * @param blocks The RDD of sub-matrix blocks (blockRowIndex, blockColIndex, sub-matrix) that form - * this distributed matrix. + * @param blocks The RDD of sub-matrix blocks ((blockRowIndex, blockColIndex), sub-matrix) that + * form this distributed matrix. If multiple blocks with the same index exist, the + * results for operations like add and multiply will be unpredictable. * @param rowsPerBlock Number of rows that make up each block. The blocks forming the final * rows are not required to have the given number of rows * @param colsPerBlock Number of columns that make up each block. The blocks forming the final @@ -129,17 +130,19 @@ class BlockMatrix( /** * Alternate constructor for BlockMatrix without the input of the number of rows and columns. * - * @param rdd The RDD of SubMatrices (local matrices) that form this matrix + * @param blocks The RDD of sub-matrix blocks ((blockRowIndex, blockColIndex), sub-matrix) that + * form this distributed matrix. If multiple blocks with the same index exist, the + * results for operations like add and multiply will be unpredictable. * @param rowsPerBlock Number of rows that make up each block. The blocks forming the final * rows are not required to have the given number of rows * @param colsPerBlock Number of columns that make up each block. The blocks forming the final * columns are not required to have the given number of columns */ def this( - rdd: RDD[((Int, Int), Matrix)], + blocks: RDD[((Int, Int), Matrix)], rowsPerBlock: Int, colsPerBlock: Int) = { - this(rdd, rowsPerBlock, colsPerBlock, 0L, 0L) + this(blocks, rowsPerBlock, colsPerBlock, 0L, 0L) } override def numRows(): Long = { @@ -155,7 +158,7 @@ class BlockMatrix( val numRowBlocks = math.ceil(numRows() * 1.0 / rowsPerBlock).toInt val numColBlocks = math.ceil(numCols() * 1.0 / colsPerBlock).toInt - private[mllib] var partitioner: GridPartitioner = + private[mllib] def createPartitioner(): GridPartitioner = GridPartitioner(numRowBlocks, numColBlocks, suggestedNumPartitions = blocks.partitions.size) private lazy val blockInfo = blocks.mapValues(block => (block.numRows, block.numCols)).cache() @@ -255,7 +258,6 @@ class BlockMatrix( val n = numCols().toInt val mem = m * n / 125000 if (mem > 500) logWarning(s"Storing this matrix will require $mem MB of memory!") - val localBlocks = blocks.collect() val values = new Array[Double](m * n) localBlocks.foreach { case ((blockRowIndex, blockColIndex), submat) => @@ -283,4 +285,86 @@ class BlockMatrix( val localMat = toLocalMatrix() new BDM[Double](localMat.numRows, localMat.numCols, localMat.toArray) } + + /** Adds two block matrices together. The matrices must have the same size and matching + * `rowsPerBlock` and `colsPerBlock` values. If one of the blocks that are being added are + * instances of [[SparseMatrix]], the resulting sub matrix will also be a [[SparseMatrix]], even + * if it is being added to a [[DenseMatrix]]. If two dense matrices are added, the output will + * also be a [[DenseMatrix]]. + */ + def add(other: BlockMatrix): BlockMatrix = { + require(numRows() == other.numRows(), "Both matrices must have the same number of rows. " + + s"A.numRows: ${numRows()}, B.numRows: ${other.numRows()}") + require(numCols() == other.numCols(), "Both matrices must have the same number of columns. " + + s"A.numCols: ${numCols()}, B.numCols: ${other.numCols()}") + if (rowsPerBlock == other.rowsPerBlock && colsPerBlock == other.colsPerBlock) { + val addedBlocks = blocks.cogroup(other.blocks, createPartitioner()) + .map { case ((blockRowIndex, blockColIndex), (a, b)) => + if (a.size > 1 || b.size > 1) { + throw new SparkException("There are multiple MatrixBlocks with indices: " + + s"($blockRowIndex, $blockColIndex). Please remove them.") + } + if (a.isEmpty) { + new MatrixBlock((blockRowIndex, blockColIndex), b.head) + } else if (b.isEmpty) { + new MatrixBlock((blockRowIndex, blockColIndex), a.head) + } else { + val result = a.head.toBreeze + b.head.toBreeze + new MatrixBlock((blockRowIndex, blockColIndex), Matrices.fromBreeze(result)) + } + } + new BlockMatrix(addedBlocks, rowsPerBlock, colsPerBlock, numRows(), numCols()) + } else { + throw new SparkException("Cannot add matrices with different block dimensions") + } + } + + /** Left multiplies this [[BlockMatrix]] to `other`, another [[BlockMatrix]]. The `colsPerBlock` + * of this matrix must equal the `rowsPerBlock` of `other`. If `other` contains + * [[SparseMatrix]], they will have to be converted to a [[DenseMatrix]]. The output + * [[BlockMatrix]] will only consist of blocks of [[DenseMatrix]]. This may cause + * some performance issues until support for multiplying two sparse matrices is added. + */ + def multiply(other: BlockMatrix): BlockMatrix = { + require(numCols() == other.numRows(), "The number of columns of A and the number of rows " + + s"of B must be equal. A.numCols: ${numCols()}, B.numRows: ${other.numRows()}. If you " + + "think they should be equal, try setting the dimensions of A and B explicitly while " + + "initializing them.") + if (colsPerBlock == other.rowsPerBlock) { + val resultPartitioner = GridPartitioner(numRowBlocks, other.numColBlocks, + math.max(blocks.partitions.length, other.blocks.partitions.length)) + // Each block of A must be multiplied with the corresponding blocks in each column of B. + // TODO: Optimize to send block to a partition once, similar to ALS + val flatA = blocks.flatMap { case ((blockRowIndex, blockColIndex), block) => + Iterator.tabulate(other.numColBlocks)(j => ((blockRowIndex, j, blockColIndex), block)) + } + // Each block of B must be multiplied with the corresponding blocks in each row of A. + val flatB = other.blocks.flatMap { case ((blockRowIndex, blockColIndex), block) => + Iterator.tabulate(numRowBlocks)(i => ((i, blockColIndex, blockRowIndex), block)) + } + val newBlocks: RDD[MatrixBlock] = flatA.cogroup(flatB, resultPartitioner) + .flatMap { case ((blockRowIndex, blockColIndex, _), (a, b)) => + if (a.size > 1 || b.size > 1) { + throw new SparkException("There are multiple MatrixBlocks with indices: " + + s"($blockRowIndex, $blockColIndex). Please remove them.") + } + if (a.nonEmpty && b.nonEmpty) { + val C = b.head match { + case dense: DenseMatrix => a.head.multiply(dense) + case sparse: SparseMatrix => a.head.multiply(sparse.toDense()) + case _ => throw new SparkException(s"Unrecognized matrix type ${b.head.getClass}.") + } + Iterator(((blockRowIndex, blockColIndex), C.toBreeze)) + } else { + Iterator() + } + }.reduceByKey(resultPartitioner, (a, b) => a + b) + .mapValues(Matrices.fromBreeze) + // TODO: Try to use aggregateByKey instead of reduceByKey to get rid of intermediate matrices + new BlockMatrix(newBlocks, rowsPerBlock, other.colsPerBlock, numRows(), other.numCols()) + } else { + throw new SparkException("colsPerBlock of A doesn't match rowsPerBlock of B. " + + s"A.colsPerBlock: $colsPerBlock, B.rowsPerBlock: ${other.rowsPerBlock}") + } + } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala index 461f1f92df..949d1c9939 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala @@ -17,14 +17,15 @@ package org.apache.spark.mllib.linalg.distributed -import scala.util.Random +import java.{util => ju} import breeze.linalg.{DenseMatrix => BDM} import org.scalatest.FunSuite import org.apache.spark.SparkException -import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Matrix} +import org.apache.spark.mllib.linalg.{SparseMatrix, DenseMatrix, Matrices, Matrix} import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext { @@ -37,7 +38,6 @@ class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext { override def beforeAll() { super.beforeAll() - val blocks: Seq[((Int, Int), Matrix)] = Seq( ((0, 0), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 2.0))), ((0, 1), new DenseMatrix(2, 2, Array(0.0, 1.0, 0.0, 0.0))), @@ -54,7 +54,7 @@ class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext { } test("grid partitioner") { - val random = new Random() + val random = new ju.Random() // This should generate a 4x4 grid of 1x2 blocks. val part0 = GridPartitioner(4, 7, suggestedNumPartitions = 12) val expected0 = Array( @@ -148,6 +148,92 @@ class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext { assert(gridBasedMat.toBreeze() === expected) } + test("add") { + val blocks: Seq[((Int, Int), Matrix)] = Seq( + ((0, 0), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 2.0))), + ((0, 1), new DenseMatrix(2, 2, Array(0.0, 1.0, 0.0, 0.0))), + ((1, 0), new DenseMatrix(2, 2, Array(3.0, 0.0, 1.0, 1.0))), + ((1, 1), new DenseMatrix(2, 2, Array(1.0, 2.0, 0.0, 1.0))), + ((2, 0), new DenseMatrix(1, 2, Array(1.0, 0.0))), // Added block that doesn't exist in A + ((2, 1), new DenseMatrix(1, 2, Array(1.0, 5.0)))) + val rdd = sc.parallelize(blocks, numPartitions) + val B = new BlockMatrix(rdd, rowPerPart, colPerPart) + + val expected = BDM( + (2.0, 0.0, 0.0, 0.0), + (0.0, 4.0, 2.0, 0.0), + (6.0, 2.0, 2.0, 0.0), + (0.0, 2.0, 4.0, 2.0), + (1.0, 0.0, 2.0, 10.0)) + + val AplusB = gridBasedMat.add(B) + assert(AplusB.numRows() === m) + assert(AplusB.numCols() === B.numCols()) + assert(AplusB.toBreeze() === expected) + + val C = new BlockMatrix(rdd, rowPerPart, colPerPart, m, n + 1) // columns don't match + intercept[IllegalArgumentException] { + gridBasedMat.add(C) + } + val largerBlocks: Seq[((Int, Int), Matrix)] = Seq( + ((0, 0), new DenseMatrix(4, 4, new Array[Double](16))), + ((1, 0), new DenseMatrix(1, 4, Array(1.0, 0.0, 1.0, 5.0)))) + val C2 = new BlockMatrix(sc.parallelize(largerBlocks, numPartitions), 4, 4, m, n) + intercept[SparkException] { // partitioning doesn't match + gridBasedMat.add(C2) + } + // adding BlockMatrices composed of SparseMatrices + val sparseBlocks = for (i <- 0 until 4) yield ((i / 2, i % 2), SparseMatrix.speye(4)) + val denseBlocks = for (i <- 0 until 4) yield ((i / 2, i % 2), DenseMatrix.eye(4)) + val sparseBM = new BlockMatrix(sc.makeRDD(sparseBlocks, 4), 4, 4, 8, 8) + val denseBM = new BlockMatrix(sc.makeRDD(denseBlocks, 4), 4, 4, 8, 8) + + assert(sparseBM.add(sparseBM).toBreeze() === sparseBM.add(denseBM).toBreeze()) + } + + test("multiply") { + // identity matrix + val blocks: Seq[((Int, Int), Matrix)] = Seq( + ((0, 0), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 1.0))), + ((1, 1), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 1.0)))) + val rdd = sc.parallelize(blocks, 2) + val B = new BlockMatrix(rdd, colPerPart, rowPerPart) + val expected = BDM( + (1.0, 0.0, 0.0, 0.0), + (0.0, 2.0, 1.0, 0.0), + (3.0, 1.0, 1.0, 0.0), + (0.0, 1.0, 2.0, 1.0), + (0.0, 0.0, 1.0, 5.0)) + + val AtimesB = gridBasedMat.multiply(B) + assert(AtimesB.numRows() === m) + assert(AtimesB.numCols() === n) + assert(AtimesB.toBreeze() === expected) + val C = new BlockMatrix(rdd, rowPerPart, colPerPart, m + 1, n) // dimensions don't match + intercept[IllegalArgumentException] { + gridBasedMat.multiply(C) + } + val largerBlocks = Seq(((0, 0), DenseMatrix.eye(4))) + val C2 = new BlockMatrix(sc.parallelize(largerBlocks, numPartitions), 4, 4) + intercept[SparkException] { + // partitioning doesn't match + gridBasedMat.multiply(C2) + } + val rand = new ju.Random(42) + val largerAblocks = for (i <- 0 until 20) yield ((i % 5, i / 5), DenseMatrix.rand(6, 4, rand)) + val largerBblocks = for (i <- 0 until 16) yield ((i % 4, i / 4), DenseMatrix.rand(4, 4, rand)) + + // Try it with increased number of partitions + val largeA = new BlockMatrix(sc.parallelize(largerAblocks, 10), 6, 4) + val largeB = new BlockMatrix(sc.parallelize(largerBblocks, 8), 4, 4) + val largeC = largeA.multiply(largeB) + val localC = largeC.toLocalMatrix() + val result = largeA.toLocalMatrix().multiply(largeB.toLocalMatrix().asInstanceOf[DenseMatrix]) + assert(largeC.numRows() === largeA.numRows()) + assert(largeC.numCols() === largeB.numCols()) + assert(localC ~== result absTol 1e-8) + } + test("validate") { // No error gridBasedMat.validate() @@ -201,14 +287,6 @@ class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext { assert(AT.numCols() === gridBasedMat.numRows()) assert(AT.toBreeze() === expected) - // partitioner must update as well - val originalPartitioner = gridBasedMat.partitioner - val ATpartitioner = AT.partitioner - assert(originalPartitioner.colsPerPart === ATpartitioner.rowsPerPart) - assert(originalPartitioner.rowsPerPart === ATpartitioner.colsPerPart) - assert(originalPartitioner.cols === ATpartitioner.rows) - assert(originalPartitioner.rows === ATpartitioner.cols) - // make sure it works when matrices are cached as well gridBasedMat.cache() val AT2 = gridBasedMat.transpose -- cgit v1.2.3