aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorBurak Yavuz <brkyvz@gmail.com>2015-01-31 00:47:30 -0800
committerXiangrui Meng <meng@databricks.com>2015-01-31 00:47:30 -0800
commitef8974b1b7ff177d9636d091770dff64fedc385f (patch)
tree38ce3d36dbb724fd812c30d49ab3be7467a9739d /mllib
parent34250a613cee39b03f05f2d54ae37029abd8a502 (diff)
downloadspark-ef8974b1b7ff177d9636d091770dff64fedc385f.tar.gz
spark-ef8974b1b7ff177d9636d091770dff64fedc385f.tar.bz2
spark-ef8974b1b7ff177d9636d091770dff64fedc385f.zip
[SPARK-3975] Added support for BlockMatrix addition and multiplication
Support for multiplying and adding large distributed matrices! Author: Burak Yavuz <brkyvz@gmail.com> Author: Burak Yavuz <brkyvz@dn51t42l.sunet> Author: Burak Yavuz <brkyvz@dn51t4rd.sunet> Author: Burak Yavuz <brkyvz@dn0a221430.sunet> Author: Burak Yavuz <brkyvz@dn0a22b17d.sunet> Closes #4274 from brkyvz/SPARK-3975PR2 and squashes the following commits: 17abd59 [Burak Yavuz] added indices to error message ac25783 [Burak Yavuz] merged masyer b66fd8b [Burak Yavuz] merged masyer e39baff [Burak Yavuz] addressed code review v1 2dba642 [Burak Yavuz] [SPARK-3975] Added support for BlockMatrix addition and multiplication fb7624b [Burak Yavuz] merged master 98c58ea [Burak Yavuz] added tests cdeb5df [Burak Yavuz] before adding tests c9bf247 [Burak Yavuz] fixed merge conflicts 1cb0d06 [Burak Yavuz] [SPARK-3976] Added doc f92a916 [Burak Yavuz] merge upstream 1a63b20 [Burak Yavuz] [SPARK-3974] Remove setPartition method. Isn't required 1e8bb2a [Burak Yavuz] [SPARK-3974] Change return type of cache and persist e3d24c3 [Burak Yavuz] [SPARK-3976] Pulled upstream changes fa3774f [Burak Yavuz] [SPARK-3976] updated matrix multiplication and addition implementation 239ab4b [Burak Yavuz] [SPARK-3974] Addressed @jkbradley's comments add7b05 [Burak Yavuz] [SPARK-3976] Updated code according to upstream changes e29acfd [Burak Yavuz] Merge branch 'master' of github.com:apache/spark into SPARK-3976 3127233 [Burak Yavuz] fixed merge conflicts with upstream ba414d2 [Burak Yavuz] [SPARK-3974] fixed frobenius norm ab6cde0 [Burak Yavuz] [SPARK-3974] Modifications cleaning code up, making size calculation more robust 9ae85aa [Burak Yavuz] [SPARK-3974] Made partitioner a variable inside BlockMatrix instead of a constructor variable d033861 [Burak Yavuz] [SPARK-3974] Removed SubMatrixInfo and added constructor without partitioner 8e954ab [Burak Yavuz] save changes bbeae8c [Burak Yavuz] merged master 987ea53 [Burak Yavuz] merged master 49b9586 [Burak Yavuz] [SPARK-3974] Updated testing utils from master 645afbe [Burak Yavuz] [SPARK-3974] Pull latest master beb1edd [Burak Yavuz] merge conflicts fixed f41d8db [Burak Yavuz] update tests b05aabb [Burak Yavuz] [SPARK-3974] Updated tests to reflect changes 56b0546 [Burak Yavuz] updates from 3974 PR b7b8a8f [Burak Yavuz] pull updates from master b2dec63 [Burak Yavuz] Pull changes from 3974 19c17e8 [Burak Yavuz] [SPARK-3974] Changed blockIdRow and blockIdCol 5f062e6 [Burak Yavuz] updates with 3974 6729fbd [Burak Yavuz] Updated with respect to SPARK-3974 PR 589fbb6 [Burak Yavuz] [SPARK-3974] Code review feedback addressed 63a4858 [Burak Yavuz] added grid multiplication aa8f086 [Burak Yavuz] [SPARK-3974] Additional comments added 7381b99 [Burak Yavuz] merge with PR1 f378e16 [Burak Yavuz] [SPARK-3974] Block Matrix Abstractions ready b693209 [Burak Yavuz] Ready for Pull request
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala7
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala104
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala102
3 files changed, 186 insertions, 27 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
index 34e0392f1b..079f7ca564 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
@@ -275,10 +275,8 @@ private[spark] object BLAS extends Serializable with Logging {
logDebug("gemm: alpha is equal to 0. Returning C.")
} else {
A match {
- case sparse: SparseMatrix =>
- gemm(alpha, sparse, B, beta, C)
- case dense: DenseMatrix =>
- gemm(alpha, dense, B, beta, C)
+ case sparse: SparseMatrix => gemm(alpha, sparse, B, beta, C)
+ case dense: DenseMatrix => gemm(alpha, dense, B, beta, C)
case _ =>
throw new IllegalArgumentException(s"gemm doesn't support matrix type ${A.getClass}.")
}
@@ -306,7 +304,6 @@ private[spark] object BLAS extends Serializable with Logging {
s"The rows of C don't match the rows of A. C: ${C.numRows}, A: ${A.numRows}")
require(B.numCols == C.numCols,
s"The columns of C don't match the columns of B. C: ${C.numCols}, A: ${B.numCols}")
-
nativeBLAS.dgemm(tAstr, tBstr, A.numRows, B.numCols, A.numCols, alpha, A.values, lda,
B.values, ldb, beta, C.values, C.numRows)
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
index a6405975eb..3871152d06 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
@@ -22,7 +22,7 @@ import scala.collection.mutable.ArrayBuffer
import breeze.linalg.{DenseMatrix => BDM}
import org.apache.spark.{SparkException, Logging, Partitioner}
-import org.apache.spark.mllib.linalg.{DenseMatrix, Matrix}
+import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Matrix, SparseMatrix}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
@@ -45,8 +45,8 @@ private[mllib] class GridPartitioner(
require(rowsPerPart > 0)
require(colsPerPart > 0)
- private val rowPartitions = math.ceil(rows / rowsPerPart).toInt
- private val colPartitions = math.ceil(cols / colsPerPart).toInt
+ private val rowPartitions = math.ceil(rows * 1.0 / rowsPerPart).toInt
+ private val colPartitions = math.ceil(cols * 1.0 / colsPerPart).toInt
override val numPartitions = rowPartitions * colPartitions
@@ -106,8 +106,9 @@ private[mllib] object GridPartitioner {
/**
* Represents a distributed matrix in blocks of local matrices.
*
- * @param blocks The RDD of sub-matrix blocks (blockRowIndex, blockColIndex, sub-matrix) that form
- * this distributed matrix.
+ * @param blocks The RDD of sub-matrix blocks ((blockRowIndex, blockColIndex), sub-matrix) that
+ * form this distributed matrix. If multiple blocks with the same index exist, the
+ * results for operations like add and multiply will be unpredictable.
* @param rowsPerBlock Number of rows that make up each block. The blocks forming the final
* rows are not required to have the given number of rows
* @param colsPerBlock Number of columns that make up each block. The blocks forming the final
@@ -129,17 +130,19 @@ class BlockMatrix(
/**
* Alternate constructor for BlockMatrix without the input of the number of rows and columns.
*
- * @param rdd The RDD of SubMatrices (local matrices) that form this matrix
+ * @param blocks The RDD of sub-matrix blocks ((blockRowIndex, blockColIndex), sub-matrix) that
+ * form this distributed matrix. If multiple blocks with the same index exist, the
+ * results for operations like add and multiply will be unpredictable.
* @param rowsPerBlock Number of rows that make up each block. The blocks forming the final
* rows are not required to have the given number of rows
* @param colsPerBlock Number of columns that make up each block. The blocks forming the final
* columns are not required to have the given number of columns
*/
def this(
- rdd: RDD[((Int, Int), Matrix)],
+ blocks: RDD[((Int, Int), Matrix)],
rowsPerBlock: Int,
colsPerBlock: Int) = {
- this(rdd, rowsPerBlock, colsPerBlock, 0L, 0L)
+ this(blocks, rowsPerBlock, colsPerBlock, 0L, 0L)
}
override def numRows(): Long = {
@@ -155,7 +158,7 @@ class BlockMatrix(
val numRowBlocks = math.ceil(numRows() * 1.0 / rowsPerBlock).toInt
val numColBlocks = math.ceil(numCols() * 1.0 / colsPerBlock).toInt
- private[mllib] var partitioner: GridPartitioner =
+ private[mllib] def createPartitioner(): GridPartitioner =
GridPartitioner(numRowBlocks, numColBlocks, suggestedNumPartitions = blocks.partitions.size)
private lazy val blockInfo = blocks.mapValues(block => (block.numRows, block.numCols)).cache()
@@ -255,7 +258,6 @@ class BlockMatrix(
val n = numCols().toInt
val mem = m * n / 125000
if (mem > 500) logWarning(s"Storing this matrix will require $mem MB of memory!")
-
val localBlocks = blocks.collect()
val values = new Array[Double](m * n)
localBlocks.foreach { case ((blockRowIndex, blockColIndex), submat) =>
@@ -283,4 +285,86 @@ class BlockMatrix(
val localMat = toLocalMatrix()
new BDM[Double](localMat.numRows, localMat.numCols, localMat.toArray)
}
+
+ /** Adds two block matrices together. The matrices must have the same size and matching
+ * `rowsPerBlock` and `colsPerBlock` values. If one of the blocks that are being added are
+ * instances of [[SparseMatrix]], the resulting sub matrix will also be a [[SparseMatrix]], even
+ * if it is being added to a [[DenseMatrix]]. If two dense matrices are added, the output will
+ * also be a [[DenseMatrix]].
+ */
+ def add(other: BlockMatrix): BlockMatrix = {
+ require(numRows() == other.numRows(), "Both matrices must have the same number of rows. " +
+ s"A.numRows: ${numRows()}, B.numRows: ${other.numRows()}")
+ require(numCols() == other.numCols(), "Both matrices must have the same number of columns. " +
+ s"A.numCols: ${numCols()}, B.numCols: ${other.numCols()}")
+ if (rowsPerBlock == other.rowsPerBlock && colsPerBlock == other.colsPerBlock) {
+ val addedBlocks = blocks.cogroup(other.blocks, createPartitioner())
+ .map { case ((blockRowIndex, blockColIndex), (a, b)) =>
+ if (a.size > 1 || b.size > 1) {
+ throw new SparkException("There are multiple MatrixBlocks with indices: " +
+ s"($blockRowIndex, $blockColIndex). Please remove them.")
+ }
+ if (a.isEmpty) {
+ new MatrixBlock((blockRowIndex, blockColIndex), b.head)
+ } else if (b.isEmpty) {
+ new MatrixBlock((blockRowIndex, blockColIndex), a.head)
+ } else {
+ val result = a.head.toBreeze + b.head.toBreeze
+ new MatrixBlock((blockRowIndex, blockColIndex), Matrices.fromBreeze(result))
+ }
+ }
+ new BlockMatrix(addedBlocks, rowsPerBlock, colsPerBlock, numRows(), numCols())
+ } else {
+ throw new SparkException("Cannot add matrices with different block dimensions")
+ }
+ }
+
+ /** Left multiplies this [[BlockMatrix]] to `other`, another [[BlockMatrix]]. The `colsPerBlock`
+ * of this matrix must equal the `rowsPerBlock` of `other`. If `other` contains
+ * [[SparseMatrix]], they will have to be converted to a [[DenseMatrix]]. The output
+ * [[BlockMatrix]] will only consist of blocks of [[DenseMatrix]]. This may cause
+ * some performance issues until support for multiplying two sparse matrices is added.
+ */
+ def multiply(other: BlockMatrix): BlockMatrix = {
+ require(numCols() == other.numRows(), "The number of columns of A and the number of rows " +
+ s"of B must be equal. A.numCols: ${numCols()}, B.numRows: ${other.numRows()}. If you " +
+ "think they should be equal, try setting the dimensions of A and B explicitly while " +
+ "initializing them.")
+ if (colsPerBlock == other.rowsPerBlock) {
+ val resultPartitioner = GridPartitioner(numRowBlocks, other.numColBlocks,
+ math.max(blocks.partitions.length, other.blocks.partitions.length))
+ // Each block of A must be multiplied with the corresponding blocks in each column of B.
+ // TODO: Optimize to send block to a partition once, similar to ALS
+ val flatA = blocks.flatMap { case ((blockRowIndex, blockColIndex), block) =>
+ Iterator.tabulate(other.numColBlocks)(j => ((blockRowIndex, j, blockColIndex), block))
+ }
+ // Each block of B must be multiplied with the corresponding blocks in each row of A.
+ val flatB = other.blocks.flatMap { case ((blockRowIndex, blockColIndex), block) =>
+ Iterator.tabulate(numRowBlocks)(i => ((i, blockColIndex, blockRowIndex), block))
+ }
+ val newBlocks: RDD[MatrixBlock] = flatA.cogroup(flatB, resultPartitioner)
+ .flatMap { case ((blockRowIndex, blockColIndex, _), (a, b)) =>
+ if (a.size > 1 || b.size > 1) {
+ throw new SparkException("There are multiple MatrixBlocks with indices: " +
+ s"($blockRowIndex, $blockColIndex). Please remove them.")
+ }
+ if (a.nonEmpty && b.nonEmpty) {
+ val C = b.head match {
+ case dense: DenseMatrix => a.head.multiply(dense)
+ case sparse: SparseMatrix => a.head.multiply(sparse.toDense())
+ case _ => throw new SparkException(s"Unrecognized matrix type ${b.head.getClass}.")
+ }
+ Iterator(((blockRowIndex, blockColIndex), C.toBreeze))
+ } else {
+ Iterator()
+ }
+ }.reduceByKey(resultPartitioner, (a, b) => a + b)
+ .mapValues(Matrices.fromBreeze)
+ // TODO: Try to use aggregateByKey instead of reduceByKey to get rid of intermediate matrices
+ new BlockMatrix(newBlocks, rowsPerBlock, other.colsPerBlock, numRows(), other.numCols())
+ } else {
+ throw new SparkException("colsPerBlock of A doesn't match rowsPerBlock of B. " +
+ s"A.colsPerBlock: $colsPerBlock, B.rowsPerBlock: ${other.rowsPerBlock}")
+ }
+ }
}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala
index 461f1f92df..949d1c9939 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala
@@ -17,14 +17,15 @@
package org.apache.spark.mllib.linalg.distributed
-import scala.util.Random
+import java.{util => ju}
import breeze.linalg.{DenseMatrix => BDM}
import org.scalatest.FunSuite
import org.apache.spark.SparkException
-import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Matrix}
+import org.apache.spark.mllib.linalg.{SparseMatrix, DenseMatrix, Matrices, Matrix}
import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext {
@@ -37,7 +38,6 @@ class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext {
override def beforeAll() {
super.beforeAll()
-
val blocks: Seq[((Int, Int), Matrix)] = Seq(
((0, 0), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 2.0))),
((0, 1), new DenseMatrix(2, 2, Array(0.0, 1.0, 0.0, 0.0))),
@@ -54,7 +54,7 @@ class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext {
}
test("grid partitioner") {
- val random = new Random()
+ val random = new ju.Random()
// This should generate a 4x4 grid of 1x2 blocks.
val part0 = GridPartitioner(4, 7, suggestedNumPartitions = 12)
val expected0 = Array(
@@ -148,6 +148,92 @@ class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext {
assert(gridBasedMat.toBreeze() === expected)
}
+ test("add") {
+ val blocks: Seq[((Int, Int), Matrix)] = Seq(
+ ((0, 0), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 2.0))),
+ ((0, 1), new DenseMatrix(2, 2, Array(0.0, 1.0, 0.0, 0.0))),
+ ((1, 0), new DenseMatrix(2, 2, Array(3.0, 0.0, 1.0, 1.0))),
+ ((1, 1), new DenseMatrix(2, 2, Array(1.0, 2.0, 0.0, 1.0))),
+ ((2, 0), new DenseMatrix(1, 2, Array(1.0, 0.0))), // Added block that doesn't exist in A
+ ((2, 1), new DenseMatrix(1, 2, Array(1.0, 5.0))))
+ val rdd = sc.parallelize(blocks, numPartitions)
+ val B = new BlockMatrix(rdd, rowPerPart, colPerPart)
+
+ val expected = BDM(
+ (2.0, 0.0, 0.0, 0.0),
+ (0.0, 4.0, 2.0, 0.0),
+ (6.0, 2.0, 2.0, 0.0),
+ (0.0, 2.0, 4.0, 2.0),
+ (1.0, 0.0, 2.0, 10.0))
+
+ val AplusB = gridBasedMat.add(B)
+ assert(AplusB.numRows() === m)
+ assert(AplusB.numCols() === B.numCols())
+ assert(AplusB.toBreeze() === expected)
+
+ val C = new BlockMatrix(rdd, rowPerPart, colPerPart, m, n + 1) // columns don't match
+ intercept[IllegalArgumentException] {
+ gridBasedMat.add(C)
+ }
+ val largerBlocks: Seq[((Int, Int), Matrix)] = Seq(
+ ((0, 0), new DenseMatrix(4, 4, new Array[Double](16))),
+ ((1, 0), new DenseMatrix(1, 4, Array(1.0, 0.0, 1.0, 5.0))))
+ val C2 = new BlockMatrix(sc.parallelize(largerBlocks, numPartitions), 4, 4, m, n)
+ intercept[SparkException] { // partitioning doesn't match
+ gridBasedMat.add(C2)
+ }
+ // adding BlockMatrices composed of SparseMatrices
+ val sparseBlocks = for (i <- 0 until 4) yield ((i / 2, i % 2), SparseMatrix.speye(4))
+ val denseBlocks = for (i <- 0 until 4) yield ((i / 2, i % 2), DenseMatrix.eye(4))
+ val sparseBM = new BlockMatrix(sc.makeRDD(sparseBlocks, 4), 4, 4, 8, 8)
+ val denseBM = new BlockMatrix(sc.makeRDD(denseBlocks, 4), 4, 4, 8, 8)
+
+ assert(sparseBM.add(sparseBM).toBreeze() === sparseBM.add(denseBM).toBreeze())
+ }
+
+ test("multiply") {
+ // identity matrix
+ val blocks: Seq[((Int, Int), Matrix)] = Seq(
+ ((0, 0), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 1.0))),
+ ((1, 1), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 1.0))))
+ val rdd = sc.parallelize(blocks, 2)
+ val B = new BlockMatrix(rdd, colPerPart, rowPerPart)
+ val expected = BDM(
+ (1.0, 0.0, 0.0, 0.0),
+ (0.0, 2.0, 1.0, 0.0),
+ (3.0, 1.0, 1.0, 0.0),
+ (0.0, 1.0, 2.0, 1.0),
+ (0.0, 0.0, 1.0, 5.0))
+
+ val AtimesB = gridBasedMat.multiply(B)
+ assert(AtimesB.numRows() === m)
+ assert(AtimesB.numCols() === n)
+ assert(AtimesB.toBreeze() === expected)
+ val C = new BlockMatrix(rdd, rowPerPart, colPerPart, m + 1, n) // dimensions don't match
+ intercept[IllegalArgumentException] {
+ gridBasedMat.multiply(C)
+ }
+ val largerBlocks = Seq(((0, 0), DenseMatrix.eye(4)))
+ val C2 = new BlockMatrix(sc.parallelize(largerBlocks, numPartitions), 4, 4)
+ intercept[SparkException] {
+ // partitioning doesn't match
+ gridBasedMat.multiply(C2)
+ }
+ val rand = new ju.Random(42)
+ val largerAblocks = for (i <- 0 until 20) yield ((i % 5, i / 5), DenseMatrix.rand(6, 4, rand))
+ val largerBblocks = for (i <- 0 until 16) yield ((i % 4, i / 4), DenseMatrix.rand(4, 4, rand))
+
+ // Try it with increased number of partitions
+ val largeA = new BlockMatrix(sc.parallelize(largerAblocks, 10), 6, 4)
+ val largeB = new BlockMatrix(sc.parallelize(largerBblocks, 8), 4, 4)
+ val largeC = largeA.multiply(largeB)
+ val localC = largeC.toLocalMatrix()
+ val result = largeA.toLocalMatrix().multiply(largeB.toLocalMatrix().asInstanceOf[DenseMatrix])
+ assert(largeC.numRows() === largeA.numRows())
+ assert(largeC.numCols() === largeB.numCols())
+ assert(localC ~== result absTol 1e-8)
+ }
+
test("validate") {
// No error
gridBasedMat.validate()
@@ -201,14 +287,6 @@ class BlockMatrixSuite extends FunSuite with MLlibTestSparkContext {
assert(AT.numCols() === gridBasedMat.numRows())
assert(AT.toBreeze() === expected)
- // partitioner must update as well
- val originalPartitioner = gridBasedMat.partitioner
- val ATpartitioner = AT.partitioner
- assert(originalPartitioner.colsPerPart === ATpartitioner.rowsPerPart)
- assert(originalPartitioner.rowsPerPart === ATpartitioner.colsPerPart)
- assert(originalPartitioner.cols === ATpartitioner.rows)
- assert(originalPartitioner.rows === ATpartitioner.cols)
-
// make sure it works when matrices are cached as well
gridBasedMat.cache()
val AT2 = gridBasedMat.transpose