aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorYuhao Yang <hhbyyh@gmail.com>2015-07-30 07:49:10 -0700
committerXiangrui Meng <meng@databricks.com>2015-07-30 07:49:10 -0700
commitd31c618e3c8838f8198556876b9dcbbbf835f7b2 (patch)
treef0a47803e957f5ea4450e496b21a2b03cc3ff59c /mllib
parent6175d6cfe795fbd88e3ee713fac375038a3993a8 (diff)
downloadspark-d31c618e3c8838f8198556876b9dcbbbf835f7b2.tar.gz
spark-d31c618e3c8838f8198556876b9dcbbbf835f7b2.tar.bz2
spark-d31c618e3c8838f8198556876b9dcbbbf835f7b2.zip
[SPARK-7368] [MLLIB] Add QR decomposition for RowMatrix
jira: https://issues.apache.org/jira/browse/SPARK-7368 Add QR decomposition for RowMatrix. I'm not sure what's the blueprint about the distributed Matrix from community and whether this will be a desirable feature , so I sent a prototype for discussion. I'll go on polish the code and provide ut and performance statistics if it's acceptable. The implementation refers to the [paper: https://www.cs.purdue.edu/homes/dgleich/publications/Benson%202013%20-%20direct-tsqr.pdf] Austin R. Benson, David F. Gleich, James Demmel. "Direct QR factorizations for tall-and-skinny matrices in MapReduce architectures", 2013 IEEE International Conference on Big Data, which is a stable algorithm with good scalability. Currently I tried it on a 400000 * 500 rowMatrix (16 partitions) and it can bring down the computation time from 8.8 mins (using breeze.linalg.qr.reduced) to 2.6 mins on a 4 worker cluster. I think there will still be some room for performance improvement. Any trial and suggestion is welcome. Author: Yuhao Yang <hhbyyh@gmail.com> Closes #5909 from hhbyyh/qrDecomposition and squashes the following commits: cec797b [Yuhao Yang] remove unnecessary qr 0fb1012 [Yuhao Yang] hierarchy R computing 3fbdb61 [Yuhao Yang] update qr to indirect and add ut 0d913d3 [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into qrDecomposition 39213c3 [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into qrDecomposition c0fc0c7 [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into qrDecomposition 39b0b22 [Yuhao Yang] initial draft for discussion
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala8
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala46
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala17
3 files changed, 70 insertions, 1 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala
index 9669c364ba..b416d50a56 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala
@@ -25,3 +25,11 @@ import org.apache.spark.annotation.Experimental
*/
@Experimental
case class SingularValueDecomposition[UType, VType](U: UType, s: Vector, V: VType)
+
+/**
+ * :: Experimental ::
+ * Represents QR factors.
+ */
+@Experimental
+case class QRDecomposition[UType, VType](Q: UType, R: VType)
+
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index 1626da9c3d..bfc90c9ef8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -22,7 +22,7 @@ import java.util.Arrays
import scala.collection.mutable.ListBuffer
import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, SparseVector => BSV, axpy => brzAxpy,
- svd => brzSvd}
+ svd => brzSvd, MatrixSingularException, inv}
import breeze.numerics.{sqrt => brzSqrt}
import com.github.fommil.netlib.BLAS.{getInstance => blas}
@@ -498,6 +498,50 @@ class RowMatrix(
}
/**
+ * Compute QR decomposition for [[RowMatrix]]. The implementation is designed to optimize the QR
+ * decomposition (factorization) for the [[RowMatrix]] of a tall and skinny shape.
+ * Reference:
+ * Paul G. Constantine, David F. Gleich. "Tall and skinny QR factorizations in MapReduce
+ * architectures" ([[http://dx.doi.org/10.1145/1996092.1996103]])
+ *
+ * @param computeQ whether to computeQ
+ * @return QRDecomposition(Q, R), Q = null if computeQ = false.
+ */
+ def tallSkinnyQR(computeQ: Boolean = false): QRDecomposition[RowMatrix, Matrix] = {
+ val col = numCols().toInt
+ // split rows horizontally into smaller matrices, and compute QR for each of them
+ val blockQRs = rows.glom().map { partRows =>
+ val bdm = BDM.zeros[Double](partRows.length, col)
+ var i = 0
+ partRows.foreach { row =>
+ bdm(i, ::) := row.toBreeze.t
+ i += 1
+ }
+ breeze.linalg.qr.reduced(bdm).r
+ }
+
+ // combine the R part from previous results vertically into a tall matrix
+ val combinedR = blockQRs.treeReduce{ (r1, r2) =>
+ val stackedR = BDM.vertcat(r1, r2)
+ breeze.linalg.qr.reduced(stackedR).r
+ }
+ val finalR = Matrices.fromBreeze(combinedR.toDenseMatrix)
+ val finalQ = if (computeQ) {
+ try {
+ val invR = inv(combinedR)
+ this.multiply(Matrices.fromBreeze(invR))
+ } catch {
+ case err: MatrixSingularException =>
+ logWarning("R is not invertible and return Q as null")
+ null
+ }
+ } else {
+ null
+ }
+ QRDecomposition(finalQ, finalR)
+ }
+
+ /**
* Find all similar columns using the DIMSUM sampling algorithm, described in two papers
*
* http://arxiv.org/abs/1206.2082
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
index b6cb53d0c7..283ffec1d4 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.mllib.linalg.distributed
import scala.util.Random
+import breeze.numerics.abs
import breeze.linalg.{DenseVector => BDV, DenseMatrix => BDM, norm => brzNorm, svd => brzSvd}
import org.apache.spark.SparkFunSuite
@@ -238,6 +239,22 @@ class RowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {
}
}
}
+
+ test("QR Decomposition") {
+ for (mat <- Seq(denseMat, sparseMat)) {
+ val result = mat.tallSkinnyQR(true)
+ val expected = breeze.linalg.qr.reduced(mat.toBreeze())
+ val calcQ = result.Q
+ val calcR = result.R
+ assert(closeToZero(abs(expected.q) - abs(calcQ.toBreeze())))
+ assert(closeToZero(abs(expected.r) - abs(calcR.toBreeze.asInstanceOf[BDM[Double]])))
+ assert(closeToZero(calcQ.multiply(calcR).toBreeze - mat.toBreeze()))
+ // Decomposition without computing Q
+ val rOnly = mat.tallSkinnyQR(computeQ = false)
+ assert(rOnly.Q == null)
+ assert(closeToZero(abs(expected.r) - abs(rOnly.R.toBreeze.asInstanceOf[BDM[Double]])))
+ }
+ }
}
class RowMatrixClusterSuite extends SparkFunSuite with LocalClusterSparkContext {