Principal Component Analysis

# Principal Component Analysis Computes the top k principal component coefficients for the m-by-n data matrix X. Rows of X correspond to observations and columns correspond to variables. The coefficient matrix is n-by-k. Each column of the coefficients return matrix contains coefficients for one principal component, and the columns are in descending order of component variance. This function centers the data and uses the singular value decomposition (SVD) algorithm. ## Testing Tests included: * All principal components * Only top k principal components * Dense SVD tests * Dense/sparse matrix tests The results are tested against MATLAB's pca: http://www.mathworks.com/help/stats/pca.html ## Documentation Added to mllib-guide.md ## Example Usage Added to examples directory under SparkPCA.scala Author: Reza Zadeh <rizlar@gmail.com> Closes #88 from rezazadeh/sparkpca and squashes the following commits: e298700 [Reza Zadeh] reformat using IDE 3f23271 [Reza Zadeh] documentation and cleanup b025ab2 [Reza Zadeh] documentation e2667d4 [Reza Zadeh] assertMatrixApproximatelyEquals 3787bb4 [Reza Zadeh] stylin c6ecc1f [Reza Zadeh] docs aa2bbcb [Reza Zadeh] rename sparseToTallSkinnyDense 56975b0 [Reza Zadeh] docs 2df9bde [Reza Zadeh] docs update 8fb0015 [Reza Zadeh] rcond documentation dbf7797 [Reza Zadeh] correct argument number a9f1f62 [Reza Zadeh] documentation 4ce6caa [Reza Zadeh] style changes 9a56a02 [Reza Zadeh] use rcond relative to larget svalue 120f796 [Reza Zadeh] housekeeping 156ff78 [Reza Zadeh] string comprehension 2e1cf43 [Reza Zadeh] rename rcond ea223a6 [Reza Zadeh] many style changes f4002d7 [Reza Zadeh] more docs bd53c7a [Reza Zadeh] proper accumulator a8b5ecf [Reza Zadeh] Don't use for loops 0dc7980 [Reza Zadeh] filter zeros in sparse 6115610 [Reza Zadeh] More documentation 36d51e8 [Reza Zadeh] use JBLAS for UVS^-1 computation bc4599f [Reza Zadeh] configurable rcond 86f7515 [Reza Zadeh] compute per parition, use while 09726b3 [Reza Zadeh] more style changes 4195e69 [Reza Zadeh] private, accumulator 17002be [Reza Zadeh] style changes 4ba7471 [Reza Zadeh] style change f4982e6 [Reza Zadeh] Use dense matrix in example 2828d28 [Reza Zadeh] optimizations: normalize once, use inplace ops 72c9fa1 [Reza Zadeh] rename DenseMatrix to TallSkinnyDenseMatrix, lean f807be9 [Reza Zadeh] fix typo 2d7ccde [Reza Zadeh] Array interface for dense svd and pca cd290fa [Reza Zadeh] provide RDD[Array[Double]] support 398d123 [Reza Zadeh] style change 55abbfa [Reza Zadeh] docs fix ef29644 [Reza Zadeh] bad chnage undo 472566e [Reza Zadeh] all files from old pr 555168f [Reza Zadeh] initial files
author: Reza Zadeh <rizlar@gmail.com> 2014-03-20 10:39:20 -0700
committer: Matei Zaharia <matei@databricks.com> 2014-03-20 10:39:20 -0700
commit: 66a03e5fe0167f590d150e099b15902e826a188f (patch)
tree: dcefdd0a43f1fdb80b13a929ec13aec96d4682cb /mllib/src/test
parent: ffe272d97c22955fe7744b1c0132cd9877b6df96 (diff)
download: spark-66a03e5fe0167f590d150e099b15902e826a188f.tar.gz
spark-66a03e5fe0167f590d150e099b15902e826a188f.tar.bz2
spark-66a03e5fe0167f590d150e099b15902e826a188f.zip
2 files changed, 191 insertions, 31 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/PCASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/PCASuite.scala
new file mode 100644
index 0000000000..5e5086b1bf
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/PCASuite.scala
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.linalg
+
+import scala.util.Random
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.FunSuite
+
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+import org.apache.spark.rdd.RDD
+
+import org.apache.spark.mllib.util._
+
+import org.jblas._
+
+class PCASuite extends FunSuite with BeforeAndAfterAll {
+  @transient private var sc: SparkContext = _
+
+  override def beforeAll() {
+    sc = new SparkContext("local", "test")
+  }
+
+  override def afterAll() {
+    sc.stop()
+    System.clearProperty("spark.driver.port")
+  }
+
+  val EPSILON = 1e-3
+
+  // Return jblas matrix from sparse matrix RDD
+  def getDenseMatrix(matrix: SparseMatrix) : DoubleMatrix = {
+    val data = matrix.data
+    val ret = DoubleMatrix.zeros(matrix.m, matrix.n)
+    matrix.data.collect().map(x => ret.put(x.i, x.j, x.mval))
+    ret
+  }
+
+  def assertMatrixApproximatelyEquals(a: DoubleMatrix, b: DoubleMatrix) {
+    assert(a.rows == b.rows && a.columns == b.columns,
+      "dimension mismatch: $a.rows vs $b.rows and $a.columns vs $b.columns")
+    for (i <- 0 until a.columns) {
+      val aCol = a.getColumn(i)
+      val bCol = b.getColumn(i)
+      val diff = Math.min(aCol.sub(bCol).norm1, aCol.add(bCol).norm1)
+      assert(diff < EPSILON, "matrix mismatch: " + diff)
+    }
+  }
+
+  test("full rank matrix pca") {
+    val m = 5
+    val n = 3
+    val dataArr = Array.tabulate(m,n){ (a, b) =>
+      MatrixEntry(a, b, Math.sin(a + b + a * b)) }.flatten
+    val data = sc.makeRDD(dataArr, 3) 
+    val a = LAUtils.sparseToTallSkinnyDense(SparseMatrix(data, m, n))
+
+    val realPCAArray = Array((0,0,-0.2579), (0,1,-0.6602), (0,2,0.7054),
+                        (1,0,-0.1448), (1,1,0.7483),  (1,2,0.6474),
+                        (2,0,0.9553),  (2,1,-0.0649),  (2,2,0.2886))
+    val realPCA = sc.makeRDD(realPCAArray.map(x => MatrixEntry(x._1, x._2, x._3)), 3)
+
+    val coeffs = new DoubleMatrix(new PCA().setK(n).compute(a))
+
+    assertMatrixApproximatelyEquals(getDenseMatrix(SparseMatrix(realPCA,n,n)), coeffs)  
+  }
+
+  test("sparse matrix full rank matrix pca") {
+    val m = 5
+    val n = 3
+    // the entry that gets dropped is zero to test sparse support
+    val dataArr = Array.tabulate(m,n){ (a, b) =>
+      MatrixEntry(a, b, Math.sin(a + b + a * b)) }.flatten.drop(1)
+    val data = sc.makeRDD(dataArr, 3)
+    val a = LAUtils.sparseToTallSkinnyDense(SparseMatrix(data, m, n))
+
+    val realPCAArray = Array((0,0,-0.2579), (0,1,-0.6602), (0,2,0.7054),
+                        (1,0,-0.1448), (1,1,0.7483),  (1,2,0.6474),
+                        (2,0,0.9553),  (2,1,-0.0649),  (2,2,0.2886))
+    val realPCA = sc.makeRDD(realPCAArray.map(x => MatrixEntry(x._1, x._2, x._3)))
+
+    val coeffs = new DoubleMatrix(new PCA().setK(n).compute(a))
+
+    assertMatrixApproximatelyEquals(getDenseMatrix(SparseMatrix(realPCA,n,n)), coeffs)
+  }
+
+  test("truncated matrix pca") {
+    val m = 5
+    val n = 3
+    val dataArr = Array.tabulate(m,n){ (a, b) =>
+      MatrixEntry(a, b, Math.sin(a + b + a * b)) }.flatten
+    
+    val data = sc.makeRDD(dataArr, 3)
+    val a = LAUtils.sparseToTallSkinnyDense(SparseMatrix(data, m, n))
+
+    val realPCAArray = Array((0,0,-0.2579), (0,1,-0.6602),
+                        (1,0,-0.1448), (1,1,0.7483),
+                        (2,0,0.9553),  (2,1,-0.0649))
+    val realPCA = sc.makeRDD(realPCAArray.map(x => MatrixEntry(x._1, x._2, x._3)))
+
+    val k = 2
+    val coeffs = new DoubleMatrix(new PCA().setK(k).compute(a))
+
+    assertMatrixApproximatelyEquals(getDenseMatrix(SparseMatrix(realPCA,n,k)), coeffs)
+  }
+}
+
+
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
index a92386865a..20e2b0f84b 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/SVDSuite.scala
@@ -28,6 +28,8 @@ import org.apache.spark.SparkContext
 import org.apache.spark.SparkContext._
 import org.apache.spark.rdd.RDD
 
+import org.apache.spark.mllib.util._
+
 import org.jblas._
 
 class SVDSuite extends FunSuite with BeforeAndAfterAll {
@@ -54,43 +56,77 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
     ret
   }
 
-  def assertMatrixEquals(a: DoubleMatrix, b: DoubleMatrix) {
-    assert(a.rows == b.rows && a.columns == b.columns, "dimension mismatch")
-    val diff = DoubleMatrix.zeros(a.rows, a.columns)
-    Array.tabulate(a.rows, a.columns){(i, j) =>
-      diff.put(i, j,
-          Math.min(Math.abs(a.get(i, j) - b.get(i, j)),
-          Math.abs(a.get(i, j) + b.get(i, j))))  }
-    assert(diff.norm1 < EPSILON, "matrix mismatch: " + diff.norm1)
+  def assertMatrixApproximatelyEquals(a: DoubleMatrix, b: DoubleMatrix) {
+    assert(a.rows == b.rows && a.columns == b.columns,
+      "dimension mismatch: $a.rows vs $b.rows and $a.columns vs $b.columns")
+    for (i <- 0 until a.columns) {
+      val aCol = a.getColumn(i)
+      val bCol = b.getColumn(i)
+      val diff = Math.min(aCol.sub(bCol).norm1, aCol.add(bCol).norm1)
+      assert(diff < EPSILON, "matrix mismatch: " + diff)
+    }
   }
 
   test("full rank matrix svd") {
     val m = 10
     val n = 3
-    val data = sc.makeRDD(Array.tabulate(m,n){ (a, b) =>
-      MatrixEntry(a, b, (a + 2).toDouble * (b + 1) / (1 + a + b)) }.flatten )
+    val datarr = Array.tabulate(m,n){ (a, b) =>
+      MatrixEntry(a, b, (a + 2).toDouble * (b + 1) / (1 + a + b)) }.flatten
+    val data = sc.makeRDD(datarr, 3)
 
     val a = SparseMatrix(data, m, n)
 
-    val decomposed = SVD.sparseSVD(a, n)
+    val decomposed = new SVD().setK(n).compute(a)
     val u = decomposed.U
     val v = decomposed.V
     val s = decomposed.S
 
-    val densea = getDenseMatrix(a)
-    val svd = Singular.sparseSVD(densea)
+    val denseA = getDenseMatrix(a)
+    val svd = Singular.sparseSVD(denseA)
 
     val retu = getDenseMatrix(u)
     val rets = getDenseMatrix(s)
     val retv = getDenseMatrix(v)
-  
+ 
+ 
+    // check individual decomposition  
+    assertMatrixApproximatelyEquals(retu, svd(0))
+    assertMatrixApproximatelyEquals(rets, DoubleMatrix.diag(svd(1)))
+    assertMatrixApproximatelyEquals(retv, svd(2))
+
+    // check multiplication guarantee
+    assertMatrixApproximatelyEquals(retu.mmul(rets).mmul(retv.transpose), denseA)  
+  }
+
+ test("dense full rank matrix svd") {
+    val m = 10
+    val n = 3
+    val datarr = Array.tabulate(m,n){ (a, b) =>
+      MatrixEntry(a, b, (a + 2).toDouble * (b + 1) / (1 + a + b)) }.flatten
+    val data = sc.makeRDD(datarr, 3)
+
+    val a = LAUtils.sparseToTallSkinnyDense(SparseMatrix(data, m, n))
+
+    val decomposed = new SVD().setK(n).setComputeU(true).compute(a)
+    val u = LAUtils.denseToSparse(decomposed.U)
+    val v = decomposed.V
+    val s = decomposed.S
+
+    val denseA = getDenseMatrix(LAUtils.denseToSparse(a))
+    val svd = Singular.sparseSVD(denseA)
+
+    val retu = getDenseMatrix(u)
+    val rets = DoubleMatrix.diag(new DoubleMatrix(s))
+    val retv = new DoubleMatrix(v)
+
+
     // check individual decomposition  
-    assertMatrixEquals(retu, svd(0))
-    assertMatrixEquals(rets, DoubleMatrix.diag(svd(1)))
-    assertMatrixEquals(retv, svd(2))
+    assertMatrixApproximatelyEquals(retu, svd(0))
+    assertMatrixApproximatelyEquals(rets, DoubleMatrix.diag(svd(1)))
+    assertMatrixApproximatelyEquals(retv, svd(2))
 
     // check multiplication guarantee
-    assertMatrixEquals(retu.mmul(rets).mmul(retv.transpose), densea)  
+    assertMatrixApproximatelyEquals(retu.mmul(rets).mmul(retv.transpose), denseA)
   }
 
  test("rank one matrix svd") {
@@ -102,7 +138,7 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
 
     val a = SparseMatrix(data, m, n)
 
-    val decomposed = SVD.sparseSVD(a, k)
+    val decomposed = new SVD().setK(k).compute(a)
     val u = decomposed.U
     val s = decomposed.S
     val v = decomposed.V
@@ -110,20 +146,20 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
 
     assert(retrank == 1, "rank returned not one")
 
-    val densea = getDenseMatrix(a)
-    val svd = Singular.sparseSVD(densea)
+    val denseA = getDenseMatrix(a)
+    val svd = Singular.sparseSVD(denseA)
 
     val retu = getDenseMatrix(u)
     val rets = getDenseMatrix(s)
     val retv = getDenseMatrix(v)
 
     // check individual decomposition  
-    assertMatrixEquals(retu, svd(0).getColumn(0))
-    assertMatrixEquals(rets, DoubleMatrix.diag(svd(1).getRow(0)))
-    assertMatrixEquals(retv, svd(2).getColumn(0))
+    assertMatrixApproximatelyEquals(retu, svd(0).getColumn(0))
+    assertMatrixApproximatelyEquals(rets, DoubleMatrix.diag(svd(1).getRow(0)))
+    assertMatrixApproximatelyEquals(retv, svd(2).getColumn(0))
 
      // check multiplication guarantee
-    assertMatrixEquals(retu.mmul(rets).mmul(retv.transpose), densea)  
+    assertMatrixApproximatelyEquals(retu.mmul(rets).mmul(retv.transpose), denseA)  
   }
 
  test("truncated with k") {
@@ -135,14 +171,14 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
     
     val k = 1 // only one svalue above this
 
-    val decomposed = SVD.sparseSVD(a, k)
+    val decomposed = new SVD().setK(k).compute(a)
     val u = decomposed.U
     val s = decomposed.S
     val v = decomposed.V
     val retrank = s.data.collect().length
 
-    val densea = getDenseMatrix(a)
-    val svd = Singular.sparseSVD(densea)
+    val denseA = getDenseMatrix(a)
+    val svd = Singular.sparseSVD(denseA)
 
     val retu = getDenseMatrix(u)
     val rets = getDenseMatrix(s)
@@ -151,8 +187,8 @@ class SVDSuite extends FunSuite with BeforeAndAfterAll {
     assert(retrank == 1, "rank returned not one")
     
     // check individual decomposition  
-    assertMatrixEquals(retu, svd(0).getColumn(0))
-    assertMatrixEquals(rets, DoubleMatrix.diag(svd(1).getRow(0)))
-    assertMatrixEquals(retv, svd(2).getColumn(0))
+    assertMatrixApproximatelyEquals(retu, svd(0).getColumn(0))
+    assertMatrixApproximatelyEquals(rets, DoubleMatrix.diag(svd(1).getRow(0)))
+    assertMatrixApproximatelyEquals(retv, svd(2).getColumn(0))
   }
 }
author	Reza Zadeh <rizlar@gmail.com>	2014-03-20 10:39:20 -0700
committer	Matei Zaharia <matei@databricks.com>	2014-03-20 10:39:20 -0700
commit	66a03e5fe0167f590d150e099b15902e826a188f (patch)
tree	dcefdd0a43f1fdb80b13a929ec13aec96d4682cb /mllib/src/test
parent	ffe272d97c22955fe7744b1c0132cd9877b6df96 (diff)
download	spark-66a03e5fe0167f590d150e099b15902e826a188f.tar.gz spark-66a03e5fe0167f590d150e099b15902e826a188f.tar.bz2 spark-66a03e5fe0167f590d150e099b15902e826a188f.zip