diff options
author | Nick Pritchard <nicholas.pritchard@falkonry.com> | 2015-10-08 22:22:20 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2015-10-08 22:22:20 -0700 |
commit | 5994cfe81271a39294aa29fd47aa94c99aa56743 (patch) | |
tree | 181ec7bb82ad10fbc8e2dbc1bc98099964ba0f48 | |
parent | 5410747a84e9be1cea44159dfc2216d5e0728ab4 (diff) | |
download | spark-5994cfe81271a39294aa29fd47aa94c99aa56743.tar.gz spark-5994cfe81271a39294aa29fd47aa94c99aa56743.tar.bz2 spark-5994cfe81271a39294aa29fd47aa94c99aa56743.zip |
[SPARK-10875] [MLLIB] Computed covariance matrix should be symmetric
Compute upper triangular values of the covariance matrix, then copy to lower triangular values.
Author: Nick Pritchard <nicholas.pritchard@falkonry.com>
Closes #8940 from pnpritchard/SPARK-10875.
-rw-r--r-- | mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala | 6 | ||||
-rw-r--r-- | mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala | 18 |
2 files changed, 22 insertions, 2 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index 7c7d900af3..b8a7adceb1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -357,9 +357,11 @@ class RowMatrix @Since("1.0.0") ( var alpha = 0.0 while (i < n) { alpha = m / m1 * mean(i) - j = 0 + j = i while (j < n) { - G(i, j) = G(i, j) / m1 - alpha * mean(j) + val Gij = G(i, j) / m1 - alpha * mean(j) + G(i, j) = Gij + G(j, i) = Gij j += 1 } i += 1 diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala index 283ffec1d4..4abb98fb6f 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala @@ -24,6 +24,7 @@ import breeze.linalg.{DenseVector => BDV, DenseMatrix => BDM, norm => brzNorm, s import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Matrices, Vectors, Vector} +import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext} class RowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext { @@ -255,6 +256,23 @@ class RowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext { assert(closeToZero(abs(expected.r) - abs(rOnly.R.toBreeze.asInstanceOf[BDM[Double]]))) } } + + test("compute covariance") { + for (mat <- Seq(denseMat, sparseMat)) { + val result = mat.computeCovariance() + val expected = breeze.linalg.cov(mat.toBreeze()) + assert(closeToZero(abs(expected) - abs(result.toBreeze.asInstanceOf[BDM[Double]]))) + } + } + + test("covariance matrix is symmetric (SPARK-10875)") { + val rdd = RandomRDDs.normalVectorRDD(sc, 100, 10, 0, 0) + val matrix = new RowMatrix(rdd) + val cov = matrix.computeCovariance() + for (i <- 0 until cov.numRows; j <- 0 until i) { + assert(cov(i, j) === cov(j, i)) + } + } } class RowMatrixClusterSuite extends SparkFunSuite with LocalClusterSparkContext { |