From 5994cfe81271a39294aa29fd47aa94c99aa56743 Mon Sep 17 00:00:00 2001 From: Nick Pritchard Date: Thu, 8 Oct 2015 22:22:20 -0700 Subject: [SPARK-10875] [MLLIB] Computed covariance matrix should be symmetric Compute upper triangular values of the covariance matrix, then copy to lower triangular values. Author: Nick Pritchard Closes #8940 from pnpritchard/SPARK-10875. --- .../spark/mllib/linalg/distributed/RowMatrix.scala | 6 ++++-- .../mllib/linalg/distributed/RowMatrixSuite.scala | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index 7c7d900af3..b8a7adceb1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -357,9 +357,11 @@ class RowMatrix @Since("1.0.0") ( var alpha = 0.0 while (i < n) { alpha = m / m1 * mean(i) - j = 0 + j = i while (j < n) { - G(i, j) = G(i, j) / m1 - alpha * mean(j) + val Gij = G(i, j) / m1 - alpha * mean(j) + G(i, j) = Gij + G(j, i) = Gij j += 1 } i += 1 diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala index 283ffec1d4..4abb98fb6f 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala @@ -24,6 +24,7 @@ import breeze.linalg.{DenseVector => BDV, DenseMatrix => BDM, norm => brzNorm, s import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Matrices, Vectors, Vector} +import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext} class RowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext { @@ -255,6 +256,23 @@ class RowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext { assert(closeToZero(abs(expected.r) - abs(rOnly.R.toBreeze.asInstanceOf[BDM[Double]]))) } } + + test("compute covariance") { + for (mat <- Seq(denseMat, sparseMat)) { + val result = mat.computeCovariance() + val expected = breeze.linalg.cov(mat.toBreeze()) + assert(closeToZero(abs(expected) - abs(result.toBreeze.asInstanceOf[BDM[Double]]))) + } + } + + test("covariance matrix is symmetric (SPARK-10875)") { + val rdd = RandomRDDs.normalVectorRDD(sc, 100, 10, 0, 0) + val matrix = new RowMatrix(rdd) + val cov = matrix.computeCovariance() + for (i <- 0 until cov.numRows; j <- 0 until i) { + assert(cov(i, j) === cov(j, i)) + } + } } class RowMatrixClusterSuite extends SparkFunSuite with LocalClusterSparkContext { -- cgit v1.2.3