[SPARK-14533][MLLIB] RowMatrix.computeCovariance inaccurate when values are very large (partial fix)

## What changes were proposed in this pull request? Fix for part of SPARK-14533: trivial simplification and more accurate computation of column means. See also https://github.com/apache/spark/pull/12299 which contained a complete fix that was very slow. This PR does _not_ resolve SPARK-14533 entirely. ## How was this patch tested? Existing tests. Author: Sean Owen <sowen@cloudera.com> Closes #12779 from srowen/SPARK-14533.2.
author: Sean Owen <sowen@cloudera.com> 2016-04-30 00:15:41 -0700
committer: Xiangrui Meng <meng@databricks.com> 2016-04-30 00:15:41 -0700
commit: 5886b6217b7ac783ec605e38f5d960048d448976 (patch)
tree: 52d9238e7cb89997870aa248502058ebd9ec87bc /mllib/src/test/scala
parent: f86f71763c014aa23940510e1e4af5a9244271e6 (diff)
download: spark-5886b6217b7ac783ec605e38f5d960048d448976.tar.gz
spark-5886b6217b7ac783ec605e38f5d960048d448976.tar.bz2
spark-5886b6217b7ac783ec605e38f5d960048d448976.zip
1 files changed, 15 insertions, 7 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
index eaa819c2e6..700f803490 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
@@ -22,6 +22,7 @@ import breeze.linalg.{DenseMatrix => BDM, Matrix => BM}
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.internal.Logging
 import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.random.RandomRDDs
 import org.apache.spark.mllib.stat.correlation.{Correlations, PearsonCorrelation,
   SpearmanCorrelation}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
@@ -42,10 +43,10 @@ class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Log
   test("corr(x, y) pearson, 1 value in data") {
     val x = sc.parallelize(Array(1.0))
     val y = sc.parallelize(Array(4.0))
-    intercept[RuntimeException] {
+    intercept[IllegalArgumentException] {
       Statistics.corr(x, y, "pearson")
     }
-    intercept[RuntimeException] {
+    intercept[IllegalArgumentException] {
       Statistics.corr(x, y, "spearman")
     }
   }
@@ -127,15 +128,22 @@ class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Log
     assert(Correlations.getCorrelationFromName("pearson") === pearson)
     assert(Correlations.getCorrelationFromName("spearman") === spearman)
 
-    // Should throw IllegalArgumentException
-    try {
+    intercept[IllegalArgumentException] {
       Correlations.getCorrelationFromName("kendall")
-      assert(false)
-    } catch {
-      case ie: IllegalArgumentException =>
     }
   }
 
+  ignore("Pearson correlation of very large uncorrelated values (SPARK-14533)") {
+    // The two RDDs should have 0 correlation because they're random;
+    // this should stay the same after shifting them by any amount
+    // In practice a large shift produces very large values which can reveal
+    // round-off problems
+    val a = RandomRDDs.normalRDD(sc, 100000, 10).map(_ + 1000000000.0)
+    val b = RandomRDDs.normalRDD(sc, 100000, 10).map(_ + 1000000000.0)
+    val p = Statistics.corr(a, b, method = "pearson")
+    assert(approxEqual(p, 0.0, 0.01))
+  }
+
   def approxEqual(v1: Double, v2: Double, threshold: Double = 1e-6): Boolean = {
     if (v1.isNaN) {
       v2.isNaN
author	Sean Owen <sowen@cloudera.com>	2016-04-30 00:15:41 -0700
committer	Xiangrui Meng <meng@databricks.com>	2016-04-30 00:15:41 -0700
commit	5886b6217b7ac783ec605e38f5d960048d448976 (patch)
tree	52d9238e7cb89997870aa248502058ebd9ec87bc /mllib/src/test/scala
parent	f86f71763c014aa23940510e1e4af5a9244271e6 (diff)
download	spark-5886b6217b7ac783ec605e38f5d960048d448976.tar.gz spark-5886b6217b7ac783ec605e38f5d960048d448976.tar.bz2 spark-5886b6217b7ac783ec605e38f5d960048d448976.zip