diff options
author | Sean Owen <sowen@cloudera.com> | 2016-04-30 00:15:41 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2016-04-30 00:15:41 -0700 |
commit | 5886b6217b7ac783ec605e38f5d960048d448976 (patch) | |
tree | 52d9238e7cb89997870aa248502058ebd9ec87bc /mllib/src/test/scala | |
parent | f86f71763c014aa23940510e1e4af5a9244271e6 (diff) | |
download | spark-5886b6217b7ac783ec605e38f5d960048d448976.tar.gz spark-5886b6217b7ac783ec605e38f5d960048d448976.tar.bz2 spark-5886b6217b7ac783ec605e38f5d960048d448976.zip |
[SPARK-14533][MLLIB] RowMatrix.computeCovariance inaccurate when values are very large (partial fix)
## What changes were proposed in this pull request?
Fix for part of SPARK-14533: trivial simplification and more accurate computation of column means. See also https://github.com/apache/spark/pull/12299 which contained a complete fix that was very slow. This PR does _not_ resolve SPARK-14533 entirely.
## How was this patch tested?
Existing tests.
Author: Sean Owen <sowen@cloudera.com>
Closes #12779 from srowen/SPARK-14533.2.
Diffstat (limited to 'mllib/src/test/scala')
-rw-r--r-- | mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala | 22 |
1 files changed, 15 insertions, 7 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala index eaa819c2e6..700f803490 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala @@ -22,6 +22,7 @@ import breeze.linalg.{DenseMatrix => BDM, Matrix => BM} import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.mllib.stat.correlation.{Correlations, PearsonCorrelation, SpearmanCorrelation} import org.apache.spark.mllib.util.MLlibTestSparkContext @@ -42,10 +43,10 @@ class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Log test("corr(x, y) pearson, 1 value in data") { val x = sc.parallelize(Array(1.0)) val y = sc.parallelize(Array(4.0)) - intercept[RuntimeException] { + intercept[IllegalArgumentException] { Statistics.corr(x, y, "pearson") } - intercept[RuntimeException] { + intercept[IllegalArgumentException] { Statistics.corr(x, y, "spearman") } } @@ -127,15 +128,22 @@ class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Log assert(Correlations.getCorrelationFromName("pearson") === pearson) assert(Correlations.getCorrelationFromName("spearman") === spearman) - // Should throw IllegalArgumentException - try { + intercept[IllegalArgumentException] { Correlations.getCorrelationFromName("kendall") - assert(false) - } catch { - case ie: IllegalArgumentException => } } + ignore("Pearson correlation of very large uncorrelated values (SPARK-14533)") { + // The two RDDs should have 0 correlation because they're random; + // this should stay the same after shifting them by any amount + // In practice a large shift produces very large values which can reveal + // round-off problems + val a = RandomRDDs.normalRDD(sc, 100000, 10).map(_ + 1000000000.0) + val b = RandomRDDs.normalRDD(sc, 100000, 10).map(_ + 1000000000.0) + val p = Statistics.corr(a, b, method = "pearson") + assert(approxEqual(p, 0.0, 0.01)) + } + def approxEqual(v1: Double, v2: Double, threshold: Double = 1e-6): Boolean = { if (v1.isNaN) { v2.isNaN |