aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/test/scala
diff options
context:
space:
mode:
authorSean Owen <sowen@cloudera.com>2016-04-30 00:15:41 -0700
committerXiangrui Meng <meng@databricks.com>2016-04-30 00:15:41 -0700
commit5886b6217b7ac783ec605e38f5d960048d448976 (patch)
tree52d9238e7cb89997870aa248502058ebd9ec87bc /mllib/src/test/scala
parentf86f71763c014aa23940510e1e4af5a9244271e6 (diff)
downloadspark-5886b6217b7ac783ec605e38f5d960048d448976.tar.gz
spark-5886b6217b7ac783ec605e38f5d960048d448976.tar.bz2
spark-5886b6217b7ac783ec605e38f5d960048d448976.zip
[SPARK-14533][MLLIB] RowMatrix.computeCovariance inaccurate when values are very large (partial fix)
## What changes were proposed in this pull request? Fix for part of SPARK-14533: trivial simplification and more accurate computation of column means. See also https://github.com/apache/spark/pull/12299 which contained a complete fix that was very slow. This PR does _not_ resolve SPARK-14533 entirely. ## How was this patch tested? Existing tests. Author: Sean Owen <sowen@cloudera.com> Closes #12779 from srowen/SPARK-14533.2.
Diffstat (limited to 'mllib/src/test/scala')
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala22
1 files changed, 15 insertions, 7 deletions
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
index eaa819c2e6..700f803490 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/CorrelationSuite.scala
@@ -22,6 +22,7 @@ import breeze.linalg.{DenseMatrix => BDM, Matrix => BM}
import org.apache.spark.SparkFunSuite
import org.apache.spark.internal.Logging
import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.mllib.stat.correlation.{Correlations, PearsonCorrelation,
SpearmanCorrelation}
import org.apache.spark.mllib.util.MLlibTestSparkContext
@@ -42,10 +43,10 @@ class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Log
test("corr(x, y) pearson, 1 value in data") {
val x = sc.parallelize(Array(1.0))
val y = sc.parallelize(Array(4.0))
- intercept[RuntimeException] {
+ intercept[IllegalArgumentException] {
Statistics.corr(x, y, "pearson")
}
- intercept[RuntimeException] {
+ intercept[IllegalArgumentException] {
Statistics.corr(x, y, "spearman")
}
}
@@ -127,15 +128,22 @@ class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Log
assert(Correlations.getCorrelationFromName("pearson") === pearson)
assert(Correlations.getCorrelationFromName("spearman") === spearman)
- // Should throw IllegalArgumentException
- try {
+ intercept[IllegalArgumentException] {
Correlations.getCorrelationFromName("kendall")
- assert(false)
- } catch {
- case ie: IllegalArgumentException =>
}
}
+ ignore("Pearson correlation of very large uncorrelated values (SPARK-14533)") {
+ // The two RDDs should have 0 correlation because they're random;
+ // this should stay the same after shifting them by any amount
+ // In practice a large shift produces very large values which can reveal
+ // round-off problems
+ val a = RandomRDDs.normalRDD(sc, 100000, 10).map(_ + 1000000000.0)
+ val b = RandomRDDs.normalRDD(sc, 100000, 10).map(_ + 1000000000.0)
+ val p = Statistics.corr(a, b, method = "pearson")
+ assert(approxEqual(p, 0.0, 0.01))
+ }
+
def approxEqual(v1: Double, v2: Double, threshold: Double = 1e-6): Boolean = {
if (v1.isNaN) {
v2.isNaN