diff options
author | Burak Yavuz <brkyvz@gmail.com> | 2015-05-03 21:44:39 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-05-03 21:44:39 -0700 |
commit | 9646018bb4466433521b4e602b808f16e8d0ffdb (patch) | |
tree | fe4df54135dcc0806af5f801c4753d9b88dbcc16 /python/pyspark/sql/tests.py | |
parent | 1ffa8cb91f8badf12a8aa190dc25920715a00db7 (diff) | |
download | spark-9646018bb4466433521b4e602b808f16e8d0ffdb.tar.gz spark-9646018bb4466433521b4e602b808f16e8d0ffdb.tar.bz2 spark-9646018bb4466433521b4e602b808f16e8d0ffdb.zip |
[SPARK-7241] Pearson correlation for DataFrames
submitting this PR from a phone, excuse the brevity.
adds Pearson correlation to Dataframes, reusing the covariance calculation code
cc mengxr rxin
Author: Burak Yavuz <brkyvz@gmail.com>
Closes #5858 from brkyvz/df-corr and squashes the following commits:
285b838 [Burak Yavuz] addressed comments v2.0
d10babb [Burak Yavuz] addressed comments v0.2
4b74b24 [Burak Yavuz] Merge branch 'master' of github.com:apache/spark into df-corr
4fe693b [Burak Yavuz] addressed comments v0.1
a682d06 [Burak Yavuz] ready for PR
Diffstat (limited to 'python/pyspark/sql/tests.py')
-rw-r--r-- | python/pyspark/sql/tests.py | 6 |
1 files changed, 6 insertions, 0 deletions
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 613efc0ac0..d652c302a5 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -394,6 +394,12 @@ class SQLTests(ReusedPySparkTestCase): self.assertTrue(95 < g.agg(functions.approxCountDistinct(df.key)).first()[0]) self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0]) + def test_corr(self): + import math + df = self.sc.parallelize([Row(a=i, b=math.sqrt(i)) for i in range(10)]).toDF() + corr = df.stat.corr("a", "b") + self.assertTrue(abs(corr - 0.95734012) < 1e-6) + def test_cov(self): df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF() cov = df.stat.cov("a", "b") |