From 4dc8d74491b101a794cf8d386d8c5ebc6019b75f Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Fri, 1 May 2015 13:29:17 -0700 Subject: [SPARK-7240][SQL] Single pass covariance calculation for dataframes Added the calculation of covariance between two columns to DataFrames. cc mengxr rxin Author: Burak Yavuz Closes #5825 from brkyvz/df-cov and squashes the following commits: cb18046 [Burak Yavuz] changed to sample covariance f2e862b [Burak Yavuz] fixed failed test 51e39b8 [Burak Yavuz] moved implementation 0c6a759 [Burak Yavuz] addressed math comments 8456eca [Burak Yavuz] fix pyStyle3 aa2ad29 [Burak Yavuz] fix pyStyle2 4e97a50 [Burak Yavuz] Merge branch 'master' of github.com:apache/spark into df-cov e3b0b85 [Burak Yavuz] addressed comments v0.1 a7115f1 [Burak Yavuz] fix python style 7dc6dbc [Burak Yavuz] reorder imports 408cb77 [Burak Yavuz] initial commit --- python/pyspark/sql/tests.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'python/pyspark/sql/tests.py') diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 5640bb5ea2..44c8b6a1aa 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -387,6 +387,11 @@ class SQLTests(ReusedPySparkTestCase): self.assertTrue(95 < g.agg(functions.approxCountDistinct(df.key)).first()[0]) self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0]) + def test_cov(self): + df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF() + cov = df.stat.cov("a", "b") + self.assertTrue(abs(cov - 55.0 / 3) < 1e-6) + def test_math_functions(self): df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF() from pyspark.sql import mathfunctions as functions -- cgit v1.2.3