diff options
author | Burak Yavuz <brkyvz@gmail.com> | 2015-05-03 21:44:39 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-05-03 21:44:39 -0700 |
commit | 9646018bb4466433521b4e602b808f16e8d0ffdb (patch) | |
tree | fe4df54135dcc0806af5f801c4753d9b88dbcc16 /python/pyspark/sql/dataframe.py | |
parent | 1ffa8cb91f8badf12a8aa190dc25920715a00db7 (diff) | |
download | spark-9646018bb4466433521b4e602b808f16e8d0ffdb.tar.gz spark-9646018bb4466433521b4e602b808f16e8d0ffdb.tar.bz2 spark-9646018bb4466433521b4e602b808f16e8d0ffdb.zip |
[SPARK-7241] Pearson correlation for DataFrames
submitting this PR from a phone, excuse the brevity.
adds Pearson correlation to Dataframes, reusing the covariance calculation code
cc mengxr rxin
Author: Burak Yavuz <brkyvz@gmail.com>
Closes #5858 from brkyvz/df-corr and squashes the following commits:
285b838 [Burak Yavuz] addressed comments v2.0
d10babb [Burak Yavuz] addressed comments v0.2
4b74b24 [Burak Yavuz] Merge branch 'master' of github.com:apache/spark into df-corr
4fe693b [Burak Yavuz] addressed comments v0.1
a682d06 [Burak Yavuz] ready for PR
Diffstat (limited to 'python/pyspark/sql/dataframe.py')
-rw-r--r-- | python/pyspark/sql/dataframe.py | 26 |
1 files changed, 26 insertions, 0 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 8ddcff8fcd..aac5b8c4c5 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -875,6 +875,27 @@ class DataFrame(object): return DataFrame(self._jdf.na().fill(value, self._jseq(subset)), self.sql_ctx) + def corr(self, col1, col2, method=None): + """ + Calculates the correlation of two columns of a DataFrame as a double value. Currently only + supports the Pearson Correlation Coefficient. + :func:`DataFrame.corr` and :func:`DataFrameStatFunctions.corr` are aliases. + + :param col1: The name of the first column + :param col2: The name of the second column + :param method: The correlation method. Currently only supports "pearson" + """ + if not isinstance(col1, str): + raise ValueError("col1 should be a string.") + if not isinstance(col2, str): + raise ValueError("col2 should be a string.") + if not method: + method = "pearson" + if not method == "pearson": + raise ValueError("Currently only the calculation of the Pearson Correlation " + + "coefficient is supported.") + return self._jdf.stat().corr(col1, col2, method) + def cov(self, col1, col2): """ Calculate the sample covariance for the given columns, specified by their names, as a @@ -1359,6 +1380,11 @@ class DataFrameStatFunctions(object): def __init__(self, df): self.df = df + def corr(self, col1, col2, method=None): + return self.df.corr(col1, col2, method) + + corr.__doc__ = DataFrame.corr.__doc__ + def cov(self, col1, col2): return self.df.cov(col1, col2) |