diff options
author | Burak Yavuz <brkyvz@gmail.com> | 2015-05-01 13:29:17 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-05-01 13:29:17 -0700 |
commit | 4dc8d74491b101a794cf8d386d8c5ebc6019b75f (patch) | |
tree | 01733e92623635c80a0e3d7b50869b742f3f82a1 /python/pyspark/sql/dataframe.py | |
parent | 7b5dd3e3c0030087eea5a8224789352c03717c1d (diff) | |
download | spark-4dc8d74491b101a794cf8d386d8c5ebc6019b75f.tar.gz spark-4dc8d74491b101a794cf8d386d8c5ebc6019b75f.tar.bz2 spark-4dc8d74491b101a794cf8d386d8c5ebc6019b75f.zip |
[SPARK-7240][SQL] Single pass covariance calculation for dataframes
Added the calculation of covariance between two columns to DataFrames.
cc mengxr rxin
Author: Burak Yavuz <brkyvz@gmail.com>
Closes #5825 from brkyvz/df-cov and squashes the following commits:
cb18046 [Burak Yavuz] changed to sample covariance
f2e862b [Burak Yavuz] fixed failed test
51e39b8 [Burak Yavuz] moved implementation
0c6a759 [Burak Yavuz] addressed math comments
8456eca [Burak Yavuz] fix pyStyle3
aa2ad29 [Burak Yavuz] fix pyStyle2
4e97a50 [Burak Yavuz] Merge branch 'master' of github.com:apache/spark into df-cov
e3b0b85 [Burak Yavuz] addressed comments v0.1
a7115f1 [Burak Yavuz] fix python style
7dc6dbc [Burak Yavuz] reorder imports
408cb77 [Burak Yavuz] initial commit
Diffstat (limited to 'python/pyspark/sql/dataframe.py')
-rw-r--r-- | python/pyspark/sql/dataframe.py | 36 |
1 files changed, 35 insertions, 1 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 5908ebc990..1f08c2df93 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -34,7 +34,8 @@ from pyspark.sql.types import * from pyspark.sql.types import _create_cls, _parse_datatype_json_string -__all__ = ["DataFrame", "GroupedData", "Column", "SchemaRDD", "DataFrameNaFunctions"] +__all__ = ["DataFrame", "GroupedData", "Column", "SchemaRDD", "DataFrameNaFunctions", + "DataFrameStatFunctions"] class DataFrame(object): @@ -93,6 +94,12 @@ class DataFrame(object): """ return DataFrameNaFunctions(self) + @property + def stat(self): + """Returns a :class:`DataFrameStatFunctions` for statistic functions. + """ + return DataFrameStatFunctions(self) + @ignore_unicode_prefix def toJSON(self, use_unicode=True): """Converts a :class:`DataFrame` into a :class:`RDD` of string. @@ -868,6 +875,20 @@ class DataFrame(object): return DataFrame(self._jdf.na().fill(value, self._jseq(subset)), self.sql_ctx) + def cov(self, col1, col2): + """ + Calculate the sample covariance for the given columns, specified by their names, as a + double value. :func:`DataFrame.cov` and :func:`DataFrameStatFunctions.cov` are aliases. + + :param col1: The name of the first column + :param col2: The name of the second column + """ + if not isinstance(col1, str): + raise ValueError("col1 should be a string.") + if not isinstance(col2, str): + raise ValueError("col2 should be a string.") + return self._jdf.stat().cov(col1, col2) + @ignore_unicode_prefix def withColumn(self, colName, col): """Returns a new :class:`DataFrame` by adding a column. @@ -1311,6 +1332,19 @@ class DataFrameNaFunctions(object): fill.__doc__ = DataFrame.fillna.__doc__ +class DataFrameStatFunctions(object): + """Functionality for statistic functions with :class:`DataFrame`. + """ + + def __init__(self, df): + self.df = df + + def cov(self, col1, col2): + return self.df.cov(col1, col2) + + cov.__doc__ = DataFrame.cov.__doc__ + + def _test(): import doctest from pyspark.context import SparkContext |