aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/sql/dataframe.py
diff options
context:
space:
mode:
authorBurak Yavuz <brkyvz@gmail.com>2015-05-01 13:29:17 -0700
committerReynold Xin <rxin@databricks.com>2015-05-01 13:29:17 -0700
commit4dc8d74491b101a794cf8d386d8c5ebc6019b75f (patch)
tree01733e92623635c80a0e3d7b50869b742f3f82a1 /python/pyspark/sql/dataframe.py
parent7b5dd3e3c0030087eea5a8224789352c03717c1d (diff)
downloadspark-4dc8d74491b101a794cf8d386d8c5ebc6019b75f.tar.gz
spark-4dc8d74491b101a794cf8d386d8c5ebc6019b75f.tar.bz2
spark-4dc8d74491b101a794cf8d386d8c5ebc6019b75f.zip
[SPARK-7240][SQL] Single pass covariance calculation for dataframes
Added the calculation of covariance between two columns to DataFrames. cc mengxr rxin Author: Burak Yavuz <brkyvz@gmail.com> Closes #5825 from brkyvz/df-cov and squashes the following commits: cb18046 [Burak Yavuz] changed to sample covariance f2e862b [Burak Yavuz] fixed failed test 51e39b8 [Burak Yavuz] moved implementation 0c6a759 [Burak Yavuz] addressed math comments 8456eca [Burak Yavuz] fix pyStyle3 aa2ad29 [Burak Yavuz] fix pyStyle2 4e97a50 [Burak Yavuz] Merge branch 'master' of github.com:apache/spark into df-cov e3b0b85 [Burak Yavuz] addressed comments v0.1 a7115f1 [Burak Yavuz] fix python style 7dc6dbc [Burak Yavuz] reorder imports 408cb77 [Burak Yavuz] initial commit
Diffstat (limited to 'python/pyspark/sql/dataframe.py')
-rw-r--r--python/pyspark/sql/dataframe.py36
1 files changed, 35 insertions, 1 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 5908ebc990..1f08c2df93 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -34,7 +34,8 @@ from pyspark.sql.types import *
from pyspark.sql.types import _create_cls, _parse_datatype_json_string
-__all__ = ["DataFrame", "GroupedData", "Column", "SchemaRDD", "DataFrameNaFunctions"]
+__all__ = ["DataFrame", "GroupedData", "Column", "SchemaRDD", "DataFrameNaFunctions",
+ "DataFrameStatFunctions"]
class DataFrame(object):
@@ -93,6 +94,12 @@ class DataFrame(object):
"""
return DataFrameNaFunctions(self)
+ @property
+ def stat(self):
+ """Returns a :class:`DataFrameStatFunctions` for statistic functions.
+ """
+ return DataFrameStatFunctions(self)
+
@ignore_unicode_prefix
def toJSON(self, use_unicode=True):
"""Converts a :class:`DataFrame` into a :class:`RDD` of string.
@@ -868,6 +875,20 @@ class DataFrame(object):
return DataFrame(self._jdf.na().fill(value, self._jseq(subset)), self.sql_ctx)
+ def cov(self, col1, col2):
+ """
+ Calculate the sample covariance for the given columns, specified by their names, as a
+ double value. :func:`DataFrame.cov` and :func:`DataFrameStatFunctions.cov` are aliases.
+
+ :param col1: The name of the first column
+ :param col2: The name of the second column
+ """
+ if not isinstance(col1, str):
+ raise ValueError("col1 should be a string.")
+ if not isinstance(col2, str):
+ raise ValueError("col2 should be a string.")
+ return self._jdf.stat().cov(col1, col2)
+
@ignore_unicode_prefix
def withColumn(self, colName, col):
"""Returns a new :class:`DataFrame` by adding a column.
@@ -1311,6 +1332,19 @@ class DataFrameNaFunctions(object):
fill.__doc__ = DataFrame.fillna.__doc__
+class DataFrameStatFunctions(object):
+ """Functionality for statistic functions with :class:`DataFrame`.
+ """
+
+ def __init__(self, df):
+ self.df = df
+
+ def cov(self, col1, col2):
+ return self.df.cov(col1, col2)
+
+ cov.__doc__ = DataFrame.cov.__doc__
+
+
def _test():
import doctest
from pyspark.context import SparkContext