aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorfelixcheung <felixcheung_m@hotmail.com>2015-11-10 15:47:10 -0800
committerDavies Liu <davies.liu@gmail.com>2015-11-10 15:47:10 -0800
commit32790fe7249b0efe2cbc5c4ee2df0fb687dcd624 (patch)
tree30a209d8061d0ca7cb3693b43af19a7adee65106 /python
parent638c51d9380081b3b8182be2c2460bd53b8b0a4f (diff)
downloadspark-32790fe7249b0efe2cbc5c4ee2df0fb687dcd624.tar.gz
spark-32790fe7249b0efe2cbc5c4ee2df0fb687dcd624.tar.bz2
spark-32790fe7249b0efe2cbc5c4ee2df0fb687dcd624.zip
[SPARK-11567] [PYTHON] Add Python API for corr Aggregate function
like `df.agg(corr("col1", "col2")` davies Author: felixcheung <felixcheung_m@hotmail.com> Closes #9536 from felixcheung/pyfunc.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/sql/functions.py16
1 files changed, 16 insertions, 0 deletions
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 6e1cbde423..c3da513c13 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -255,6 +255,22 @@ def coalesce(*cols):
return Column(jc)
+@since(1.6)
+def corr(col1, col2):
+ """Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1``
+ and ``col2``.
+
+ >>> a = [x * x - 2 * x + 3.5 for x in range(20)]
+ >>> b = range(20)
+ >>> corrDf = sqlContext.createDataFrame(zip(a, b))
+ >>> corrDf = corrDf.agg(corr(corrDf._1, corrDf._2).alias('c'))
+ >>> corrDf.selectExpr('abs(c - 0.9572339139475857) < 1e-16 as t').collect()
+ [Row(t=True)]
+ """
+ sc = SparkContext._active_spark_context
+ return Column(sc._jvm.functions.corr(_to_java_column(col1), _to_java_column(col2)))
+
+
@since(1.3)
def countDistinct(col, *cols):
"""Returns a new :class:`Column` for distinct count of ``col`` or ``cols``.