diff options
author | felixcheung <felixcheung_m@hotmail.com> | 2015-11-10 15:47:10 -0800 |
---|---|---|
committer | Davies Liu <davies.liu@gmail.com> | 2015-11-10 15:47:10 -0800 |
commit | 32790fe7249b0efe2cbc5c4ee2df0fb687dcd624 (patch) | |
tree | 30a209d8061d0ca7cb3693b43af19a7adee65106 | |
parent | 638c51d9380081b3b8182be2c2460bd53b8b0a4f (diff) | |
download | spark-32790fe7249b0efe2cbc5c4ee2df0fb687dcd624.tar.gz spark-32790fe7249b0efe2cbc5c4ee2df0fb687dcd624.tar.bz2 spark-32790fe7249b0efe2cbc5c4ee2df0fb687dcd624.zip |
[SPARK-11567] [PYTHON] Add Python API for corr Aggregate function
like `df.agg(corr("col1", "col2")`
davies
Author: felixcheung <felixcheung_m@hotmail.com>
Closes #9536 from felixcheung/pyfunc.
-rw-r--r-- | python/pyspark/sql/functions.py | 16 |
1 files changed, 16 insertions, 0 deletions
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 6e1cbde423..c3da513c13 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -255,6 +255,22 @@ def coalesce(*cols): return Column(jc) +@since(1.6) +def corr(col1, col2): + """Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1`` + and ``col2``. + + >>> a = [x * x - 2 * x + 3.5 for x in range(20)] + >>> b = range(20) + >>> corrDf = sqlContext.createDataFrame(zip(a, b)) + >>> corrDf = corrDf.agg(corr(corrDf._1, corrDf._2).alias('c')) + >>> corrDf.selectExpr('abs(c - 0.9572339139475857) < 1e-16 as t').collect() + [Row(t=True)] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.corr(_to_java_column(col1), _to_java_column(col2))) + + @since(1.3) def countDistinct(col, *cols): """Returns a new :class:`Column` for distinct count of ``col`` or ``cols``. |