diff options
author | Olivier Girardot <o.girardot@lateral-thoughts.com> | 2015-05-07 10:58:35 -0700 |
---|---|---|
committer | Reynold Xin <rxin@databricks.com> | 2015-05-07 10:58:47 -0700 |
commit | 3038b26f1e8ad2c4bd90b630005c75e3cd862e1d (patch) | |
tree | 24d6352a5ccd079b60ae946c888eec6a6918994e /python | |
parent | 6b9737a830d505accd1922970e44558a70bc8329 (diff) | |
download | spark-3038b26f1e8ad2c4bd90b630005c75e3cd862e1d.tar.gz spark-3038b26f1e8ad2c4bd90b630005c75e3cd862e1d.tar.bz2 spark-3038b26f1e8ad2c4bd90b630005c75e3cd862e1d.zip |
[SPARK-7118] [Python] Add the coalesce Spark SQL function available in PySpark
This patch adds a proxy call from PySpark to the Spark SQL coalesce function and this patch comes out of a discussion on devspark with rxin
This contribution is my original work and i license the work to the project under the project's open source license.
Olivier.
Author: Olivier Girardot <o.girardot@lateral-thoughts.com>
Closes #5698 from ogirardot/master and squashes the following commits:
d9a4439 [Olivier Girardot] SPARK-7118 Add the coalesce Spark SQL function available in PySpark
(cherry picked from commit 068c3158ac0c66e20d90a45e6a2a0b93108e08d5)
Signed-off-by: Reynold Xin <rxin@databricks.com>
Diffstat (limited to 'python')
-rw-r--r-- | python/pyspark/sql/functions.py | 37 |
1 files changed, 37 insertions, 0 deletions
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 274c410a1e..38a043a3c5 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -37,6 +37,7 @@ __all__ = [ 'rand', 'randn', 'sparkPartitionId', + 'coalesce', 'udf'] @@ -167,6 +168,42 @@ def approxCountDistinct(col, rsd=None): return Column(jc) +def coalesce(*cols): + """Returns the first column that is not null. + + >>> cDf = sqlContext.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b")) + >>> cDf.show() + +----+----+ + | a| b| + +----+----+ + |null|null| + | 1|null| + |null| 2| + +----+----+ + + >>> cDf.select(coalesce(cDf["a"], cDf["b"])).show() + +-------------+ + |Coalesce(a,b)| + +-------------+ + | null| + | 1| + | 2| + +-------------+ + + >>> cDf.select('*', coalesce(cDf["a"], lit(0.0))).show() + +----+----+---------------+ + | a| b|Coalesce(a,0.0)| + +----+----+---------------+ + |null|null| 0.0| + | 1|null| 1.0| + |null| 2| 0.0| + +----+----+---------------+ + """ + sc = SparkContext._active_spark_context + jc = sc._jvm.functions.coalesce(_to_seq(sc, cols, _to_java_column)) + return Column(jc) + + def countDistinct(col, *cols): """Returns a new :class:`Column` for distinct count of ``col`` or ``cols``. |