From 3038b26f1e8ad2c4bd90b630005c75e3cd862e1d Mon Sep 17 00:00:00 2001 From: Olivier Girardot Date: Thu, 7 May 2015 10:58:35 -0700 Subject: [SPARK-7118] [Python] Add the coalesce Spark SQL function available in PySpark This patch adds a proxy call from PySpark to the Spark SQL coalesce function and this patch comes out of a discussion on devspark with rxin This contribution is my original work and i license the work to the project under the project's open source license. Olivier. Author: Olivier Girardot Closes #5698 from ogirardot/master and squashes the following commits: d9a4439 [Olivier Girardot] SPARK-7118 Add the coalesce Spark SQL function available in PySpark (cherry picked from commit 068c3158ac0c66e20d90a45e6a2a0b93108e08d5) Signed-off-by: Reynold Xin --- python/pyspark/sql/functions.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'python') diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 274c410a1e..38a043a3c5 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -37,6 +37,7 @@ __all__ = [ 'rand', 'randn', 'sparkPartitionId', + 'coalesce', 'udf'] @@ -167,6 +168,42 @@ def approxCountDistinct(col, rsd=None): return Column(jc) +def coalesce(*cols): + """Returns the first column that is not null. + + >>> cDf = sqlContext.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b")) + >>> cDf.show() + +----+----+ + | a| b| + +----+----+ + |null|null| + | 1|null| + |null| 2| + +----+----+ + + >>> cDf.select(coalesce(cDf["a"], cDf["b"])).show() + +-------------+ + |Coalesce(a,b)| + +-------------+ + | null| + | 1| + | 2| + +-------------+ + + >>> cDf.select('*', coalesce(cDf["a"], lit(0.0))).show() + +----+----+---------------+ + | a| b|Coalesce(a,0.0)| + +----+----+---------------+ + |null|null| 0.0| + | 1|null| 1.0| + |null| 2| 0.0| + +----+----+---------------+ + """ + sc = SparkContext._active_spark_context + jc = sc._jvm.functions.coalesce(_to_seq(sc, cols, _to_java_column)) + return Column(jc) + + def countDistinct(col, *cols): """Returns a new :class:`Column` for distinct count of ``col`` or ``cols``. -- cgit v1.2.3