aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorOlivier Girardot <o.girardot@lateral-thoughts.com>2015-05-07 10:58:35 -0700
committerReynold Xin <rxin@databricks.com>2015-05-07 10:58:35 -0700
commit068c3158ac0c66e20d90a45e6a2a0b93108e08d5 (patch)
treeee59b88e2f6e31bcaaf699dd397ab52ff9c36ed6 /python
parent9e2ffb13287e6efe256b8d23a4654e4cc305e20b (diff)
downloadspark-068c3158ac0c66e20d90a45e6a2a0b93108e08d5.tar.gz
spark-068c3158ac0c66e20d90a45e6a2a0b93108e08d5.tar.bz2
spark-068c3158ac0c66e20d90a45e6a2a0b93108e08d5.zip
[SPARK-7118] [Python] Add the coalesce Spark SQL function available in PySpark
This patch adds a proxy call from PySpark to the Spark SQL coalesce function and this patch comes out of a discussion on devspark with rxin This contribution is my original work and i license the work to the project under the project's open source license. Olivier. Author: Olivier Girardot <o.girardot@lateral-thoughts.com> Closes #5698 from ogirardot/master and squashes the following commits: d9a4439 [Olivier Girardot] SPARK-7118 Add the coalesce Spark SQL function available in PySpark
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/sql/functions.py37
1 files changed, 37 insertions, 0 deletions
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 274c410a1e..38a043a3c5 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -37,6 +37,7 @@ __all__ = [
'rand',
'randn',
'sparkPartitionId',
+ 'coalesce',
'udf']
@@ -167,6 +168,42 @@ def approxCountDistinct(col, rsd=None):
return Column(jc)
+def coalesce(*cols):
+ """Returns the first column that is not null.
+
+ >>> cDf = sqlContext.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b"))
+ >>> cDf.show()
+ +----+----+
+ | a| b|
+ +----+----+
+ |null|null|
+ | 1|null|
+ |null| 2|
+ +----+----+
+
+ >>> cDf.select(coalesce(cDf["a"], cDf["b"])).show()
+ +-------------+
+ |Coalesce(a,b)|
+ +-------------+
+ | null|
+ | 1|
+ | 2|
+ +-------------+
+
+ >>> cDf.select('*', coalesce(cDf["a"], lit(0.0))).show()
+ +----+----+---------------+
+ | a| b|Coalesce(a,0.0)|
+ +----+----+---------------+
+ |null|null| 0.0|
+ | 1|null| 1.0|
+ |null| 2| 0.0|
+ +----+----+---------------+
+ """
+ sc = SparkContext._active_spark_context
+ jc = sc._jvm.functions.coalesce(_to_seq(sc, cols, _to_java_column))
+ return Column(jc)
+
+
def countDistinct(col, *cols):
"""Returns a new :class:`Column` for distinct count of ``col`` or ``cols``.