aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/sql/functions.py
diff options
context:
space:
mode:
authorCheng Lian <lian@databricks.com>2016-02-21 22:53:15 +0800
committerCheng Lian <lian@databricks.com>2016-02-21 22:53:15 +0800
commitd9efe63ecdc60a9955f1924de0e8a00bcb6a559d (patch)
tree218879fda9e2285db67a72ed9fc42ec3ab61afa5 /python/pyspark/sql/functions.py
parentd806ed34365aa27895547297fff4cc48ecbeacdf (diff)
downloadspark-d9efe63ecdc60a9955f1924de0e8a00bcb6a559d.tar.gz
spark-d9efe63ecdc60a9955f1924de0e8a00bcb6a559d.tar.bz2
spark-d9efe63ecdc60a9955f1924de0e8a00bcb6a559d.zip
[SPARK-12799] Simplify various string output for expressions
This PR introduces several major changes: 1. Replacing `Expression.prettyString` with `Expression.sql` The `prettyString` method is mostly an internal, developer faced facility for debugging purposes, and shouldn't be exposed to users. 1. Using SQL-like representation as column names for selected fields that are not named expression (back-ticks and double quotes should be removed) Before, we were using `prettyString` as column names when possible, and sometimes the result column names can be weird. Here are several examples: Expression | `prettyString` | `sql` | Note ------------------ | -------------- | ---------- | --------------- `a && b` | `a && b` | `a AND b` | `a.getField("f")` | `a[f]` | `a.f` | `a` is a struct 1. Adding trait `NonSQLExpression` extending from `Expression` for expressions that don't have a SQL representation (e.g. Scala UDF/UDAF and Java/Scala object expressions used for encoders) `NonSQLExpression.sql` may return an arbitrary user facing string representation of the expression. Author: Cheng Lian <lian@databricks.com> Closes #10757 from liancheng/spark-12799.simplify-expression-string-methods.
Diffstat (limited to 'python/pyspark/sql/functions.py')
-rw-r--r--python/pyspark/sql/functions.py30
1 files changed, 15 insertions, 15 deletions
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 5fc1cc2cae..fdae05d98c 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -223,22 +223,22 @@ def coalesce(*cols):
+----+----+
>>> cDf.select(coalesce(cDf["a"], cDf["b"])).show()
- +-------------+
- |coalesce(a,b)|
- +-------------+
- | null|
- | 1|
- | 2|
- +-------------+
+ +--------------+
+ |coalesce(a, b)|
+ +--------------+
+ | null|
+ | 1|
+ | 2|
+ +--------------+
>>> cDf.select('*', coalesce(cDf["a"], lit(0.0))).show()
- +----+----+---------------+
- | a| b|coalesce(a,0.0)|
- +----+----+---------------+
- |null|null| 0.0|
- | 1|null| 1.0|
- |null| 2| 0.0|
- +----+----+---------------+
+ +----+----+----------------+
+ | a| b|coalesce(a, 0.0)|
+ +----+----+----------------+
+ |null|null| 0.0|
+ | 1|null| 1.0|
+ |null| 2| 0.0|
+ +----+----+----------------+
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.coalesce(_to_seq(sc, cols, _to_java_column))
@@ -1528,7 +1528,7 @@ def array_contains(col, value):
>>> df = sqlContext.createDataFrame([(["a", "b", "c"],), ([],)], ['data'])
>>> df.select(array_contains(df.data, "a")).collect()
- [Row(array_contains(data,a)=True), Row(array_contains(data,a)=False)]
+ [Row(array_contains(data, a)=True), Row(array_contains(data, a)=False)]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.array_contains(_to_java_column(col), value))