aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/sql/context.py
diff options
context:
space:
mode:
authorCheng Lian <lian@databricks.com>2016-02-21 22:53:15 +0800
committerCheng Lian <lian@databricks.com>2016-02-21 22:53:15 +0800
commitd9efe63ecdc60a9955f1924de0e8a00bcb6a559d (patch)
tree218879fda9e2285db67a72ed9fc42ec3ab61afa5 /python/pyspark/sql/context.py
parentd806ed34365aa27895547297fff4cc48ecbeacdf (diff)
downloadspark-d9efe63ecdc60a9955f1924de0e8a00bcb6a559d.tar.gz
spark-d9efe63ecdc60a9955f1924de0e8a00bcb6a559d.tar.bz2
spark-d9efe63ecdc60a9955f1924de0e8a00bcb6a559d.zip
[SPARK-12799] Simplify various string output for expressions
This PR introduces several major changes: 1. Replacing `Expression.prettyString` with `Expression.sql` The `prettyString` method is mostly an internal, developer faced facility for debugging purposes, and shouldn't be exposed to users. 1. Using SQL-like representation as column names for selected fields that are not named expression (back-ticks and double quotes should be removed) Before, we were using `prettyString` as column names when possible, and sometimes the result column names can be weird. Here are several examples: Expression | `prettyString` | `sql` | Note ------------------ | -------------- | ---------- | --------------- `a && b` | `a && b` | `a AND b` | `a.getField("f")` | `a[f]` | `a.f` | `a` is a struct 1. Adding trait `NonSQLExpression` extending from `Expression` for expressions that don't have a SQL representation (e.g. Scala UDF/UDAF and Java/Scala object expressions used for encoders) `NonSQLExpression.sql` may return an arbitrary user facing string representation of the expression. Author: Cheng Lian <lian@databricks.com> Closes #10757 from liancheng/spark-12799.simplify-expression-string-methods.
Diffstat (limited to 'python/pyspark/sql/context.py')
-rw-r--r--python/pyspark/sql/context.py10
1 files changed, 5 insertions, 5 deletions
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 7f6fb410ab..89bf1443a6 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -92,8 +92,8 @@ class SQLContext(object):
>>> df.registerTempTable("allTypes")
>>> sqlContext.sql('select i+1, d+1, not b, list[1], dict["s"], time, row.a '
... 'from allTypes where b and i > 0').collect()
- [Row(_c0=2, _c1=2.0, _c2=False, _c3=2, _c4=0, \
- time=datetime.datetime(2014, 8, 1, 14, 1, 5), a=1)]
+ [Row((i + CAST(1 AS BIGINT))=2, (d + CAST(1 AS DOUBLE))=2.0, (NOT b)=False, list[1]=2, \
+ dict[s]=0, time=datetime.datetime(2014, 8, 1, 14, 1, 5), a=1)]
>>> df.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time, x.row.a, x.list)).collect()
[(1, u'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])]
"""
@@ -210,17 +210,17 @@ class SQLContext(object):
>>> sqlContext.registerFunction("stringLengthString", lambda x: len(x))
>>> sqlContext.sql("SELECT stringLengthString('test')").collect()
- [Row(_c0=u'4')]
+ [Row(stringLengthString(test)=u'4')]
>>> from pyspark.sql.types import IntegerType
>>> sqlContext.registerFunction("stringLengthInt", lambda x: len(x), IntegerType())
>>> sqlContext.sql("SELECT stringLengthInt('test')").collect()
- [Row(_c0=4)]
+ [Row(stringLengthInt(test)=4)]
>>> from pyspark.sql.types import IntegerType
>>> sqlContext.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
>>> sqlContext.sql("SELECT stringLengthInt('test')").collect()
- [Row(_c0=4)]
+ [Row(stringLengthInt(test)=4)]
"""
udf = UserDefinedFunction(f, returnType, name)
self._ssql_ctx.udf().registerPython(name, udf._judf)