aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorgatorsmile <gatorsmile@gmail.com>2015-11-25 23:24:33 -0800
committerReynold Xin <rxin@databricks.com>2015-11-25 23:24:33 -0800
commit068b6438d6886ce5b4aa698383866f466d913d66 (patch)
tree34ff7bc7a9862806007bc2b4f68b83226f54f818 /python
parentd1930ec01ab5a9d83f801f8ae8d4f15a38d98b76 (diff)
downloadspark-068b6438d6886ce5b4aa698383866f466d913d66.tar.gz
spark-068b6438d6886ce5b4aa698383866f466d913d66.tar.bz2
spark-068b6438d6886ce5b4aa698383866f466d913d66.zip
[SPARK-11980][SPARK-10621][SQL] Fix json_tuple and add test cases for
Added Python test cases for the function `isnan`, `isnull`, `nanvl` and `json_tuple`. Fixed a bug in the function `json_tuple` rxin , could you help me review my changes? Please let me know anything is missing. Thank you! Have a good Thanksgiving day! Author: gatorsmile <gatorsmile@gmail.com> Closes #9977 from gatorsmile/json_tuple.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/sql/functions.py44
1 files changed, 34 insertions, 10 deletions
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index e3786e0fa5..90625949f7 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -286,14 +286,6 @@ def countDistinct(col, *cols):
return Column(jc)
-@since(1.4)
-def monotonicallyIncreasingId():
- """
- .. note:: Deprecated in 1.6, use monotonically_increasing_id instead.
- """
- return monotonically_increasing_id()
-
-
@since(1.6)
def input_file_name():
"""Creates a string column for the file name of the current Spark task.
@@ -305,6 +297,10 @@ def input_file_name():
@since(1.6)
def isnan(col):
"""An expression that returns true iff the column is NaN.
+
+ >>> df = sqlContext.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b"))
+ >>> df.select(isnan("a").alias("r1"), isnan(df.a).alias("r2")).collect()
+ [Row(r1=False, r2=False), Row(r1=True, r2=True)]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.isnan(_to_java_column(col)))
@@ -313,11 +309,23 @@ def isnan(col):
@since(1.6)
def isnull(col):
"""An expression that returns true iff the column is null.
+
+ >>> df = sqlContext.createDataFrame([(1, None), (None, 2)], ("a", "b"))
+ >>> df.select(isnull("a").alias("r1"), isnull(df.a).alias("r2")).collect()
+ [Row(r1=False, r2=False), Row(r1=True, r2=True)]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.isnull(_to_java_column(col)))
+@since(1.4)
+def monotonicallyIncreasingId():
+ """
+ .. note:: Deprecated in 1.6, use monotonically_increasing_id instead.
+ """
+ return monotonically_increasing_id()
+
+
@since(1.6)
def monotonically_increasing_id():
"""A column that generates monotonically increasing 64-bit integers.
@@ -344,6 +352,10 @@ def nanvl(col1, col2):
"""Returns col1 if it is not NaN, or col2 if col1 is NaN.
Both inputs should be floating point columns (DoubleType or FloatType).
+
+ >>> df = sqlContext.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b"))
+ >>> df.select(nanvl("a", "b").alias("r1"), nanvl(df.a, df.b).alias("r2")).collect()
+ [Row(r1=1.0, r2=1.0), Row(r1=2.0, r2=2.0)]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.nanvl(_to_java_column(col1), _to_java_column(col2)))
@@ -1460,6 +1472,7 @@ def explode(col):
return Column(jc)
+@ignore_unicode_prefix
@since(1.6)
def get_json_object(col, path):
"""
@@ -1468,22 +1481,33 @@ def get_json_object(col, path):
:param col: string column in json format
:param path: path to the json object to extract
+
+ >>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')]
+ >>> df = sqlContext.createDataFrame(data, ("key", "jstring"))
+ >>> df.select(df.key, get_json_object(df.jstring, '$.f1').alias("c0"), \
+ get_json_object(df.jstring, '$.f2').alias("c1") ).collect()
+ [Row(key=u'1', c0=u'value1', c1=u'value2'), Row(key=u'2', c0=u'value12', c1=None)]
"""
sc = SparkContext._active_spark_context
jc = sc._jvm.functions.get_json_object(_to_java_column(col), path)
return Column(jc)
+@ignore_unicode_prefix
@since(1.6)
-def json_tuple(col, fields):
+def json_tuple(col, *fields):
"""Creates a new row for a json column according to the given field names.
:param col: string column in json format
:param fields: list of fields to extract
+ >>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')]
+ >>> df = sqlContext.createDataFrame(data, ("key", "jstring"))
+ >>> df.select(df.key, json_tuple(df.jstring, 'f1', 'f2')).collect()
+ [Row(key=u'1', c0=u'value1', c1=u'value2'), Row(key=u'2', c0=u'value12', c1=None)]
"""
sc = SparkContext._active_spark_context
- jc = sc._jvm.functions.json_tuple(_to_java_column(col), fields)
+ jc = sc._jvm.functions.json_tuple(_to_java_column(col), _to_seq(sc, fields))
return Column(jc)