diff options
author | zhichao.li <zhichao.li@intel.com> | 2015-08-01 08:48:46 -0700 |
---|---|---|
committer | Davies Liu <davies.liu@gmail.com> | 2015-08-01 08:48:46 -0700 |
commit | c5166f7a69faeaa8a41a774c73c1ed4d4c2cf0ce (patch) | |
tree | c1b3ddbb2f8697743cd8e11aaeedcdf80d1adec7 /python/pyspark/sql | |
parent | cf6c9ca32a89422e25007d333bc8714d9b0ae6d8 (diff) | |
download | spark-c5166f7a69faeaa8a41a774c73c1ed4d4c2cf0ce.tar.gz spark-c5166f7a69faeaa8a41a774c73c1ed4d4c2cf0ce.tar.bz2 spark-c5166f7a69faeaa8a41a774c73c1ed4d4c2cf0ce.zip |
[SPARK-8263] [SQL] substr/substring should also support binary type
This is based on #7641, thanks to zhichao-li
Closes #7641
Author: zhichao.li <zhichao.li@intel.com>
Author: Davies Liu <davies@databricks.com>
Closes #7848 from davies/substr and squashes the following commits:
461b709 [Davies Liu] remove bytearry from tests
b45377a [Davies Liu] Merge branch 'master' of github.com:apache/spark into substr
01d795e [zhichao.li] scala style
99aa130 [zhichao.li] add substring to dataframe
4f68bfe [zhichao.li] add binary type support for substring
Diffstat (limited to 'python/pyspark/sql')
-rw-r--r-- | python/pyspark/sql/functions.py | 18 |
1 files changed, 17 insertions, 1 deletions
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 81dc7d832e..96975f54ff 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -64,7 +64,7 @@ __all__ += [ 'year', 'quarter', 'month', 'hour', 'minute', 'second', 'dayofmonth', 'dayofyear', 'weekofyear'] -__all__ += ['soundex'] +__all__ += ['soundex', 'substring', 'substring_index'] def _create_function(name, doc=""): @@ -925,6 +925,22 @@ def trunc(date, format): @since(1.5) @ignore_unicode_prefix +def substring(str, pos, len): + """ + Substring starts at `pos` and is of length `len` when str is String type or + returns the slice of byte array that starts at `pos` in byte and is of length `len` + when str is Binary type + + >>> df = sqlContext.createDataFrame([('abcd',)], ['s',]) + >>> df.select(substring(df.s, 1, 2).alias('s')).collect() + [Row(s=u'ab')] + """ + sc = SparkContext._active_spark_context + return Column(sc._jvm.functions.substring(_to_java_column(str), pos, len)) + + +@since(1.5) +@ignore_unicode_prefix def substring_index(str, delim, count): """ Returns the substring from string str before count occurrences of the delimiter delim. |