aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorzhichao.li <zhichao.li@intel.com>2015-08-01 08:48:46 -0700
committerDavies Liu <davies.liu@gmail.com>2015-08-01 08:48:46 -0700
commitc5166f7a69faeaa8a41a774c73c1ed4d4c2cf0ce (patch)
treec1b3ddbb2f8697743cd8e11aaeedcdf80d1adec7 /python
parentcf6c9ca32a89422e25007d333bc8714d9b0ae6d8 (diff)
downloadspark-c5166f7a69faeaa8a41a774c73c1ed4d4c2cf0ce.tar.gz
spark-c5166f7a69faeaa8a41a774c73c1ed4d4c2cf0ce.tar.bz2
spark-c5166f7a69faeaa8a41a774c73c1ed4d4c2cf0ce.zip
[SPARK-8263] [SQL] substr/substring should also support binary type
This is based on #7641, thanks to zhichao-li Closes #7641 Author: zhichao.li <zhichao.li@intel.com> Author: Davies Liu <davies@databricks.com> Closes #7848 from davies/substr and squashes the following commits: 461b709 [Davies Liu] remove bytearry from tests b45377a [Davies Liu] Merge branch 'master' of github.com:apache/spark into substr 01d795e [zhichao.li] scala style 99aa130 [zhichao.li] add substring to dataframe 4f68bfe [zhichao.li] add binary type support for substring
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/sql/functions.py18
1 files changed, 17 insertions, 1 deletions
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 81dc7d832e..96975f54ff 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -64,7 +64,7 @@ __all__ += [
'year', 'quarter', 'month', 'hour', 'minute', 'second',
'dayofmonth', 'dayofyear', 'weekofyear']
-__all__ += ['soundex']
+__all__ += ['soundex', 'substring', 'substring_index']
def _create_function(name, doc=""):
@@ -925,6 +925,22 @@ def trunc(date, format):
@since(1.5)
@ignore_unicode_prefix
+def substring(str, pos, len):
+ """
+ Substring starts at `pos` and is of length `len` when str is String type or
+ returns the slice of byte array that starts at `pos` in byte and is of length `len`
+ when str is Binary type
+
+ >>> df = sqlContext.createDataFrame([('abcd',)], ['s',])
+ >>> df.select(substring(df.s, 1, 2).alias('s')).collect()
+ [Row(s=u'ab')]
+ """
+ sc = SparkContext._active_spark_context
+ return Column(sc._jvm.functions.substring(_to_java_column(str), pos, len))
+
+
+@since(1.5)
+@ignore_unicode_prefix
def substring_index(str, delim, count):
"""
Returns the substring from string str before count occurrences of the delimiter delim.