aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/sql
diff options
context:
space:
mode:
authorSean Owen <sowen@cloudera.com>2016-08-10 10:14:43 +0100
committerSean Owen <sowen@cloudera.com>2016-08-10 10:14:43 +0100
commit0578ff9681edbaab4ae68f67272dc3d4d890d53b (patch)
treed54b571ca32c769dbcfdf116aaf51a178c2785ae /python/pyspark/sql
parenteca58755fbbc11937b335ad953a3caff89b818e6 (diff)
downloadspark-0578ff9681edbaab4ae68f67272dc3d4d890d53b.tar.gz
spark-0578ff9681edbaab4ae68f67272dc3d4d890d53b.tar.bz2
spark-0578ff9681edbaab4ae68f67272dc3d4d890d53b.zip
[SPARK-16324][SQL] regexp_extract should doc that it returns empty string when match fails
## What changes were proposed in this pull request? Doc that regexp_extract returns empty string when regex or group does not match ## How was this patch tested? Jenkins test, with a few new test cases Author: Sean Owen <sowen@cloudera.com> Closes #14525 from srowen/SPARK-16324.
Diffstat (limited to 'python/pyspark/sql')
-rw-r--r--python/pyspark/sql/functions.py6
1 files changed, 5 insertions, 1 deletions
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 8a01805ec8..4ea83e24bb 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1440,11 +1440,15 @@ def split(str, pattern):
@ignore_unicode_prefix
@since(1.5)
def regexp_extract(str, pattern, idx):
- """Extract a specific(idx) group identified by a java regex, from the specified string column.
+ """Extract a specific group matched by a Java regex, from the specified string column.
+ If the regex did not match, or the specified group did not match, an empty string is returned.
>>> df = spark.createDataFrame([('100-200',)], ['str'])
>>> df.select(regexp_extract('str', '(\d+)-(\d+)', 1).alias('d')).collect()
[Row(d=u'100')]
+ >>> df = spark.createDataFrame([('foo',)], ['str'])
+ >>> df.select(regexp_extract('str', '(\d+)', 1).alias('d')).collect()
+ [Row(d=u'')]
>>> df = spark.createDataFrame([('aaaac',)], ['str'])
>>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect()
[Row(d=u'')]