diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/pyspark/sql/functions.py | 30 |
1 files changed, 30 insertions, 0 deletions
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 031745a1c4..3c134faa0a 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -46,6 +46,8 @@ __all__ = [ 'monotonicallyIncreasingId', 'rand', 'randn', + 'regexp_extract', + 'regexp_replace', 'sha1', 'sha2', 'sparkPartitionId', @@ -345,6 +347,34 @@ def levenshtein(left, right): @ignore_unicode_prefix @since(1.5) +def regexp_extract(str, pattern, idx): + """Extract a specific(idx) group identified by a java regex, from the specified string column. + + >>> df = sqlContext.createDataFrame([('100-200',)], ['str']) + >>> df.select(regexp_extract('str', '(\d+)-(\d+)', 1).alias('d')).collect() + [Row(d=u'100')] + """ + sc = SparkContext._active_spark_context + jc = sc._jvm.functions.regexp_extract(_to_java_column(str), pattern, idx) + return Column(jc) + + +@ignore_unicode_prefix +@since(1.5) +def regexp_replace(str, pattern, replacement): + """Replace all substrings of the specified string value that match regexp with rep. + + >>> df = sqlContext.createDataFrame([('100-200',)], ['str']) + >>> df.select(regexp_replace('str', '(\\d+)', '##').alias('d')).collect() + [Row(d=u'##-##')] + """ + sc = SparkContext._active_spark_context + jc = sc._jvm.functions.regexp_replace(_to_java_column(str), pattern, replacement) + return Column(jc) + + +@ignore_unicode_prefix +@since(1.5) def md5(col): """Calculates the MD5 digest and returns the value as a 32 character hex string. |