diff options
author | Sean Owen <sowen@cloudera.com> | 2016-08-07 12:20:07 +0100 |
---|---|---|
committer | Sean Owen <sowen@cloudera.com> | 2016-08-07 12:20:07 +0100 |
commit | 8d8725208771a8815a60160a5a30dc6ea87a7e6a (patch) | |
tree | 191042d5279e59aeaadc60f3585c32ebaf5ca3ee /sql | |
parent | bdfab9f942dcad7c1f3de9b6df5c01dee2392055 (diff) | |
download | spark-8d8725208771a8815a60160a5a30dc6ea87a7e6a.tar.gz spark-8d8725208771a8815a60160a5a30dc6ea87a7e6a.tar.bz2 spark-8d8725208771a8815a60160a5a30dc6ea87a7e6a.zip |
[SPARK-16409][SQL] regexp_extract with optional groups causes NPE
## What changes were proposed in this pull request?
regexp_extract actually returns null when it shouldn't when a regex matches but the requested optional group did not. This makes it return an empty string, as apparently designed.
## How was this patch tested?
Additional unit test
Author: Sean Owen <sowen@cloudera.com>
Closes #14504 from srowen/SPARK-16409.
Diffstat (limited to 'sql')
-rw-r--r-- | sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala | 13 | ||||
-rw-r--r-- | sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala | 8 |
2 files changed, 19 insertions, 2 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index be82b3b8f4..d25da3fd58 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -329,7 +329,12 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio val m = pattern.matcher(s.toString) if (m.find) { val mr: MatchResult = m.toMatchResult - UTF8String.fromString(mr.group(r.asInstanceOf[Int])) + val group = mr.group(r.asInstanceOf[Int]) + if (group == null) { // Pattern matched, but not optional group + UTF8String.EMPTY_UTF8 + } else { + UTF8String.fromString(group) + } } else { UTF8String.EMPTY_UTF8 } @@ -367,7 +372,11 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio ${termPattern}.matcher($subject.toString()); if (${matcher}.find()) { java.util.regex.MatchResult ${matchResult} = ${matcher}.toMatchResult(); - ${ev.value} = UTF8String.fromString(${matchResult}.group($idx)); + if (${matchResult}.group($idx) == null) { + ${ev.value} = UTF8String.EMPTY_UTF8; + } else { + ${ev.value} = UTF8String.fromString(${matchResult}.group($idx)); + } $setEvNotNull } else { ${ev.value} = UTF8String.EMPTY_UTF8; diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 57ca5d9c4d..3b76aaf7d0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -94,6 +94,14 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext { Row("300", "100") :: Row("400", "100") :: Row("400-400", "100") :: Nil) } + test("non-matching optional group") { + val df = Seq("aaaac").toDF("s") + checkAnswer( + df.select(regexp_extract($"s", "(a+)(b)?(c)", 2)), + Row("") + ) + } + test("string ascii function") { val df = Seq(("abc", "")).toDF("a", "b") checkAnswer( |