diff options
author | Jakob Odersky <jakob@odersky.com> | 2016-10-18 15:07:55 -0700 |
---|---|---|
committer | Jakob Odersky <jakob@odersky.com> | 2016-12-09 11:34:52 -0800 |
commit | 8bca7a5e0ec60957c0a847e18f5ed40b763c752e (patch) | |
tree | 39fd4bcb00281b0c8747613f76a61f88d2aa9893 | |
parent | 8a04504a13257f5a4f00908c1a0dd486df60808d (diff) | |
download | spark-8bca7a5e0ec60957c0a847e18f5ed40b763c752e.tar.gz spark-8bca7a5e0ec60957c0a847e18f5ed40b763c752e.tar.bz2 spark-8bca7a5e0ec60957c0a847e18f5ed40b763c752e.zip |
Implement Hive's 'like' behaviour
3 files changed, 36 insertions, 37 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 1504c7c62e..8ee987e5ba 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -70,20 +70,18 @@ trait StringRegexExpression extends ImplicitCastInputTypes { @ExpressionDescription( usage = "str _FUNC_ pattern - Returns true if str matches pattern, " + "null if any arguments are null, false otherwise.", - extended = """ - |The pattern is a string which is matched literally, with exception to the - |following symbols: - | - | _ matches any one character in the input (similar to . in posix - | regular expressions) - | - | % matches zero ore more characters in the input (similar to .* in - | posix regular expressions) - | - |The default escape character is '\\'. Any character after the escape - |character will be matched against literally. - | - |Use RLIKE to match with standard regular expressions.""") + extended = + "The pattern is a string which is matched literally, with exception to the " + + "following special symbols:\n\n" + + " _ matches any one character in the input (similar to . in posix " + + "regular expressions)\n\n" + + " % matches zero ore more characters in the input (similar to .* in " + + "posix regular expressions\n\n" + + "The default escape character is '\\'. If an escape character precedes a special symbol or " + + "another escape character, the following character is matched literally, otherwise the " + + "escape character is treated literally. I.e. '\\%' would match '%', whereas '\\a' matches " + + "'\\a'.\n\n" + + "Use RLIKE to match with standard regular expressions.") case class Like(left: Expression, right: Expression) extends BinaryExpression with StringRegexExpression { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala index c03cc4c718..79c61023b4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala @@ -23,27 +23,27 @@ import org.apache.spark.unsafe.types.UTF8String object StringUtils { - // replace the _ with .{1} exactly match 1 time of any character - // replace the % with .*, match 0 or more times with any character + /** Convert 'like' pattern to Java regex. */ def escapeLikeRegex(str: String): String = { - val builder = new StringBuilder() - var escaping = false - for (next <- str) { - if (escaping) { - builder ++= Pattern.quote(Character.toString(next)) - escaping = false - } else if (next == '\\') { - escaping = true - } else { - builder ++= (next match { - case '_' => "." - case '%' => ".*" - case _ => Pattern.quote(Character.toString(next)) - }) - escaping = false + val in = str.toIterator + val out = new StringBuilder() + + while (in.hasNext) { + in.next match { + case '\\' if in.hasNext => + in.next match { + case '\\' => out ++= Pattern.quote("\\") + case '_' => out ++= Pattern.quote("_") + case '%' => out ++= Pattern.quote("%") + // escape before non-escapable character treated literally + case c => out ++= Pattern.quote("\\" + c) + } + case '_' => out ++= "." + case '%' => out ++= ".*" + case c => out ++= Pattern.quote(Character.toString(c)) } } - "(?s)" + builder.result() // (?s) enables dotall mode, causing "." to match new lines + "(?s)" + out.result() // (?s) enables dotall mode, causing "." to match new lines } private[this] val trueStrings = Set("t", "true", "y", "yes", "1").map(UTF8String.fromString) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 65c08d90ea..71292dd701 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -62,7 +62,6 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("a" like "", false) checkEvaluation("" like "a", false) - // SI-17647 double-escaping backslash checkEvaluation("""\\\\""" like """%\\%""", true) // triple quotes to avoid java string escaping checkEvaluation("""%%""" like """%%""", true) @@ -78,8 +77,9 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("a\u20ACa" like "_€_", true) // scalastyle:on nonascii - // escaping at end position - checkEvaluation("""a\""" like """a\""", false) // TODO: should throw an exception? + // escaping non-escapable should match literally + checkEvaluation("""\a""" like """\a""", true) + checkEvaluation("""a\""" like """a\""", true) // case checkEvaluation("A" like "a%", false) @@ -125,8 +125,9 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("a\u20ACa" like regEx, true, create_row("_€_")) // scalastyle:on nonascii - // TODO: should throw an exception? - checkEvaluation("""a\""" like regEx, false, create_row("""a\""")) + // escaping non-escapable should match literally + checkEvaluation("""\a""" like regEx, true, create_row("""\a""")) + checkEvaluation("""a\""" like regEx, true, create_row("""a\""")) checkEvaluation("A" like regEx, false, create_row("a%")) checkEvaluation("a" like regEx, false, create_row("A%")) |