From 8bca7a5e0ec60957c0a847e18f5ed40b763c752e Mon Sep 17 00:00:00 2001 From: Jakob Odersky Date: Tue, 18 Oct 2016 15:07:55 -0700 Subject: Implement Hive's 'like' behaviour --- .../catalyst/expressions/regexpExpressions.scala | 26 ++++++++-------- .../spark/sql/catalyst/util/StringUtils.scala | 36 +++++++++++----------- .../expressions/RegexpExpressionsSuite.scala | 11 ++++--- 3 files changed, 36 insertions(+), 37 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 1504c7c62e..8ee987e5ba 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -70,20 +70,18 @@ trait StringRegexExpression extends ImplicitCastInputTypes { @ExpressionDescription( usage = "str _FUNC_ pattern - Returns true if str matches pattern, " + "null if any arguments are null, false otherwise.", - extended = """ - |The pattern is a string which is matched literally, with exception to the - |following symbols: - | - | _ matches any one character in the input (similar to . in posix - | regular expressions) - | - | % matches zero ore more characters in the input (similar to .* in - | posix regular expressions) - | - |The default escape character is '\\'. Any character after the escape - |character will be matched against literally. - | - |Use RLIKE to match with standard regular expressions.""") + extended = + "The pattern is a string which is matched literally, with exception to the " + + "following special symbols:\n\n" + + " _ matches any one character in the input (similar to . in posix " + + "regular expressions)\n\n" + + " % matches zero ore more characters in the input (similar to .* in " + + "posix regular expressions\n\n" + + "The default escape character is '\\'. If an escape character precedes a special symbol or " + + "another escape character, the following character is matched literally, otherwise the " + + "escape character is treated literally. I.e. '\\%' would match '%', whereas '\\a' matches " + + "'\\a'.\n\n" + + "Use RLIKE to match with standard regular expressions.") case class Like(left: Expression, right: Expression) extends BinaryExpression with StringRegexExpression { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala index c03cc4c718..79c61023b4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala @@ -23,27 +23,27 @@ import org.apache.spark.unsafe.types.UTF8String object StringUtils { - // replace the _ with .{1} exactly match 1 time of any character - // replace the % with .*, match 0 or more times with any character + /** Convert 'like' pattern to Java regex. */ def escapeLikeRegex(str: String): String = { - val builder = new StringBuilder() - var escaping = false - for (next <- str) { - if (escaping) { - builder ++= Pattern.quote(Character.toString(next)) - escaping = false - } else if (next == '\\') { - escaping = true - } else { - builder ++= (next match { - case '_' => "." - case '%' => ".*" - case _ => Pattern.quote(Character.toString(next)) - }) - escaping = false + val in = str.toIterator + val out = new StringBuilder() + + while (in.hasNext) { + in.next match { + case '\\' if in.hasNext => + in.next match { + case '\\' => out ++= Pattern.quote("\\") + case '_' => out ++= Pattern.quote("_") + case '%' => out ++= Pattern.quote("%") + // escape before non-escapable character treated literally + case c => out ++= Pattern.quote("\\" + c) + } + case '_' => out ++= "." + case '%' => out ++= ".*" + case c => out ++= Pattern.quote(Character.toString(c)) } } - "(?s)" + builder.result() // (?s) enables dotall mode, causing "." to match new lines + "(?s)" + out.result() // (?s) enables dotall mode, causing "." to match new lines } private[this] val trueStrings = Set("t", "true", "y", "yes", "1").map(UTF8String.fromString) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 65c08d90ea..71292dd701 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -62,7 +62,6 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("a" like "", false) checkEvaluation("" like "a", false) - // SI-17647 double-escaping backslash checkEvaluation("""\\\\""" like """%\\%""", true) // triple quotes to avoid java string escaping checkEvaluation("""%%""" like """%%""", true) @@ -78,8 +77,9 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("a\u20ACa" like "_€_", true) // scalastyle:on nonascii - // escaping at end position - checkEvaluation("""a\""" like """a\""", false) // TODO: should throw an exception? + // escaping non-escapable should match literally + checkEvaluation("""\a""" like """\a""", true) + checkEvaluation("""a\""" like """a\""", true) // case checkEvaluation("A" like "a%", false) @@ -125,8 +125,9 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("a\u20ACa" like regEx, true, create_row("_€_")) // scalastyle:on nonascii - // TODO: should throw an exception? - checkEvaluation("""a\""" like regEx, false, create_row("""a\""")) + // escaping non-escapable should match literally + checkEvaluation("""\a""" like regEx, true, create_row("""\a""")) + checkEvaluation("""a\""" like regEx, true, create_row("""a\""")) checkEvaluation("A" like regEx, false, create_row("a%")) checkEvaluation("a" like regEx, false, create_row("A%")) -- cgit v1.2.3