aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakob Odersky <jakob@odersky.com>2016-10-18 15:07:55 -0700
committerJakob Odersky <jakob@odersky.com>2016-12-09 11:34:52 -0800
commit8bca7a5e0ec60957c0a847e18f5ed40b763c752e (patch)
tree39fd4bcb00281b0c8747613f76a61f88d2aa9893
parent8a04504a13257f5a4f00908c1a0dd486df60808d (diff)
downloadspark-8bca7a5e0ec60957c0a847e18f5ed40b763c752e.tar.gz
spark-8bca7a5e0ec60957c0a847e18f5ed40b763c752e.tar.bz2
spark-8bca7a5e0ec60957c0a847e18f5ed40b763c752e.zip
Implement Hive's 'like' behaviour
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala26
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala36
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala11
3 files changed, 36 insertions, 37 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 1504c7c62e..8ee987e5ba 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -70,20 +70,18 @@ trait StringRegexExpression extends ImplicitCastInputTypes {
@ExpressionDescription(
usage = "str _FUNC_ pattern - Returns true if str matches pattern, " +
"null if any arguments are null, false otherwise.",
- extended = """
- |The pattern is a string which is matched literally, with exception to the
- |following symbols:
- |
- | _ matches any one character in the input (similar to . in posix
- | regular expressions)
- |
- | % matches zero ore more characters in the input (similar to .* in
- | posix regular expressions)
- |
- |The default escape character is '\\'. Any character after the escape
- |character will be matched against literally.
- |
- |Use RLIKE to match with standard regular expressions.""")
+ extended =
+ "The pattern is a string which is matched literally, with exception to the " +
+ "following special symbols:\n\n" +
+ " _ matches any one character in the input (similar to . in posix " +
+ "regular expressions)\n\n" +
+ " % matches zero ore more characters in the input (similar to .* in " +
+ "posix regular expressions\n\n" +
+ "The default escape character is '\\'. If an escape character precedes a special symbol or " +
+ "another escape character, the following character is matched literally, otherwise the " +
+ "escape character is treated literally. I.e. '\\%' would match '%', whereas '\\a' matches " +
+ "'\\a'.\n\n" +
+ "Use RLIKE to match with standard regular expressions.")
case class Like(left: Expression, right: Expression)
extends BinaryExpression with StringRegexExpression {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
index c03cc4c718..79c61023b4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
@@ -23,27 +23,27 @@ import org.apache.spark.unsafe.types.UTF8String
object StringUtils {
- // replace the _ with .{1} exactly match 1 time of any character
- // replace the % with .*, match 0 or more times with any character
+ /** Convert 'like' pattern to Java regex. */
def escapeLikeRegex(str: String): String = {
- val builder = new StringBuilder()
- var escaping = false
- for (next <- str) {
- if (escaping) {
- builder ++= Pattern.quote(Character.toString(next))
- escaping = false
- } else if (next == '\\') {
- escaping = true
- } else {
- builder ++= (next match {
- case '_' => "."
- case '%' => ".*"
- case _ => Pattern.quote(Character.toString(next))
- })
- escaping = false
+ val in = str.toIterator
+ val out = new StringBuilder()
+
+ while (in.hasNext) {
+ in.next match {
+ case '\\' if in.hasNext =>
+ in.next match {
+ case '\\' => out ++= Pattern.quote("\\")
+ case '_' => out ++= Pattern.quote("_")
+ case '%' => out ++= Pattern.quote("%")
+ // escape before non-escapable character treated literally
+ case c => out ++= Pattern.quote("\\" + c)
+ }
+ case '_' => out ++= "."
+ case '%' => out ++= ".*"
+ case c => out ++= Pattern.quote(Character.toString(c))
}
}
- "(?s)" + builder.result() // (?s) enables dotall mode, causing "." to match new lines
+ "(?s)" + out.result() // (?s) enables dotall mode, causing "." to match new lines
}
private[this] val trueStrings = Set("t", "true", "y", "yes", "1").map(UTF8String.fromString)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
index 65c08d90ea..71292dd701 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
@@ -62,7 +62,6 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation("a" like "", false)
checkEvaluation("" like "a", false)
-
// SI-17647 double-escaping backslash
checkEvaluation("""\\\\""" like """%\\%""", true) // triple quotes to avoid java string escaping
checkEvaluation("""%%""" like """%%""", true)
@@ -78,8 +77,9 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation("a\u20ACa" like "_€_", true)
// scalastyle:on nonascii
- // escaping at end position
- checkEvaluation("""a\""" like """a\""", false) // TODO: should throw an exception?
+ // escaping non-escapable should match literally
+ checkEvaluation("""\a""" like """\a""", true)
+ checkEvaluation("""a\""" like """a\""", true)
// case
checkEvaluation("A" like "a%", false)
@@ -125,8 +125,9 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation("a\u20ACa" like regEx, true, create_row("_€_"))
// scalastyle:on nonascii
- // TODO: should throw an exception?
- checkEvaluation("""a\""" like regEx, false, create_row("""a\"""))
+ // escaping non-escapable should match literally
+ checkEvaluation("""\a""" like regEx, true, create_row("""\a"""))
+ checkEvaluation("""a\""" like regEx, true, create_row("""a\"""))
checkEvaluation("A" like regEx, false, create_row("a%"))
checkEvaluation("a" like regEx, false, create_row("A%"))