aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakob Odersky <jakob@odersky.com>2016-10-24 17:07:14 -0700
committerJakob Odersky <jakob@odersky.com>2016-12-09 11:34:53 -0800
commitdbd305c195ba1c1661f82e3e19085778b8783794 (patch)
tree2601fc24126f5ddbbae3f2d6e1510b08cf3d8327
parentba6d0584e4a6417cd9e130d831219cccc194f791 (diff)
downloadspark-SPARK-17647.tar.gz
spark-SPARK-17647.tar.bz2
spark-SPARK-17647.zip
Refactor tests and add documentationSPARK-17647
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala34
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala18
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala187
3 files changed, 111 insertions, 128 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 3df6effb6f..0325d0e837 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -70,18 +70,28 @@ trait StringRegexExpression extends ImplicitCastInputTypes {
@ExpressionDescription(
usage = "str _FUNC_ pattern - Returns true if str matches pattern, " +
"null if any arguments are null, false otherwise.",
- extended =
- "The pattern is a string which is matched literally, with exception to the following " +
- "special symbols:\n\n" +
- " _ matches any one character in the input (similar to . in posix " +
- "regular expressions)\n\n" +
- " % matches zero ore more characters in the input (similar to .* in " +
- "posix regular expressions)\n\n" +
- "The escape character is '\\'. If an escape character precedes a special symbol or " +
- "another escape character, the following character is matched literally, For example, " +
- "the expression `<path> like \\%SystemDrive\\%\\\\Users%` will match any `<path>` that " +
- "starts with '%SystemDrive%\\Users'. It is invalid to escape any other character.\n\n" +
- "Use RLIKE to match with standard regular expressions.")
+ extended = """
+ Arguments:
+ str - a string expression
+ pattern - a string expression. The pattern is a string which is matched literally, with
+ exception to the following special symbols:
+
+ _ matches any one character in the input (similar to . in posix regular expressions)
+
+ % matches zero ore more characters in the input (similar to .* in posix regular
+ expressions)
+
+ The escape character is '\'. If an escape character precedes a special symbol or another
+ escape character, the following character is matched literally. It is invalid to escape
+ any other character.
+
+ Examples:
+ > SELECT '%SystemDrive%\Users\John' _FUNC_ '\%SystemDrive\%\\Users%'
+ true
+
+ See also:
+ Use RLIKE to match with standard regular expressions.
+""")
case class Like(left: Expression, right: Expression)
extends BinaryExpression with StringRegexExpression {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
index b760b994f2..ca22ea2420 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
@@ -24,13 +24,23 @@ import org.apache.spark.unsafe.types.UTF8String
object StringUtils {
- /** Convert 'like' pattern to Java regex. */
- def escapeLikeRegex(str: String): String = {
- val in = str.toIterator
+ /**
+ * Validate and convert SQL 'like' pattern to a Java regular expression.
+ *
+ * Underscores (_) are converted to '.' and percent signs (%) are converted to '.*', other
+ * characters are quoted literally. Escaping is done according to the rules specified in
+ * [[org.apache.spark.sql.catalyst.expressions.Like]] usage documentation. An invalid pattern will
+ * throw an [[AnalysisException]].
+ *
+ * @param pattern the SQL pattern to convert
+ * @return the equivalent Java regular expression of the pattern
+ */
+ def escapeLikeRegex(pattern: String): String = {
+ val in = pattern.toIterator
val out = new StringBuilder()
def fail(message: String) = throw new AnalysisException(
- s"the pattern '$str' is invalid, $message")
+ s"the pattern '$pattern' is invalid, $message")
while (in.hasNext) {
in.next match {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
index 6de2ba2131..1ce150e091 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
@@ -27,10 +27,29 @@ import org.apache.spark.sql.types.{IntegerType, StringType}
*/
class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
- test("LIKE literal Regular Expression") {
+ /**
+ * Check if a given expression evaluates to an expected output, in case the input is
+ * a literal and in case the input is in the form of a row.
+ * @tparam A type of input
+ * @param mkExpr the expression to test for a given input
+ * @param input value that will be used to create the expression, as literal and in the form
+ * of a row
+ * @param expected the expected output of the expression
+ * @param inputToExpression an implicit conversion from the input type to its corresponding
+ * sql expression
+ */
+ def checkLiteralRow[A](mkExpr: Expression => Expression, input: A, expected: Any)
+ (implicit inputToExpression: A => Expression): Unit = {
+ checkEvaluation(mkExpr(input), expected) // check literal input
+
+ val regex = 'a.string.at(0)
+ checkEvaluation(mkExpr(regex), expected, create_row(input)) // check row input
+ }
+
+ test("LIKE Pattern") {
// null handling
- checkEvaluation(Literal.create(null, StringType).like("a"), null)
+ checkLiteralRow(Literal.create(null, StringType).like(_), "a", null)
checkEvaluation(Literal.create("a", StringType).like(Literal.create(null, StringType)), null)
checkEvaluation(Literal.create(null, StringType).like(Literal.create(null, StringType)), null)
checkEvaluation(
@@ -43,109 +62,63 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
Literal.create(null, StringType).like(NonFoldableLiteral.create(null, StringType)), null)
// simple patterns
- checkEvaluation("abdef" like "abdef", true)
- checkEvaluation("a_%b" like "a\\__b", true)
- checkEvaluation("addb" like "a_%b", true)
- checkEvaluation("addb" like "a\\__b", false)
- checkEvaluation("addb" like "a%\\%b", false)
- checkEvaluation("a_%b" like "a%\\%b", true)
- checkEvaluation("addb" like "a%", true)
- checkEvaluation("addb" like "**", false)
- checkEvaluation("abc" like "a%", true)
- checkEvaluation("abc" like "b%", false)
- checkEvaluation("abc" like "bc%", false)
- checkEvaluation("a\nb" like "a_b", true)
- checkEvaluation("ab" like "a%b", true)
- checkEvaluation("a\nb" like "a%b", true)
+ checkLiteralRow("abdef" like _, "abdef", true)
+ checkLiteralRow("a_%b" like _, "a\\__b", true)
+ checkLiteralRow("addb" like _, "a_%b", true)
+ checkLiteralRow("addb" like _, "a\\__b", false)
+ checkLiteralRow("addb" like _, "a%\\%b", false)
+ checkLiteralRow("a_%b" like _, "a%\\%b", true)
+ checkLiteralRow("addb" like _, "a%", true)
+ checkLiteralRow("addb" like _, "**", false)
+ checkLiteralRow("abc" like _, "a%", true)
+ checkLiteralRow("abc" like _, "b%", false)
+ checkLiteralRow("abc" like _, "bc%", false)
+ checkLiteralRow("a\nb" like _, "a_b", true)
+ checkLiteralRow("ab" like _, "a%b", true)
+ checkLiteralRow("a\nb" like _, "a%b", true)
// empty input
- checkEvaluation("" like "", true)
- checkEvaluation("a" like "", false)
- checkEvaluation("" like "a", false)
+ checkLiteralRow("" like _, "", true)
+ checkLiteralRow("a" like _, "", false)
+ checkLiteralRow("" like _, "a", false)
// SI-17647 double-escaping backslash
- checkEvaluation("""\\\\""" like """%\\%""", true) // triple quotes to avoid java string escaping
- checkEvaluation("""%%""" like """%%""", true)
- checkEvaluation("""\__""" like """\\\__""", true)
- checkEvaluation("""\\\__""" like """%\\%\%""", false)
- checkEvaluation("""_\\\%""" like """%\\""", false)
+ checkLiteralRow("""\\\\""" like _, """%\\%""", true)
+ checkLiteralRow("""%%""" like _, """%%""", true)
+ checkLiteralRow("""\__""" like _, """\\\__""", true)
+ checkLiteralRow("""\\\__""" like _, """%\\%\%""", false)
+ checkLiteralRow("""_\\\%""" like _, """%\\""", false)
// unicode
// scalastyle:off nonascii
- checkEvaluation("a\u20ACa" like "_\u20AC_", true)
- checkEvaluation("a€a" like "_€_", true)
- checkEvaluation("a€a" like "_\u20AC_", true)
- checkEvaluation("a\u20ACa" like "_€_", true)
+ checkLiteralRow("a\u20ACa" like _, "_\u20AC_", true)
+ checkLiteralRow("a€a" like _, "_€_", true)
+ checkLiteralRow("a€a" like _, "_\u20AC_", true)
+ checkLiteralRow("a\u20ACa" like _, "_€_", true)
// scalastyle:on nonascii
// invalid escaping
- intercept[AnalysisException] {
+ val invalidEscape = intercept[AnalysisException] {
evaluate("""a""" like """\a""")
}
- intercept[AnalysisException] {
+ assert(invalidEscape.getMessage.contains("pattern"))
+
+ val endEscape = intercept[AnalysisException] {
evaluate("""a""" like """a\""")
}
+ assert(endEscape.getMessage.contains("pattern"))
// case
- checkEvaluation("A" like "a%", false)
- checkEvaluation("a" like "A%", false)
- checkEvaluation("AaA" like "_a_", true)
-
- }
-
- test("LIKE Non-literal Regular Expression") {
- val regEx = 'a.string.at(0)
- checkEvaluation("abcd" like regEx, null, create_row(null))
- checkEvaluation("abdef" like regEx, true, create_row("abdef"))
- checkEvaluation("a_%b" like regEx, true, create_row("a\\__b"))
- checkEvaluation("addb" like regEx, true, create_row("a_%b"))
- checkEvaluation("addb" like regEx, false, create_row("a\\__b"))
- checkEvaluation("addb" like regEx, false, create_row("a%\\%b"))
- checkEvaluation("a_%b" like regEx, true, create_row("a%\\%b"))
- checkEvaluation("addb" like regEx, true, create_row("a%"))
- checkEvaluation("addb" like regEx, false, create_row("**"))
- checkEvaluation("abc" like regEx, true, create_row("a%"))
- checkEvaluation("abc" like regEx, false, create_row("b%"))
- checkEvaluation("abc" like regEx, false, create_row("bc%"))
- checkEvaluation("a\nb" like regEx, true, create_row("a_b"))
- checkEvaluation("ab" like regEx, true, create_row("a%b"))
- checkEvaluation("a\nb" like regEx, true, create_row("a%b"))
-
- checkEvaluation(Literal.create(null, StringType) like regEx, null, create_row("bc%"))
-
- checkEvaluation("" like regEx, true, create_row(""))
- checkEvaluation("a" like regEx, false, create_row(""))
- checkEvaluation("" like regEx, false, create_row("a"))
-
- checkEvaluation("""\\\\""" like regEx, true, create_row("""%\\%"""))
- checkEvaluation("""%%""" like regEx, true, create_row("""%%"""))
- checkEvaluation("""\__""" like regEx, true, create_row("""\\\__"""))
- checkEvaluation("""\\\__""" like regEx, false, create_row("""%\\%\%"""))
- checkEvaluation("""_\\\%""" like regEx, false, create_row("""%\\"""))
-
- // scalastyle:off nonascii
- checkEvaluation("a\u20ACa" like regEx, true, create_row("_\u20AC_"))
- checkEvaluation("a€a" like regEx, true, create_row("_€_"))
- checkEvaluation("a€a" like regEx, true, create_row("_\u20AC_"))
- checkEvaluation("a\u20ACa" like regEx, true, create_row("_€_"))
- // scalastyle:on nonascii
-
- // invalid escaping
- intercept[AnalysisException] {
- evaluate("""a""" like regEx, create_row("""\a"""))
- }
- intercept[AnalysisException] {
- evaluate("""a""" like regEx, create_row("""a\"""))
- }
-
- checkEvaluation("A" like regEx, false, create_row("a%"))
- checkEvaluation("a" like regEx, false, create_row("A%"))
- checkEvaluation("AaA" like regEx, true, create_row("_a_"))
+ checkLiteralRow("A" like _, "a%", false)
+ checkLiteralRow("a" like _, "A%", false)
+ checkLiteralRow("AaA" like _, "_a_", true)
+ // example
+ checkLiteralRow("""%SystemDrive%\Users\John""" like _, """\%SystemDrive\%\\Users%""", true)
}
- test("RLIKE literal Regular Expression") {
- checkEvaluation(Literal.create(null, StringType) rlike "abdef", null)
+ test("RLIKE Regular Expression") {
+ checkLiteralRow(Literal.create(null, StringType) rlike _, "abdef", null)
checkEvaluation("abdef" rlike Literal.create(null, StringType), null)
checkEvaluation(Literal.create(null, StringType) rlike Literal.create(null, StringType), null)
checkEvaluation("abdef" rlike NonFoldableLiteral.create("abdef", StringType), true)
@@ -155,42 +128,32 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation(
Literal.create(null, StringType) rlike NonFoldableLiteral.create(null, StringType), null)
- checkEvaluation("abdef" rlike "abdef", true)
- checkEvaluation("abbbbc" rlike "a.*c", true)
+ checkLiteralRow("abdef" rlike _, "abdef", true)
+ checkLiteralRow("abbbbc" rlike _, "a.*c", true)
- checkEvaluation("fofo" rlike "^fo", true)
- checkEvaluation("fo\no" rlike "^fo\no$", true)
- checkEvaluation("Bn" rlike "^Ba*n", true)
- checkEvaluation("afofo" rlike "fo", true)
- checkEvaluation("afofo" rlike "^fo", false)
- checkEvaluation("Baan" rlike "^Ba?n", false)
- checkEvaluation("axe" rlike "pi|apa", false)
- checkEvaluation("pip" rlike "^(pi)*$", false)
+ checkLiteralRow("fofo" rlike _, "^fo", true)
+ checkLiteralRow("fo\no" rlike _, "^fo\no$", true)
+ checkLiteralRow("Bn" rlike _, "^Ba*n", true)
+ checkLiteralRow("afofo" rlike _, "fo", true)
+ checkLiteralRow("afofo" rlike _, "^fo", false)
+ checkLiteralRow("Baan" rlike _, "^Ba?n", false)
+ checkLiteralRow("axe" rlike _, "pi|apa", false)
+ checkLiteralRow("pip" rlike _, "^(pi)*$", false)
- checkEvaluation("abc" rlike "^ab", true)
- checkEvaluation("abc" rlike "^bc", false)
- checkEvaluation("abc" rlike "^ab", true)
- checkEvaluation("abc" rlike "^bc", false)
+ checkLiteralRow("abc" rlike _, "^ab", true)
+ checkLiteralRow("abc" rlike _, "^bc", false)
+ checkLiteralRow("abc" rlike _, "^ab", true)
+ checkLiteralRow("abc" rlike _, "^bc", false)
intercept[java.util.regex.PatternSyntaxException] {
evaluate("abbbbc" rlike "**")
}
- }
-
- test("RLIKE Non-literal Regular Expression") {
- val regEx = 'a.string.at(0)
- checkEvaluation("abdef" rlike regEx, true, create_row("abdef"))
- checkEvaluation("abbbbc" rlike regEx, true, create_row("a.*c"))
- checkEvaluation("fofo" rlike regEx, true, create_row("^fo"))
- checkEvaluation("fo\no" rlike regEx, true, create_row("^fo\no$"))
- checkEvaluation("Bn" rlike regEx, true, create_row("^Ba*n"))
-
intercept[java.util.regex.PatternSyntaxException] {
- evaluate("abbbbc" rlike regEx, create_row("**"))
+ val regex = 'a.string.at(0)
+ evaluate("abbbbc" rlike regex, create_row("**"))
}
}
-
test("RegexReplace") {
val row1 = create_row("100-200", "(\\d+)", "num")
val row2 = create_row("100-200", "(\\d+)", "###")