From dbd305c195ba1c1661f82e3e19085778b8783794 Mon Sep 17 00:00:00 2001 From: Jakob Odersky Date: Mon, 24 Oct 2016 17:07:14 -0700 Subject: Refactor tests and add documentation --- .../catalyst/expressions/regexpExpressions.scala | 34 ++-- .../spark/sql/catalyst/util/StringUtils.scala | 18 +- .../expressions/RegexpExpressionsSuite.scala | 187 +++++++++------------ 3 files changed, 111 insertions(+), 128 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 3df6effb6f..0325d0e837 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -70,18 +70,28 @@ trait StringRegexExpression extends ImplicitCastInputTypes { @ExpressionDescription( usage = "str _FUNC_ pattern - Returns true if str matches pattern, " + "null if any arguments are null, false otherwise.", - extended = - "The pattern is a string which is matched literally, with exception to the following " + - "special symbols:\n\n" + - " _ matches any one character in the input (similar to . in posix " + - "regular expressions)\n\n" + - " % matches zero ore more characters in the input (similar to .* in " + - "posix regular expressions)\n\n" + - "The escape character is '\\'. If an escape character precedes a special symbol or " + - "another escape character, the following character is matched literally, For example, " + - "the expression ` like \\%SystemDrive\\%\\\\Users%` will match any `` that " + - "starts with '%SystemDrive%\\Users'. It is invalid to escape any other character.\n\n" + - "Use RLIKE to match with standard regular expressions.") + extended = """ + Arguments: + str - a string expression + pattern - a string expression. The pattern is a string which is matched literally, with + exception to the following special symbols: + + _ matches any one character in the input (similar to . in posix regular expressions) + + % matches zero ore more characters in the input (similar to .* in posix regular + expressions) + + The escape character is '\'. If an escape character precedes a special symbol or another + escape character, the following character is matched literally. It is invalid to escape + any other character. + + Examples: + > SELECT '%SystemDrive%\Users\John' _FUNC_ '\%SystemDrive\%\\Users%' + true + + See also: + Use RLIKE to match with standard regular expressions. +""") case class Like(left: Expression, right: Expression) extends BinaryExpression with StringRegexExpression { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala index b760b994f2..ca22ea2420 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala @@ -24,13 +24,23 @@ import org.apache.spark.unsafe.types.UTF8String object StringUtils { - /** Convert 'like' pattern to Java regex. */ - def escapeLikeRegex(str: String): String = { - val in = str.toIterator + /** + * Validate and convert SQL 'like' pattern to a Java regular expression. + * + * Underscores (_) are converted to '.' and percent signs (%) are converted to '.*', other + * characters are quoted literally. Escaping is done according to the rules specified in + * [[org.apache.spark.sql.catalyst.expressions.Like]] usage documentation. An invalid pattern will + * throw an [[AnalysisException]]. + * + * @param pattern the SQL pattern to convert + * @return the equivalent Java regular expression of the pattern + */ + def escapeLikeRegex(pattern: String): String = { + val in = pattern.toIterator val out = new StringBuilder() def fail(message: String) = throw new AnalysisException( - s"the pattern '$str' is invalid, $message") + s"the pattern '$pattern' is invalid, $message") while (in.hasNext) { in.next match { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 6de2ba2131..1ce150e091 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -27,10 +27,29 @@ import org.apache.spark.sql.types.{IntegerType, StringType} */ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { - test("LIKE literal Regular Expression") { + /** + * Check if a given expression evaluates to an expected output, in case the input is + * a literal and in case the input is in the form of a row. + * @tparam A type of input + * @param mkExpr the expression to test for a given input + * @param input value that will be used to create the expression, as literal and in the form + * of a row + * @param expected the expected output of the expression + * @param inputToExpression an implicit conversion from the input type to its corresponding + * sql expression + */ + def checkLiteralRow[A](mkExpr: Expression => Expression, input: A, expected: Any) + (implicit inputToExpression: A => Expression): Unit = { + checkEvaluation(mkExpr(input), expected) // check literal input + + val regex = 'a.string.at(0) + checkEvaluation(mkExpr(regex), expected, create_row(input)) // check row input + } + + test("LIKE Pattern") { // null handling - checkEvaluation(Literal.create(null, StringType).like("a"), null) + checkLiteralRow(Literal.create(null, StringType).like(_), "a", null) checkEvaluation(Literal.create("a", StringType).like(Literal.create(null, StringType)), null) checkEvaluation(Literal.create(null, StringType).like(Literal.create(null, StringType)), null) checkEvaluation( @@ -43,109 +62,63 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { Literal.create(null, StringType).like(NonFoldableLiteral.create(null, StringType)), null) // simple patterns - checkEvaluation("abdef" like "abdef", true) - checkEvaluation("a_%b" like "a\\__b", true) - checkEvaluation("addb" like "a_%b", true) - checkEvaluation("addb" like "a\\__b", false) - checkEvaluation("addb" like "a%\\%b", false) - checkEvaluation("a_%b" like "a%\\%b", true) - checkEvaluation("addb" like "a%", true) - checkEvaluation("addb" like "**", false) - checkEvaluation("abc" like "a%", true) - checkEvaluation("abc" like "b%", false) - checkEvaluation("abc" like "bc%", false) - checkEvaluation("a\nb" like "a_b", true) - checkEvaluation("ab" like "a%b", true) - checkEvaluation("a\nb" like "a%b", true) + checkLiteralRow("abdef" like _, "abdef", true) + checkLiteralRow("a_%b" like _, "a\\__b", true) + checkLiteralRow("addb" like _, "a_%b", true) + checkLiteralRow("addb" like _, "a\\__b", false) + checkLiteralRow("addb" like _, "a%\\%b", false) + checkLiteralRow("a_%b" like _, "a%\\%b", true) + checkLiteralRow("addb" like _, "a%", true) + checkLiteralRow("addb" like _, "**", false) + checkLiteralRow("abc" like _, "a%", true) + checkLiteralRow("abc" like _, "b%", false) + checkLiteralRow("abc" like _, "bc%", false) + checkLiteralRow("a\nb" like _, "a_b", true) + checkLiteralRow("ab" like _, "a%b", true) + checkLiteralRow("a\nb" like _, "a%b", true) // empty input - checkEvaluation("" like "", true) - checkEvaluation("a" like "", false) - checkEvaluation("" like "a", false) + checkLiteralRow("" like _, "", true) + checkLiteralRow("a" like _, "", false) + checkLiteralRow("" like _, "a", false) // SI-17647 double-escaping backslash - checkEvaluation("""\\\\""" like """%\\%""", true) // triple quotes to avoid java string escaping - checkEvaluation("""%%""" like """%%""", true) - checkEvaluation("""\__""" like """\\\__""", true) - checkEvaluation("""\\\__""" like """%\\%\%""", false) - checkEvaluation("""_\\\%""" like """%\\""", false) + checkLiteralRow("""\\\\""" like _, """%\\%""", true) + checkLiteralRow("""%%""" like _, """%%""", true) + checkLiteralRow("""\__""" like _, """\\\__""", true) + checkLiteralRow("""\\\__""" like _, """%\\%\%""", false) + checkLiteralRow("""_\\\%""" like _, """%\\""", false) // unicode // scalastyle:off nonascii - checkEvaluation("a\u20ACa" like "_\u20AC_", true) - checkEvaluation("a€a" like "_€_", true) - checkEvaluation("a€a" like "_\u20AC_", true) - checkEvaluation("a\u20ACa" like "_€_", true) + checkLiteralRow("a\u20ACa" like _, "_\u20AC_", true) + checkLiteralRow("a€a" like _, "_€_", true) + checkLiteralRow("a€a" like _, "_\u20AC_", true) + checkLiteralRow("a\u20ACa" like _, "_€_", true) // scalastyle:on nonascii // invalid escaping - intercept[AnalysisException] { + val invalidEscape = intercept[AnalysisException] { evaluate("""a""" like """\a""") } - intercept[AnalysisException] { + assert(invalidEscape.getMessage.contains("pattern")) + + val endEscape = intercept[AnalysisException] { evaluate("""a""" like """a\""") } + assert(endEscape.getMessage.contains("pattern")) // case - checkEvaluation("A" like "a%", false) - checkEvaluation("a" like "A%", false) - checkEvaluation("AaA" like "_a_", true) - - } - - test("LIKE Non-literal Regular Expression") { - val regEx = 'a.string.at(0) - checkEvaluation("abcd" like regEx, null, create_row(null)) - checkEvaluation("abdef" like regEx, true, create_row("abdef")) - checkEvaluation("a_%b" like regEx, true, create_row("a\\__b")) - checkEvaluation("addb" like regEx, true, create_row("a_%b")) - checkEvaluation("addb" like regEx, false, create_row("a\\__b")) - checkEvaluation("addb" like regEx, false, create_row("a%\\%b")) - checkEvaluation("a_%b" like regEx, true, create_row("a%\\%b")) - checkEvaluation("addb" like regEx, true, create_row("a%")) - checkEvaluation("addb" like regEx, false, create_row("**")) - checkEvaluation("abc" like regEx, true, create_row("a%")) - checkEvaluation("abc" like regEx, false, create_row("b%")) - checkEvaluation("abc" like regEx, false, create_row("bc%")) - checkEvaluation("a\nb" like regEx, true, create_row("a_b")) - checkEvaluation("ab" like regEx, true, create_row("a%b")) - checkEvaluation("a\nb" like regEx, true, create_row("a%b")) - - checkEvaluation(Literal.create(null, StringType) like regEx, null, create_row("bc%")) - - checkEvaluation("" like regEx, true, create_row("")) - checkEvaluation("a" like regEx, false, create_row("")) - checkEvaluation("" like regEx, false, create_row("a")) - - checkEvaluation("""\\\\""" like regEx, true, create_row("""%\\%""")) - checkEvaluation("""%%""" like regEx, true, create_row("""%%""")) - checkEvaluation("""\__""" like regEx, true, create_row("""\\\__""")) - checkEvaluation("""\\\__""" like regEx, false, create_row("""%\\%\%""")) - checkEvaluation("""_\\\%""" like regEx, false, create_row("""%\\""")) - - // scalastyle:off nonascii - checkEvaluation("a\u20ACa" like regEx, true, create_row("_\u20AC_")) - checkEvaluation("a€a" like regEx, true, create_row("_€_")) - checkEvaluation("a€a" like regEx, true, create_row("_\u20AC_")) - checkEvaluation("a\u20ACa" like regEx, true, create_row("_€_")) - // scalastyle:on nonascii - - // invalid escaping - intercept[AnalysisException] { - evaluate("""a""" like regEx, create_row("""\a""")) - } - intercept[AnalysisException] { - evaluate("""a""" like regEx, create_row("""a\""")) - } - - checkEvaluation("A" like regEx, false, create_row("a%")) - checkEvaluation("a" like regEx, false, create_row("A%")) - checkEvaluation("AaA" like regEx, true, create_row("_a_")) + checkLiteralRow("A" like _, "a%", false) + checkLiteralRow("a" like _, "A%", false) + checkLiteralRow("AaA" like _, "_a_", true) + // example + checkLiteralRow("""%SystemDrive%\Users\John""" like _, """\%SystemDrive\%\\Users%""", true) } - test("RLIKE literal Regular Expression") { - checkEvaluation(Literal.create(null, StringType) rlike "abdef", null) + test("RLIKE Regular Expression") { + checkLiteralRow(Literal.create(null, StringType) rlike _, "abdef", null) checkEvaluation("abdef" rlike Literal.create(null, StringType), null) checkEvaluation(Literal.create(null, StringType) rlike Literal.create(null, StringType), null) checkEvaluation("abdef" rlike NonFoldableLiteral.create("abdef", StringType), true) @@ -155,42 +128,32 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation( Literal.create(null, StringType) rlike NonFoldableLiteral.create(null, StringType), null) - checkEvaluation("abdef" rlike "abdef", true) - checkEvaluation("abbbbc" rlike "a.*c", true) + checkLiteralRow("abdef" rlike _, "abdef", true) + checkLiteralRow("abbbbc" rlike _, "a.*c", true) - checkEvaluation("fofo" rlike "^fo", true) - checkEvaluation("fo\no" rlike "^fo\no$", true) - checkEvaluation("Bn" rlike "^Ba*n", true) - checkEvaluation("afofo" rlike "fo", true) - checkEvaluation("afofo" rlike "^fo", false) - checkEvaluation("Baan" rlike "^Ba?n", false) - checkEvaluation("axe" rlike "pi|apa", false) - checkEvaluation("pip" rlike "^(pi)*$", false) + checkLiteralRow("fofo" rlike _, "^fo", true) + checkLiteralRow("fo\no" rlike _, "^fo\no$", true) + checkLiteralRow("Bn" rlike _, "^Ba*n", true) + checkLiteralRow("afofo" rlike _, "fo", true) + checkLiteralRow("afofo" rlike _, "^fo", false) + checkLiteralRow("Baan" rlike _, "^Ba?n", false) + checkLiteralRow("axe" rlike _, "pi|apa", false) + checkLiteralRow("pip" rlike _, "^(pi)*$", false) - checkEvaluation("abc" rlike "^ab", true) - checkEvaluation("abc" rlike "^bc", false) - checkEvaluation("abc" rlike "^ab", true) - checkEvaluation("abc" rlike "^bc", false) + checkLiteralRow("abc" rlike _, "^ab", true) + checkLiteralRow("abc" rlike _, "^bc", false) + checkLiteralRow("abc" rlike _, "^ab", true) + checkLiteralRow("abc" rlike _, "^bc", false) intercept[java.util.regex.PatternSyntaxException] { evaluate("abbbbc" rlike "**") } - } - - test("RLIKE Non-literal Regular Expression") { - val regEx = 'a.string.at(0) - checkEvaluation("abdef" rlike regEx, true, create_row("abdef")) - checkEvaluation("abbbbc" rlike regEx, true, create_row("a.*c")) - checkEvaluation("fofo" rlike regEx, true, create_row("^fo")) - checkEvaluation("fo\no" rlike regEx, true, create_row("^fo\no$")) - checkEvaluation("Bn" rlike regEx, true, create_row("^Ba*n")) - intercept[java.util.regex.PatternSyntaxException] { - evaluate("abbbbc" rlike regEx, create_row("**")) + val regex = 'a.string.at(0) + evaluate("abbbbc" rlike regex, create_row("**")) } } - test("RegexReplace") { val row1 = create_row("100-200", "(\\d+)", "num") val row2 = create_row("100-200", "(\\d+)", "###") -- cgit v1.2.3