aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakob Odersky <jakob@odersky.com>2016-10-21 18:19:04 -0700
committerJakob Odersky <jakob@odersky.com>2016-12-09 11:34:53 -0800
commitba6d0584e4a6417cd9e130d831219cccc194f791 (patch)
tree0f00df25c0fcacd7bdff7d1261d649c16cb03958
parent8bca7a5e0ec60957c0a847e18f5ed40b763c752e (diff)
downloadspark-ba6d0584e4a6417cd9e130d831219cccc194f791.tar.gz
spark-ba6d0584e4a6417cd9e130d831219cccc194f791.tar.bz2
spark-ba6d0584e4a6417cd9e130d831219cccc194f791.zip
Throw error on invalid escape pattern
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala14
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala15
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala21
3 files changed, 31 insertions, 19 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 8ee987e5ba..3df6effb6f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -71,16 +71,16 @@ trait StringRegexExpression extends ImplicitCastInputTypes {
usage = "str _FUNC_ pattern - Returns true if str matches pattern, " +
"null if any arguments are null, false otherwise.",
extended =
- "The pattern is a string which is matched literally, with exception to the " +
- "following special symbols:\n\n" +
+ "The pattern is a string which is matched literally, with exception to the following " +
+ "special symbols:\n\n" +
" _ matches any one character in the input (similar to . in posix " +
"regular expressions)\n\n" +
" % matches zero ore more characters in the input (similar to .* in " +
- "posix regular expressions\n\n" +
- "The default escape character is '\\'. If an escape character precedes a special symbol or " +
- "another escape character, the following character is matched literally, otherwise the " +
- "escape character is treated literally. I.e. '\\%' would match '%', whereas '\\a' matches " +
- "'\\a'.\n\n" +
+ "posix regular expressions)\n\n" +
+ "The escape character is '\\'. If an escape character precedes a special symbol or " +
+ "another escape character, the following character is matched literally, For example, " +
+ "the expression `<path> like \\%SystemDrive\\%\\\\Users%` will match any `<path>` that " +
+ "starts with '%SystemDrive%\\Users'. It is invalid to escape any other character.\n\n" +
"Use RLIKE to match with standard regular expressions.")
case class Like(left: Expression, right: Expression)
extends BinaryExpression with StringRegexExpression {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
index 79c61023b4..b760b994f2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.util
import java.util.regex.{Pattern, PatternSyntaxException}
+import org.apache.spark.sql.AnalysisException
import org.apache.spark.unsafe.types.UTF8String
object StringUtils {
@@ -28,16 +29,18 @@ object StringUtils {
val in = str.toIterator
val out = new StringBuilder()
+ def fail(message: String) = throw new AnalysisException(
+ s"the pattern '$str' is invalid, $message")
+
while (in.hasNext) {
in.next match {
case '\\' if in.hasNext =>
- in.next match {
- case '\\' => out ++= Pattern.quote("\\")
- case '_' => out ++= Pattern.quote("_")
- case '%' => out ++= Pattern.quote("%")
- // escape before non-escapable character treated literally
- case c => out ++= Pattern.quote("\\" + c)
+ val c = in.next
+ c match {
+ case '_' | '%' | '\\' => out ++= Pattern.quote(Character.toString(c))
+ case _ => fail(s"the escape character is not allowed to precede '$c'")
}
+ case '\\' => fail("it is not allowed to end with the escape character")
case '_' => out ++= "."
case '%' => out ++= ".*"
case c => out ++= Pattern.quote(Character.toString(c))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
index 71292dd701..6de2ba2131 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
@@ -18,6 +18,7 @@
package org.apache.spark.sql.catalyst.expressions
import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.types.{IntegerType, StringType}
@@ -77,9 +78,13 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation("a\u20ACa" like "_€_", true)
// scalastyle:on nonascii
- // escaping non-escapable should match literally
- checkEvaluation("""\a""" like """\a""", true)
- checkEvaluation("""a\""" like """a\""", true)
+ // invalid escaping
+ intercept[AnalysisException] {
+ evaluate("""a""" like """\a""")
+ }
+ intercept[AnalysisException] {
+ evaluate("""a""" like """a\""")
+ }
// case
checkEvaluation("A" like "a%", false)
@@ -125,9 +130,13 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation("a\u20ACa" like regEx, true, create_row("_€_"))
// scalastyle:on nonascii
- // escaping non-escapable should match literally
- checkEvaluation("""\a""" like regEx, true, create_row("""\a"""))
- checkEvaluation("""a\""" like regEx, true, create_row("""a\"""))
+ // invalid escaping
+ intercept[AnalysisException] {
+ evaluate("""a""" like regEx, create_row("""\a"""))
+ }
+ intercept[AnalysisException] {
+ evaluate("""a""" like regEx, create_row("""a\"""))
+ }
checkEvaluation("A" like regEx, false, create_row("a%"))
checkEvaluation("a" like regEx, false, create_row("A%"))