From ba6d0584e4a6417cd9e130d831219cccc194f791 Mon Sep 17 00:00:00 2001 From: Jakob Odersky Date: Fri, 21 Oct 2016 18:19:04 -0700 Subject: Throw error on invalid escape pattern --- .../catalyst/expressions/regexpExpressions.scala | 14 +++++++------- .../spark/sql/catalyst/util/StringUtils.scala | 15 +++++++++------ .../expressions/RegexpExpressionsSuite.scala | 21 +++++++++++++++------ 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 8ee987e5ba..3df6effb6f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -71,16 +71,16 @@ trait StringRegexExpression extends ImplicitCastInputTypes { usage = "str _FUNC_ pattern - Returns true if str matches pattern, " + "null if any arguments are null, false otherwise.", extended = - "The pattern is a string which is matched literally, with exception to the " + - "following special symbols:\n\n" + + "The pattern is a string which is matched literally, with exception to the following " + + "special symbols:\n\n" + " _ matches any one character in the input (similar to . in posix " + "regular expressions)\n\n" + " % matches zero ore more characters in the input (similar to .* in " + - "posix regular expressions\n\n" + - "The default escape character is '\\'. If an escape character precedes a special symbol or " + - "another escape character, the following character is matched literally, otherwise the " + - "escape character is treated literally. I.e. '\\%' would match '%', whereas '\\a' matches " + - "'\\a'.\n\n" + + "posix regular expressions)\n\n" + + "The escape character is '\\'. If an escape character precedes a special symbol or " + + "another escape character, the following character is matched literally, For example, " + + "the expression ` like \\%SystemDrive\\%\\\\Users%` will match any `` that " + + "starts with '%SystemDrive%\\Users'. It is invalid to escape any other character.\n\n" + "Use RLIKE to match with standard regular expressions.") case class Like(left: Expression, right: Expression) extends BinaryExpression with StringRegexExpression { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala index 79c61023b4..b760b994f2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/StringUtils.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.util import java.util.regex.{Pattern, PatternSyntaxException} +import org.apache.spark.sql.AnalysisException import org.apache.spark.unsafe.types.UTF8String object StringUtils { @@ -28,16 +29,18 @@ object StringUtils { val in = str.toIterator val out = new StringBuilder() + def fail(message: String) = throw new AnalysisException( + s"the pattern '$str' is invalid, $message") + while (in.hasNext) { in.next match { case '\\' if in.hasNext => - in.next match { - case '\\' => out ++= Pattern.quote("\\") - case '_' => out ++= Pattern.quote("_") - case '%' => out ++= Pattern.quote("%") - // escape before non-escapable character treated literally - case c => out ++= Pattern.quote("\\" + c) + val c = in.next + c match { + case '_' | '%' | '\\' => out ++= Pattern.quote(Character.toString(c)) + case _ => fail(s"the escape character is not allowed to precede '$c'") } + case '\\' => fail("it is not allowed to end with the escape character") case '_' => out ++= "." case '%' => out ++= ".*" case c => out ++= Pattern.quote(Character.toString(c)) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 71292dd701..6de2ba2131 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.types.{IntegerType, StringType} @@ -77,9 +78,13 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("a\u20ACa" like "_€_", true) // scalastyle:on nonascii - // escaping non-escapable should match literally - checkEvaluation("""\a""" like """\a""", true) - checkEvaluation("""a\""" like """a\""", true) + // invalid escaping + intercept[AnalysisException] { + evaluate("""a""" like """\a""") + } + intercept[AnalysisException] { + evaluate("""a""" like """a\""") + } // case checkEvaluation("A" like "a%", false) @@ -125,9 +130,13 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation("a\u20ACa" like regEx, true, create_row("_€_")) // scalastyle:on nonascii - // escaping non-escapable should match literally - checkEvaluation("""\a""" like regEx, true, create_row("""\a""")) - checkEvaluation("""a\""" like regEx, true, create_row("""a\""")) + // invalid escaping + intercept[AnalysisException] { + evaluate("""a""" like regEx, create_row("""\a""")) + } + intercept[AnalysisException] { + evaluate("""a""" like regEx, create_row("""a\""")) + } checkEvaluation("A" like regEx, false, create_row("a%")) checkEvaluation("a" like regEx, false, create_row("A%")) -- cgit v1.2.3