From af3746ce0d724dc624658a2187bde188ab26d084 Mon Sep 17 00:00:00 2001 From: Cheng Hao Date: Sat, 29 Mar 2014 15:12:43 -0700 Subject: Implement the RLike & Like in catalyst This PR includes: 1) Unify the unit test for expression evaluation 2) Add implementation of RLike & Like Author: Cheng Hao Closes #224 from chenghao-intel/string_expression and squashes the following commits: 84f72e9 [Cheng Hao] fix bug in RLike/Like & Simplify the unit test aeeb1d7 [Cheng Hao] Simplify the implementation/unit test of RLike/Like 319edb7 [Cheng Hao] change to spark code style 91cfd33 [Cheng Hao] add implementation for rlike/like 2c8929e [Cheng Hao] Update the unit test for expression evaluation --- .../org/apache/spark/sql/catalyst/SqlParser.scala | 6 ++ .../apache/spark/sql/catalyst/dsl/package.scala | 10 ++- .../catalyst/expressions/stringOperations.scala | 98 +++++++++++++++++++++- .../expressions/ExpressionEvaluationSuite.scala | 83 ++++++++++++++++++ .../scala/org/apache/spark/sql/hive/HiveQl.scala | 9 +- 5 files changed, 196 insertions(+), 10 deletions(-) (limited to 'sql') diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala index 9dec4e3d9e..0c851c2ee2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala @@ -114,6 +114,9 @@ class SqlParser extends StandardTokenParsers { protected val NULL = Keyword("NULL") protected val ON = Keyword("ON") protected val OR = Keyword("OR") + protected val LIKE = Keyword("LIKE") + protected val RLIKE = Keyword("RLIKE") + protected val REGEXP = Keyword("REGEXP") protected val ORDER = Keyword("ORDER") protected val OUTER = Keyword("OUTER") protected val RIGHT = Keyword("RIGHT") @@ -267,6 +270,9 @@ class SqlParser extends StandardTokenParsers { termExpression ~ ">=" ~ termExpression ^^ { case e1 ~ _ ~ e2 => GreaterThanOrEqual(e1, e2) } | termExpression ~ "!=" ~ termExpression ^^ { case e1 ~ _ ~ e2 => Not(Equals(e1, e2)) } | termExpression ~ "<>" ~ termExpression ^^ { case e1 ~ _ ~ e2 => Not(Equals(e1, e2)) } | + termExpression ~ RLIKE ~ termExpression ^^ { case e1 ~ _ ~ e2 => RLike(e1, e2) } | + termExpression ~ REGEXP ~ termExpression ^^ { case e1 ~ _ ~ e2 => RLike(e1, e2) } | + termExpression ~ LIKE ~ termExpression ^^ { case e1 ~ _ ~ e2 => Like(e1, e2) } | termExpression ~ IN ~ "(" ~ rep1sep(termExpression, ",") <~ ")" ^^ { case e1 ~ _ ~ _ ~ e2 => In(e1, e2) } | diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index 67cddb351c..44abe671c0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -70,6 +70,9 @@ package object dsl { def === (other: Expression) = Equals(expr, other) def != (other: Expression) = Not(Equals(expr, other)) + def like(other: Expression) = Like(expr, other) + def rlike(other: Expression) = RLike(expr, other) + def asc = SortOrder(expr, Ascending) def desc = SortOrder(expr, Descending) @@ -90,7 +93,10 @@ package object dsl { implicit def symbolToUnresolvedAttribute(s: Symbol) = analysis.UnresolvedAttribute(s.name) implicit class DslSymbol(sym: Symbol) extends ImplicitAttribute { def s = sym.name } - implicit class DslString(val s: String) extends ImplicitAttribute + implicit class DslString(val s: String) extends ImplicitOperators { + def expr: Expression = Literal(s) + def attr = analysis.UnresolvedAttribute(s) + } abstract class ImplicitAttribute extends ImplicitOperators { def s: String @@ -110,6 +116,8 @@ package object dsl { // Protobuf terminology def required = a.withNullability(false) + + def at(ordinal: Int) = BoundReference(ordinal, a) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index e195f2ac7e..42b7a9b125 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -17,11 +17,103 @@ package org.apache.spark.sql.catalyst.expressions +import java.util.regex.Pattern + +import org.apache.spark.sql.catalyst.types.DataType +import org.apache.spark.sql.catalyst.types.StringType import org.apache.spark.sql.catalyst.types.BooleanType +import org.apache.spark.sql.catalyst.trees.TreeNode +import org.apache.spark.sql.catalyst.errors.`package`.TreeNodeException + + +trait StringRegexExpression { + self: BinaryExpression => + + type EvaluatedType = Any + + def escape(v: String): String + def matches(regex: Pattern, str: String): Boolean + + def nullable: Boolean = true + def dataType: DataType = BooleanType + + // try cache the pattern for Literal + private lazy val cache: Pattern = right match { + case x @ Literal(value: String, StringType) => compile(value) + case _ => null + } + + protected def compile(str: String): Pattern = if(str == null) { + null + } else { + // Let it raise exception if couldn't compile the regex string + Pattern.compile(escape(str)) + } -case class Like(left: Expression, right: Expression) extends BinaryExpression { - def dataType = BooleanType - def nullable = left.nullable // Right cannot be null. + protected def pattern(str: String) = if(cache == null) compile(str) else cache + + override def apply(input: Row): Any = { + val l = left.apply(input) + if(l == null) { + null + } else { + val r = right.apply(input) + if(r == null) { + null + } else { + val regex = pattern(r.asInstanceOf[String]) + if(regex == null) { + null + } else { + matches(regex, l.asInstanceOf[String]) + } + } + } + } +} + +/** + * Simple RegEx pattern matching function + */ +case class Like(left: Expression, right: Expression) + extends BinaryExpression with StringRegexExpression { + def symbol = "LIKE" + + // replace the _ with .{1} exactly match 1 time of any character + // replace the % with .*, match 0 or more times with any character + override def escape(v: String) = { + val sb = new StringBuilder() + var i = 0; + while (i < v.length) { + // Make a special case for "\\_" and "\\%" + val n = v.charAt(i); + if (n == '\\' && i + 1 < v.length && (v.charAt(i + 1) == '_' || v.charAt(i + 1) == '%')) { + sb.append(v.charAt(i + 1)) + i += 1 + } else { + if (n == '_') { + sb.append("."); + } else if (n == '%') { + sb.append(".*"); + } else { + sb.append(Pattern.quote(Character.toString(n))); + } + } + + i += 1 + } + + sb.toString() + } + + override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).matches() } +case class RLike(left: Expression, right: Expression) + extends BinaryExpression with StringRegexExpression { + + def symbol = "RLIKE" + override def escape(v: String): String = v + override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).find(0) +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala index 94894adf81..52a205be3e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala @@ -109,4 +109,87 @@ class ExpressionEvaluationSuite extends FunSuite { } } } + + def evaluate(expression: Expression, inputRow: Row = EmptyRow): Any = { + expression.apply(inputRow) + } + + def checkEvaluation(expression: Expression, expected: Any, inputRow: Row = EmptyRow): Unit = { + val actual = try evaluate(expression, inputRow) catch { + case e: Exception => fail(s"Exception evaluating $expression", e) + } + if(actual != expected) { + val input = if(inputRow == EmptyRow) "" else s", input: $inputRow" + fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input") + } + } + + test("LIKE literal Regular Expression") { + checkEvaluation(Literal(null, StringType).like("a"), null) + checkEvaluation(Literal(null, StringType).like(Literal(null, StringType)), null) + checkEvaluation("abdef" like "abdef", true) + checkEvaluation("a_%b" like "a\\__b", true) + checkEvaluation("addb" like "a_%b", true) + checkEvaluation("addb" like "a\\__b", false) + checkEvaluation("addb" like "a%\\%b", false) + checkEvaluation("a_%b" like "a%\\%b", true) + checkEvaluation("addb" like "a%", true) + checkEvaluation("addb" like "**", false) + checkEvaluation("abc" like "a%", true) + checkEvaluation("abc" like "b%", false) + checkEvaluation("abc" like "bc%", false) + } + + test("LIKE Non-literal Regular Expression") { + val regEx = 'a.string.at(0) + checkEvaluation("abcd" like regEx, null, new GenericRow(Array[Any](null))) + checkEvaluation("abdef" like regEx, true, new GenericRow(Array[Any]("abdef"))) + checkEvaluation("a_%b" like regEx, true, new GenericRow(Array[Any]("a\\__b"))) + checkEvaluation("addb" like regEx, true, new GenericRow(Array[Any]("a_%b"))) + checkEvaluation("addb" like regEx, false, new GenericRow(Array[Any]("a\\__b"))) + checkEvaluation("addb" like regEx, false, new GenericRow(Array[Any]("a%\\%b"))) + checkEvaluation("a_%b" like regEx, true, new GenericRow(Array[Any]("a%\\%b"))) + checkEvaluation("addb" like regEx, true, new GenericRow(Array[Any]("a%"))) + checkEvaluation("addb" like regEx, false, new GenericRow(Array[Any]("**"))) + checkEvaluation("abc" like regEx, true, new GenericRow(Array[Any]("a%"))) + checkEvaluation("abc" like regEx, false, new GenericRow(Array[Any]("b%"))) + checkEvaluation("abc" like regEx, false, new GenericRow(Array[Any]("bc%"))) + } + + test("RLIKE literal Regular Expression") { + checkEvaluation("abdef" rlike "abdef", true) + checkEvaluation("abbbbc" rlike "a.*c", true) + + checkEvaluation("fofo" rlike "^fo", true) + checkEvaluation("fo\no" rlike "^fo\no$", true) + checkEvaluation("Bn" rlike "^Ba*n", true) + checkEvaluation("afofo" rlike "fo", true) + checkEvaluation("afofo" rlike "^fo", false) + checkEvaluation("Baan" rlike "^Ba?n", false) + checkEvaluation("axe" rlike "pi|apa", false) + checkEvaluation("pip" rlike "^(pi)*$", false) + + checkEvaluation("abc" rlike "^ab", true) + checkEvaluation("abc" rlike "^bc", false) + checkEvaluation("abc" rlike "^ab", true) + checkEvaluation("abc" rlike "^bc", false) + + intercept[java.util.regex.PatternSyntaxException] { + evaluate("abbbbc" rlike "**") + } + } + + test("RLIKE Non-literal Regular Expression") { + val regEx = 'a.string.at(0) + checkEvaluation("abdef" rlike regEx, true, new GenericRow(Array[Any]("abdef"))) + checkEvaluation("abbbbc" rlike regEx, true, new GenericRow(Array[Any]("a.*c"))) + checkEvaluation("fofo" rlike regEx, true, new GenericRow(Array[Any]("^fo"))) + checkEvaluation("fo\no" rlike regEx, true, new GenericRow(Array[Any]("^fo\no$"))) + checkEvaluation("Bn" rlike regEx, true, new GenericRow(Array[Any]("^Ba*n"))) + + intercept[java.util.regex.PatternSyntaxException] { + evaluate("abbbbc" rlike regEx, new GenericRow(Array[Any]("**"))) + } + } } + diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index b70ec897e4..490a592a58 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -847,12 +847,9 @@ object HiveQl { case Token(">=", left :: right:: Nil) => GreaterThanOrEqual(nodeToExpr(left), nodeToExpr(right)) case Token("<", left :: right:: Nil) => LessThan(nodeToExpr(left), nodeToExpr(right)) case Token("<=", left :: right:: Nil) => LessThanOrEqual(nodeToExpr(left), nodeToExpr(right)) - case Token("LIKE", left :: right:: Nil) => - UnresolvedFunction("LIKE", Seq(nodeToExpr(left), nodeToExpr(right))) - case Token("RLIKE", left :: right:: Nil) => - UnresolvedFunction("RLIKE", Seq(nodeToExpr(left), nodeToExpr(right))) - case Token("REGEXP", left :: right:: Nil) => - UnresolvedFunction("REGEXP", Seq(nodeToExpr(left), nodeToExpr(right))) + case Token("LIKE", left :: right:: Nil) => Like(nodeToExpr(left), nodeToExpr(right)) + case Token("RLIKE", left :: right:: Nil) => RLike(nodeToExpr(left), nodeToExpr(right)) + case Token("REGEXP", left :: right:: Nil) => RLike(nodeToExpr(left), nodeToExpr(right)) case Token("TOK_FUNCTION", Token("TOK_ISNOTNULL", Nil) :: child :: Nil) => IsNotNull(nodeToExpr(child)) case Token("TOK_FUNCTION", Token("TOK_ISNULL", Nil) :: child :: Nil) => -- cgit v1.2.3