aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorCheng Hao <hao.cheng@intel.com>2014-03-29 15:12:43 -0700
committerReynold Xin <rxin@apache.org>2014-03-29 15:12:43 -0700
commitaf3746ce0d724dc624658a2187bde188ab26d084 (patch)
tree155966da9bd752eaf5c6dd2f05c504c474f8c7d4 /sql
parent1617816090e7b20124a512a43860a21232ebf511 (diff)
downloadspark-af3746ce0d724dc624658a2187bde188ab26d084.tar.gz
spark-af3746ce0d724dc624658a2187bde188ab26d084.tar.bz2
spark-af3746ce0d724dc624658a2187bde188ab26d084.zip
Implement the RLike & Like in catalyst
This PR includes: 1) Unify the unit test for expression evaluation 2) Add implementation of RLike & Like Author: Cheng Hao <hao.cheng@intel.com> Closes #224 from chenghao-intel/string_expression and squashes the following commits: 84f72e9 [Cheng Hao] fix bug in RLike/Like & Simplify the unit test aeeb1d7 [Cheng Hao] Simplify the implementation/unit test of RLike/Like 319edb7 [Cheng Hao] change to spark code style 91cfd33 [Cheng Hao] add implementation for rlike/like 2c8929e [Cheng Hao] Update the unit test for expression evaluation
Diffstat (limited to 'sql')
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala6
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala10
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala98
-rw-r--r--sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala83
-rw-r--r--sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala9
5 files changed, 196 insertions, 10 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index 9dec4e3d9e..0c851c2ee2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -114,6 +114,9 @@ class SqlParser extends StandardTokenParsers {
protected val NULL = Keyword("NULL")
protected val ON = Keyword("ON")
protected val OR = Keyword("OR")
+ protected val LIKE = Keyword("LIKE")
+ protected val RLIKE = Keyword("RLIKE")
+ protected val REGEXP = Keyword("REGEXP")
protected val ORDER = Keyword("ORDER")
protected val OUTER = Keyword("OUTER")
protected val RIGHT = Keyword("RIGHT")
@@ -267,6 +270,9 @@ class SqlParser extends StandardTokenParsers {
termExpression ~ ">=" ~ termExpression ^^ { case e1 ~ _ ~ e2 => GreaterThanOrEqual(e1, e2) } |
termExpression ~ "!=" ~ termExpression ^^ { case e1 ~ _ ~ e2 => Not(Equals(e1, e2)) } |
termExpression ~ "<>" ~ termExpression ^^ { case e1 ~ _ ~ e2 => Not(Equals(e1, e2)) } |
+ termExpression ~ RLIKE ~ termExpression ^^ { case e1 ~ _ ~ e2 => RLike(e1, e2) } |
+ termExpression ~ REGEXP ~ termExpression ^^ { case e1 ~ _ ~ e2 => RLike(e1, e2) } |
+ termExpression ~ LIKE ~ termExpression ^^ { case e1 ~ _ ~ e2 => Like(e1, e2) } |
termExpression ~ IN ~ "(" ~ rep1sep(termExpression, ",") <~ ")" ^^ {
case e1 ~ _ ~ _ ~ e2 => In(e1, e2)
} |
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 67cddb351c..44abe671c0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -70,6 +70,9 @@ package object dsl {
def === (other: Expression) = Equals(expr, other)
def != (other: Expression) = Not(Equals(expr, other))
+ def like(other: Expression) = Like(expr, other)
+ def rlike(other: Expression) = RLike(expr, other)
+
def asc = SortOrder(expr, Ascending)
def desc = SortOrder(expr, Descending)
@@ -90,7 +93,10 @@ package object dsl {
implicit def symbolToUnresolvedAttribute(s: Symbol) = analysis.UnresolvedAttribute(s.name)
implicit class DslSymbol(sym: Symbol) extends ImplicitAttribute { def s = sym.name }
- implicit class DslString(val s: String) extends ImplicitAttribute
+ implicit class DslString(val s: String) extends ImplicitOperators {
+ def expr: Expression = Literal(s)
+ def attr = analysis.UnresolvedAttribute(s)
+ }
abstract class ImplicitAttribute extends ImplicitOperators {
def s: String
@@ -110,6 +116,8 @@ package object dsl {
// Protobuf terminology
def required = a.withNullability(false)
+
+ def at(ordinal: Int) = BoundReference(ordinal, a)
}
}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index e195f2ac7e..42b7a9b125 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -17,11 +17,103 @@
package org.apache.spark.sql.catalyst.expressions
+import java.util.regex.Pattern
+
+import org.apache.spark.sql.catalyst.types.DataType
+import org.apache.spark.sql.catalyst.types.StringType
import org.apache.spark.sql.catalyst.types.BooleanType
+import org.apache.spark.sql.catalyst.trees.TreeNode
+import org.apache.spark.sql.catalyst.errors.`package`.TreeNodeException
+
+
+trait StringRegexExpression {
+ self: BinaryExpression =>
+
+ type EvaluatedType = Any
+
+ def escape(v: String): String
+ def matches(regex: Pattern, str: String): Boolean
+
+ def nullable: Boolean = true
+ def dataType: DataType = BooleanType
+
+ // try cache the pattern for Literal
+ private lazy val cache: Pattern = right match {
+ case x @ Literal(value: String, StringType) => compile(value)
+ case _ => null
+ }
+
+ protected def compile(str: String): Pattern = if(str == null) {
+ null
+ } else {
+ // Let it raise exception if couldn't compile the regex string
+ Pattern.compile(escape(str))
+ }
-case class Like(left: Expression, right: Expression) extends BinaryExpression {
- def dataType = BooleanType
- def nullable = left.nullable // Right cannot be null.
+ protected def pattern(str: String) = if(cache == null) compile(str) else cache
+
+ override def apply(input: Row): Any = {
+ val l = left.apply(input)
+ if(l == null) {
+ null
+ } else {
+ val r = right.apply(input)
+ if(r == null) {
+ null
+ } else {
+ val regex = pattern(r.asInstanceOf[String])
+ if(regex == null) {
+ null
+ } else {
+ matches(regex, l.asInstanceOf[String])
+ }
+ }
+ }
+ }
+}
+
+/**
+ * Simple RegEx pattern matching function
+ */
+case class Like(left: Expression, right: Expression)
+ extends BinaryExpression with StringRegexExpression {
+
def symbol = "LIKE"
+
+ // replace the _ with .{1} exactly match 1 time of any character
+ // replace the % with .*, match 0 or more times with any character
+ override def escape(v: String) = {
+ val sb = new StringBuilder()
+ var i = 0;
+ while (i < v.length) {
+ // Make a special case for "\\_" and "\\%"
+ val n = v.charAt(i);
+ if (n == '\\' && i + 1 < v.length && (v.charAt(i + 1) == '_' || v.charAt(i + 1) == '%')) {
+ sb.append(v.charAt(i + 1))
+ i += 1
+ } else {
+ if (n == '_') {
+ sb.append(".");
+ } else if (n == '%') {
+ sb.append(".*");
+ } else {
+ sb.append(Pattern.quote(Character.toString(n)));
+ }
+ }
+
+ i += 1
+ }
+
+ sb.toString()
+ }
+
+ override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).matches()
}
+case class RLike(left: Expression, right: Expression)
+ extends BinaryExpression with StringRegexExpression {
+
+ def symbol = "RLIKE"
+ override def escape(v: String): String = v
+ override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).find(0)
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
index 94894adf81..52a205be3e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
@@ -109,4 +109,87 @@ class ExpressionEvaluationSuite extends FunSuite {
}
}
}
+
+ def evaluate(expression: Expression, inputRow: Row = EmptyRow): Any = {
+ expression.apply(inputRow)
+ }
+
+ def checkEvaluation(expression: Expression, expected: Any, inputRow: Row = EmptyRow): Unit = {
+ val actual = try evaluate(expression, inputRow) catch {
+ case e: Exception => fail(s"Exception evaluating $expression", e)
+ }
+ if(actual != expected) {
+ val input = if(inputRow == EmptyRow) "" else s", input: $inputRow"
+ fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
+ }
+ }
+
+ test("LIKE literal Regular Expression") {
+ checkEvaluation(Literal(null, StringType).like("a"), null)
+ checkEvaluation(Literal(null, StringType).like(Literal(null, StringType)), null)
+ checkEvaluation("abdef" like "abdef", true)
+ checkEvaluation("a_%b" like "a\\__b", true)
+ checkEvaluation("addb" like "a_%b", true)
+ checkEvaluation("addb" like "a\\__b", false)
+ checkEvaluation("addb" like "a%\\%b", false)
+ checkEvaluation("a_%b" like "a%\\%b", true)
+ checkEvaluation("addb" like "a%", true)
+ checkEvaluation("addb" like "**", false)
+ checkEvaluation("abc" like "a%", true)
+ checkEvaluation("abc" like "b%", false)
+ checkEvaluation("abc" like "bc%", false)
+ }
+
+ test("LIKE Non-literal Regular Expression") {
+ val regEx = 'a.string.at(0)
+ checkEvaluation("abcd" like regEx, null, new GenericRow(Array[Any](null)))
+ checkEvaluation("abdef" like regEx, true, new GenericRow(Array[Any]("abdef")))
+ checkEvaluation("a_%b" like regEx, true, new GenericRow(Array[Any]("a\\__b")))
+ checkEvaluation("addb" like regEx, true, new GenericRow(Array[Any]("a_%b")))
+ checkEvaluation("addb" like regEx, false, new GenericRow(Array[Any]("a\\__b")))
+ checkEvaluation("addb" like regEx, false, new GenericRow(Array[Any]("a%\\%b")))
+ checkEvaluation("a_%b" like regEx, true, new GenericRow(Array[Any]("a%\\%b")))
+ checkEvaluation("addb" like regEx, true, new GenericRow(Array[Any]("a%")))
+ checkEvaluation("addb" like regEx, false, new GenericRow(Array[Any]("**")))
+ checkEvaluation("abc" like regEx, true, new GenericRow(Array[Any]("a%")))
+ checkEvaluation("abc" like regEx, false, new GenericRow(Array[Any]("b%")))
+ checkEvaluation("abc" like regEx, false, new GenericRow(Array[Any]("bc%")))
+ }
+
+ test("RLIKE literal Regular Expression") {
+ checkEvaluation("abdef" rlike "abdef", true)
+ checkEvaluation("abbbbc" rlike "a.*c", true)
+
+ checkEvaluation("fofo" rlike "^fo", true)
+ checkEvaluation("fo\no" rlike "^fo\no$", true)
+ checkEvaluation("Bn" rlike "^Ba*n", true)
+ checkEvaluation("afofo" rlike "fo", true)
+ checkEvaluation("afofo" rlike "^fo", false)
+ checkEvaluation("Baan" rlike "^Ba?n", false)
+ checkEvaluation("axe" rlike "pi|apa", false)
+ checkEvaluation("pip" rlike "^(pi)*$", false)
+
+ checkEvaluation("abc" rlike "^ab", true)
+ checkEvaluation("abc" rlike "^bc", false)
+ checkEvaluation("abc" rlike "^ab", true)
+ checkEvaluation("abc" rlike "^bc", false)
+
+ intercept[java.util.regex.PatternSyntaxException] {
+ evaluate("abbbbc" rlike "**")
+ }
+ }
+
+ test("RLIKE Non-literal Regular Expression") {
+ val regEx = 'a.string.at(0)
+ checkEvaluation("abdef" rlike regEx, true, new GenericRow(Array[Any]("abdef")))
+ checkEvaluation("abbbbc" rlike regEx, true, new GenericRow(Array[Any]("a.*c")))
+ checkEvaluation("fofo" rlike regEx, true, new GenericRow(Array[Any]("^fo")))
+ checkEvaluation("fo\no" rlike regEx, true, new GenericRow(Array[Any]("^fo\no$")))
+ checkEvaluation("Bn" rlike regEx, true, new GenericRow(Array[Any]("^Ba*n")))
+
+ intercept[java.util.regex.PatternSyntaxException] {
+ evaluate("abbbbc" rlike regEx, new GenericRow(Array[Any]("**")))
+ }
+ }
}
+
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index b70ec897e4..490a592a58 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -847,12 +847,9 @@ object HiveQl {
case Token(">=", left :: right:: Nil) => GreaterThanOrEqual(nodeToExpr(left), nodeToExpr(right))
case Token("<", left :: right:: Nil) => LessThan(nodeToExpr(left), nodeToExpr(right))
case Token("<=", left :: right:: Nil) => LessThanOrEqual(nodeToExpr(left), nodeToExpr(right))
- case Token("LIKE", left :: right:: Nil) =>
- UnresolvedFunction("LIKE", Seq(nodeToExpr(left), nodeToExpr(right)))
- case Token("RLIKE", left :: right:: Nil) =>
- UnresolvedFunction("RLIKE", Seq(nodeToExpr(left), nodeToExpr(right)))
- case Token("REGEXP", left :: right:: Nil) =>
- UnresolvedFunction("REGEXP", Seq(nodeToExpr(left), nodeToExpr(right)))
+ case Token("LIKE", left :: right:: Nil) => Like(nodeToExpr(left), nodeToExpr(right))
+ case Token("RLIKE", left :: right:: Nil) => RLike(nodeToExpr(left), nodeToExpr(right))
+ case Token("REGEXP", left :: right:: Nil) => RLike(nodeToExpr(left), nodeToExpr(right))
case Token("TOK_FUNCTION", Token("TOK_ISNOTNULL", Nil) :: child :: Nil) =>
IsNotNull(nodeToExpr(child))
case Token("TOK_FUNCTION", Token("TOK_ISNULL", Nil) :: child :: Nil) =>