aboutsummaryrefslogtreecommitdiff
path: root/sql/catalyst/src/main/scala/org/apache
diff options
context:
space:
mode:
authorWenchen Fan <cloud0fan@163.com>2014-09-10 12:56:59 -0700
committerMichael Armbrust <michael@databricks.com>2014-09-10 12:56:59 -0700
commite4f4886d7148bf48f9e3462b83bfb1ecc7edbe31 (patch)
treeef2eb99181260a633e898c6476475b26086e0c80 /sql/catalyst/src/main/scala/org/apache
parent1f4a648d4e30e837d6cf3ea8de1808e2254ad70b (diff)
downloadspark-e4f4886d7148bf48f9e3462b83bfb1ecc7edbe31.tar.gz
spark-e4f4886d7148bf48f9e3462b83bfb1ecc7edbe31.tar.bz2
spark-e4f4886d7148bf48f9e3462b83bfb1ecc7edbe31.zip
[SPARK-2096][SQL] Correctly parse dot notations
First let me write down the current `projections` grammar of spark sql: expression : orExpression orExpression : andExpression {"or" andExpression} andExpression : comparisonExpression {"and" comparisonExpression} comparisonExpression : termExpression | termExpression "=" termExpression | termExpression ">" termExpression | ... termExpression : productExpression {"+"|"-" productExpression} productExpression : baseExpression {"*"|"/"|"%" baseExpression} baseExpression : expression "[" expression "]" | ... | ident | ... ident : identChar {identChar | digit} | delimiters | ... identChar : letter | "_" | "." delimiters : "," | ";" | "(" | ")" | "[" | "]" | ... projection : expression [["AS"] ident] projections : projection { "," projection} For something like `a.b.c[1]`, it will be parsed as: <img src="http://img51.imgspice.com/i/03008/4iltjsnqgmtt_t.jpg" border=0> But for something like `a[1].b`, the current grammar can't parse it correctly. A simple solution is written in `ParquetQuerySuite#NestedSqlParser`, changed grammars are: delimiters : "." | "," | ";" | "(" | ")" | "[" | "]" | ... identChar : letter | "_" baseExpression : expression "[" expression "]" | expression "." ident | ... | ident | ... This works well, but can't cover some corner case like `select t.a.b from table as t`: <img src="http://img51.imgspice.com/i/03008/v2iau3hoxoxg_t.jpg" border=0> `t.a.b` parsed as `GetField(GetField(UnResolved("t"), "a"), "b")` instead of `GetField(UnResolved("t.a"), "b")` using this new grammar. However, we can't resolve `t` as it's not a filed, but the whole table.(if we could do this, then `select t from table as t` is legal, which is unexpected) My solution is: dotExpressionHeader : ident "." ident baseExpression : expression "[" expression "]" | expression "." ident | ... | dotExpressionHeader | ident | ... I passed all test cases under sql locally and add a more complex case. "arrayOfStruct.field1 to access all values of field1" is not supported yet. Since this PR has changed a lot of code, I will open another PR for it. I'm not familiar with the latter optimize phase, please correct me if I missed something. Author: Wenchen Fan <cloud0fan@163.com> Author: Michael Armbrust <michael@databricks.com> Closes #2230 from cloud-fan/dot and squashes the following commits: e1a8898 [Wenchen Fan] remove support for arbitrary nested arrays ee8a724 [Wenchen Fan] rollback LogicalPlan, support dot operation on nested array type a58df40 [Michael Armbrust] add regression test for doubly nested data 16bc4c6 [Wenchen Fan] some enhance 95d733f [Wenchen Fan] split long line dc31698 [Wenchen Fan] SPARK-2096 Correctly parse dot notations
Diffstat (limited to 'sql/catalyst/src/main/scala/org/apache')
-rwxr-xr-xsql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala13
-rw-r--r--sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala6
2 files changed, 12 insertions, 7 deletions
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index a04b4a938d..ca69531c69 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -357,16 +357,25 @@ class SqlParser extends StandardTokenParsers with PackratParsers {
expression ~ "[" ~ expression <~ "]" ^^ {
case base ~ _ ~ ordinal => GetItem(base, ordinal)
} |
+ (expression <~ ".") ~ ident ^^ {
+ case base ~ fieldName => GetField(base, fieldName)
+ } |
TRUE ^^^ Literal(true, BooleanType) |
FALSE ^^^ Literal(false, BooleanType) |
cast |
"(" ~> expression <~ ")" |
function |
"-" ~> literal ^^ UnaryMinus |
+ dotExpressionHeader |
ident ^^ UnresolvedAttribute |
"*" ^^^ Star(None) |
literal
+ protected lazy val dotExpressionHeader: Parser[Expression] =
+ (ident <~ ".") ~ ident ~ rep("." ~> ident) ^^ {
+ case i1 ~ i2 ~ rest => UnresolvedAttribute(i1 + "." + i2 + rest.mkString(".", ".", ""))
+ }
+
protected lazy val dataType: Parser[DataType] =
STRING ^^^ StringType | TIMESTAMP ^^^ TimestampType
}
@@ -380,7 +389,7 @@ class SqlLexical(val keywords: Seq[String]) extends StdLexical {
delimiters += (
"@", "*", "+", "-", "<", "=", "<>", "!=", "<=", ">=", ">", "/", "(", ")",
- ",", ";", "%", "{", "}", ":", "[", "]"
+ ",", ";", "%", "{", "}", ":", "[", "]", "."
)
override lazy val token: Parser[Token] = (
@@ -401,7 +410,7 @@ class SqlLexical(val keywords: Seq[String]) extends StdLexical {
| failure("illegal character")
)
- override def identChar = letter | elem('_') | elem('.')
+ override def identChar = letter | elem('_')
override def whitespace: Parser[Any] = rep(
whitespaceChar
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index f81d911194..bae491f07c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -104,11 +104,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] {
case Seq((a, Nil)) => Some(a) // One match, no nested fields, use it.
// One match, but we also need to extract the requested nested field.
case Seq((a, nestedFields)) =>
- a.dataType match {
- case StructType(fields) =>
- Some(Alias(nestedFields.foldLeft(a: Expression)(GetField), nestedFields.last)())
- case _ => None // Don't know how to resolve these field references
- }
+ Some(Alias(nestedFields.foldLeft(a: Expression)(GetField), nestedFields.last)())
case Seq() => None // No matches.
case ambiguousReferences =>
throw new TreeNodeException(