From e4f4886d7148bf48f9e3462b83bfb1ecc7edbe31 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@163.com>
Date: Wed, 10 Sep 2014 12:56:59 -0700
Subject: [SPARK-2096][SQL] Correctly parse dot notations

First let me write down the current `projections` grammar of spark sql:

    expression                : orExpression
    orExpression              : andExpression {"or" andExpression}
    andExpression             : comparisonExpression {"and" comparisonExpression}
    comparisonExpression      : termExpression | termExpression "=" termExpression | termExpression ">" termExpression | ...
    termExpression            : productExpression {"+"|"-" productExpression}
    productExpression         : baseExpression {"*"|"/"|"%" baseExpression}
    baseExpression            : expression "[" expression "]" | ... | ident | ...
    ident                     : identChar {identChar | digit} | delimiters | ...
    identChar                 : letter | "_" | "."
    delimiters                : "," | ";" | "(" | ")" | "[" | "]" | ...
    projection                : expression [["AS"] ident]
    projections               : projection { "," projection}

For something like `a.b.c[1]`, it will be parsed as:
<img src="http://img51.imgspice.com/i/03008/4iltjsnqgmtt_t.jpg" border=0>
But for something like `a[1].b`, the current grammar can't parse it correctly.
A simple solution is written in `ParquetQuerySuite#NestedSqlParser`, changed grammars are:

    delimiters                : "." | "," | ";" | "(" | ")" | "[" | "]" | ...
    identChar                 : letter | "_"
    baseExpression            : expression "[" expression "]" | expression "." ident | ... | ident | ...
This works well, but can't cover some corner case like `select t.a.b from table as t`:
<img src="http://img51.imgspice.com/i/03008/v2iau3hoxoxg_t.jpg" border=0>
`t.a.b` parsed as `GetField(GetField(UnResolved("t"), "a"), "b")` instead of `GetField(UnResolved("t.a"), "b")` using this new grammar.
However, we can't resolve `t` as it's not a filed, but the whole table.(if we could do this, then `select t from table as t` is legal, which is unexpected)
My solution is:

    dotExpressionHeader       : ident "." ident
    baseExpression            : expression "[" expression "]" | expression "." ident | ... | dotExpressionHeader  | ident | ...
I passed all test cases under sql locally and add a more complex case.
"arrayOfStruct.field1 to access all values of field1" is not supported yet. Since this PR has changed a lot of code, I will open another PR for it.
I'm not familiar with the latter optimize phase, please correct me if I missed something.

Author: Wenchen Fan <cloud0fan@163.com>
Author: Michael Armbrust <michael@databricks.com>

Closes #2230 from cloud-fan/dot and squashes the following commits:

e1a8898 [Wenchen Fan] remove support for arbitrary nested arrays
ee8a724 [Wenchen Fan] rollback LogicalPlan, support dot operation on nested array type
a58df40 [Michael Armbrust] add regression test for doubly nested data
16bc4c6 [Wenchen Fan] some enhance
95d733f [Wenchen Fan] split long line
dc31698 [Wenchen Fan] SPARK-2096 Correctly parse dot notations
---
 .../scala/org/apache/spark/sql/catalyst/SqlParser.scala     | 13 +++++++++++--
 .../spark/sql/catalyst/plans/logical/LogicalPlan.scala      |  6 +-----
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'sql/catalyst/src/main/scala/org/apache')
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index a04b4a938d..ca69531c69 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -357,16 +357,25 @@ class SqlParser extends StandardTokenParsers with PackratParsers {
     expression ~ "[" ~ expression <~ "]" ^^ {
       case base ~ _ ~ ordinal => GetItem(base, ordinal)
     } |
+    (expression <~ ".") ~ ident ^^ {
+      case base ~ fieldName => GetField(base, fieldName)
+    } |
     TRUE ^^^ Literal(true, BooleanType) |
     FALSE ^^^ Literal(false, BooleanType) |
     cast |
     "(" ~> expression <~ ")" |
     function |
     "-" ~> literal ^^ UnaryMinus |
+    dotExpressionHeader |
     ident ^^ UnresolvedAttribute |
     "*" ^^^ Star(None) |
     literal
 
+  protected lazy val dotExpressionHeader: Parser[Expression] =
+    (ident <~ ".") ~ ident ~ rep("." ~> ident) ^^ {
+      case i1 ~ i2 ~ rest => UnresolvedAttribute(i1 + "." + i2 + rest.mkString(".", ".", ""))
+    }
+
   protected lazy val dataType: Parser[DataType] =
     STRING ^^^ StringType | TIMESTAMP ^^^ TimestampType
 }
@@ -380,7 +389,7 @@ class SqlLexical(val keywords: Seq[String]) extends StdLexical {
 
   delimiters += (
       "@", "*", "+", "-", "<", "=", "<>", "!=", "<=", ">=", ">", "/", "(", ")",
-      ",", ";", "%", "{", "}", ":", "[", "]"
+      ",", ";", "%", "{", "}", ":", "[", "]", "."
   )
 
   override lazy val token: Parser[Token] = (
@@ -401,7 +410,7 @@ class SqlLexical(val keywords: Seq[String]) extends StdLexical {
       | failure("illegal character")
     )
 
-  override def identChar = letter | elem('_') | elem('.')
+  override def identChar = letter | elem('_')
 
   override def whitespace: Parser[Any] = rep(
     whitespaceChar
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index f81d911194..bae491f07c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -104,11 +104,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] {
       case Seq((a, Nil)) => Some(a) // One match, no nested fields, use it.
       // One match, but we also need to extract the requested nested field.
       case Seq((a, nestedFields)) =>
-        a.dataType match {
-          case StructType(fields) =>
-            Some(Alias(nestedFields.foldLeft(a: Expression)(GetField), nestedFields.last)())
-          case _ => None // Don't know how to resolve these field references
-        }
+        Some(Alias(nestedFields.foldLeft(a: Expression)(GetField), nestedFields.last)())
       case Seq() => None         // No matches.
       case ambiguousReferences =>
         throw new TreeNodeException(
-- 
cgit v1.2.3