From 71296c041e59159bd7c5836cf652c02843974077 Mon Sep 17 00:00:00 2001 From: gatorsmile Date: Tue, 3 May 2016 23:20:18 +0200 Subject: [SPARK-15056][SQL] Parse Unsupported Sampling Syntax and Issue Better Exceptions #### What changes were proposed in this pull request? Compared with the current Spark parser, there are two extra syntax are supported in Hive for sampling - In `On` clauses, `rand()` is used for indicating sampling on the entire row instead of an individual column. For example, ```SQL SELECT * FROM source TABLESAMPLE(BUCKET 3 OUT OF 32 ON rand()) s; ``` - Users can specify the total length to be read. For example, ```SQL SELECT * FROM source TABLESAMPLE(100M) s; ``` Below is the link for references: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Sampling This PR is to parse and capture these two extra syntax, and issue a better error message. #### How was this patch tested? Added test cases to verify the thrown exceptions Author: gatorsmile Closes #12838 from gatorsmile/bucketOnRand. --- .../antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 | 7 ++++++- .../org/apache/spark/sql/catalyst/parser/AstBuilder.scala | 12 +++++++++++- .../apache/spark/sql/catalyst/parser/PlanParserSuite.scala | 6 +++++- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index cc4e5c853e..3ab448dd9e 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -389,7 +389,8 @@ sample : TABLESAMPLE '(' ( (percentage=(INTEGER_VALUE | DECIMAL_VALUE) sampleType=PERCENTLIT) | (expression sampleType=ROWS) - | (sampleType=BUCKET numerator=INTEGER_VALUE OUT OF denominator=INTEGER_VALUE (ON identifier)?)) + | sampleType=BYTELENGTH_LITERAL + | (sampleType=BUCKET numerator=INTEGER_VALUE OUT OF denominator=INTEGER_VALUE (ON (identifier | qualifiedName '(' ')'))?)) ')' ; @@ -895,6 +896,10 @@ TINYINT_LITERAL : DIGIT+ 'Y' ; +BYTELENGTH_LITERAL + : DIGIT+ ('B' | 'K' | 'M' | 'G') + ; + INTEGER_VALUE : DIGIT+ ; diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index c3974625aa..1d4e1ec3b8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -632,8 +632,18 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging { val fraction = ctx.percentage.getText.toDouble sample(fraction / 100.0d) + case SqlBaseParser.BYTELENGTH_LITERAL => + throw new ParseException( + "TABLESAMPLE(byteLengthLiteral) is not supported", ctx) + case SqlBaseParser.BUCKET if ctx.ON != null => - throw new ParseException("TABLESAMPLE(BUCKET x OUT OF y ON id) is not supported", ctx) + if (ctx.identifier != null) { + throw new ParseException( + "TABLESAMPLE(BUCKET x OUT OF y ON colname) is not supported", ctx) + } else { + throw new ParseException( + "TABLESAMPLE(BUCKET x OUT OF y ON function) is not supported", ctx) + } case SqlBaseParser.BUCKET => sample(ctx.numerator.getText.toDouble / ctx.denominator.getText.toDouble) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala index b7af2ceda6..aaf84268af 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala @@ -372,9 +372,13 @@ class PlanParserSuite extends PlanTest { assertEqual(s"$sql tablesample(bucket 4 out of 10) as x", Sample(0, .4d, withReplacement = false, 10L, table("t").as("x"))(true).select(star())) intercept(s"$sql tablesample(bucket 4 out of 10 on x) as x", - "TABLESAMPLE(BUCKET x OUT OF y ON id) is not supported") + "TABLESAMPLE(BUCKET x OUT OF y ON colname) is not supported") intercept(s"$sql tablesample(bucket 11 out of 10) as x", s"Sampling fraction (${11.0/10.0}) must be on interval [0, 1]") + intercept("SELECT * FROM parquet_t0 TABLESAMPLE(300M) s", + "TABLESAMPLE(byteLengthLiteral) is not supported") + intercept("SELECT * FROM parquet_t0 TABLESAMPLE(BUCKET 3 OUT OF 32 ON rand()) s", + "TABLESAMPLE(BUCKET x OUT OF y ON function) is not supported") } test("sub-query") { -- cgit v1.2.3