aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorgatorsmile <gatorsmile@gmail.com>2016-06-13 13:22:46 -0700
committerYin Huai <yhuai@databricks.com>2016-06-13 13:22:46 -0700
commit3b7fb84cf88bcae56713fd56396db537fa18f2e5 (patch)
treecac22b65ce993a8b44731a9c9e852292b1b9dc14 /sql
parenta6a18a4573515e76d78534f1a19fcc2c3819f6c5 (diff)
downloadspark-3b7fb84cf88bcae56713fd56396db537fa18f2e5.tar.gz
spark-3b7fb84cf88bcae56713fd56396db537fa18f2e5.tar.bz2
spark-3b7fb84cf88bcae56713fd56396db537fa18f2e5.zip
[SPARK-15676][SQL] Disallow Column Names as Partition Columns For Hive Tables
#### What changes were proposed in this pull request? When creating a Hive Table (not data source tables), a common error users might make is to specify an existing column name as a partition column. Below is what Hive returns in this case: ``` hive> CREATE TABLE partitioned (id bigint, data string) PARTITIONED BY (data string, part string); FAILED: SemanticException [Error 10035]: Column repeated in partitioning columns ``` Currently, the error we issued is very confusing: ``` org.apache.spark.sql.AnalysisException: org.apache.hadoop.hive.ql.metadata.HiveException: MetaException(message:For direct MetaStore DB connections, we don't support retries at the client level.); ``` This PR is to fix the above issue by capturing the usage error in `Parser`. #### How was this patch tested? Added a test case to `DDLCommandSuite` Author: gatorsmile <gatorsmile@gmail.com> Closes #13415 from gatorsmile/partitionColumnsInTableSchema.
Diffstat (limited to 'sql')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala17
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala14
2 files changed, 31 insertions, 0 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 06d8f15dc3..a0508ad601 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -903,6 +903,23 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
val properties = Option(ctx.tablePropertyList).map(visitPropertyKeyValues).getOrElse(Map.empty)
val selectQuery = Option(ctx.query).map(plan)
+ // Ensuring whether no duplicate name is used in table definition
+ val colNames = cols.map(_.name)
+ if (colNames.length != colNames.distinct.length) {
+ val duplicateColumns = colNames.groupBy(identity).collect {
+ case (x, ys) if ys.length > 1 => "\"" + x + "\""
+ }
+ throw operationNotAllowed(s"Duplicated column names found in table definition of $name: " +
+ duplicateColumns.mkString("[", ",", "]"), ctx)
+ }
+
+ // For Hive tables, partition columns must not be part of the schema
+ val badPartCols = partitionCols.map(_.name).toSet.intersect(colNames.toSet)
+ if (badPartCols.nonEmpty) {
+ throw operationNotAllowed(s"Partition columns may not be specified in the schema: " +
+ badPartCols.map("\"" + _ + "\"").mkString("[", ",", "]"), ctx)
+ }
+
// Note: Hive requires partition columns to be distinct from the schema, so we need
// to include the partition columns here explicitly
val schema = cols ++ partitionCols
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
index aec7e99d9d..5bee28b446 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
@@ -334,6 +334,20 @@ class DDLCommandSuite extends PlanTest {
assert(ct.table.storage.locationUri == Some("/something/anything"))
}
+ test("create table - column repeated in partitioning columns") {
+ val query = "CREATE TABLE tab1 (key INT, value STRING) PARTITIONED BY (key INT, hr STRING)"
+ val e = intercept[ParseException] { parser.parsePlan(query) }
+ assert(e.getMessage.contains(
+ "Operation not allowed: Partition columns may not be specified in the schema: [\"key\"]"))
+ }
+
+ test("create table - duplicate column names in the table definition") {
+ val query = "CREATE TABLE default.tab1 (key INT, key STRING)"
+ val e = intercept[ParseException] { parser.parsePlan(query) }
+ assert(e.getMessage.contains("Operation not allowed: Duplicated column names found in " +
+ "table definition of `default`.`tab1`: [\"key\"]"))
+ }
+
test("create table using - with partitioned by") {
val query = "CREATE TABLE my_tab(a INT, b STRING) USING parquet PARTITIONED BY (a)"
val expected = CreateTableUsing(