aboutsummaryrefslogtreecommitdiff
path: root/sql/core
diff options
context:
space:
mode:
authorgatorsmile <gatorsmile@gmail.com>2016-01-25 13:38:09 -0800
committerMichael Armbrust <michael@databricks.com>2016-01-25 13:38:09 -0800
commit9348431da212ec3ab7be2b8e89a952a48b4e2a31 (patch)
tree73081cab9e62beaf17ca98783556e7b20eae6470 /sql/core
parent00026fa9912ecee5637f1e7dd222f977f31f6766 (diff)
downloadspark-9348431da212ec3ab7be2b8e89a952a48b4e2a31.tar.gz
spark-9348431da212ec3ab7be2b8e89a952a48b4e2a31.tar.bz2
spark-9348431da212ec3ab7be2b8e89a952a48b4e2a31.zip
[SPARK-12975][SQL] Throwing Exception when Bucketing Columns are part of Partitioning Columns
When users are using `partitionBy` and `bucketBy` at the same time, some bucketing columns might be part of partitioning columns. For example, ``` df.write .format(source) .partitionBy("i") .bucketBy(8, "i", "k") .saveAsTable("bucketed_table") ``` However, in the above case, adding column `i` into `bucketBy` is useless. It is just wasting extra CPU when reading or writing bucket tables. Thus, like Hive, we can issue an exception and let users do the change. Also added a test case for checking if the information of `sortBy` and `bucketBy` columns are correctly saved in the metastore table. Could you check if my understanding is correct? cloud-fan rxin marmbrus Thanks! Author: gatorsmile <gatorsmile@gmail.com> Closes #10891 from gatorsmile/commonKeysInPartitionByBucketBy.
Diffstat (limited to 'sql/core')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala9
1 files changed, 9 insertions, 0 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index ab63fe4aa8..12eb239363 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -240,6 +240,15 @@ final class DataFrameWriter private[sql](df: DataFrame) {
n <- numBuckets
} yield {
require(n > 0 && n < 100000, "Bucket number must be greater than 0 and less than 100000.")
+
+ // partitionBy columns cannot be used in bucketBy
+ if (normalizedParCols.nonEmpty &&
+ normalizedBucketColNames.get.toSet.intersect(normalizedParCols.get.toSet).nonEmpty) {
+ throw new AnalysisException(
+ s"bucketBy columns '${bucketColumnNames.get.mkString(", ")}' should not be part of " +
+ s"partitionBy columns '${partitioningColumns.get.mkString(", ")}'")
+ }
+
BucketSpec(n, normalizedBucketColNames.get, normalizedSortColNames.getOrElse(Nil))
}
}