[SPARK-19359][SQL] renaming partition should not leave useless directories

## What changes were proposed in this pull request? Hive metastore is not case-preserving and keep partition columns with lower case names. If Spark SQL creates a table with upper-case partition column names using `HiveExternalCatalog`, when we rename partition, it first calls the HiveClient to renamePartition, which will create a new lower case partition path, then Spark SQL renames the lower case path to upper-case. However, when we rename a nested path, different file systems have different behaviors. e.g. in jenkins, renaming `a=1/b=2` to `A=2/B=2` will success, but leave an empty directory `a=1`. in mac os, the renaming doesn't work as expected and result to `a=1/B=2`. This PR renames the partition directory recursively from the first partition column in `HiveExternalCatalog`, to be most compatible with different file systems. ## How was this patch tested? new regression test Author: Wenchen Fan <wenchen@databricks.com> Closes #16837 from cloud-fan/partition.
author: Wenchen Fan <wenchen@databricks.com> 2017-02-09 00:39:22 -0500
committer: gatorsmile <gatorsmile@gmail.com> 2017-02-09 00:39:22 -0500
commit: 50a991264c16e4c4126e88668ef4fbd048c782b8 (patch)
tree: 5844d588e8b85bc792e7d9257b6fe15fa2b853c9 /sql/hive/src/test/scala/org/apache
parent: 64cae22f7cbba793e32d2c8ccb4b7981208070fd (diff)
download: spark-50a991264c16e4c4126e88668ef4fbd048c782b8.tar.gz
spark-50a991264c16e4c4126e88668ef4fbd048c782b8.tar.bz2
spark-50a991264c16e4c4126e88668ef4fbd048c782b8.zip
1 files changed, 29 insertions, 0 deletions
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
index dca207a72d..96385961c9 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionProviderCompatibilitySuite.scala
@@ -19,8 +19,11 @@ package org.apache.spark.sql.hive
 
 import java.io.File
 
+import org.apache.hadoop.fs.Path
+
 import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.{AnalysisException, QueryTest}
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
@@ -481,4 +484,30 @@ class PartitionProviderCompatibilitySuite
       assert(spark.sql("show partitions test").count() == 5)
     }
   }
+
+  test("SPARK-19359: renaming partition should not leave useless directories") {
+    withTable("t", "t1") {
+      Seq((1, 2, 3)).toDF("id", "A", "B").write.partitionBy("A", "B").saveAsTable("t")
+      spark.sql("alter table t partition(A=2, B=3) rename to partition(A=4, B=5)")
+
+      var table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t"))
+      var tablePath = new Path(table.location)
+      val fs = tablePath.getFileSystem(spark.sessionState.newHadoopConf())
+      // the `A=2` directory is still there, we follow this behavior from hive.
+      assert(fs.listStatus(tablePath)
+        .filterNot(_.getPath.toString.contains("A=2")).count(_.isDirectory) == 1)
+      assert(fs.listStatus(new Path(tablePath, "A=4")).count(_.isDirectory) == 1)
+
+
+      Seq((1, 2, 3, 4)).toDF("id", "A", "b", "C").write.partitionBy("A", "b", "C").saveAsTable("t1")
+      spark.sql("alter table t1 partition(A=2, b=3, C=4) rename to partition(A=4, b=5, C=6)")
+      table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t1"))
+      tablePath = new Path(table.location)
+      // the `A=2` directory is still there, we follow this behavior from hive.
+      assert(fs.listStatus(tablePath)
+        .filterNot(_.getPath.toString.contains("A=2")).count(_.isDirectory) == 1)
+      assert(fs.listStatus(new Path(tablePath, "A=4")).count(_.isDirectory) == 1)
+      assert(fs.listStatus(new Path(new Path(tablePath, "A=4"), "b=5")).count(_.isDirectory) == 1)
+    }
+  }
 }
author	Wenchen Fan <wenchen@databricks.com>	2017-02-09 00:39:22 -0500
committer	gatorsmile <gatorsmile@gmail.com>	2017-02-09 00:39:22 -0500
commit	50a991264c16e4c4126e88668ef4fbd048c782b8 (patch)
tree	5844d588e8b85bc792e7d9257b6fe15fa2b853c9 /sql/hive/src/test/scala/org/apache
parent	64cae22f7cbba793e32d2c8ccb4b7981208070fd (diff)
download	spark-50a991264c16e4c4126e88668ef4fbd048c782b8.tar.gz spark-50a991264c16e4c4126e88668ef4fbd048c782b8.tar.bz2 spark-50a991264c16e4c4126e88668ef4fbd048c782b8.zip