From e14817b528ccab4b4685b45a95e2325630b5fc53 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Tue, 19 Jan 2016 10:44:51 -0800 Subject: [SPARK-12870][SQL] better format bucket id in file name for normal parquet file without bucket, it's file name ends with a jobUUID which maybe all numbers and mistakeny regarded as bucket id. This PR improves the format of bucket id in file name by using a different seperator, `_`, so that the regex is more robust. Author: Wenchen Fan Closes #10799 from cloud-fan/fix-bucket. --- .../apache/spark/sql/execution/datasources/bucket.scala | 14 ++++++++++---- .../sql/execution/datasources/json/JSONRelation.scala | 2 +- .../execution/datasources/parquet/ParquetRelation.scala | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) (limited to 'sql/core') diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/bucket.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/bucket.scala index c7ecd6125d..3e0d484b74 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/bucket.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/bucket.scala @@ -57,15 +57,21 @@ private[sql] abstract class BucketedOutputWriterFactory extends OutputWriterFact private[sql] object BucketingUtils { // The file name of bucketed data should have 3 parts: - // 1. some other information in the head of file name, ends with `-` - // 2. bucket id part, some numbers + // 1. some other information in the head of file name + // 2. bucket id part, some numbers, starts with "_" + // * The other-information part may use `-` as separator and may have numbers at the end, + // e.g. a normal parquet file without bucketing may have name: + // part-r-00000-2dd664f9-d2c4-4ffe-878f-431234567891.gz.parquet, and we will mistakenly + // treat `431234567891` as bucket id. So here we pick `_` as separator. // 3. optional file extension part, in the tail of file name, starts with `.` // An example of bucketed parquet file name with bucket id 3: - // part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb-00003.gz.parquet - private val bucketedFileName = """.*-(\d+)(?:\..*)?$""".r + // part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet + private val bucketedFileName = """.*_(\d+)(?:\..*)?$""".r def getBucketId(fileName: String): Option[Int] = fileName match { case bucketedFileName(bucketId) => Some(bucketId.toInt) case other => None } + + def bucketIdToString(id: Int): String = f"_$id%05d" } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala index 20c60b9c43..31c5620c9a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONRelation.scala @@ -193,7 +193,7 @@ private[json] class JsonOutputWriter( val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID") val taskAttemptId = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId - val bucketString = bucketId.map(id => f"-$id%05d").getOrElse("") + val bucketString = bucketId.map(BucketingUtils.bucketIdToString).getOrElse("") new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$bucketString$extension") } }.getRecordWriter(context) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala index 30ddec686c..b460ec1d26 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala @@ -90,7 +90,7 @@ private[sql] class ParquetOutputWriter( val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID") val taskAttemptId = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId - val bucketString = bucketId.map(id => f"-$id%05d").getOrElse("") + val bucketString = bucketId.map(BucketingUtils.bucketIdToString).getOrElse("") new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$bucketString$extension") } } -- cgit v1.2.3