aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShixiong Zhu <shixiong@databricks.com>2016-12-07 10:30:05 -0800
committerShixiong Zhu <shixiong@databricks.com>2016-12-07 10:30:05 -0800
commitdbf3e298a1a35c0243f087814ddf88034ff96d66 (patch)
tree4ce7a9b30a94c549633e863230a71fda18f7a35f
parentf1fca81b165c5a673f7d86b268e04ea42a6c267e (diff)
downloadspark-dbf3e298a1a35c0243f087814ddf88034ff96d66.tar.gz
spark-dbf3e298a1a35c0243f087814ddf88034ff96d66.tar.bz2
spark-dbf3e298a1a35c0243f087814ddf88034ff96d66.zip
[SPARK-18764][CORE] Add a warning log when skipping a corrupted file
## What changes were proposed in this pull request? It's better to add a warning log when skipping a corrupted file. It will be helpful when we want to finish the job first, then find them in the log and fix these files. ## How was this patch tested? Jenkins Author: Shixiong Zhu <shixiong@databricks.com> Closes #16192 from zsxwing/SPARK-18764.
-rw-r--r--core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala4
-rw-r--r--core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala6
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala1
3 files changed, 9 insertions, 2 deletions
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index e3d81a6be5..6e87233cd9 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -259,7 +259,9 @@ class HadoopRDD[K, V](
try {
finished = !reader.next(key, value)
} catch {
- case e: IOException if ignoreCorruptFiles => finished = true
+ case e: IOException if ignoreCorruptFiles =>
+ logWarning(s"Skipped the rest content in the corrupted file: ${split.inputSplit}", e)
+ finished = true
}
if (!finished) {
inputMetrics.incRecordsRead(1)
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index e90e84c459..e805192bb6 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -189,7 +189,11 @@ class NewHadoopRDD[K, V](
try {
finished = !reader.nextKeyValue
} catch {
- case e: IOException if ignoreCorruptFiles => finished = true
+ case e: IOException if ignoreCorruptFiles =>
+ logWarning(
+ s"Skipped the rest content in the corrupted file: ${split.serializableHadoopSplit}",
+ e)
+ finished = true
}
if (finished) {
// Close and release the reader here; close() will also be called when the task
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
index 306dc6527e..6d8cd81431 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
@@ -139,6 +139,7 @@ class FileScanRDD(
}
} catch {
case e: IOException =>
+ logWarning(s"Skipped the rest content in the corrupted file: $currentFile", e)
finished = true
null
}