aboutsummaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorYash Datta <Yash.Datta@guavus.com>2014-11-14 15:16:36 -0800
committerMichael Armbrust <michael@databricks.com>2014-11-14 15:16:40 -0800
commit63ca3af66f9680fd12adee82fb4d342caae5cea4 (patch)
treecd6e888ec019d89b6c0c75d0ae8124a9b5d252a3 /sql
parentf76b9683706232c3d4e8e6e61627b8188dcb79dc (diff)
downloadspark-63ca3af66f9680fd12adee82fb4d342caae5cea4.tar.gz
spark-63ca3af66f9680fd12adee82fb4d342caae5cea4.tar.bz2
spark-63ca3af66f9680fd12adee82fb4d342caae5cea4.zip
[SPARK-4365][SQL] Remove unnecessary filter call on records returned from parquet library
Since parquet library has been updated , we no longer need to filter the records returned from parquet library for null records , as now the library skips those : from parquet-hadoop/src/main/java/parquet/hadoop/InternalParquetRecordReader.java public boolean nextKeyValue() throws IOException, InterruptedException { boolean recordFound = false; while (!recordFound) { // no more records left if (current >= total) { return false; } try { checkRead(); currentValue = recordReader.read(); current ++; if (recordReader.shouldSkipCurrentRecord()) { // this record is being filtered via the filter2 package if (DEBUG) LOG.debug("skipping record"); continue; } if (currentValue == null) { // only happens with FilteredRecordReader at end of block current = totalCountLoadedSoFar; if (DEBUG) LOG.debug("filtered record reader reached end of block"); continue; } recordFound = true; if (DEBUG) LOG.debug("read value: " + currentValue); } catch (RuntimeException e) { throw new ParquetDecodingException(format("Can not read value at %d in block %d in file %s", current, currentBlock, file), e); } } return true; } Author: Yash Datta <Yash.Datta@guavus.com> Closes #3229 from saucam/remove_filter and squashes the following commits: 8909ae9 [Yash Datta] SPARK-4365: Remove unnecessary filter call on records returned from parquet library
Diffstat (limited to 'sql')
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala2
1 files changed, 1 insertions, 1 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
index 5f93279a08..f6bed5016f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -159,7 +159,7 @@ case class ParquetTableScan(
}
} else {
baseRDD.map(_._2)
- }.filter(_ != null) // Parquet's record filters may produce null values
+ }
}
/**