aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2016-03-08 17:42:52 -0800
committerMichael Armbrust <michael@databricks.com>2016-03-08 17:42:52 -0800
commit982ef2b87e3a9e0f3f252a1a0f30970cafe58c52 (patch)
treeb8c2f4129cd3eb5bf5434b124f6adaaaba8fd6d0
parentd8813fa043e8b8f7cbf6921d4c7ec889634f7abd (diff)
downloadspark-982ef2b87e3a9e0f3f252a1a0f30970cafe58c52.tar.gz
spark-982ef2b87e3a9e0f3f252a1a0f30970cafe58c52.tar.bz2
spark-982ef2b87e3a9e0f3f252a1a0f30970cafe58c52.zip
[SPARK-13750][SQL] fix sizeInBytes of HadoopFsRelation
## What changes were proposed in this pull request? This PR fix the sizeInBytes of HadoopFsRelation. ## How was this patch tested? Added regression test for that. Author: Davies Liu <davies@databricks.com> Closes #11590 from davies/fix_sizeInBytes.
-rw-r--r--sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala2
-rw-r--r--sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala42
2 files changed, 44 insertions, 0 deletions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index 60b0c64c7f..e251b52f6c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -419,6 +419,8 @@ case class HadoopFsRelation(
/** Returns the list of files that will be read when scanning this relation. */
override def inputFiles: Array[String] =
location.allFiles().map(_.getPath.toUri.toString).toArray
+
+ override def sizeInBytes: Long = location.allFiles().map(_.getLen).sum
}
/**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
new file mode 100644
index 0000000000..297731c70c
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.io.{File, FilenameFilter}
+
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.test.SharedSQLContext
+
+class HadoopFsRelationSuite extends QueryTest with SharedSQLContext {
+
+ test("sizeInBytes should be the total size of all files") {
+ withTempDir{ dir =>
+ dir.delete()
+ sqlContext.range(1000).write.parquet(dir.toString)
+ // ignore hidden files
+ val allFiles = dir.listFiles(new FilenameFilter {
+ override def accept(dir: File, name: String): Boolean = {
+ !name.startsWith(".")
+ }
+ })
+ val totalSize = allFiles.map(_.length()).sum
+ val df = sqlContext.read.parquet(dir.toString)
+ assert(df.queryExecution.logical.statistics.sizeInBytes === BigInt(totalSize))
+ }
+ }
+}