From d6a52176ade92853f37167ad27631977dc79bc76 Mon Sep 17 00:00:00 2001 From: Sameer Agarwal Date: Mon, 25 Jul 2016 22:31:01 +0800 Subject: [SPARK-16668][TEST] Test parquet reader for row groups containing both dictionary and plain encoded pages ## What changes were proposed in this pull request? This patch adds an explicit test for [SPARK-14217] by setting the parquet dictionary and page size the generated parquet file spans across 3 pages (within a single row group) where the first page is dictionary encoded and the remaining two are plain encoded. ## How was this patch tested? 1. ParquetEncodingSuite 2. Also manually tested that this test fails without https://github.com/apache/spark/pull/12279 Author: Sameer Agarwal Closes #14304 from sameeragarwal/hybrid-encoding-test. --- .../datasources/parquet/ParquetEncodingSuite.scala | 29 ++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'sql/core/src') diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala index 88fcfce0ec..c7541889f2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala @@ -16,6 +16,10 @@ */ package org.apache.spark.sql.execution.datasources.parquet +import scala.collection.JavaConverters._ + +import org.apache.parquet.hadoop.ParquetOutputFormat + import org.apache.spark.sql.test.SharedSQLContext // TODO: this needs a lot more testing but it's currently not easy to test with the parquet @@ -78,4 +82,29 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSQLContex }} } } + + test("Read row group containing both dictionary and plain encoded pages") { + withSQLConf(ParquetOutputFormat.DICTIONARY_PAGE_SIZE -> "2048", + ParquetOutputFormat.PAGE_SIZE -> "4096") { + withTempPath { dir => + // In order to explicitly test for SPARK-14217, we set the parquet dictionary and page size + // such that the following data spans across 3 pages (within a single row group) where the + // first page is dictionary encoded and the remaining two are plain encoded. + val data = (0 until 512).flatMap(i => Seq.fill(3)(i.toString)) + data.toDF("f").coalesce(1).write.parquet(dir.getCanonicalPath) + val file = SpecificParquetRecordReaderBase.listDirectory(dir).asScala.head + + val reader = new VectorizedParquetRecordReader + reader.initialize(file, null /* set columns to null to project all columns */) + val column = reader.resultBatch().column(0) + assert(reader.nextBatch()) + + (0 until 512).foreach { i => + assert(column.getUTF8String(3 * i).toString == i.toString) + assert(column.getUTF8String(3 * i + 1).toString == i.toString) + assert(column.getUTF8String(3 * i + 2).toString == i.toString) + } + } + } + } } -- cgit v1.2.3