[SPARK-15982][SPARK-16009][SPARK-16007][SQL] Harmonize the behavior of DataFrameReader.text/csv/json/parquet/orc

## What changes were proposed in this pull request? Issues with current reader behavior. - `text()` without args returns an empty DF with no columns -> inconsistent, its expected that text will always return a DF with `value` string field, - `textFile()` without args fails with exception because of the above reason, it expected the DF returned by `text()` to have a `value` field. - `orc()` does not have var args, inconsistent with others - `json(single-arg)` was removed, but that caused source compatibility issues - [SPARK-16009](https://issues.apache.org/jira/browse/SPARK-16009) - user specified schema was not respected when `text/csv/...` were used with no args - [SPARK-16007](https://issues.apache.org/jira/browse/SPARK-16007) The solution I am implementing is to do the following. - For each format, there will be a single argument method, and a vararg method. For json, parquet, csv, text, this means adding json(string), etc.. For orc, this means adding orc(varargs). - Remove the special handling of text(), csv(), etc. that returns empty dataframe with no fields. Rather pass on the empty sequence of paths to the datasource, and let each datasource handle it right. For e.g, text data source, should return empty DF with schema (value: string) - Deduped docs and fixed their formatting. ## How was this patch tested? Added new unit tests for Scala and Java tests Author: Tathagata Das <tathagata.das1565@gmail.com> Closes #13727 from tdas/SPARK-15982.
author: Tathagata Das <tathagata.das1565@gmail.com> 2016-06-20 14:52:28 -0700
committer: Shixiong Zhu <shixiong@databricks.com> 2016-06-20 14:52:28 -0700
commit: b99129cc452defc266f6d357f5baab5f4ff37a36 (patch)
tree: de4e6e356930aeacee94b541530be063d178707c /sql/core/src/test/java
parent: 6df8e3886063a9d8c2e8499456ea9166245d5640 (diff)
download: spark-b99129cc452defc266f6d357f5baab5f4ff37a36.tar.gz
spark-b99129cc452defc266f6d357f5baab5f4ff37a36.tar.bz2
spark-b99129cc452defc266f6d357f5baab5f4ff37a36.zip
1 files changed, 158 insertions, 0 deletions
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameReaderWriterSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameReaderWriterSuite.java
new file mode 100644
index 0000000000..7babf7573c
--- /dev/null
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameReaderWriterSuite.java
@@ -0,0 +1,158 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package test.org.apache.spark.sql;
+
+import java.io.File;
+import java.util.HashMap;
+
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.test.TestSparkSession;
+import org.apache.spark.sql.types.StructType;
+import org.apache.spark.util.Utils;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class JavaDataFrameReaderWriterSuite {
+  private SparkSession spark = new TestSparkSession();
+  private StructType schema = new StructType().add("s", "string");
+  private transient String input;
+  private transient String output;
+
+  @Before
+  public void setUp() {
+    input = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "input").toString();
+    File f = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "output");
+    f.delete();
+    output = f.toString();
+  }
+
+  @After
+  public void tearDown() {
+    spark.stop();
+    spark = null;
+  }
+
+  @Test
+  public void testFormatAPI() {
+    spark
+        .read()
+        .format("org.apache.spark.sql.test")
+        .load()
+        .write()
+        .format("org.apache.spark.sql.test")
+        .save();
+  }
+
+  @Test
+  public void testOptionsAPI() {
+    HashMap<String, String> map = new HashMap<String, String>();
+    map.put("e", "1");
+    spark
+        .read()
+        .option("a", "1")
+        .option("b", 1)
+        .option("c", 1.0)
+        .option("d", true)
+        .options(map)
+        .text()
+        .write()
+        .option("a", "1")
+        .option("b", 1)
+        .option("c", 1.0)
+        .option("d", true)
+        .options(map)
+        .format("org.apache.spark.sql.test")
+        .save();
+  }
+
+  @Test
+  public void testSaveModeAPI() {
+    spark
+        .range(10)
+        .write()
+        .format("org.apache.spark.sql.test")
+        .mode(SaveMode.ErrorIfExists)
+        .save();
+  }
+
+  @Test
+  public void testLoadAPI() {
+    spark.read().format("org.apache.spark.sql.test").load();
+    spark.read().format("org.apache.spark.sql.test").load(input);
+    spark.read().format("org.apache.spark.sql.test").load(input, input, input);
+    spark.read().format("org.apache.spark.sql.test").load(new String[]{input, input});
+  }
+
+  @Test
+  public void testTextAPI() {
+    spark.read().text();
+    spark.read().text(input);
+    spark.read().text(input, input, input);
+    spark.read().text(new String[]{input, input})
+        .write().text(output);
+  }
+
+  @Test
+  public void testTextFileAPI() {
+    spark.read().textFile();
+    spark.read().textFile(input);
+    spark.read().textFile(input, input, input);
+    spark.read().textFile(new String[]{input, input});
+  }
+
+  @Test
+  public void testCsvAPI() {
+    spark.read().schema(schema).csv();
+    spark.read().schema(schema).csv(input);
+    spark.read().schema(schema).csv(input, input, input);
+    spark.read().schema(schema).csv(new String[]{input, input})
+        .write().csv(output);
+  }
+
+  @Test
+  public void testJsonAPI() {
+    spark.read().schema(schema).json();
+    spark.read().schema(schema).json(input);
+    spark.read().schema(schema).json(input, input, input);
+    spark.read().schema(schema).json(new String[]{input, input})
+        .write().json(output);
+  }
+
+  @Test
+  public void testParquetAPI() {
+    spark.read().schema(schema).parquet();
+    spark.read().schema(schema).parquet(input);
+    spark.read().schema(schema).parquet(input, input, input);
+    spark.read().schema(schema).parquet(new String[] { input, input })
+        .write().parquet(output);
+  }
+
+  /**
+   * This only tests whether API compiles, but does not run it as orc()
+   * cannot be run without Hive classes.
+   */
+  public void testOrcAPI() {
+    spark.read().schema(schema).orc();
+    spark.read().schema(schema).orc(input);
+    spark.read().schema(schema).orc(input, input, input);
+    spark.read().schema(schema).orc(new String[]{input, input})
+        .write().orc(output);
+  }
+}
author	Tathagata Das <tathagata.das1565@gmail.com>	2016-06-20 14:52:28 -0700
committer	Shixiong Zhu <shixiong@databricks.com>	2016-06-20 14:52:28 -0700
commit	b99129cc452defc266f6d357f5baab5f4ff37a36 (patch)
tree	de4e6e356930aeacee94b541530be063d178707c /sql/core/src/test/java
parent	6df8e3886063a9d8c2e8499456ea9166245d5640 (diff)
download	spark-b99129cc452defc266f6d357f5baab5f4ff37a36.tar.gz spark-b99129cc452defc266f6d357f5baab5f4ff37a36.tar.bz2 spark-b99129cc452defc266f6d357f5baab5f4ff37a36.zip