[SPARK-11274] [SQL] Text data source support for Spark SQL.

This adds API for reading and writing text files, similar to SparkContext.textFile and RDD.saveAsTextFile. ``` SQLContext.read.text("/path/to/something.txt") DataFrame.write.text("/path/to/write.txt") ``` Using the new Dataset API, this also supports ``` val ds: Dataset[String] = SQLContext.read.text("/path/to/something.txt").as[String] ``` Author: Reynold Xin <rxin@databricks.com> Closes #9240 from rxin/SPARK-11274.
author: Reynold Xin <rxin@databricks.com> 2015-10-23 13:04:06 -0700
committer: Yin Huai <yhuai@databricks.com> 2015-10-23 13:04:06 -0700
commit: e1a897b657eb62e837026f7b3efafb9a6424ec4f (patch)
tree: 6c9553d281c65342db5ff692cbe3f4814a539eed /sql/core/src/test
parent: 4e38defae13b2b13e196b4d172722ef5e6266c66 (diff)
download: spark-e1a897b657eb62e837026f7b3efafb9a6424ec4f.tar.gz
spark-e1a897b657eb62e837026f7b3efafb9a6424ec4f.tar.bz2
spark-e1a897b657eb62e837026f7b3efafb9a6424ec4f.zip
2 files changed, 85 insertions, 0 deletions
diff --git a/sql/core/src/test/resources/text-suite.txt b/sql/core/src/test/resources/text-suite.txt
new file mode 100644
index 0000000000..e8fd967197
--- /dev/null
+++ b/sql/core/src/test/resources/text-suite.txt
@@ -0,0 +1,4 @@
+This is a test file for the text data source
+1+1
+数据砖头
+"doh"
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala
new file mode 100644
index 0000000000..0a2306c066
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.text
+
+import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.types.{StringType, StructType}
+import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row}
+import org.apache.spark.util.Utils
+
+
+class TextSuite extends QueryTest with SharedSQLContext {
+
+  test("reading text file") {
+    verifyFrame(sqlContext.read.format("text").load(testFile))
+  }
+
+  test("SQLContext.read.text() API") {
+    verifyFrame(sqlContext.read.text(testFile))
+  }
+
+  test("writing") {
+    val df = sqlContext.read.text(testFile)
+
+    val tempFile = Utils.createTempDir()
+    tempFile.delete()
+    df.write.text(tempFile.getCanonicalPath)
+    verifyFrame(sqlContext.read.text(tempFile.getCanonicalPath))
+
+    Utils.deleteRecursively(tempFile)
+  }
+
+  test("error handling for invalid schema") {
+    val tempFile = Utils.createTempDir()
+    tempFile.delete()
+
+    val df = sqlContext.range(2)
+    intercept[AnalysisException] {
+      df.write.text(tempFile.getCanonicalPath)
+    }
+
+    intercept[AnalysisException] {
+      sqlContext.range(2).select(df("id"), df("id") + 1).write.text(tempFile.getCanonicalPath)
+    }
+  }
+
+  private def testFile: String = {
+    Thread.currentThread().getContextClassLoader.getResource("text-suite.txt").toString
+  }
+
+  /** Verifies data and schema. */
+  private def verifyFrame(df: DataFrame): Unit = {
+    // schema
+    assert(df.schema == new StructType().add("text", StringType))
+
+    // verify content
+    val data = df.collect()
+    assert(data(0) == Row("This is a test file for the text data source"))
+    assert(data(1) == Row("1+1"))
+    // non ascii characters are not allowed in the code, so we disable the scalastyle here.
+    // scalastyle:off
+    assert(data(2) == Row("数据砖头"))
+    // scalastyle:on
+    assert(data(3) == Row("\"doh\""))
+    assert(data.length == 4)
+  }
+}
author	Reynold Xin <rxin@databricks.com>	2015-10-23 13:04:06 -0700
committer	Yin Huai <yhuai@databricks.com>	2015-10-23 13:04:06 -0700
commit	e1a897b657eb62e837026f7b3efafb9a6424ec4f (patch)
tree	6c9553d281c65342db5ff692cbe3f4814a539eed /sql/core/src/test
parent	4e38defae13b2b13e196b4d172722ef5e6266c66 (diff)
download	spark-e1a897b657eb62e837026f7b3efafb9a6424ec4f.tar.gz spark-e1a897b657eb62e837026f7b3efafb9a6424ec4f.tar.bz2 spark-e1a897b657eb62e837026f7b3efafb9a6424ec4f.zip