From 5aa05219118e3d3525fb703a4716ae8e04f3da72 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Wed, 28 Oct 2015 14:28:38 -0700 Subject: [SPARK-11292] [SQL] Python API for text data source Adds DataFrameReader.text and DataFrameWriter.text. Author: Reynold Xin Closes #9259 from rxin/SPARK-11292. --- python/pyspark/sql/readwriter.py | 27 +++++++++++++++++++++++++-- python/test_support/sql/text-test.txt | 2 ++ 2 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 python/test_support/sql/text-test.txt (limited to 'python') diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 93832d4c71..97bd90c4db 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -23,6 +23,7 @@ if sys.version >= '3': from py4j.java_gateway import JavaClass from pyspark import RDD, since +from pyspark.rdd import ignore_unicode_prefix from pyspark.sql.column import _to_seq from pyspark.sql.types import * @@ -193,10 +194,22 @@ class DataFrameReader(object): """ return self._df(self._jreader.parquet(_to_seq(self._sqlContext._sc, paths))) + @ignore_unicode_prefix + @since(1.6) + def text(self, path): + """Loads a text file and returns a [[DataFrame]] with a single string column named "text". + + Each line in the text file is a new row in the resulting DataFrame. + + >>> df = sqlContext.read.text('python/test_support/sql/text-test.txt') + >>> df.collect() + [Row(text=u'hello'), Row(text=u'this')] + """ + return self._df(self._jreader.text(path)) + @since(1.5) def orc(self, path): - """ - Loads an ORC file, returning the result as a :class:`DataFrame`. + """Loads an ORC file, returning the result as a :class:`DataFrame`. ::Note: Currently ORC support is only available together with :class:`HiveContext`. @@ -432,6 +445,16 @@ class DataFrameWriter(object): self.partitionBy(partitionBy) self._jwrite.parquet(path) + @since(1.6) + def text(self, path): + """Saves the content of the DataFrame in a text file at the specified path. + + The DataFrame must have only one column that is of string type. + Each row becomes a new line in the output file. + """ + self._jwrite.text(path) + + @since(1.5) def orc(self, path, mode=None, partitionBy=None): """Saves the content of the :class:`DataFrame` in ORC format at the specified path. diff --git a/python/test_support/sql/text-test.txt b/python/test_support/sql/text-test.txt new file mode 100644 index 0000000000..ae1e76c9e9 --- /dev/null +++ b/python/test_support/sql/text-test.txt @@ -0,0 +1,2 @@ +hello +this \ No newline at end of file -- cgit v1.2.3