aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/sql/readwriter.py
diff options
context:
space:
mode:
authorReynold Xin <rxin@databricks.com>2015-11-24 18:16:07 -0800
committerReynold Xin <rxin@databricks.com>2015-11-24 18:16:07 -0800
commit25bbd3c16e8e8be4d2c43000223d54650e9a3696 (patch)
tree36a82e11908fa11f1a45447d6e493b80af240a2a /python/pyspark/sql/readwriter.py
parent238ae51b66ac12d15fba6aff061804004c5ca6cb (diff)
downloadspark-25bbd3c16e8e8be4d2c43000223d54650e9a3696.tar.gz
spark-25bbd3c16e8e8be4d2c43000223d54650e9a3696.tar.bz2
spark-25bbd3c16e8e8be4d2c43000223d54650e9a3696.zip
[SPARK-11967][SQL] Consistent use of varargs for multiple paths in DataFrameReader
This patch makes it consistent to use varargs in all DataFrameReader methods, including Parquet, JSON, text, and the generic load function. Also added a few more API tests for the Java API. Author: Reynold Xin <rxin@databricks.com> Closes #9945 from rxin/SPARK-11967.
Diffstat (limited to 'python/pyspark/sql/readwriter.py')
-rw-r--r--python/pyspark/sql/readwriter.py19
1 files changed, 12 insertions, 7 deletions
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index e8f0d7ec77..2e75f0c8a1 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -109,7 +109,7 @@ class DataFrameReader(object):
def load(self, path=None, format=None, schema=None, **options):
"""Loads data from a data source and returns it as a :class`DataFrame`.
- :param path: optional string for file-system backed data sources.
+ :param path: optional string or a list of string for file-system backed data sources.
:param format: optional string for format of the data source. Default to 'parquet'.
:param schema: optional :class:`StructType` for the input schema.
:param options: all other string options
@@ -118,6 +118,7 @@ class DataFrameReader(object):
... opt2=1, opt3='str')
>>> df.dtypes
[('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
+
>>> df = sqlContext.read.format('json').load(['python/test_support/sql/people.json',
... 'python/test_support/sql/people1.json'])
>>> df.dtypes
@@ -130,10 +131,8 @@ class DataFrameReader(object):
self.options(**options)
if path is not None:
if type(path) == list:
- paths = path
- gateway = self._sqlContext._sc._gateway
- jpaths = utils.toJArray(gateway, gateway.jvm.java.lang.String, paths)
- return self._df(self._jreader.load(jpaths))
+ return self._df(
+ self._jreader.load(self._sqlContext._sc._jvm.PythonUtils.toSeq(path)))
else:
return self._df(self._jreader.load(path))
else:
@@ -175,6 +174,8 @@ class DataFrameReader(object):
self.schema(schema)
if isinstance(path, basestring):
return self._df(self._jreader.json(path))
+ elif type(path) == list:
+ return self._df(self._jreader.json(self._sqlContext._sc._jvm.PythonUtils.toSeq(path)))
elif isinstance(path, RDD):
return self._df(self._jreader.json(path._jrdd))
else:
@@ -205,16 +206,20 @@ class DataFrameReader(object):
@ignore_unicode_prefix
@since(1.6)
- def text(self, path):
+ def text(self, paths):
"""Loads a text file and returns a [[DataFrame]] with a single string column named "text".
Each line in the text file is a new row in the resulting DataFrame.
+ :param paths: string, or list of strings, for input path(s).
+
>>> df = sqlContext.read.text('python/test_support/sql/text-test.txt')
>>> df.collect()
[Row(value=u'hello'), Row(value=u'this')]
"""
- return self._df(self._jreader.text(path))
+ if isinstance(paths, basestring):
+ paths = [paths]
+ return self._df(self._jreader.text(self._sqlContext._sc._jvm.PythonUtils.toSeq(paths)))
@since(1.5)
def orc(self, path):