aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorCheng Lian <lian@databricks.com>2015-07-21 15:08:44 +0800
committerCheng Lian <lian@databricks.com>2015-07-21 15:08:44 +0800
commitd38c5029a2ca845e2782096044a6412b653c9f95 (patch)
tree80c71a2f152e5bdd348edf66599f0a3892236ad6 /python
parent1ddd0f2f1688560f88470e312b72af04364e2d49 (diff)
downloadspark-d38c5029a2ca845e2782096044a6412b653c9f95.tar.gz
spark-d38c5029a2ca845e2782096044a6412b653c9f95.tar.bz2
spark-d38c5029a2ca845e2782096044a6412b653c9f95.zip
[SPARK-9100] [SQL] Adds DataFrame reader/writer shortcut methods for ORC
This PR adds DataFrame reader/writer shortcut methods for ORC in both Scala and Python. Author: Cheng Lian <lian@databricks.com> Closes #7444 from liancheng/spark-9100 and squashes the following commits: 284d043 [Cheng Lian] Fixes PySpark test cases and addresses PR comments e0b09fb [Cheng Lian] Adds DataFrame reader/writer shortcut methods for ORC
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/sql/readwriter.py44
-rw-r--r--python/test_support/sql/orc_partitioned/._SUCCESS.crcbin0 -> 8 bytes
-rwxr-xr-xpython/test_support/sql/orc_partitioned/_SUCCESS0
-rw-r--r--python/test_support/sql/orc_partitioned/b=0/c=0/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crcbin0 -> 12 bytes
-rwxr-xr-xpython/test_support/sql/orc_partitioned/b=0/c=0/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orcbin0 -> 168 bytes
-rw-r--r--python/test_support/sql/orc_partitioned/b=1/c=1/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crcbin0 -> 12 bytes
-rwxr-xr-xpython/test_support/sql/orc_partitioned/b=1/c=1/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orcbin0 -> 168 bytes
7 files changed, 41 insertions, 3 deletions
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 882a03090e..dea8bad79e 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -146,14 +146,28 @@ class DataFrameReader(object):
return self._df(self._jreader.table(tableName))
@since(1.4)
- def parquet(self, *path):
+ def parquet(self, *paths):
"""Loads a Parquet file, returning the result as a :class:`DataFrame`.
>>> df = sqlContext.read.parquet('python/test_support/sql/parquet_partitioned')
>>> df.dtypes
[('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
"""
- return self._df(self._jreader.parquet(_to_seq(self._sqlContext._sc, path)))
+ return self._df(self._jreader.parquet(_to_seq(self._sqlContext._sc, paths)))
+
+ @since(1.5)
+ def orc(self, path):
+ """
+ Loads an ORC file, returning the result as a :class:`DataFrame`.
+
+ ::Note: Currently ORC support is only available together with
+ :class:`HiveContext`.
+
+ >>> df = hiveContext.read.orc('python/test_support/sql/orc_partitioned')
+ >>> df.dtypes
+ [('a', 'bigint'), ('b', 'int'), ('c', 'int')]
+ """
+ return self._df(self._jreader.orc(path))
@since(1.4)
def jdbc(self, url, table, column=None, lowerBound=None, upperBound=None, numPartitions=None,
@@ -378,6 +392,29 @@ class DataFrameWriter(object):
self.partitionBy(partitionBy)
self._jwrite.parquet(path)
+ def orc(self, path, mode=None, partitionBy=None):
+ """Saves the content of the :class:`DataFrame` in ORC format at the specified path.
+
+ ::Note: Currently ORC support is only available together with
+ :class:`HiveContext`.
+
+ :param path: the path in any Hadoop supported file system
+ :param mode: specifies the behavior of the save operation when data already exists.
+
+ * ``append``: Append contents of this :class:`DataFrame` to existing data.
+ * ``overwrite``: Overwrite existing data.
+ * ``ignore``: Silently ignore this operation if data already exists.
+ * ``error`` (default case): Throw an exception if data already exists.
+ :param partitionBy: names of partitioning columns
+
+ >>> orc_df = hiveContext.read.orc('python/test_support/sql/orc_partitioned')
+ >>> orc_df.write.orc(os.path.join(tempfile.mkdtemp(), 'data'))
+ """
+ self.mode(mode)
+ if partitionBy is not None:
+ self.partitionBy(partitionBy)
+ self._jwrite.orc(path)
+
@since(1.4)
def jdbc(self, url, table, mode=None, properties={}):
"""Saves the content of the :class:`DataFrame` to a external database table via JDBC.
@@ -408,7 +445,7 @@ def _test():
import os
import tempfile
from pyspark.context import SparkContext
- from pyspark.sql import Row, SQLContext
+ from pyspark.sql import Row, SQLContext, HiveContext
import pyspark.sql.readwriter
os.chdir(os.environ["SPARK_HOME"])
@@ -420,6 +457,7 @@ def _test():
globs['os'] = os
globs['sc'] = sc
globs['sqlContext'] = SQLContext(sc)
+ globs['hiveContext'] = HiveContext(sc)
globs['df'] = globs['sqlContext'].read.parquet('python/test_support/sql/parquet_partitioned')
(failure_count, test_count) = doctest.testmod(
diff --git a/python/test_support/sql/orc_partitioned/._SUCCESS.crc b/python/test_support/sql/orc_partitioned/._SUCCESS.crc
new file mode 100644
index 0000000000..3b7b044936
--- /dev/null
+++ b/python/test_support/sql/orc_partitioned/._SUCCESS.crc
Binary files differ
diff --git a/python/test_support/sql/orc_partitioned/_SUCCESS b/python/test_support/sql/orc_partitioned/_SUCCESS
new file mode 100755
index 0000000000..e69de29bb2
--- /dev/null
+++ b/python/test_support/sql/orc_partitioned/_SUCCESS
diff --git a/python/test_support/sql/orc_partitioned/b=0/c=0/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc b/python/test_support/sql/orc_partitioned/b=0/c=0/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc
new file mode 100644
index 0000000000..834cf0b7f2
--- /dev/null
+++ b/python/test_support/sql/orc_partitioned/b=0/c=0/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc
Binary files differ
diff --git a/python/test_support/sql/orc_partitioned/b=0/c=0/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc b/python/test_support/sql/orc_partitioned/b=0/c=0/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc
new file mode 100755
index 0000000000..4943801873
--- /dev/null
+++ b/python/test_support/sql/orc_partitioned/b=0/c=0/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc
Binary files differ
diff --git a/python/test_support/sql/orc_partitioned/b=1/c=1/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc b/python/test_support/sql/orc_partitioned/b=1/c=1/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc
new file mode 100644
index 0000000000..693dceeee3
--- /dev/null
+++ b/python/test_support/sql/orc_partitioned/b=1/c=1/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc
Binary files differ
diff --git a/python/test_support/sql/orc_partitioned/b=1/c=1/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc b/python/test_support/sql/orc_partitioned/b=1/c=1/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc
new file mode 100755
index 0000000000..4cbb95ae02
--- /dev/null
+++ b/python/test_support/sql/orc_partitioned/b=1/c=1/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc
Binary files differ