aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/sql/dataframe.py
diff options
context:
space:
mode:
authorReynold Xin <rxin@databricks.com>2015-06-03 00:23:34 -0700
committerReynold Xin <rxin@databricks.com>2015-06-03 00:23:34 -0700
commitce320cb2dbf28825f80795ce569735888f98d6e8 (patch)
tree020c2f7b8a0868fb0cc3bef50a284c4f06b8777b /python/pyspark/sql/dataframe.py
parent452eb82dd722e5dfd00ee47bb8b6353933b0016e (diff)
downloadspark-ce320cb2dbf28825f80795ce569735888f98d6e8.tar.gz
spark-ce320cb2dbf28825f80795ce569735888f98d6e8.tar.bz2
spark-ce320cb2dbf28825f80795ce569735888f98d6e8.zip
[SPARK-8060] Improve DataFrame Python test coverage and documentation.
Author: Reynold Xin <rxin@databricks.com> Closes #6601 from rxin/python-read-write-test-and-doc and squashes the following commits: baa8ad5 [Reynold Xin] Code review feedback. f081d47 [Reynold Xin] More documentation updates. c9902fa [Reynold Xin] [SPARK-8060] Improve DataFrame Python reader/writer interface doc and testing.
Diffstat (limited to 'python/pyspark/sql/dataframe.py')
-rw-r--r--python/pyspark/sql/dataframe.py82
1 files changed, 27 insertions, 55 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index a82b6b87c4..7673153abe 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -44,7 +44,7 @@ class DataFrame(object):
A :class:`DataFrame` is equivalent to a relational table in Spark SQL,
and can be created using various functions in :class:`SQLContext`::
- people = sqlContext.parquetFile("...")
+ people = sqlContext.read.parquet("...")
Once created, it can be manipulated using the various domain-specific-language
(DSL) functions defined in: :class:`DataFrame`, :class:`Column`.
@@ -56,8 +56,8 @@ class DataFrame(object):
A more concrete example::
# To create DataFrame using SQLContext
- people = sqlContext.parquetFile("...")
- department = sqlContext.parquetFile("...")
+ people = sqlContext.read.parquet("...")
+ department = sqlContext.read.parquet("...")
people.filter(people.age > 30).join(department, people.deptId == department.id)) \
.groupBy(department.name, "gender").agg({"salary": "avg", "age": "max"})
@@ -120,21 +120,12 @@ class DataFrame(object):
rdd = self._jdf.toJSON()
return RDD(rdd.toJavaRDD(), self._sc, UTF8Deserializer(use_unicode))
- @since(1.3)
def saveAsParquetFile(self, path):
"""Saves the contents as a Parquet file, preserving the schema.
- Files that are written out using this method can be read back in as
- a :class:`DataFrame` using :func:`SQLContext.parquetFile`.
-
- >>> import tempfile, shutil
- >>> parquetFile = tempfile.mkdtemp()
- >>> shutil.rmtree(parquetFile)
- >>> df.saveAsParquetFile(parquetFile)
- >>> df2 = sqlContext.parquetFile(parquetFile)
- >>> sorted(df2.collect()) == sorted(df.collect())
- True
+ .. note:: Deprecated in 1.4, use :func:`DataFrameWriter.parquet` instead.
"""
+ warnings.warn("saveAsParquetFile is deprecated. Use write.parquet() instead.")
self._jdf.saveAsParquetFile(path)
@since(1.3)
@@ -151,69 +142,45 @@ class DataFrame(object):
"""
self._jdf.registerTempTable(name)
- @since(1.3)
def registerAsTable(self, name):
- """DEPRECATED: use :func:`registerTempTable` instead"""
- warnings.warn("Use registerTempTable instead of registerAsTable.", DeprecationWarning)
+ """
+ .. note:: Deprecated in 1.4, use :func:`registerTempTable` instead.
+ """
+ warnings.warn("Use registerTempTable instead of registerAsTable.")
self.registerTempTable(name)
- @since(1.3)
def insertInto(self, tableName, overwrite=False):
"""Inserts the contents of this :class:`DataFrame` into the specified table.
- Optionally overwriting any existing data.
+ .. note:: Deprecated in 1.4, use :func:`DataFrameWriter.insertInto` instead.
"""
+ warnings.warn("insertInto is deprecated. Use write.insertInto() instead.")
self.write.insertInto(tableName, overwrite)
- @since(1.3)
def saveAsTable(self, tableName, source=None, mode="error", **options):
"""Saves the contents of this :class:`DataFrame` to a data source as a table.
- The data source is specified by the ``source`` and a set of ``options``.
- If ``source`` is not specified, the default data source configured by
- ``spark.sql.sources.default`` will be used.
-
- Additionally, mode is used to specify the behavior of the saveAsTable operation when
- table already exists in the data source. There are four modes:
-
- * `append`: Append contents of this :class:`DataFrame` to existing data.
- * `overwrite`: Overwrite existing data.
- * `error`: Throw an exception if data already exists.
- * `ignore`: Silently ignore this operation if data already exists.
+ .. note:: Deprecated in 1.4, use :func:`DataFrameWriter.saveAsTable` instead.
"""
+ warnings.warn("insertInto is deprecated. Use write.saveAsTable() instead.")
self.write.saveAsTable(tableName, source, mode, **options)
@since(1.3)
def save(self, path=None, source=None, mode="error", **options):
"""Saves the contents of the :class:`DataFrame` to a data source.
- The data source is specified by the ``source`` and a set of ``options``.
- If ``source`` is not specified, the default data source configured by
- ``spark.sql.sources.default`` will be used.
-
- Additionally, mode is used to specify the behavior of the save operation when
- data already exists in the data source. There are four modes:
-
- * `append`: Append contents of this :class:`DataFrame` to existing data.
- * `overwrite`: Overwrite existing data.
- * `error`: Throw an exception if data already exists.
- * `ignore`: Silently ignore this operation if data already exists.
+ .. note:: Deprecated in 1.4, use :func:`DataFrameWriter.save` instead.
"""
+ warnings.warn("insertInto is deprecated. Use write.save() instead.")
return self.write.save(path, source, mode, **options)
@property
@since(1.4)
def write(self):
"""
- Interface for saving the content of the :class:`DataFrame` out
- into external storage.
-
- :return :class:`DataFrameWriter`
+ Interface for saving the content of the :class:`DataFrame` out into external storage.
- .. note:: Experimental
-
- >>> df.write
- <pyspark.sql.readwriter.DataFrameWriter object at ...>
+ :return: :class:`DataFrameWriter`
"""
return DataFrameWriter(self)
@@ -636,6 +603,9 @@ class DataFrame(object):
This include count, mean, stddev, min, and max. If no columns are
given, this function computes statistics for all numerical columns.
+ .. note:: This function is meant for exploratory data analysis, as we make no \
+ guarantee about the backward compatibility of the schema of the resulting DataFrame.
+
>>> df.describe().show()
+-------+---+
|summary|age|
@@ -653,9 +623,11 @@ class DataFrame(object):
@ignore_unicode_prefix
@since(1.3)
def head(self, n=None):
- """
- Returns the first ``n`` rows as a list of :class:`Row`,
- or the first :class:`Row` if ``n`` is ``None.``
+ """Returns the first ``n`` rows.
+
+ :param n: int, default 1. Number of rows to return.
+ :return: If n is greater than 1, return a list of :class:`Row`.
+ If n is 1, return a single Row.
>>> df.head()
Row(age=2, name=u'Alice')
@@ -1170,8 +1142,8 @@ class DataFrame(object):
"http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou".
:func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases.
- This function is meant for exploratory data analysis, as we make no guarantee about the
- backward compatibility of the schema of the resulting DataFrame.
+ .. note:: This function is meant for exploratory data analysis, as we make no \
+ guarantee about the backward compatibility of the schema of the resulting DataFrame.
:param cols: Names of the columns to calculate frequent items for as a list or tuple of
strings.