aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorNick Pentreath <nickp@za.ibm.com>2016-10-14 15:07:32 -0700
committerMichael Armbrust <michael@databricks.com>2016-10-14 15:09:49 -0700
commit5aeb7384c7aa5f487f031f9ae07d3f1653399d14 (patch)
tree22c2fb40cc786812883239ba32cd8b7097eda57c /python
parentda9aeb0fde589f7c21c2f4a32036a68c0353965d (diff)
downloadspark-5aeb7384c7aa5f487f031f9ae07d3f1653399d14.tar.gz
spark-5aeb7384c7aa5f487f031f9ae07d3f1653399d14.tar.bz2
spark-5aeb7384c7aa5f487f031f9ae07d3f1653399d14.zip
[SPARK-16063][SQL] Add storageLevel to Dataset
[SPARK-11905](https://issues.apache.org/jira/browse/SPARK-11905) added support for `persist`/`cache` for `Dataset`. However, there is no user-facing API to check if a `Dataset` is cached and if so what the storage level is. This PR adds `getStorageLevel` to `Dataset`, analogous to `RDD.getStorageLevel`. Updated `DatasetCacheSuite`. Author: Nick Pentreath <nickp@za.ibm.com> Closes #13780 from MLnick/ds-storagelevel. Signed-off-by: Michael Armbrust <michael@databricks.com>
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/sql/dataframe.py36
1 files changed, 30 insertions, 6 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index ce277eb204..7606ac08ba 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -407,24 +407,48 @@ class DataFrame(object):
@since(1.3)
def cache(self):
- """ Persists with the default storage level (C{MEMORY_ONLY}).
+ """Persists the :class:`DataFrame` with the default storage level (C{MEMORY_AND_DISK}).
+
+ .. note:: the default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0.
"""
self.is_cached = True
self._jdf.cache()
return self
@since(1.3)
- def persist(self, storageLevel=StorageLevel.MEMORY_ONLY):
- """Sets the storage level to persist its values across operations
- after the first time it is computed. This can only be used to assign
- a new storage level if the RDD does not have a storage level set yet.
- If no storage level is specified defaults to (C{MEMORY_ONLY}).
+ def persist(self, storageLevel=StorageLevel.MEMORY_AND_DISK):
+ """Sets the storage level to persist the contents of the :class:`DataFrame` across
+ operations after the first time it is computed. This can only be used to assign
+ a new storage level if the :class:`DataFrame` does not have a storage level set yet.
+ If no storage level is specified defaults to (C{MEMORY_AND_DISK}).
+
+ .. note:: the default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0.
"""
self.is_cached = True
javaStorageLevel = self._sc._getJavaStorageLevel(storageLevel)
self._jdf.persist(javaStorageLevel)
return self
+ @property
+ @since(2.1)
+ def storageLevel(self):
+ """Get the :class:`DataFrame`'s current storage level.
+
+ >>> df.storageLevel
+ StorageLevel(False, False, False, False, 1)
+ >>> df.cache().storageLevel
+ StorageLevel(True, True, False, True, 1)
+ >>> df2.persist(StorageLevel.DISK_ONLY_2).storageLevel
+ StorageLevel(True, False, False, False, 2)
+ """
+ java_storage_level = self._jdf.storageLevel()
+ storage_level = StorageLevel(java_storage_level.useDisk(),
+ java_storage_level.useMemory(),
+ java_storage_level.useOffHeap(),
+ java_storage_level.deserialized(),
+ java_storage_level.replication())
+ return storage_level
+
@since(1.3)
def unpersist(self, blocking=False):
"""Marks the :class:`DataFrame` as non-persistent, and remove all blocks for it from