[SPARK-16063][SQL] Add storageLevel to Dataset

[SPARK-11905](https://issues.apache.org/jira/browse/SPARK-11905) added support for `persist`/`cache` for `Dataset`. However, there is no user-facing API to check if a `Dataset` is cached and if so what the storage level is. This PR adds `getStorageLevel` to `Dataset`, analogous to `RDD.getStorageLevel`. Updated `DatasetCacheSuite`. Author: Nick Pentreath <nickp@za.ibm.com> Closes #13780 from MLnick/ds-storagelevel. Signed-off-by: Michael Armbrust <michael@databricks.com>
author: Nick Pentreath <nickp@za.ibm.com> 2016-10-14 15:07:32 -0700
committer: Michael Armbrust <michael@databricks.com> 2016-10-14 15:09:49 -0700
commit: 5aeb7384c7aa5f487f031f9ae07d3f1653399d14 (patch)
tree: 22c2fb40cc786812883239ba32cd8b7097eda57c /python
parent: da9aeb0fde589f7c21c2f4a32036a68c0353965d (diff)
download: spark-5aeb7384c7aa5f487f031f9ae07d3f1653399d14.tar.gz
spark-5aeb7384c7aa5f487f031f9ae07d3f1653399d14.tar.bz2
spark-5aeb7384c7aa5f487f031f9ae07d3f1653399d14.zip
1 files changed, 30 insertions, 6 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index ce277eb204..7606ac08ba 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -407,24 +407,48 @@ class DataFrame(object):
 
     @since(1.3)
     def cache(self):
-        """ Persists with the default storage level (C{MEMORY_ONLY}).
+        """Persists the :class:`DataFrame` with the default storage level (C{MEMORY_AND_DISK}).
+
+        .. note:: the default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0.
         """
         self.is_cached = True
         self._jdf.cache()
         return self
 
     @since(1.3)
-    def persist(self, storageLevel=StorageLevel.MEMORY_ONLY):
-        """Sets the storage level to persist its values across operations
-        after the first time it is computed. This can only be used to assign
-        a new storage level if the RDD does not have a storage level set yet.
-        If no storage level is specified defaults to (C{MEMORY_ONLY}).
+    def persist(self, storageLevel=StorageLevel.MEMORY_AND_DISK):
+        """Sets the storage level to persist the contents of the :class:`DataFrame` across
+        operations after the first time it is computed. This can only be used to assign
+        a new storage level if the :class:`DataFrame` does not have a storage level set yet.
+        If no storage level is specified defaults to (C{MEMORY_AND_DISK}).
+
+        .. note:: the default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0.
         """
         self.is_cached = True
         javaStorageLevel = self._sc._getJavaStorageLevel(storageLevel)
         self._jdf.persist(javaStorageLevel)
         return self
 
+    @property
+    @since(2.1)
+    def storageLevel(self):
+        """Get the :class:`DataFrame`'s current storage level.
+
+        >>> df.storageLevel
+        StorageLevel(False, False, False, False, 1)
+        >>> df.cache().storageLevel
+        StorageLevel(True, True, False, True, 1)
+        >>> df2.persist(StorageLevel.DISK_ONLY_2).storageLevel
+        StorageLevel(True, False, False, False, 2)
+        """
+        java_storage_level = self._jdf.storageLevel()
+        storage_level = StorageLevel(java_storage_level.useDisk(),
+                                     java_storage_level.useMemory(),
+                                     java_storage_level.useOffHeap(),
+                                     java_storage_level.deserialized(),
+                                     java_storage_level.replication())
+        return storage_level
+
     @since(1.3)
     def unpersist(self, blocking=False):
         """Marks the :class:`DataFrame` as non-persistent, and remove all blocks for it from
author	Nick Pentreath <nickp@za.ibm.com>	2016-10-14 15:07:32 -0700
committer	Michael Armbrust <michael@databricks.com>	2016-10-14 15:09:49 -0700
commit	5aeb7384c7aa5f487f031f9ae07d3f1653399d14 (patch)
tree	22c2fb40cc786812883239ba32cd8b7097eda57c /python
parent	da9aeb0fde589f7c21c2f4a32036a68c0353965d (diff)
download	spark-5aeb7384c7aa5f487f031f9ae07d3f1653399d14.tar.gz spark-5aeb7384c7aa5f487f031f9ae07d3f1653399d14.tar.bz2 spark-5aeb7384c7aa5f487f031f9ae07d3f1653399d14.zip