diff options
author | Nick Pentreath <nickp@za.ibm.com> | 2016-10-14 15:07:32 -0700 |
---|---|---|
committer | Michael Armbrust <michael@databricks.com> | 2016-10-14 15:09:49 -0700 |
commit | 5aeb7384c7aa5f487f031f9ae07d3f1653399d14 (patch) | |
tree | 22c2fb40cc786812883239ba32cd8b7097eda57c /python/pyspark | |
parent | da9aeb0fde589f7c21c2f4a32036a68c0353965d (diff) | |
download | spark-5aeb7384c7aa5f487f031f9ae07d3f1653399d14.tar.gz spark-5aeb7384c7aa5f487f031f9ae07d3f1653399d14.tar.bz2 spark-5aeb7384c7aa5f487f031f9ae07d3f1653399d14.zip |
[SPARK-16063][SQL] Add storageLevel to Dataset
[SPARK-11905](https://issues.apache.org/jira/browse/SPARK-11905) added support for `persist`/`cache` for `Dataset`. However, there is no user-facing API to check if a `Dataset` is cached and if so what the storage level is. This PR adds `getStorageLevel` to `Dataset`, analogous to `RDD.getStorageLevel`.
Updated `DatasetCacheSuite`.
Author: Nick Pentreath <nickp@za.ibm.com>
Closes #13780 from MLnick/ds-storagelevel.
Signed-off-by: Michael Armbrust <michael@databricks.com>
Diffstat (limited to 'python/pyspark')
-rw-r--r-- | python/pyspark/sql/dataframe.py | 36 |
1 files changed, 30 insertions, 6 deletions
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index ce277eb204..7606ac08ba 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -407,24 +407,48 @@ class DataFrame(object): @since(1.3) def cache(self): - """ Persists with the default storage level (C{MEMORY_ONLY}). + """Persists the :class:`DataFrame` with the default storage level (C{MEMORY_AND_DISK}). + + .. note:: the default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0. """ self.is_cached = True self._jdf.cache() return self @since(1.3) - def persist(self, storageLevel=StorageLevel.MEMORY_ONLY): - """Sets the storage level to persist its values across operations - after the first time it is computed. This can only be used to assign - a new storage level if the RDD does not have a storage level set yet. - If no storage level is specified defaults to (C{MEMORY_ONLY}). + def persist(self, storageLevel=StorageLevel.MEMORY_AND_DISK): + """Sets the storage level to persist the contents of the :class:`DataFrame` across + operations after the first time it is computed. This can only be used to assign + a new storage level if the :class:`DataFrame` does not have a storage level set yet. + If no storage level is specified defaults to (C{MEMORY_AND_DISK}). + + .. note:: the default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0. """ self.is_cached = True javaStorageLevel = self._sc._getJavaStorageLevel(storageLevel) self._jdf.persist(javaStorageLevel) return self + @property + @since(2.1) + def storageLevel(self): + """Get the :class:`DataFrame`'s current storage level. + + >>> df.storageLevel + StorageLevel(False, False, False, False, 1) + >>> df.cache().storageLevel + StorageLevel(True, True, False, True, 1) + >>> df2.persist(StorageLevel.DISK_ONLY_2).storageLevel + StorageLevel(True, False, False, False, 2) + """ + java_storage_level = self._jdf.storageLevel() + storage_level = StorageLevel(java_storage_level.useDisk(), + java_storage_level.useMemory(), + java_storage_level.useOffHeap(), + java_storage_level.deserialized(), + java_storage_level.replication()) + return storage_level + @since(1.3) def unpersist(self, blocking=False): """Marks the :class:`DataFrame` as non-persistent, and remove all blocks for it from |