aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/rdd.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/pyspark/rdd.py')
-rw-r--r--python/pyspark/rdd.py17
1 files changed, 5 insertions, 12 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 2a2ff9b271..7b6ab956ee 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -52,18 +52,11 @@ class RDD(object):
def checkpoint(self):
"""
- Mark this RDD for checkpointing. The RDD will be saved to a file inside
- `checkpointDir` (set using setCheckpointDir()) and all references to
- its parent RDDs will be removed. This is used to truncate very long
- lineages. In the current implementation, Spark will save this RDD to
- a file (using saveAsObjectFile()) after the first job using this RDD is
- done. Hence, it is strongly recommended to use checkpoint() on RDDs
- when
-
- (i) checkpoint() is called before the any job has been executed on this
- RDD.
-
- (ii) This RDD has been made to persist in memory. Otherwise saving it
+ Mark this RDD for checkpointing. It will be saved to a file inside the
+ checkpoint directory set with L{SparkContext.setCheckpointDir()} and
+ all references to its parent RDDs will be removed. This function must
+ be called before any job has been executed on this RDD. It is strongly
+ recommended that this RDD is persisted in memory, otherwise saving it
on a file will require recomputation.
"""
self.is_checkpointed = True