diff options
author | Josh Rosen <joshrosen@eecs.berkeley.edu> | 2013-01-20 15:31:41 -0800 |
---|---|---|
committer | Josh Rosen <joshrosen@eecs.berkeley.edu> | 2013-01-20 15:31:41 -0800 |
commit | 5b6ea9e9a04994553d0319c541ca356e2e3064a7 (patch) | |
tree | a2af005f7ec7524707bdaf649290c035febed0dd /python/pyspark/rdd.py | |
parent | d0ba80dc727d00b2b7627dcefd2c77009af55f7d (diff) | |
download | spark-5b6ea9e9a04994553d0319c541ca356e2e3064a7.tar.gz spark-5b6ea9e9a04994553d0319c541ca356e2e3064a7.tar.bz2 spark-5b6ea9e9a04994553d0319c541ca356e2e3064a7.zip |
Update checkpointing API docs in Python/Java.
Diffstat (limited to 'python/pyspark/rdd.py')
-rw-r--r-- | python/pyspark/rdd.py | 17 |
1 files changed, 5 insertions, 12 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 2a2ff9b271..7b6ab956ee 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -52,18 +52,11 @@ class RDD(object): def checkpoint(self): """ - Mark this RDD for checkpointing. The RDD will be saved to a file inside - `checkpointDir` (set using setCheckpointDir()) and all references to - its parent RDDs will be removed. This is used to truncate very long - lineages. In the current implementation, Spark will save this RDD to - a file (using saveAsObjectFile()) after the first job using this RDD is - done. Hence, it is strongly recommended to use checkpoint() on RDDs - when - - (i) checkpoint() is called before the any job has been executed on this - RDD. - - (ii) This RDD has been made to persist in memory. Otherwise saving it + Mark this RDD for checkpointing. It will be saved to a file inside the + checkpoint directory set with L{SparkContext.setCheckpointDir()} and + all references to its parent RDDs will be removed. This function must + be called before any job has been executed on this RDD. It is strongly + recommended that this RDD is persisted in memory, otherwise saving it on a file will require recomputation. """ self.is_checkpointed = True |