[SPARK-11875][ML][PYSPARK] Update doc for PySpark HasCheckpointInterval

* Update doc for PySpark ```HasCheckpointInterval``` that users can understand how to disable checkpoint. * Update doc for PySpark ```cacheNodeIds``` of ```DecisionTreeParams``` to notify the relationship between ```cacheNodeIds``` and ```checkpointInterval```. Author: Yanbo Liang <ybliang8@gmail.com> Closes #9856 from yanboliang/spark-11875.
author: Yanbo Liang <ybliang8@gmail.com> 2015-11-19 22:14:01 -0800
committer: Xiangrui Meng <meng@databricks.com> 2015-11-19 22:14:01 -0800
commit: 7216f405454f6f3557b5b1f72df8f393605faf60 (patch)
tree: 456ae090ae728f1f74b345f0c6ea55ce75efe7be /python/pyspark/ml/param
parent: 3b7f056da87a23f3a96f0311b3a947a9b698f38b (diff)
download: spark-7216f405454f6f3557b5b1f72df8f393605faf60.tar.gz
spark-7216f405454f6f3557b5b1f72df8f393605faf60.tar.bz2
spark-7216f405454f6f3557b5b1f72df8f393605faf60.zip
2 files changed, 11 insertions, 9 deletions
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 070c5db01a..0528dc1e3a 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -118,7 +118,8 @@ if __name__ == "__main__":
         ("inputCols", "input column names.", None),
         ("outputCol", "output column name.", "self.uid + '__output'"),
         ("numFeatures", "number of features.", None),
-        ("checkpointInterval", "checkpoint interval (>= 1).", None),
+        ("checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). " +
+         "E.g. 10 means that the cache will get checkpointed every 10 iterations.", None),
         ("seed", "random seed.", "hash(type(self).__name__)"),
         ("tol", "the convergence tolerance for iterative algorithms.", None),
         ("stepSize", "Step size to be used for each iteration of optimization.", None),
@@ -157,7 +158,8 @@ if __name__ == "__main__":
         ("maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation."),
         ("cacheNodeIds", "If false, the algorithm will pass trees to executors to match " +
          "instances with nodes. If true, the algorithm will cache node IDs for each instance. " +
-         "Caching can speed up training of deeper trees.")]
+         "Caching can speed up training of deeper trees. Users can set how often should the " +
+         "cache be checkpointed or disable it by setting checkpointInterval.")]
 
     decisionTreeCode = '''class DecisionTreeParams(Params):
     """
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index 4bdf2a8cc5..4d96080150 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -325,16 +325,16 @@ class HasNumFeatures(Params):
 
 class HasCheckpointInterval(Params):
     """
-    Mixin for param checkpointInterval: checkpoint interval (>= 1).
+    Mixin for param checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
     """
 
     # a placeholder to make it appear in the generated doc
-    checkpointInterval = Param(Params._dummy(), "checkpointInterval", "checkpoint interval (>= 1).")
+    checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.")
 
     def __init__(self):
         super(HasCheckpointInterval, self).__init__()
-        #: param for checkpoint interval (>= 1).
-        self.checkpointInterval = Param(self, "checkpointInterval", "checkpoint interval (>= 1).")
+        #: param for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
+        self.checkpointInterval = Param(self, "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.")
 
     def setCheckpointInterval(self, value):
         """
@@ -636,7 +636,7 @@ class DecisionTreeParams(Params):
     minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.")
     minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
     maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
-    cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
+    cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.")
     
 
     def __init__(self):
@@ -651,8 +651,8 @@ class DecisionTreeParams(Params):
         self.minInfoGain = Param(self, "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
         #: param for Maximum memory in MB allocated to histogram aggregation.
         self.maxMemoryInMB = Param(self, "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
-        #: param for If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.
-        self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.")
+        #: param for If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.
+        self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.")
         
     def setMaxDepth(self, value):
         """
author	Yanbo Liang <ybliang8@gmail.com>	2015-11-19 22:14:01 -0800
committer	Xiangrui Meng <meng@databricks.com>	2015-11-19 22:14:01 -0800
commit	7216f405454f6f3557b5b1f72df8f393605faf60 (patch)
tree	456ae090ae728f1f74b345f0c6ea55ce75efe7be /python/pyspark/ml/param
parent	3b7f056da87a23f3a96f0311b3a947a9b698f38b (diff)
download	spark-7216f405454f6f3557b5b1f72df8f393605faf60.tar.gz spark-7216f405454f6f3557b5b1f72df8f393605faf60.tar.bz2 spark-7216f405454f6f3557b5b1f72df8f393605faf60.zip