aboutsummaryrefslogtreecommitdiff
path: root/examples/src/main
diff options
context:
space:
mode:
authorSung Chung <schung@alpinenow.com>2014-11-01 16:58:26 -0700
committerXiangrui Meng <meng@databricks.com>2014-11-01 16:58:26 -0700
commit56f2c61cde3f5d906c2a58e9af1a661222f2c679 (patch)
treecaa90f8479249480567dd75cec39629ff4f865ab /examples/src/main
parentd8176b1c2f22247ee724041aefa1af9118cf861d (diff)
downloadspark-56f2c61cde3f5d906c2a58e9af1a661222f2c679.tar.gz
spark-56f2c61cde3f5d906c2a58e9af1a661222f2c679.tar.bz2
spark-56f2c61cde3f5d906c2a58e9af1a661222f2c679.zip
[SPARK-3161][MLLIB] Adding a node Id caching mechanism for training deci...
...sion trees. jkbradley mengxr chouqin Please review this. Author: Sung Chung <schung@alpinenow.com> Closes #2868 from codedeft/SPARK-3161 and squashes the following commits: 5f5a156 [Sung Chung] [SPARK-3161][MLLIB] Adding a node Id caching mechanism for training decision trees.
Diffstat (limited to 'examples/src/main')
-rw-r--r--examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala25
1 files changed, 23 insertions, 2 deletions
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
index f98730366b..49751a3049 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
@@ -62,7 +62,10 @@ object DecisionTreeRunner {
minInfoGain: Double = 0.0,
numTrees: Int = 1,
featureSubsetStrategy: String = "auto",
- fracTest: Double = 0.2) extends AbstractParams[Params]
+ fracTest: Double = 0.2,
+ useNodeIdCache: Boolean = false,
+ checkpointDir: Option[String] = None,
+ checkpointInterval: Int = 10) extends AbstractParams[Params]
def main(args: Array[String]) {
val defaultParams = Params()
@@ -102,6 +105,21 @@ object DecisionTreeRunner {
.text(s"fraction of data to hold out for testing. If given option testInput, " +
s"this option is ignored. default: ${defaultParams.fracTest}")
.action((x, c) => c.copy(fracTest = x))
+ opt[Boolean]("useNodeIdCache")
+ .text(s"whether to use node Id cache during training, " +
+ s"default: ${defaultParams.useNodeIdCache}")
+ .action((x, c) => c.copy(useNodeIdCache = x))
+ opt[String]("checkpointDir")
+ .text(s"checkpoint directory where intermediate node Id caches will be stored, " +
+ s"default: ${defaultParams.checkpointDir match {
+ case Some(strVal) => strVal
+ case None => "None"
+ }}")
+ .action((x, c) => c.copy(checkpointDir = Some(x)))
+ opt[Int]("checkpointInterval")
+ .text(s"how often to checkpoint the node Id cache, " +
+ s"default: ${defaultParams.checkpointInterval}")
+ .action((x, c) => c.copy(checkpointInterval = x))
opt[String]("testInput")
.text(s"input path to test dataset. If given, option fracTest is ignored." +
s" default: ${defaultParams.testInput}")
@@ -236,7 +254,10 @@ object DecisionTreeRunner {
maxBins = params.maxBins,
numClassesForClassification = numClasses,
minInstancesPerNode = params.minInstancesPerNode,
- minInfoGain = params.minInfoGain)
+ minInfoGain = params.minInfoGain,
+ useNodeIdCache = params.useNodeIdCache,
+ checkpointDir = params.checkpointDir,
+ checkpointInterval = params.checkpointInterval)
if (params.numTrees == 1) {
val startTime = System.nanoTime()
val model = DecisionTree.train(training, strategy)