aboutsummaryrefslogtreecommitdiff
path: root/docs/mllib-decision-tree.md
diff options
context:
space:
mode:
authorXiangrui Meng <meng@databricks.com>2014-09-08 18:59:57 -0700
committerXiangrui Meng <meng@databricks.com>2014-09-08 18:59:57 -0700
commit50a4fa774a0e8a17d7743b33ce8941bf4041144d (patch)
tree18089ba49e1450cf1b76238c9b435883f7003474 /docs/mllib-decision-tree.md
parent7db53391f1b349d1f49844197b34f94806f5e336 (diff)
downloadspark-50a4fa774a0e8a17d7743b33ce8941bf4041144d.tar.gz
spark-50a4fa774a0e8a17d7743b33ce8941bf4041144d.tar.bz2
spark-50a4fa774a0e8a17d7743b33ce8941bf4041144d.zip
[SPARK-3443][MLLIB] update default values of tree:
Adjust the default values of decision tree, based on the memory requirement discussed in https://github.com/apache/spark/pull/2125 : 1. maxMemoryInMB: 128 -> 256 2. maxBins: 100 -> 32 3. maxDepth: 4 -> 5 (in some example code) jkbradley Author: Xiangrui Meng <meng@databricks.com> Closes #2322 from mengxr/tree-defaults and squashes the following commits: cda453a [Xiangrui Meng] fix tests 5900445 [Xiangrui Meng] update comments 8c81831 [Xiangrui Meng] update default values of tree:
Diffstat (limited to 'docs/mllib-decision-tree.md')
-rw-r--r--docs/mllib-decision-tree.md16
1 files changed, 8 insertions, 8 deletions
diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md
index 1166d9cd15..12a6afbeea 100644
--- a/docs/mllib-decision-tree.md
+++ b/docs/mllib-decision-tree.md
@@ -80,7 +80,7 @@ The ordered splits create "bins" and the maximum number of such
bins can be specified using the `maxBins` parameter.
Note that the number of bins cannot be greater than the number of instances `$N$` (a rare scenario
-since the default `maxBins` value is 100). The tree algorithm automatically reduces the number of
+since the default `maxBins` value is 32). The tree algorithm automatically reduces the number of
bins if the condition is not satisfied.
**Categorical features**
@@ -117,7 +117,7 @@ all nodes at each level of the tree. This could lead to high memory requirements
of the tree, potentially leading to memory overflow errors. To alleviate this problem, a `maxMemoryInMB`
training parameter specifies the maximum amount of memory at the workers (twice as much at the
master) to be allocated to the histogram computation. The default value is conservatively chosen to
-be 128 MB to allow the decision algorithm to work in most scenarios. Once the memory requirements
+be 256 MB to allow the decision algorithm to work in most scenarios. Once the memory requirements
for a level-wise computation cross the `maxMemoryInMB` threshold, the node training tasks at each
subsequent level are split into smaller tasks.
@@ -167,7 +167,7 @@ val numClasses = 2
val categoricalFeaturesInfo = Map[Int, Int]()
val impurity = "gini"
val maxDepth = 5
-val maxBins = 100
+val maxBins = 32
val model = DecisionTree.trainClassifier(data, numClasses, categoricalFeaturesInfo, impurity,
maxDepth, maxBins)
@@ -213,7 +213,7 @@ Integer numClasses = 2;
HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
String impurity = "gini";
Integer maxDepth = 5;
-Integer maxBins = 100;
+Integer maxBins = 32;
// Train a DecisionTree model for classification.
final DecisionTreeModel model = DecisionTree.trainClassifier(data, numClasses,
@@ -250,7 +250,7 @@ data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt').cache()
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={},
- impurity='gini', maxDepth=5, maxBins=100)
+ impurity='gini', maxDepth=5, maxBins=32)
# Evaluate model on training instances and compute training error
predictions = model.predict(data.map(lambda x: x.features))
@@ -293,7 +293,7 @@ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").cache
val categoricalFeaturesInfo = Map[Int, Int]()
val impurity = "variance"
val maxDepth = 5
-val maxBins = 100
+val maxBins = 32
val model = DecisionTree.trainRegressor(data, categoricalFeaturesInfo, impurity,
maxDepth, maxBins)
@@ -338,7 +338,7 @@ JavaSparkContext sc = new JavaSparkContext(sparkConf);
HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
String impurity = "variance";
Integer maxDepth = 5;
-Integer maxBins = 100;
+Integer maxBins = 32;
// Train a DecisionTree model.
final DecisionTreeModel model = DecisionTree.trainRegressor(data,
@@ -380,7 +380,7 @@ data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt').cache()
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor(data, categoricalFeaturesInfo={},
- impurity='variance', maxDepth=5, maxBins=100)
+ impurity='variance', maxDepth=5, maxBins=32)
# Evaluate model on training instances and compute training error
predictions = model.predict(data.map(lambda x: x.features))