diff options
author | sethah <seth.hendrickson16@gmail.com> | 2016-03-11 09:54:23 +0200 |
---|---|---|
committer | Nick Pentreath <nick.pentreath@gmail.com> | 2016-03-11 09:54:23 +0200 |
commit | 234f781ae19370ce1ab485e364a6fdab7ed2598c (patch) | |
tree | 51f3ceb0e1120968bd541afe61948e90d664bd13 /python/pyspark/ml | |
parent | 0b713e0455d01999d5a027ddc2ea8527eb085b34 (diff) | |
download | spark-234f781ae19370ce1ab485e364a6fdab7ed2598c.tar.gz spark-234f781ae19370ce1ab485e364a6fdab7ed2598c.tar.bz2 spark-234f781ae19370ce1ab485e364a6fdab7ed2598c.zip |
[SPARK-13787][ML][PYSPARK] Pyspark feature importances for decision tree and random forest
## What changes were proposed in this pull request?
This patch adds a `featureImportance` property to the Pyspark API for `DecisionTreeRegressionModel`, `DecisionTreeClassificationModel`, `RandomForestRegressionModel` and `RandomForestClassificationModel`.
## How was this patch tested?
Python doc tests for the affected classes were updated to check feature importances.
Author: sethah <seth.hendrickson16@gmail.com>
Closes #11622 from sethah/SPARK-13787.
Diffstat (limited to 'python/pyspark/ml')
-rw-r--r-- | python/pyspark/ml/classification.py | 44 | ||||
-rw-r--r-- | python/pyspark/ml/regression.py | 44 |
2 files changed, 88 insertions, 0 deletions
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 29d1d203f2..ec8834a89e 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -285,6 +285,8 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred 3 >>> model.depth 1 + >>> model.featureImportances + SparseVector(1, {0: 1.0}) >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> result = model.transform(test0).head() >>> result.prediction @@ -352,6 +354,27 @@ class DecisionTreeClassificationModel(DecisionTreeModel): .. versionadded:: 1.4.0 """ + @property + @since("2.0.0") + def featureImportances(self): + """ + Estimate of the importance of each feature. + + This generalizes the idea of "Gini" importance to other losses, + following the explanation of Gini importance from "Random Forests" documentation + by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn. + + This feature importance is calculated as follows: + - importance(feature j) = sum (over nodes which split on feature j) of the gain, + where gain is scaled by the number of instances passing through node + - Normalize importances for tree to sum to 1. + + Note: Feature importance for single decision trees can have high variance due to + correlated predictor variables. Consider using a :class:`RandomForestClassifier` + to determine feature importance instead. + """ + return self._call_java("featureImportances") + @inherit_doc class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed, @@ -375,6 +398,8 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred >>> td = si_model.transform(df) >>> rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed", seed=42) >>> model = rf.fit(td) + >>> model.featureImportances + SparseVector(1, {0: 1.0}) >>> allclose(model.treeWeights, [1.0, 1.0, 1.0]) True >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) @@ -443,6 +468,25 @@ class RandomForestClassificationModel(TreeEnsembleModels): .. versionadded:: 1.4.0 """ + @property + @since("2.0.0") + def featureImportances(self): + """ + Estimate of the importance of each feature. + + This generalizes the idea of "Gini" importance to other losses, + following the explanation of Gini importance from "Random Forests" documentation + by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn. + + This feature importance is calculated as follows: + - Average over trees: + - importance(feature j) = sum (over nodes which split on feature j) of the gain, + where gain is scaled by the number of instances passing through node + - Normalize importances for tree to sum to 1. + - Normalize feature importance vector to sum to 1. + """ + return self._call_java("featureImportances") + @inherit_doc class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 6b994fe9f9..6e23393f91 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -401,6 +401,8 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi 1 >>> model.numNodes 3 + >>> model.featureImportances + SparseVector(1, {0: 1.0}) >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -499,6 +501,27 @@ class DecisionTreeRegressionModel(DecisionTreeModel): .. versionadded:: 1.4.0 """ + @property + @since("2.0.0") + def featureImportances(self): + """ + Estimate of the importance of each feature. + + This generalizes the idea of "Gini" importance to other losses, + following the explanation of Gini importance from "Random Forests" documentation + by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn. + + This feature importance is calculated as follows: + - importance(feature j) = sum (over nodes which split on feature j) of the gain, + where gain is scaled by the number of instances passing through node + - Normalize importances for tree to sum to 1. + + Note: Feature importance for single decision trees can have high variance due to + correlated predictor variables. Consider using a :class:`RandomForestRegressor` + to determine feature importance instead. + """ + return self._call_java("featureImportances") + @inherit_doc class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed, @@ -515,6 +538,8 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) >>> rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42) >>> model = rf.fit(df) + >>> model.featureImportances + SparseVector(1, {0: 1.0}) >>> allclose(model.treeWeights, [1.0, 1.0]) True >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) @@ -579,6 +604,25 @@ class RandomForestRegressionModel(TreeEnsembleModels): .. versionadded:: 1.4.0 """ + @property + @since("2.0.0") + def featureImportances(self): + """ + Estimate of the importance of each feature. + + This generalizes the idea of "Gini" importance to other losses, + following the explanation of Gini importance from "Random Forests" documentation + by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn. + + This feature importance is calculated as follows: + - Average over trees: + - importance(feature j) = sum (over nodes which split on feature j) of the gain, + where gain is scaled by the number of instances passing through node + - Normalize importances for tree to sum to 1. + - Normalize feature importance vector to sum to 1. + """ + return self._call_java("featureImportances") + @inherit_doc class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, |