aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorsethah <seth.hendrickson16@gmail.com>2016-03-11 09:54:23 +0200
committerNick Pentreath <nick.pentreath@gmail.com>2016-03-11 09:54:23 +0200
commit234f781ae19370ce1ab485e364a6fdab7ed2598c (patch)
tree51f3ceb0e1120968bd541afe61948e90d664bd13 /python
parent0b713e0455d01999d5a027ddc2ea8527eb085b34 (diff)
downloadspark-234f781ae19370ce1ab485e364a6fdab7ed2598c.tar.gz
spark-234f781ae19370ce1ab485e364a6fdab7ed2598c.tar.bz2
spark-234f781ae19370ce1ab485e364a6fdab7ed2598c.zip
[SPARK-13787][ML][PYSPARK] Pyspark feature importances for decision tree and random forest
## What changes were proposed in this pull request? This patch adds a `featureImportance` property to the Pyspark API for `DecisionTreeRegressionModel`, `DecisionTreeClassificationModel`, `RandomForestRegressionModel` and `RandomForestClassificationModel`. ## How was this patch tested? Python doc tests for the affected classes were updated to check feature importances. Author: sethah <seth.hendrickson16@gmail.com> Closes #11622 from sethah/SPARK-13787.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/ml/classification.py44
-rw-r--r--python/pyspark/ml/regression.py44
2 files changed, 88 insertions, 0 deletions
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 29d1d203f2..ec8834a89e 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -285,6 +285,8 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
3
>>> model.depth
1
+ >>> model.featureImportances
+ SparseVector(1, {0: 1.0})
>>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
>>> result = model.transform(test0).head()
>>> result.prediction
@@ -352,6 +354,27 @@ class DecisionTreeClassificationModel(DecisionTreeModel):
.. versionadded:: 1.4.0
"""
+ @property
+ @since("2.0.0")
+ def featureImportances(self):
+ """
+ Estimate of the importance of each feature.
+
+ This generalizes the idea of "Gini" importance to other losses,
+ following the explanation of Gini importance from "Random Forests" documentation
+ by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn.
+
+ This feature importance is calculated as follows:
+ - importance(feature j) = sum (over nodes which split on feature j) of the gain,
+ where gain is scaled by the number of instances passing through node
+ - Normalize importances for tree to sum to 1.
+
+ Note: Feature importance for single decision trees can have high variance due to
+ correlated predictor variables. Consider using a :class:`RandomForestClassifier`
+ to determine feature importance instead.
+ """
+ return self._call_java("featureImportances")
+
@inherit_doc
class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed,
@@ -375,6 +398,8 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
>>> td = si_model.transform(df)
>>> rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed", seed=42)
>>> model = rf.fit(td)
+ >>> model.featureImportances
+ SparseVector(1, {0: 1.0})
>>> allclose(model.treeWeights, [1.0, 1.0, 1.0])
True
>>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
@@ -443,6 +468,25 @@ class RandomForestClassificationModel(TreeEnsembleModels):
.. versionadded:: 1.4.0
"""
+ @property
+ @since("2.0.0")
+ def featureImportances(self):
+ """
+ Estimate of the importance of each feature.
+
+ This generalizes the idea of "Gini" importance to other losses,
+ following the explanation of Gini importance from "Random Forests" documentation
+ by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn.
+
+ This feature importance is calculated as follows:
+ - Average over trees:
+ - importance(feature j) = sum (over nodes which split on feature j) of the gain,
+ where gain is scaled by the number of instances passing through node
+ - Normalize importances for tree to sum to 1.
+ - Normalize feature importance vector to sum to 1.
+ """
+ return self._call_java("featureImportances")
+
@inherit_doc
class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 6b994fe9f9..6e23393f91 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -401,6 +401,8 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
1
>>> model.numNodes
3
+ >>> model.featureImportances
+ SparseVector(1, {0: 1.0})
>>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
>>> model.transform(test0).head().prediction
0.0
@@ -499,6 +501,27 @@ class DecisionTreeRegressionModel(DecisionTreeModel):
.. versionadded:: 1.4.0
"""
+ @property
+ @since("2.0.0")
+ def featureImportances(self):
+ """
+ Estimate of the importance of each feature.
+
+ This generalizes the idea of "Gini" importance to other losses,
+ following the explanation of Gini importance from "Random Forests" documentation
+ by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn.
+
+ This feature importance is calculated as follows:
+ - importance(feature j) = sum (over nodes which split on feature j) of the gain,
+ where gain is scaled by the number of instances passing through node
+ - Normalize importances for tree to sum to 1.
+
+ Note: Feature importance for single decision trees can have high variance due to
+ correlated predictor variables. Consider using a :class:`RandomForestRegressor`
+ to determine feature importance instead.
+ """
+ return self._call_java("featureImportances")
+
@inherit_doc
class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed,
@@ -515,6 +538,8 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
... (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
>>> rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42)
>>> model = rf.fit(df)
+ >>> model.featureImportances
+ SparseVector(1, {0: 1.0})
>>> allclose(model.treeWeights, [1.0, 1.0])
True
>>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
@@ -579,6 +604,25 @@ class RandomForestRegressionModel(TreeEnsembleModels):
.. versionadded:: 1.4.0
"""
+ @property
+ @since("2.0.0")
+ def featureImportances(self):
+ """
+ Estimate of the importance of each feature.
+
+ This generalizes the idea of "Gini" importance to other losses,
+ following the explanation of Gini importance from "Random Forests" documentation
+ by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn.
+
+ This feature importance is calculated as follows:
+ - Average over trees:
+ - importance(feature j) = sum (over nodes which split on feature j) of the gain,
+ where gain is scaled by the number of instances passing through node
+ - Normalize importances for tree to sum to 1.
+ - Normalize feature importance vector to sum to 1.
+ """
+ return self._call_java("featureImportances")
+
@inherit_doc
class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,