aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorPeng, Meng <peng.meng@intel.com>2016-09-21 10:17:38 +0100
committerSean Owen <sowen@cloudera.com>2016-09-21 10:17:38 +0100
commitb366f18496e1ce8bd20fe58a0245ef7d91819a03 (patch)
tree3a2e189a94ad1fc49040d721eb586724bc493097 /python
parent28fafa3ee8f3478fa441e7bd6c8fd4ab482ca98e (diff)
downloadspark-b366f18496e1ce8bd20fe58a0245ef7d91819a03.tar.gz
spark-b366f18496e1ce8bd20fe58a0245ef7d91819a03.tar.bz2
spark-b366f18496e1ce8bd20fe58a0245ef7d91819a03.zip
[SPARK-17017][MLLIB][ML] add a chiSquare Selector based on False Positive Rate (FPR) test
## What changes were proposed in this pull request? Univariate feature selection works by selecting the best features based on univariate statistical tests. False Positive Rate (FPR) is a popular univariate statistical test for feature selection. We add a chiSquare Selector based on False Positive Rate (FPR) test in this PR, like it is implemented in scikit-learn. http://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection ## How was this patch tested? Add Scala ut Author: Peng, Meng <peng.meng@intel.com> Closes #14597 from mpjlu/fprChiSquare.
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/mllib/feature.py71
1 files changed, 66 insertions, 5 deletions
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 5d99644fca..077c11370e 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -271,11 +271,22 @@ class ChiSqSelectorModel(JavaVectorTransformer):
return JavaVectorTransformer.transform(self, vector)
+class ChiSqSelectorType:
+ """
+ This class defines the selector types of Chi Square Selector.
+ """
+ KBest, Percentile, FPR = range(3)
+
+
class ChiSqSelector(object):
"""
Creates a ChiSquared feature selector.
-
- :param numTopFeatures: number of features that selector will select.
+ The selector supports three selection methods: `KBest`, `Percentile` and `FPR`.
+ `KBest` chooses the `k` top features according to a chi-squared test.
+ `Percentile` is similar but chooses a fraction of all features instead of a fixed number.
+ `FPR` chooses all features whose false positive rate meets some threshold.
+ By default, the selection method is `KBest`, the default number of top features is 50.
+ User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods.
>>> data = [
... LabeledPoint(0.0, SparseVector(3, {0: 8.0, 1: 7.0})),
@@ -283,16 +294,58 @@ class ChiSqSelector(object):
... LabeledPoint(1.0, [0.0, 9.0, 8.0]),
... LabeledPoint(2.0, [8.0, 9.0, 5.0])
... ]
- >>> model = ChiSqSelector(1).fit(sc.parallelize(data))
+ >>> model = ChiSqSelector().setNumTopFeatures(1).fit(sc.parallelize(data))
+ >>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0}))
+ SparseVector(1, {0: 6.0})
+ >>> model.transform(DenseVector([8.0, 9.0, 5.0]))
+ DenseVector([5.0])
+ >>> model = ChiSqSelector().setPercentile(0.34).fit(sc.parallelize(data))
>>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0}))
SparseVector(1, {0: 6.0})
>>> model.transform(DenseVector([8.0, 9.0, 5.0]))
DenseVector([5.0])
+ >>> data = [
+ ... LabeledPoint(0.0, SparseVector(4, {0: 8.0, 1: 7.0})),
+ ... LabeledPoint(1.0, SparseVector(4, {1: 9.0, 2: 6.0, 3: 4.0})),
+ ... LabeledPoint(1.0, [0.0, 9.0, 8.0, 4.0]),
+ ... LabeledPoint(2.0, [8.0, 9.0, 5.0, 9.0])
+ ... ]
+ >>> model = ChiSqSelector().setAlpha(0.1).fit(sc.parallelize(data))
+ >>> model.transform(DenseVector([1.0,2.0,3.0,4.0]))
+ DenseVector([4.0])
.. versionadded:: 1.4.0
"""
- def __init__(self, numTopFeatures):
+ def __init__(self, numTopFeatures=50):
+ self.numTopFeatures = numTopFeatures
+ self.selectorType = ChiSqSelectorType.KBest
+
+ @since('2.1.0')
+ def setNumTopFeatures(self, numTopFeatures):
+ """
+ set numTopFeature for feature selection by number of top features
+ """
self.numTopFeatures = int(numTopFeatures)
+ self.selectorType = ChiSqSelectorType.KBest
+ return self
+
+ @since('2.1.0')
+ def setPercentile(self, percentile):
+ """
+ set percentile [0.0, 1.0] for feature selection by percentile
+ """
+ self.percentile = float(percentile)
+ self.selectorType = ChiSqSelectorType.Percentile
+ return self
+
+ @since('2.1.0')
+ def setAlpha(self, alpha):
+ """
+ set alpha [0.0, 1.0] for feature selection by FPR
+ """
+ self.alpha = float(alpha)
+ self.selectorType = ChiSqSelectorType.FPR
+ return self
@since('1.4.0')
def fit(self, data):
@@ -304,7 +357,15 @@ class ChiSqSelector(object):
treated as categorical for each distinct value.
Apply feature discretizer before using this function.
"""
- jmodel = callMLlibFunc("fitChiSqSelector", self.numTopFeatures, data)
+ if self.selectorType == ChiSqSelectorType.KBest:
+ jmodel = callMLlibFunc("fitChiSqSelectorKBest", self.numTopFeatures, data)
+ elif self.selectorType == ChiSqSelectorType.Percentile:
+ jmodel = callMLlibFunc("fitChiSqSelectorPercentile", self.percentile, data)
+ elif self.selectorType == ChiSqSelectorType.FPR:
+ jmodel = callMLlibFunc("fitChiSqSelectorFPR", self.alpha, data)
+ else:
+ raise ValueError("ChiSqSelector type supports KBest(0), Percentile(1) and"
+ " FPR(2), the current value is: %s" % self.selectorType)
return ChiSqSelectorModel(jmodel)