diff options
author | Peng, Meng <peng.meng@intel.com> | 2016-09-21 10:17:38 +0100 |
---|---|---|
committer | Sean Owen <sowen@cloudera.com> | 2016-09-21 10:17:38 +0100 |
commit | b366f18496e1ce8bd20fe58a0245ef7d91819a03 (patch) | |
tree | 3a2e189a94ad1fc49040d721eb586724bc493097 /python/pyspark | |
parent | 28fafa3ee8f3478fa441e7bd6c8fd4ab482ca98e (diff) | |
download | spark-b366f18496e1ce8bd20fe58a0245ef7d91819a03.tar.gz spark-b366f18496e1ce8bd20fe58a0245ef7d91819a03.tar.bz2 spark-b366f18496e1ce8bd20fe58a0245ef7d91819a03.zip |
[SPARK-17017][MLLIB][ML] add a chiSquare Selector based on False Positive Rate (FPR) test
## What changes were proposed in this pull request?
Univariate feature selection works by selecting the best features based on univariate statistical tests. False Positive Rate (FPR) is a popular univariate statistical test for feature selection. We add a chiSquare Selector based on False Positive Rate (FPR) test in this PR, like it is implemented in scikit-learn.
http://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection
## How was this patch tested?
Add Scala ut
Author: Peng, Meng <peng.meng@intel.com>
Closes #14597 from mpjlu/fprChiSquare.
Diffstat (limited to 'python/pyspark')
-rw-r--r-- | python/pyspark/mllib/feature.py | 71 |
1 files changed, 66 insertions, 5 deletions
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 5d99644fca..077c11370e 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -271,11 +271,22 @@ class ChiSqSelectorModel(JavaVectorTransformer): return JavaVectorTransformer.transform(self, vector) +class ChiSqSelectorType: + """ + This class defines the selector types of Chi Square Selector. + """ + KBest, Percentile, FPR = range(3) + + class ChiSqSelector(object): """ Creates a ChiSquared feature selector. - - :param numTopFeatures: number of features that selector will select. + The selector supports three selection methods: `KBest`, `Percentile` and `FPR`. + `KBest` chooses the `k` top features according to a chi-squared test. + `Percentile` is similar but chooses a fraction of all features instead of a fixed number. + `FPR` chooses all features whose false positive rate meets some threshold. + By default, the selection method is `KBest`, the default number of top features is 50. + User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods. >>> data = [ ... LabeledPoint(0.0, SparseVector(3, {0: 8.0, 1: 7.0})), @@ -283,16 +294,58 @@ class ChiSqSelector(object): ... LabeledPoint(1.0, [0.0, 9.0, 8.0]), ... LabeledPoint(2.0, [8.0, 9.0, 5.0]) ... ] - >>> model = ChiSqSelector(1).fit(sc.parallelize(data)) + >>> model = ChiSqSelector().setNumTopFeatures(1).fit(sc.parallelize(data)) + >>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0})) + SparseVector(1, {0: 6.0}) + >>> model.transform(DenseVector([8.0, 9.0, 5.0])) + DenseVector([5.0]) + >>> model = ChiSqSelector().setPercentile(0.34).fit(sc.parallelize(data)) >>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0})) SparseVector(1, {0: 6.0}) >>> model.transform(DenseVector([8.0, 9.0, 5.0])) DenseVector([5.0]) + >>> data = [ + ... LabeledPoint(0.0, SparseVector(4, {0: 8.0, 1: 7.0})), + ... LabeledPoint(1.0, SparseVector(4, {1: 9.0, 2: 6.0, 3: 4.0})), + ... LabeledPoint(1.0, [0.0, 9.0, 8.0, 4.0]), + ... LabeledPoint(2.0, [8.0, 9.0, 5.0, 9.0]) + ... ] + >>> model = ChiSqSelector().setAlpha(0.1).fit(sc.parallelize(data)) + >>> model.transform(DenseVector([1.0,2.0,3.0,4.0])) + DenseVector([4.0]) .. versionadded:: 1.4.0 """ - def __init__(self, numTopFeatures): + def __init__(self, numTopFeatures=50): + self.numTopFeatures = numTopFeatures + self.selectorType = ChiSqSelectorType.KBest + + @since('2.1.0') + def setNumTopFeatures(self, numTopFeatures): + """ + set numTopFeature for feature selection by number of top features + """ self.numTopFeatures = int(numTopFeatures) + self.selectorType = ChiSqSelectorType.KBest + return self + + @since('2.1.0') + def setPercentile(self, percentile): + """ + set percentile [0.0, 1.0] for feature selection by percentile + """ + self.percentile = float(percentile) + self.selectorType = ChiSqSelectorType.Percentile + return self + + @since('2.1.0') + def setAlpha(self, alpha): + """ + set alpha [0.0, 1.0] for feature selection by FPR + """ + self.alpha = float(alpha) + self.selectorType = ChiSqSelectorType.FPR + return self @since('1.4.0') def fit(self, data): @@ -304,7 +357,15 @@ class ChiSqSelector(object): treated as categorical for each distinct value. Apply feature discretizer before using this function. """ - jmodel = callMLlibFunc("fitChiSqSelector", self.numTopFeatures, data) + if self.selectorType == ChiSqSelectorType.KBest: + jmodel = callMLlibFunc("fitChiSqSelectorKBest", self.numTopFeatures, data) + elif self.selectorType == ChiSqSelectorType.Percentile: + jmodel = callMLlibFunc("fitChiSqSelectorPercentile", self.percentile, data) + elif self.selectorType == ChiSqSelectorType.FPR: + jmodel = callMLlibFunc("fitChiSqSelectorFPR", self.alpha, data) + else: + raise ValueError("ChiSqSelector type supports KBest(0), Percentile(1) and" + " FPR(2), the current value is: %s" % self.selectorType) return ChiSqSelectorModel(jmodel) |