diff options
author | Joseph K. Bradley <joseph@databricks.com> | 2016-11-01 17:00:00 -0700 |
---|---|---|
committer | Joseph K. Bradley <joseph@databricks.com> | 2016-11-01 17:00:00 -0700 |
commit | 91c33a0ca5c8287f710076ed7681e5aa13ca068f (patch) | |
tree | ea3e24b067e3b7ba1f340f0ed7906c80a64a36bd /python | |
parent | b929537b6eb0f8f34497c3dbceea8045bf5dffdb (diff) | |
download | spark-91c33a0ca5c8287f710076ed7681e5aa13ca068f.tar.gz spark-91c33a0ca5c8287f710076ed7681e5aa13ca068f.tar.bz2 spark-91c33a0ca5c8287f710076ed7681e5aa13ca068f.zip |
[SPARK-18088][ML] Various ChiSqSelector cleanups
## What changes were proposed in this pull request?
- Renamed kbest to numTopFeatures
- Renamed alpha to fpr
- Added missing Since annotations
- Doc cleanups
## How was this patch tested?
Added new standardized unit tests for spark.ml.
Improved existing unit test coverage a bit.
Author: Joseph K. Bradley <joseph@databricks.com>
Closes #15647 from jkbradley/chisqselector-follow-ups.
Diffstat (limited to 'python')
-rwxr-xr-x | python/pyspark/ml/feature.py | 37 | ||||
-rw-r--r-- | python/pyspark/mllib/feature.py | 58 |
2 files changed, 46 insertions, 49 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 94afe82a36..635cf13045 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2606,42 +2606,43 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja selectorType = Param(Params._dummy(), "selectorType", "The selector type of the ChisqSelector. " + - "Supported options: kbest (default), percentile and fpr.", + "Supported options: numTopFeatures (default), percentile and fpr.", typeConverter=TypeConverters.toString) numTopFeatures = \ Param(Params._dummy(), "numTopFeatures", - "Number of features that selector will select, ordered by statistics value " + - "descending. If the number of features is < numTopFeatures, then this will select " + + "Number of features that selector will select, ordered by ascending p-value. " + + "If the number of features is < numTopFeatures, then this will select " + "all features.", typeConverter=TypeConverters.toInt) percentile = Param(Params._dummy(), "percentile", "Percentile of features that selector " + - "will select, ordered by statistics value descending.", + "will select, ordered by ascending p-value.", typeConverter=TypeConverters.toFloat) - alpha = Param(Params._dummy(), "alpha", "The highest p-value for features to be kept.", - typeConverter=TypeConverters.toFloat) + fpr = Param(Params._dummy(), "fpr", "The highest p-value for features to be kept.", + typeConverter=TypeConverters.toFloat) @keyword_only def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, - labelCol="label", selectorType="kbest", percentile=0.1, alpha=0.05): + labelCol="label", selectorType="numTopFeatures", percentile=0.1, fpr=0.05): """ __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, \ - labelCol="label", selectorType="kbest", percentile=0.1, alpha=0.05) + labelCol="label", selectorType="numTopFeatures", percentile=0.1, fpr=0.05) """ super(ChiSqSelector, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ChiSqSelector", self.uid) - self._setDefault(numTopFeatures=50, selectorType="kbest", percentile=0.1, alpha=0.05) + self._setDefault(numTopFeatures=50, selectorType="numTopFeatures", percentile=0.1, + fpr=0.05) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @keyword_only @since("2.0.0") def setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, - labelCol="labels", selectorType="kbest", percentile=0.1, alpha=0.05): + labelCol="labels", selectorType="numTopFeatures", percentile=0.1, fpr=0.05): """ setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, \ - labelCol="labels", selectorType="kbest", percentile=0.1, alpha=0.05) + labelCol="labels", selectorType="numTopFeatures", percentile=0.1, fpr=0.05) Sets params for this ChiSqSelector. """ kwargs = self.setParams._input_kwargs @@ -2665,7 +2666,7 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja def setNumTopFeatures(self, value): """ Sets the value of :py:attr:`numTopFeatures`. - Only applicable when selectorType = "kbest". + Only applicable when selectorType = "numTopFeatures". """ return self._set(numTopFeatures=value) @@ -2692,19 +2693,19 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja return self.getOrDefault(self.percentile) @since("2.1.0") - def setAlpha(self, value): + def setFpr(self, value): """ - Sets the value of :py:attr:`alpha`. + Sets the value of :py:attr:`fpr`. Only applicable when selectorType = "fpr". """ - return self._set(alpha=value) + return self._set(fpr=value) @since("2.1.0") - def getAlpha(self): + def getFpr(self): """ - Gets the value of alpha or its default value. + Gets the value of fpr or its default value. """ - return self.getOrDefault(self.alpha) + return self.getOrDefault(self.fpr) def _create_model(self, java_model): return ChiSqSelectorModel(java_model) diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 50ef7c7901..7eaa2282cb 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -274,52 +274,48 @@ class ChiSqSelectorModel(JavaVectorTransformer): class ChiSqSelector(object): """ Creates a ChiSquared feature selector. - The selector supports three selection methods: `KBest`, `Percentile` and `FPR`. - `kbest` chooses the `k` top features according to a chi-squared test. + The selector supports different selection methods: `numTopFeatures`, `percentile`, `fpr`. + `numTopFeatures` chooses a fixed number of top features according to a chi-squared test. `percentile` is similar but chooses a fraction of all features instead of a fixed number. - `fpr` chooses all features whose false positive rate meets some threshold. - By default, the selection method is `kbest`, the default number of top features is 50. + `fpr` chooses all features whose p-value is below a threshold, thus controlling the false + positive rate of selection. + By default, the selection method is `numTopFeatures`, with the default number of top features + set to 50. - >>> data = [ + >>> data = sc.parallelize([ ... LabeledPoint(0.0, SparseVector(3, {0: 8.0, 1: 7.0})), ... LabeledPoint(1.0, SparseVector(3, {1: 9.0, 2: 6.0})), ... LabeledPoint(1.0, [0.0, 9.0, 8.0]), - ... LabeledPoint(2.0, [8.0, 9.0, 5.0]) - ... ] - >>> model = ChiSqSelector().setNumTopFeatures(1).fit(sc.parallelize(data)) + ... LabeledPoint(2.0, [7.0, 9.0, 5.0]), + ... LabeledPoint(2.0, [8.0, 7.0, 3.0]) + ... ]) + >>> model = ChiSqSelector(numTopFeatures=1).fit(data) >>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0})) SparseVector(1, {}) - >>> model.transform(DenseVector([8.0, 9.0, 5.0])) - DenseVector([8.0]) - >>> model = ChiSqSelector().setSelectorType("percentile").setPercentile(0.34).fit( - ... sc.parallelize(data)) + >>> model.transform(DenseVector([7.0, 9.0, 5.0])) + DenseVector([7.0]) + >>> model = ChiSqSelector(selectorType="fpr", fpr=0.2).fit(data) >>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0})) SparseVector(1, {}) - >>> model.transform(DenseVector([8.0, 9.0, 5.0])) - DenseVector([8.0]) - >>> data = [ - ... LabeledPoint(0.0, SparseVector(4, {0: 8.0, 1: 7.0})), - ... LabeledPoint(1.0, SparseVector(4, {1: 9.0, 2: 6.0, 3: 4.0})), - ... LabeledPoint(1.0, [0.0, 9.0, 8.0, 4.0]), - ... LabeledPoint(2.0, [8.0, 9.0, 5.0, 9.0]) - ... ] - >>> model = ChiSqSelector().setSelectorType("fpr").setAlpha(0.1).fit(sc.parallelize(data)) - >>> model.transform(DenseVector([1.0,2.0,3.0,4.0])) - DenseVector([4.0]) + >>> model.transform(DenseVector([7.0, 9.0, 5.0])) + DenseVector([7.0]) + >>> model = ChiSqSelector(selectorType="percentile", percentile=0.34).fit(data) + >>> model.transform(DenseVector([7.0, 9.0, 5.0])) + DenseVector([7.0]) .. versionadded:: 1.4.0 """ - def __init__(self, numTopFeatures=50, selectorType="kbest", percentile=0.1, alpha=0.05): + def __init__(self, numTopFeatures=50, selectorType="numTopFeatures", percentile=0.1, fpr=0.05): self.numTopFeatures = numTopFeatures self.selectorType = selectorType self.percentile = percentile - self.alpha = alpha + self.fpr = fpr @since('2.1.0') def setNumTopFeatures(self, numTopFeatures): """ set numTopFeature for feature selection by number of top features. - Only applicable when selectorType = "kbest". + Only applicable when selectorType = "numTopFeatures". """ self.numTopFeatures = int(numTopFeatures) return self @@ -334,19 +330,19 @@ class ChiSqSelector(object): return self @since('2.1.0') - def setAlpha(self, alpha): + def setFpr(self, fpr): """ - set alpha [0.0, 1.0] for feature selection by FPR. + set FPR [0.0, 1.0] for feature selection by FPR. Only applicable when selectorType = "fpr". """ - self.alpha = float(alpha) + self.fpr = float(fpr) return self @since('2.1.0') def setSelectorType(self, selectorType): """ set the selector type of the ChisqSelector. - Supported options: "kbest" (default), "percentile" and "fpr". + Supported options: "numTopFeatures" (default), "percentile", "fpr". """ self.selectorType = str(selectorType) return self @@ -362,7 +358,7 @@ class ChiSqSelector(object): Apply feature discretizer before using this function. """ jmodel = callMLlibFunc("fitChiSqSelector", self.selectorType, self.numTopFeatures, - self.percentile, self.alpha, data) + self.percentile, self.fpr, data) return ChiSqSelectorModel(jmodel) |