aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorJoseph K. Bradley <joseph@databricks.com>2016-11-01 17:00:00 -0700
committerJoseph K. Bradley <joseph@databricks.com>2016-11-01 17:00:00 -0700
commit91c33a0ca5c8287f710076ed7681e5aa13ca068f (patch)
treeea3e24b067e3b7ba1f340f0ed7906c80a64a36bd /python
parentb929537b6eb0f8f34497c3dbceea8045bf5dffdb (diff)
downloadspark-91c33a0ca5c8287f710076ed7681e5aa13ca068f.tar.gz
spark-91c33a0ca5c8287f710076ed7681e5aa13ca068f.tar.bz2
spark-91c33a0ca5c8287f710076ed7681e5aa13ca068f.zip
[SPARK-18088][ML] Various ChiSqSelector cleanups
## What changes were proposed in this pull request? - Renamed kbest to numTopFeatures - Renamed alpha to fpr - Added missing Since annotations - Doc cleanups ## How was this patch tested? Added new standardized unit tests for spark.ml. Improved existing unit test coverage a bit. Author: Joseph K. Bradley <joseph@databricks.com> Closes #15647 from jkbradley/chisqselector-follow-ups.
Diffstat (limited to 'python')
-rwxr-xr-xpython/pyspark/ml/feature.py37
-rw-r--r--python/pyspark/mllib/feature.py58
2 files changed, 46 insertions, 49 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 94afe82a36..635cf13045 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2606,42 +2606,43 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
selectorType = Param(Params._dummy(), "selectorType",
"The selector type of the ChisqSelector. " +
- "Supported options: kbest (default), percentile and fpr.",
+ "Supported options: numTopFeatures (default), percentile and fpr.",
typeConverter=TypeConverters.toString)
numTopFeatures = \
Param(Params._dummy(), "numTopFeatures",
- "Number of features that selector will select, ordered by statistics value " +
- "descending. If the number of features is < numTopFeatures, then this will select " +
+ "Number of features that selector will select, ordered by ascending p-value. " +
+ "If the number of features is < numTopFeatures, then this will select " +
"all features.", typeConverter=TypeConverters.toInt)
percentile = Param(Params._dummy(), "percentile", "Percentile of features that selector " +
- "will select, ordered by statistics value descending.",
+ "will select, ordered by ascending p-value.",
typeConverter=TypeConverters.toFloat)
- alpha = Param(Params._dummy(), "alpha", "The highest p-value for features to be kept.",
- typeConverter=TypeConverters.toFloat)
+ fpr = Param(Params._dummy(), "fpr", "The highest p-value for features to be kept.",
+ typeConverter=TypeConverters.toFloat)
@keyword_only
def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None,
- labelCol="label", selectorType="kbest", percentile=0.1, alpha=0.05):
+ labelCol="label", selectorType="numTopFeatures", percentile=0.1, fpr=0.05):
"""
__init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, \
- labelCol="label", selectorType="kbest", percentile=0.1, alpha=0.05)
+ labelCol="label", selectorType="numTopFeatures", percentile=0.1, fpr=0.05)
"""
super(ChiSqSelector, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ChiSqSelector", self.uid)
- self._setDefault(numTopFeatures=50, selectorType="kbest", percentile=0.1, alpha=0.05)
+ self._setDefault(numTopFeatures=50, selectorType="numTopFeatures", percentile=0.1,
+ fpr=0.05)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@keyword_only
@since("2.0.0")
def setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None,
- labelCol="labels", selectorType="kbest", percentile=0.1, alpha=0.05):
+ labelCol="labels", selectorType="numTopFeatures", percentile=0.1, fpr=0.05):
"""
setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, \
- labelCol="labels", selectorType="kbest", percentile=0.1, alpha=0.05)
+ labelCol="labels", selectorType="numTopFeatures", percentile=0.1, fpr=0.05)
Sets params for this ChiSqSelector.
"""
kwargs = self.setParams._input_kwargs
@@ -2665,7 +2666,7 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
def setNumTopFeatures(self, value):
"""
Sets the value of :py:attr:`numTopFeatures`.
- Only applicable when selectorType = "kbest".
+ Only applicable when selectorType = "numTopFeatures".
"""
return self._set(numTopFeatures=value)
@@ -2692,19 +2693,19 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
return self.getOrDefault(self.percentile)
@since("2.1.0")
- def setAlpha(self, value):
+ def setFpr(self, value):
"""
- Sets the value of :py:attr:`alpha`.
+ Sets the value of :py:attr:`fpr`.
Only applicable when selectorType = "fpr".
"""
- return self._set(alpha=value)
+ return self._set(fpr=value)
@since("2.1.0")
- def getAlpha(self):
+ def getFpr(self):
"""
- Gets the value of alpha or its default value.
+ Gets the value of fpr or its default value.
"""
- return self.getOrDefault(self.alpha)
+ return self.getOrDefault(self.fpr)
def _create_model(self, java_model):
return ChiSqSelectorModel(java_model)
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 50ef7c7901..7eaa2282cb 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -274,52 +274,48 @@ class ChiSqSelectorModel(JavaVectorTransformer):
class ChiSqSelector(object):
"""
Creates a ChiSquared feature selector.
- The selector supports three selection methods: `KBest`, `Percentile` and `FPR`.
- `kbest` chooses the `k` top features according to a chi-squared test.
+ The selector supports different selection methods: `numTopFeatures`, `percentile`, `fpr`.
+ `numTopFeatures` chooses a fixed number of top features according to a chi-squared test.
`percentile` is similar but chooses a fraction of all features instead of a fixed number.
- `fpr` chooses all features whose false positive rate meets some threshold.
- By default, the selection method is `kbest`, the default number of top features is 50.
+ `fpr` chooses all features whose p-value is below a threshold, thus controlling the false
+ positive rate of selection.
+ By default, the selection method is `numTopFeatures`, with the default number of top features
+ set to 50.
- >>> data = [
+ >>> data = sc.parallelize([
... LabeledPoint(0.0, SparseVector(3, {0: 8.0, 1: 7.0})),
... LabeledPoint(1.0, SparseVector(3, {1: 9.0, 2: 6.0})),
... LabeledPoint(1.0, [0.0, 9.0, 8.0]),
- ... LabeledPoint(2.0, [8.0, 9.0, 5.0])
- ... ]
- >>> model = ChiSqSelector().setNumTopFeatures(1).fit(sc.parallelize(data))
+ ... LabeledPoint(2.0, [7.0, 9.0, 5.0]),
+ ... LabeledPoint(2.0, [8.0, 7.0, 3.0])
+ ... ])
+ >>> model = ChiSqSelector(numTopFeatures=1).fit(data)
>>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0}))
SparseVector(1, {})
- >>> model.transform(DenseVector([8.0, 9.0, 5.0]))
- DenseVector([8.0])
- >>> model = ChiSqSelector().setSelectorType("percentile").setPercentile(0.34).fit(
- ... sc.parallelize(data))
+ >>> model.transform(DenseVector([7.0, 9.0, 5.0]))
+ DenseVector([7.0])
+ >>> model = ChiSqSelector(selectorType="fpr", fpr=0.2).fit(data)
>>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0}))
SparseVector(1, {})
- >>> model.transform(DenseVector([8.0, 9.0, 5.0]))
- DenseVector([8.0])
- >>> data = [
- ... LabeledPoint(0.0, SparseVector(4, {0: 8.0, 1: 7.0})),
- ... LabeledPoint(1.0, SparseVector(4, {1: 9.0, 2: 6.0, 3: 4.0})),
- ... LabeledPoint(1.0, [0.0, 9.0, 8.0, 4.0]),
- ... LabeledPoint(2.0, [8.0, 9.0, 5.0, 9.0])
- ... ]
- >>> model = ChiSqSelector().setSelectorType("fpr").setAlpha(0.1).fit(sc.parallelize(data))
- >>> model.transform(DenseVector([1.0,2.0,3.0,4.0]))
- DenseVector([4.0])
+ >>> model.transform(DenseVector([7.0, 9.0, 5.0]))
+ DenseVector([7.0])
+ >>> model = ChiSqSelector(selectorType="percentile", percentile=0.34).fit(data)
+ >>> model.transform(DenseVector([7.0, 9.0, 5.0]))
+ DenseVector([7.0])
.. versionadded:: 1.4.0
"""
- def __init__(self, numTopFeatures=50, selectorType="kbest", percentile=0.1, alpha=0.05):
+ def __init__(self, numTopFeatures=50, selectorType="numTopFeatures", percentile=0.1, fpr=0.05):
self.numTopFeatures = numTopFeatures
self.selectorType = selectorType
self.percentile = percentile
- self.alpha = alpha
+ self.fpr = fpr
@since('2.1.0')
def setNumTopFeatures(self, numTopFeatures):
"""
set numTopFeature for feature selection by number of top features.
- Only applicable when selectorType = "kbest".
+ Only applicable when selectorType = "numTopFeatures".
"""
self.numTopFeatures = int(numTopFeatures)
return self
@@ -334,19 +330,19 @@ class ChiSqSelector(object):
return self
@since('2.1.0')
- def setAlpha(self, alpha):
+ def setFpr(self, fpr):
"""
- set alpha [0.0, 1.0] for feature selection by FPR.
+ set FPR [0.0, 1.0] for feature selection by FPR.
Only applicable when selectorType = "fpr".
"""
- self.alpha = float(alpha)
+ self.fpr = float(fpr)
return self
@since('2.1.0')
def setSelectorType(self, selectorType):
"""
set the selector type of the ChisqSelector.
- Supported options: "kbest" (default), "percentile" and "fpr".
+ Supported options: "numTopFeatures" (default), "percentile", "fpr".
"""
self.selectorType = str(selectorType)
return self
@@ -362,7 +358,7 @@ class ChiSqSelector(object):
Apply feature discretizer before using this function.
"""
jmodel = callMLlibFunc("fitChiSqSelector", self.selectorType, self.numTopFeatures,
- self.percentile, self.alpha, data)
+ self.percentile, self.fpr, data)
return ChiSqSelectorModel(jmodel)