aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/ml/feature.py
diff options
context:
space:
mode:
authorJoseph K. Bradley <joseph@databricks.com>2016-11-01 17:00:00 -0700
committerJoseph K. Bradley <joseph@databricks.com>2016-11-01 17:00:00 -0700
commit91c33a0ca5c8287f710076ed7681e5aa13ca068f (patch)
treeea3e24b067e3b7ba1f340f0ed7906c80a64a36bd /python/pyspark/ml/feature.py
parentb929537b6eb0f8f34497c3dbceea8045bf5dffdb (diff)
downloadspark-91c33a0ca5c8287f710076ed7681e5aa13ca068f.tar.gz
spark-91c33a0ca5c8287f710076ed7681e5aa13ca068f.tar.bz2
spark-91c33a0ca5c8287f710076ed7681e5aa13ca068f.zip
[SPARK-18088][ML] Various ChiSqSelector cleanups
## What changes were proposed in this pull request? - Renamed kbest to numTopFeatures - Renamed alpha to fpr - Added missing Since annotations - Doc cleanups ## How was this patch tested? Added new standardized unit tests for spark.ml. Improved existing unit test coverage a bit. Author: Joseph K. Bradley <joseph@databricks.com> Closes #15647 from jkbradley/chisqselector-follow-ups.
Diffstat (limited to 'python/pyspark/ml/feature.py')
-rwxr-xr-xpython/pyspark/ml/feature.py37
1 files changed, 19 insertions, 18 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 94afe82a36..635cf13045 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2606,42 +2606,43 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
selectorType = Param(Params._dummy(), "selectorType",
"The selector type of the ChisqSelector. " +
- "Supported options: kbest (default), percentile and fpr.",
+ "Supported options: numTopFeatures (default), percentile and fpr.",
typeConverter=TypeConverters.toString)
numTopFeatures = \
Param(Params._dummy(), "numTopFeatures",
- "Number of features that selector will select, ordered by statistics value " +
- "descending. If the number of features is < numTopFeatures, then this will select " +
+ "Number of features that selector will select, ordered by ascending p-value. " +
+ "If the number of features is < numTopFeatures, then this will select " +
"all features.", typeConverter=TypeConverters.toInt)
percentile = Param(Params._dummy(), "percentile", "Percentile of features that selector " +
- "will select, ordered by statistics value descending.",
+ "will select, ordered by ascending p-value.",
typeConverter=TypeConverters.toFloat)
- alpha = Param(Params._dummy(), "alpha", "The highest p-value for features to be kept.",
- typeConverter=TypeConverters.toFloat)
+ fpr = Param(Params._dummy(), "fpr", "The highest p-value for features to be kept.",
+ typeConverter=TypeConverters.toFloat)
@keyword_only
def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None,
- labelCol="label", selectorType="kbest", percentile=0.1, alpha=0.05):
+ labelCol="label", selectorType="numTopFeatures", percentile=0.1, fpr=0.05):
"""
__init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, \
- labelCol="label", selectorType="kbest", percentile=0.1, alpha=0.05)
+ labelCol="label", selectorType="numTopFeatures", percentile=0.1, fpr=0.05)
"""
super(ChiSqSelector, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ChiSqSelector", self.uid)
- self._setDefault(numTopFeatures=50, selectorType="kbest", percentile=0.1, alpha=0.05)
+ self._setDefault(numTopFeatures=50, selectorType="numTopFeatures", percentile=0.1,
+ fpr=0.05)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@keyword_only
@since("2.0.0")
def setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None,
- labelCol="labels", selectorType="kbest", percentile=0.1, alpha=0.05):
+ labelCol="labels", selectorType="numTopFeatures", percentile=0.1, fpr=0.05):
"""
setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, \
- labelCol="labels", selectorType="kbest", percentile=0.1, alpha=0.05)
+ labelCol="labels", selectorType="numTopFeatures", percentile=0.1, fpr=0.05)
Sets params for this ChiSqSelector.
"""
kwargs = self.setParams._input_kwargs
@@ -2665,7 +2666,7 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
def setNumTopFeatures(self, value):
"""
Sets the value of :py:attr:`numTopFeatures`.
- Only applicable when selectorType = "kbest".
+ Only applicable when selectorType = "numTopFeatures".
"""
return self._set(numTopFeatures=value)
@@ -2692,19 +2693,19 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
return self.getOrDefault(self.percentile)
@since("2.1.0")
- def setAlpha(self, value):
+ def setFpr(self, value):
"""
- Sets the value of :py:attr:`alpha`.
+ Sets the value of :py:attr:`fpr`.
Only applicable when selectorType = "fpr".
"""
- return self._set(alpha=value)
+ return self._set(fpr=value)
@since("2.1.0")
- def getAlpha(self):
+ def getFpr(self):
"""
- Gets the value of alpha or its default value.
+ Gets the value of fpr or its default value.
"""
- return self.getOrDefault(self.alpha)
+ return self.getOrDefault(self.fpr)
def _create_model(self, java_model):
return ChiSqSelectorModel(java_model)