aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorYanbo Liang <ybliang8@gmail.com>2016-09-26 09:45:33 +0100
committerSean Owen <sowen@cloudera.com>2016-09-26 09:45:33 +0100
commitac65139be96dbf87402b9a85729a93afd3c6ff17 (patch)
tree6b9580267acc710567fe5509fc66ba10fa01ec29 /python
parent59d87d24079bc633e63ce032f0a5ddd18a3b02cb (diff)
downloadspark-ac65139be96dbf87402b9a85729a93afd3c6ff17.tar.gz
spark-ac65139be96dbf87402b9a85729a93afd3c6ff17.tar.bz2
spark-ac65139be96dbf87402b9a85729a93afd3c6ff17.zip
[SPARK-17017][FOLLOW-UP][ML] Refactor of ChiSqSelector and add ML Python API.
## What changes were proposed in this pull request? #14597 modified ```ChiSqSelector``` to support ```fpr``` type selector, however, it left some issue need to be addressed: * We should allow users to set selector type explicitly rather than switching them by using different setting function, since the setting order will involves some unexpected issue. For example, if users both set ```numTopFeatures``` and ```percentile```, it will train ```kbest``` or ```percentile``` model based on the order of setting (the latter setting one will be trained). This make users confused, and we should allow users to set selector type explicitly. We handle similar issues at other place of ML code base such as ```GeneralizedLinearRegression``` and ```LogisticRegression```. * Meanwhile, if there are more than one parameter except ```alpha``` can be set for ```fpr``` model, we can not handle it elegantly in the existing framework. And similar issues for ```kbest``` and ```percentile``` model. Setting selector type explicitly can solve this issue also. * If setting selector type explicitly by users is allowed, we should handle param interaction such as if users set ```selectorType = percentile``` and ```alpha = 0.1```, we should notify users the parameter ```alpha``` will take no effect. We should handle complex parameter interaction checks at ```transformSchema```. (FYI #11620) * We should use lower case of the selector type names to follow MLlib convention. * Add ML Python API. ## How was this patch tested? Unit test. Author: Yanbo Liang <ybliang8@gmail.com> Closes #15214 from yanboliang/spark-17017.
Diffstat (limited to 'python')
-rwxr-xr-xpython/pyspark/ml/feature.py71
-rw-r--r--python/pyspark/mllib/feature.py59
2 files changed, 93 insertions, 37 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index c45434f1a5..12a13849dc 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2586,39 +2586,68 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
.. versionadded:: 2.0.0
"""
+ selectorType = Param(Params._dummy(), "selectorType",
+ "The selector type of the ChisqSelector. " +
+ "Supported options: kbest (default), percentile and fpr.",
+ typeConverter=TypeConverters.toString)
+
numTopFeatures = \
Param(Params._dummy(), "numTopFeatures",
"Number of features that selector will select, ordered by statistics value " +
"descending. If the number of features is < numTopFeatures, then this will select " +
"all features.", typeConverter=TypeConverters.toInt)
+ percentile = Param(Params._dummy(), "percentile", "Percentile of features that selector " +
+ "will select, ordered by statistics value descending.",
+ typeConverter=TypeConverters.toFloat)
+
+ alpha = Param(Params._dummy(), "alpha", "The highest p-value for features to be kept.",
+ typeConverter=TypeConverters.toFloat)
+
@keyword_only
- def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, labelCol="label"):
+ def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None,
+ labelCol="label", selectorType="kbest", percentile=0.1, alpha=0.05):
"""
- __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, labelCol="label")
+ __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, \
+ labelCol="label", selectorType="kbest", percentile=0.1, alpha=0.05)
"""
super(ChiSqSelector, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ChiSqSelector", self.uid)
- self._setDefault(numTopFeatures=50)
+ self._setDefault(numTopFeatures=50, selectorType="kbest", percentile=0.1, alpha=0.05)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
@keyword_only
@since("2.0.0")
def setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None,
- labelCol="labels"):
+ labelCol="labels", selectorType="kbest", percentile=0.1, alpha=0.05):
"""
- setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None,\
- labelCol="labels")
+ setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, \
+ labelCol="labels", selectorType="kbest", percentile=0.1, alpha=0.05)
Sets params for this ChiSqSelector.
"""
kwargs = self.setParams._input_kwargs
return self._set(**kwargs)
+ @since("2.1.0")
+ def setSelectorType(self, value):
+ """
+ Sets the value of :py:attr:`selectorType`.
+ """
+ return self._set(selectorType=value)
+
+ @since("2.1.0")
+ def getSelectorType(self):
+ """
+ Gets the value of selectorType or its default value.
+ """
+ return self.getOrDefault(self.selectorType)
+
@since("2.0.0")
def setNumTopFeatures(self, value):
"""
Sets the value of :py:attr:`numTopFeatures`.
+ Only applicable when selectorType = "kbest".
"""
return self._set(numTopFeatures=value)
@@ -2629,6 +2658,36 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
"""
return self.getOrDefault(self.numTopFeatures)
+ @since("2.1.0")
+ def setPercentile(self, value):
+ """
+ Sets the value of :py:attr:`percentile`.
+ Only applicable when selectorType = "percentile".
+ """
+ return self._set(percentile=value)
+
+ @since("2.1.0")
+ def getPercentile(self):
+ """
+ Gets the value of percentile or its default value.
+ """
+ return self.getOrDefault(self.percentile)
+
+ @since("2.1.0")
+ def setAlpha(self, value):
+ """
+ Sets the value of :py:attr:`alpha`.
+ Only applicable when selectorType = "fpr".
+ """
+ return self._set(alpha=value)
+
+ @since("2.1.0")
+ def getAlpha(self):
+ """
+ Gets the value of alpha or its default value.
+ """
+ return self.getOrDefault(self.alpha)
+
def _create_model(self, java_model):
return ChiSqSelectorModel(java_model)
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 077c11370e..4aea81840a 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -271,22 +271,14 @@ class ChiSqSelectorModel(JavaVectorTransformer):
return JavaVectorTransformer.transform(self, vector)
-class ChiSqSelectorType:
- """
- This class defines the selector types of Chi Square Selector.
- """
- KBest, Percentile, FPR = range(3)
-
-
class ChiSqSelector(object):
"""
Creates a ChiSquared feature selector.
The selector supports three selection methods: `KBest`, `Percentile` and `FPR`.
- `KBest` chooses the `k` top features according to a chi-squared test.
- `Percentile` is similar but chooses a fraction of all features instead of a fixed number.
- `FPR` chooses all features whose false positive rate meets some threshold.
- By default, the selection method is `KBest`, the default number of top features is 50.
- User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods.
+ `kbest` chooses the `k` top features according to a chi-squared test.
+ `percentile` is similar but chooses a fraction of all features instead of a fixed number.
+ `fpr` chooses all features whose false positive rate meets some threshold.
+ By default, the selection method is `kbest`, the default number of top features is 50.
>>> data = [
... LabeledPoint(0.0, SparseVector(3, {0: 8.0, 1: 7.0})),
@@ -299,7 +291,8 @@ class ChiSqSelector(object):
SparseVector(1, {0: 6.0})
>>> model.transform(DenseVector([8.0, 9.0, 5.0]))
DenseVector([5.0])
- >>> model = ChiSqSelector().setPercentile(0.34).fit(sc.parallelize(data))
+ >>> model = ChiSqSelector().setSelectorType("percentile").setPercentile(0.34).fit(
+ ... sc.parallelize(data))
>>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0}))
SparseVector(1, {0: 6.0})
>>> model.transform(DenseVector([8.0, 9.0, 5.0]))
@@ -310,41 +303,52 @@ class ChiSqSelector(object):
... LabeledPoint(1.0, [0.0, 9.0, 8.0, 4.0]),
... LabeledPoint(2.0, [8.0, 9.0, 5.0, 9.0])
... ]
- >>> model = ChiSqSelector().setAlpha(0.1).fit(sc.parallelize(data))
+ >>> model = ChiSqSelector().setSelectorType("fpr").setAlpha(0.1).fit(sc.parallelize(data))
>>> model.transform(DenseVector([1.0,2.0,3.0,4.0]))
DenseVector([4.0])
.. versionadded:: 1.4.0
"""
- def __init__(self, numTopFeatures=50):
+ def __init__(self, numTopFeatures=50, selectorType="kbest", percentile=0.1, alpha=0.05):
self.numTopFeatures = numTopFeatures
- self.selectorType = ChiSqSelectorType.KBest
+ self.selectorType = selectorType
+ self.percentile = percentile
+ self.alpha = alpha
@since('2.1.0')
def setNumTopFeatures(self, numTopFeatures):
"""
- set numTopFeature for feature selection by number of top features
+ set numTopFeature for feature selection by number of top features.
+ Only applicable when selectorType = "kbest".
"""
self.numTopFeatures = int(numTopFeatures)
- self.selectorType = ChiSqSelectorType.KBest
return self
@since('2.1.0')
def setPercentile(self, percentile):
"""
- set percentile [0.0, 1.0] for feature selection by percentile
+ set percentile [0.0, 1.0] for feature selection by percentile.
+ Only applicable when selectorType = "percentile".
"""
self.percentile = float(percentile)
- self.selectorType = ChiSqSelectorType.Percentile
return self
@since('2.1.0')
def setAlpha(self, alpha):
"""
- set alpha [0.0, 1.0] for feature selection by FPR
+ set alpha [0.0, 1.0] for feature selection by FPR.
+ Only applicable when selectorType = "fpr".
"""
self.alpha = float(alpha)
- self.selectorType = ChiSqSelectorType.FPR
+ return self
+
+ @since('2.1.0')
+ def setSelectorType(self, selectorType):
+ """
+ set the selector type of the ChisqSelector.
+ Supported options: "kbest" (default), "percentile" and "fpr".
+ """
+ self.selectorType = str(selectorType)
return self
@since('1.4.0')
@@ -357,15 +361,8 @@ class ChiSqSelector(object):
treated as categorical for each distinct value.
Apply feature discretizer before using this function.
"""
- if self.selectorType == ChiSqSelectorType.KBest:
- jmodel = callMLlibFunc("fitChiSqSelectorKBest", self.numTopFeatures, data)
- elif self.selectorType == ChiSqSelectorType.Percentile:
- jmodel = callMLlibFunc("fitChiSqSelectorPercentile", self.percentile, data)
- elif self.selectorType == ChiSqSelectorType.FPR:
- jmodel = callMLlibFunc("fitChiSqSelectorFPR", self.alpha, data)
- else:
- raise ValueError("ChiSqSelector type supports KBest(0), Percentile(1) and"
- " FPR(2), the current value is: %s" % self.selectorType)
+ jmodel = callMLlibFunc("fitChiSqSelector", self.selectorType, self.numTopFeatures,
+ self.percentile, self.alpha, data)
return ChiSqSelectorModel(jmodel)