[SPARK-18088][ML] Various ChiSqSelector cleanups

## What changes were proposed in this pull request? - Renamed kbest to numTopFeatures - Renamed alpha to fpr - Added missing Since annotations - Doc cleanups ## How was this patch tested? Added new standardized unit tests for spark.ml. Improved existing unit test coverage a bit. Author: Joseph K. Bradley <joseph@databricks.com> Closes #15647 from jkbradley/chisqselector-follow-ups.
author: Joseph K. Bradley <joseph@databricks.com> 2016-11-01 17:00:00 -0700
committer: Joseph K. Bradley <joseph@databricks.com> 2016-11-01 17:00:00 -0700
commit: 91c33a0ca5c8287f710076ed7681e5aa13ca068f (patch)
tree: ea3e24b067e3b7ba1f340f0ed7906c80a64a36bd /python
parent: b929537b6eb0f8f34497c3dbceea8045bf5dffdb (diff)
download: spark-91c33a0ca5c8287f710076ed7681e5aa13ca068f.tar.gz
spark-91c33a0ca5c8287f710076ed7681e5aa13ca068f.tar.bz2
spark-91c33a0ca5c8287f710076ed7681e5aa13ca068f.zip
2 files changed, 46 insertions, 49 deletions
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 94afe82a36..635cf13045 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2606,42 +2606,43 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
 
     selectorType = Param(Params._dummy(), "selectorType",
                          "The selector type of the ChisqSelector. " +
-                         "Supported options: kbest (default), percentile and fpr.",
+                         "Supported options: numTopFeatures (default), percentile and fpr.",
                          typeConverter=TypeConverters.toString)
 
     numTopFeatures = \
         Param(Params._dummy(), "numTopFeatures",
-              "Number of features that selector will select, ordered by statistics value " +
-              "descending. If the number of features is < numTopFeatures, then this will select " +
+              "Number of features that selector will select, ordered by ascending p-value. " +
+              "If the number of features is < numTopFeatures, then this will select " +
               "all features.", typeConverter=TypeConverters.toInt)
 
     percentile = Param(Params._dummy(), "percentile", "Percentile of features that selector " +
-                       "will select, ordered by statistics value descending.",
+                       "will select, ordered by ascending p-value.",
                        typeConverter=TypeConverters.toFloat)
 
-    alpha = Param(Params._dummy(), "alpha", "The highest p-value for features to be kept.",
-                  typeConverter=TypeConverters.toFloat)
+    fpr = Param(Params._dummy(), "fpr", "The highest p-value for features to be kept.",
+                typeConverter=TypeConverters.toFloat)
 
     @keyword_only
     def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None,
-                 labelCol="label", selectorType="kbest", percentile=0.1, alpha=0.05):
+                 labelCol="label", selectorType="numTopFeatures", percentile=0.1, fpr=0.05):
         """
         __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, \
-                 labelCol="label", selectorType="kbest", percentile=0.1, alpha=0.05)
+                 labelCol="label", selectorType="numTopFeatures", percentile=0.1, fpr=0.05)
         """
         super(ChiSqSelector, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ChiSqSelector", self.uid)
-        self._setDefault(numTopFeatures=50, selectorType="kbest", percentile=0.1, alpha=0.05)
+        self._setDefault(numTopFeatures=50, selectorType="numTopFeatures", percentile=0.1,
+                         fpr=0.05)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
     @since("2.0.0")
     def setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None,
-                  labelCol="labels", selectorType="kbest", percentile=0.1, alpha=0.05):
+                  labelCol="labels", selectorType="numTopFeatures", percentile=0.1, fpr=0.05):
         """
         setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, \
-                  labelCol="labels", selectorType="kbest", percentile=0.1, alpha=0.05)
+                  labelCol="labels", selectorType="numTopFeatures", percentile=0.1, fpr=0.05)
         Sets params for this ChiSqSelector.
         """
         kwargs = self.setParams._input_kwargs
@@ -2665,7 +2666,7 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
     def setNumTopFeatures(self, value):
         """
         Sets the value of :py:attr:`numTopFeatures`.
-        Only applicable when selectorType = "kbest".
+        Only applicable when selectorType = "numTopFeatures".
         """
         return self._set(numTopFeatures=value)
 
@@ -2692,19 +2693,19 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
         return self.getOrDefault(self.percentile)
 
     @since("2.1.0")
-    def setAlpha(self, value):
+    def setFpr(self, value):
         """
-        Sets the value of :py:attr:`alpha`.
+        Sets the value of :py:attr:`fpr`.
         Only applicable when selectorType = "fpr".
         """
-        return self._set(alpha=value)
+        return self._set(fpr=value)
 
     @since("2.1.0")
-    def getAlpha(self):
+    def getFpr(self):
         """
-        Gets the value of alpha or its default value.
+        Gets the value of fpr or its default value.
         """
-        return self.getOrDefault(self.alpha)
+        return self.getOrDefault(self.fpr)
 
     def _create_model(self, java_model):
         return ChiSqSelectorModel(java_model)
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 50ef7c7901..7eaa2282cb 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -274,52 +274,48 @@ class ChiSqSelectorModel(JavaVectorTransformer):
 class ChiSqSelector(object):
     """
     Creates a ChiSquared feature selector.
-    The selector supports three selection methods: `KBest`, `Percentile` and `FPR`.
-    `kbest` chooses the `k` top features according to a chi-squared test.
+    The selector supports different selection methods: `numTopFeatures`, `percentile`, `fpr`.
+    `numTopFeatures` chooses a fixed number of top features according to a chi-squared test.
     `percentile` is similar but chooses a fraction of all features instead of a fixed number.
-    `fpr` chooses all features whose false positive rate meets some threshold.
-    By default, the selection method is `kbest`, the default number of top features is 50.
+    `fpr` chooses all features whose p-value is below a threshold, thus controlling the false
+    positive rate of selection.
+    By default, the selection method is `numTopFeatures`, with the default number of top features
+    set to 50.
 
-    >>> data = [
+    >>> data = sc.parallelize([
     ...     LabeledPoint(0.0, SparseVector(3, {0: 8.0, 1: 7.0})),
     ...     LabeledPoint(1.0, SparseVector(3, {1: 9.0, 2: 6.0})),
     ...     LabeledPoint(1.0, [0.0, 9.0, 8.0]),
-    ...     LabeledPoint(2.0, [8.0, 9.0, 5.0])
-    ... ]
-    >>> model = ChiSqSelector().setNumTopFeatures(1).fit(sc.parallelize(data))
+    ...     LabeledPoint(2.0, [7.0, 9.0, 5.0]),
+    ...     LabeledPoint(2.0, [8.0, 7.0, 3.0])
+    ... ])
+    >>> model = ChiSqSelector(numTopFeatures=1).fit(data)
     >>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0}))
     SparseVector(1, {})
-    >>> model.transform(DenseVector([8.0, 9.0, 5.0]))
-    DenseVector([8.0])
-    >>> model = ChiSqSelector().setSelectorType("percentile").setPercentile(0.34).fit(
-    ...     sc.parallelize(data))
+    >>> model.transform(DenseVector([7.0, 9.0, 5.0]))
+    DenseVector([7.0])
+    >>> model = ChiSqSelector(selectorType="fpr", fpr=0.2).fit(data)
     >>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0}))
     SparseVector(1, {})
-    >>> model.transform(DenseVector([8.0, 9.0, 5.0]))
-    DenseVector([8.0])
-    >>> data = [
-    ...     LabeledPoint(0.0, SparseVector(4, {0: 8.0, 1: 7.0})),
-    ...     LabeledPoint(1.0, SparseVector(4, {1: 9.0, 2: 6.0, 3: 4.0})),
-    ...     LabeledPoint(1.0, [0.0, 9.0, 8.0, 4.0]),
-    ...     LabeledPoint(2.0, [8.0, 9.0, 5.0, 9.0])
-    ... ]
-    >>> model = ChiSqSelector().setSelectorType("fpr").setAlpha(0.1).fit(sc.parallelize(data))
-    >>> model.transform(DenseVector([1.0,2.0,3.0,4.0]))
-    DenseVector([4.0])
+    >>> model.transform(DenseVector([7.0, 9.0, 5.0]))
+    DenseVector([7.0])
+    >>> model = ChiSqSelector(selectorType="percentile", percentile=0.34).fit(data)
+    >>> model.transform(DenseVector([7.0, 9.0, 5.0]))
+    DenseVector([7.0])
 
     .. versionadded:: 1.4.0
     """
-    def __init__(self, numTopFeatures=50, selectorType="kbest", percentile=0.1, alpha=0.05):
+    def __init__(self, numTopFeatures=50, selectorType="numTopFeatures", percentile=0.1, fpr=0.05):
         self.numTopFeatures = numTopFeatures
         self.selectorType = selectorType
         self.percentile = percentile
-        self.alpha = alpha
+        self.fpr = fpr
 
     @since('2.1.0')
     def setNumTopFeatures(self, numTopFeatures):
         """
         set numTopFeature for feature selection by number of top features.
-        Only applicable when selectorType = "kbest".
+        Only applicable when selectorType = "numTopFeatures".
         """
         self.numTopFeatures = int(numTopFeatures)
         return self
@@ -334,19 +330,19 @@ class ChiSqSelector(object):
         return self
 
     @since('2.1.0')
-    def setAlpha(self, alpha):
+    def setFpr(self, fpr):
         """
-        set alpha [0.0, 1.0] for feature selection by FPR.
+        set FPR [0.0, 1.0] for feature selection by FPR.
         Only applicable when selectorType = "fpr".
         """
-        self.alpha = float(alpha)
+        self.fpr = float(fpr)
         return self
 
     @since('2.1.0')
     def setSelectorType(self, selectorType):
         """
         set the selector type of the ChisqSelector.
-        Supported options: "kbest" (default), "percentile" and "fpr".
+        Supported options: "numTopFeatures" (default), "percentile", "fpr".
         """
         self.selectorType = str(selectorType)
         return self
@@ -362,7 +358,7 @@ class ChiSqSelector(object):
                      Apply feature discretizer before using this function.
         """
         jmodel = callMLlibFunc("fitChiSqSelector", self.selectorType, self.numTopFeatures,
-                               self.percentile, self.alpha, data)
+                               self.percentile, self.fpr, data)
         return ChiSqSelectorModel(jmodel)
author	Joseph K. Bradley <joseph@databricks.com>	2016-11-01 17:00:00 -0700
committer	Joseph K. Bradley <joseph@databricks.com>	2016-11-01 17:00:00 -0700
commit	91c33a0ca5c8287f710076ed7681e5aa13ca068f (patch)
tree	ea3e24b067e3b7ba1f340f0ed7906c80a64a36bd /python
parent	b929537b6eb0f8f34497c3dbceea8045bf5dffdb (diff)
download	spark-91c33a0ca5c8287f710076ed7681e5aa13ca068f.tar.gz spark-91c33a0ca5c8287f710076ed7681e5aa13ca068f.tar.bz2 spark-91c33a0ca5c8287f710076ed7681e5aa13ca068f.zip