[SPARK-17645][MLLIB][ML][FOLLOW-UP] document minor change

## What changes were proposed in this pull request? Add FDR test case in ml/feature/ChiSqSelectorSuite. Improve some comments in the code. This is a follow-up pr for #15212. ## How was this patch tested? ut Author: Peng, Meng <peng.meng@intel.com> Closes #16434 from mpjlu/fdr_fwe_update.
author: Peng, Meng <peng.meng@intel.com> 2017-01-10 13:09:58 +0000
committer: Sean Owen <sowen@cloudera.com> 2017-01-10 13:09:58 +0000
commit: 32286ba68af03af6b9ff50d5dece050e5417307a (patch)
tree: 85d945c4bc531e91ae05bda2c85559660b6d02c8 /mllib/src
parent: acfc5f354332107cc744fb636e3730f6fc48b2fe (diff)
download: spark-32286ba68af03af6b9ff50d5dece050e5417307a.tar.gz
spark-32286ba68af03af6b9ff50d5dece050e5417307a.tar.bz2
spark-32286ba68af03af6b9ff50d5dece050e5417307a.zip
3 files changed, 84 insertions, 23 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index 353bd186da..16abc4949d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -143,13 +143,13 @@ private[feature] trait ChiSqSelectorParams extends Params
  * `fdr`, `fwe`.
  *  - `numTopFeatures` chooses a fixed number of top features according to a chi-squared test.
  *  - `percentile` is similar but chooses a fraction of all features instead of a fixed number.
- *  - `fpr` chooses all features whose p-value is below a threshold, thus controlling the false
+ *  - `fpr` chooses all features whose p-value are below a threshold, thus controlling the false
  *    positive rate of selection.
  *  - `fdr` uses the [Benjamini-Hochberg procedure]
  *    (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure)
  *    to choose all features whose false discovery rate is below a threshold.
- *  - `fwe` chooses all features whose p-values is below a threshold,
- *    thus controlling the family-wise error rate of selection.
+ *  - `fwe` chooses all features whose p-values are below a threshold. The threshold is scaled by
+ *    1/numFeatures, thus controlling the family-wise error rate of selection.
  * By default, the selection method is `numTopFeatures`, with the default number of top features
  * set to 50.
  */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 9dea3c3e84..862be6f37e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -175,13 +175,13 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {
  * `fdr`, `fwe`.
  *  - `numTopFeatures` chooses a fixed number of top features according to a chi-squared test.
  *  - `percentile` is similar but chooses a fraction of all features instead of a fixed number.
- *  - `fpr` chooses all features whose p-value is below a threshold, thus controlling the false
+ *  - `fpr` chooses all features whose p-values are below a threshold, thus controlling the false
  *    positive rate of selection.
  *  - `fdr` uses the [Benjamini-Hochberg procedure]
  *    (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure)
  *    to choose all features whose false discovery rate is below a threshold.
- *  - `fwe` chooses all features whose p-values is below a threshold,
- *    thus controlling the family-wise error rate of selection.
+ *  - `fwe` chooses all features whose p-values are below a threshold. The threshold is scaled by
+ *    1/numFeatures, thus controlling the family-wise error rate of selection.
  * By default, the selection method is `numTopFeatures`, with the default number of top features
  * set to 50.
  */
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
index f6c68b9314..482e5d5426 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
@@ -35,22 +35,77 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
 
     // Toy dataset, including the top feature for a chi-squared test.
     // These data are chosen such that each feature's test has a distinct p-value.
-    /*  To verify the results with R, run:
-      library(stats)
-      x1 <- c(8.0, 0.0, 0.0, 7.0, 8.0)
-      x2 <- c(7.0, 9.0, 9.0, 9.0, 7.0)
-      x3 <- c(0.0, 6.0, 8.0, 5.0, 3.0)
-      y <- c(0.0, 1.0, 1.0, 2.0, 2.0)
-      chisq.test(x1,y)
-      chisq.test(x2,y)
-      chisq.test(x3,y)
+    /*
+     *  Contingency tables
+     *  feature1 = {6.0, 0.0, 8.0}
+     *  class  0 1 2
+     *    6.0||1|0|0|
+     *    0.0||0|3|0|
+     *    8.0||0|0|2|
+     *  degree of freedom = 4, statistic = 12, pValue = 0.017
+     *
+     *  feature2 = {7.0, 9.0}
+     *  class  0 1 2
+     *    7.0||1|0|0|
+     *    9.0||0|3|2|
+     *  degree of freedom = 2, statistic = 6, pValue = 0.049
+     *
+     *  feature3 = {0.0, 6.0, 3.0, 8.0}
+     *  class  0 1 2
+     *    0.0||1|0|0|
+     *    6.0||0|1|2|
+     *    3.0||0|1|0|
+     *    8.0||0|1|0|
+     *  degree of freedom = 6, statistic = 8.66, pValue = 0.193
+     *
+     *  feature4 = {7.0, 0.0, 5.0, 4.0}
+     *  class  0 1 2
+     *    7.0||1|0|0|
+     *    0.0||0|2|0|
+     *    5.0||0|1|1|
+     *    4.0||0|0|1|
+     *  degree of freedom = 6, statistic = 9.5, pValue = 0.147
+     *
+     *  feature5 = {6.0, 5.0, 4.0, 0.0}
+     *  class  0 1 2
+     *    6.0||1|1|0|
+     *    5.0||0|2|0|
+     *    4.0||0|0|1|
+     *    0.0||0|0|1|
+     *  degree of freedom = 6, statistic = 8.0, pValue = 0.238
+     *
+     *  feature6 = {0.0, 9.0, 5.0, 4.0}
+     *  class  0 1 2
+     *    0.0||1|0|1|
+     *    9.0||0|1|0|
+     *    5.0||0|1|0|
+     *    4.0||0|1|1|
+     *  degree of freedom = 6, statistic = 5, pValue = 0.54
+     *
+     *  To verify the results with R, run:
+     *  library(stats)
+     *  x1 <- c(6.0, 0.0, 0.0, 0.0, 8.0, 8.0)
+     *  x2 <- c(7.0, 9.0, 9.0, 9.0, 9.0, 9.0)
+     *  x3 <- c(0.0, 6.0, 3.0, 8.0, 6.0, 6.0)
+     *  x4 <- c(7.0, 0.0, 0.0, 5.0, 5.0, 4.0)
+     *  x5 <- c(6.0, 5.0, 5.0, 6.0, 4.0, 0.0)
+     *  x6 <- c(0.0, 9.0, 5.0, 4.0, 4.0, 0.0)
+     *  y <- c(0.0, 1.0, 1.0, 1.0, 2.0, 2.0)
+     *  chisq.test(x1,y)
+     *  chisq.test(x2,y)
+     *  chisq.test(x3,y)
+     *  chisq.test(x4,y)
+     *  chisq.test(x5,y)
+     *  chisq.test(x6,y)
      */
+
     dataset = spark.createDataFrame(Seq(
-      (0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0))), Vectors.dense(8.0)),
-      (1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0))), Vectors.dense(0.0)),
-      (1.0, Vectors.dense(Array(0.0, 9.0, 8.0)), Vectors.dense(0.0)),
-      (2.0, Vectors.dense(Array(7.0, 9.0, 5.0)), Vectors.dense(7.0)),
-      (2.0, Vectors.dense(Array(8.0, 7.0, 3.0)), Vectors.dense(8.0))
+      (0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0))), Vectors.dense(6.0)),
+      (1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0))), Vectors.dense(0.0)),
+      (1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0))), Vectors.dense(0.0)),
+      (1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0)), Vectors.dense(0.0)),
+      (2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0)), Vectors.dense(8.0)),
+      (2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)), Vectors.dense(8.0))
     )).toDF("label", "features", "topFeature")
   }
 
@@ -69,19 +124,25 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
 
   test("Test Chi-Square selector: percentile") {
     val selector = new ChiSqSelector()
-      .setOutputCol("filtered").setSelectorType("percentile").setPercentile(0.34)
+      .setOutputCol("filtered").setSelectorType("percentile").setPercentile(0.17)
     ChiSqSelectorSuite.testSelector(selector, dataset)
   }
 
   test("Test Chi-Square selector: fpr") {
     val selector = new ChiSqSelector()
-      .setOutputCol("filtered").setSelectorType("fpr").setFpr(0.2)
+      .setOutputCol("filtered").setSelectorType("fpr").setFpr(0.02)
+    ChiSqSelectorSuite.testSelector(selector, dataset)
+  }
+
+  test("Test Chi-Square selector: fdr") {
+    val selector = new ChiSqSelector()
+      .setOutputCol("filtered").setSelectorType("fdr").setFdr(0.12)
     ChiSqSelectorSuite.testSelector(selector, dataset)
   }
 
   test("Test Chi-Square selector: fwe") {
     val selector = new ChiSqSelector()
-      .setOutputCol("filtered").setSelectorType("fwe").setFwe(0.6)
+      .setOutputCol("filtered").setSelectorType("fwe").setFwe(0.12)
     ChiSqSelectorSuite.testSelector(selector, dataset)
   }
author	Peng, Meng <peng.meng@intel.com>	2017-01-10 13:09:58 +0000
committer	Sean Owen <sowen@cloudera.com>	2017-01-10 13:09:58 +0000
commit	32286ba68af03af6b9ff50d5dece050e5417307a (patch)
tree	85d945c4bc531e91ae05bda2c85559660b6d02c8 /mllib/src
parent	acfc5f354332107cc744fb636e3730f6fc48b2fe (diff)
download	spark-32286ba68af03af6b9ff50d5dece050e5417307a.tar.gz spark-32286ba68af03af6b9ff50d5dece050e5417307a.tar.bz2 spark-32286ba68af03af6b9ff50d5dece050e5417307a.zip