[SPARK-19635][ML] DataFrame-based API for chi square test

## What changes were proposed in this pull request? Wrapper taking and return a DataFrame ## How was this patch tested? Copied unit tests from RDD-based API Author: Joseph K. Bradley <joseph@databricks.com> Closes #17110 from jkbradley/df-hypotests.
author: Joseph K. Bradley <joseph@databricks.com> 2017-03-16 17:10:15 -0700
committer: Joseph K. Bradley <joseph@databricks.com> 2017-03-16 17:10:15 -0700
commit: 4c3200546c5c55e671988a957011417ba76a0600 (patch)
tree: 6d5b0aafc2fb302d0829c8da5b039e45646cf332 /mllib/src/main
parent: 2ea214dd05da929840c15891e908384cfa695ca8 (diff)
download: spark-4c3200546c5c55e671988a957011417ba76a0600.tar.gz
spark-4c3200546c5c55e671988a957011417ba76a0600.tar.bz2
spark-4c3200546c5c55e671988a957011417ba76a0600.zip
2 files changed, 87 insertions, 2 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquare.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquare.scala
new file mode 100644
index 0000000000..c3865ce6a9
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/ChiSquare.scala
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.stat
+
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
+import org.apache.spark.ml.util.SchemaUtils
+import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
+import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
+import org.apache.spark.mllib.stat.{Statistics => OldStatistics}
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions.col
+
+
+/**
+ * :: Experimental ::
+ *
+ * Chi-square hypothesis testing for categorical data.
+ *
+ * See <a href="http://en.wikipedia.org/wiki/Chi-squared_test">Wikipedia</a> for more information
+ * on the Chi-squared test.
+ */
+@Experimental
+@Since("2.2.0")
+object ChiSquare {
+
+  /** Used to construct output schema of tests */
+  private case class ChiSquareResult(
+      pValues: Vector,
+      degreesOfFreedom: Array[Int],
+      statistics: Vector)
+
+  /**
+   * Conduct Pearson's independence test for every feature against the label across the input RDD.
+   * For each feature, the (feature, label) pairs are converted into a contingency matrix for which
+   * the Chi-squared statistic is computed. All label and feature values must be categorical.
+   *
+   * The null hypothesis is that the occurrence of the outcomes is statistically independent.
+   *
+   * @param dataset  DataFrame of categorical labels and categorical features.
+   *                 Real-valued features will be treated as categorical for each distinct value.
+   * @param featuresCol  Name of features column in dataset, of type `Vector` (`VectorUDT`)
+   * @param labelCol  Name of label column in dataset, of any numerical type
+   * @return DataFrame containing the test result for every feature against the label.
+   *         This DataFrame will contain a single Row with the following fields:
+   *          - `pValues: Vector`
+   *          - `degreesOfFreedom: Array[Int]`
+   *          - `statistics: Vector`
+   *         Each of these fields has one value per feature.
+   */
+  @Since("2.2.0")
+  def test(dataset: DataFrame, featuresCol: String, labelCol: String): DataFrame = {
+    val spark = dataset.sparkSession
+    import spark.implicits._
+
+    SchemaUtils.checkColumnType(dataset.schema, featuresCol, new VectorUDT)
+    SchemaUtils.checkNumericType(dataset.schema, labelCol)
+    val rdd = dataset.select(col(labelCol).cast("double"), col(featuresCol)).as[(Double, Vector)]
+      .rdd.map { case (label, features) => OldLabeledPoint(label, OldVectors.fromML(features)) }
+    val testResults = OldStatistics.chiSqTest(rdd)
+    val pValues: Vector = Vectors.dense(testResults.map(_.pValue))
+    val degreesOfFreedom: Array[Int] = testResults.map(_.degreesOfFreedom)
+    val statistics: Vector = Vectors.dense(testResults.map(_.statistic))
+    spark.createDataFrame(Seq(ChiSquareResult(pValues, degreesOfFreedom, statistics)))
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
index 9a63b8a5d6..ee51248e53 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
@@ -41,7 +41,7 @@ import org.apache.spark.rdd.RDD
  *
  * More information on Chi-squared test: http://en.wikipedia.org/wiki/Chi-squared_test
  */
-private[stat] object ChiSqTest extends Logging {
+private[spark] object ChiSqTest extends Logging {
 
   /**
    * @param name String name for the method.
@@ -71,6 +71,11 @@ private[stat] object ChiSqTest extends Logging {
   }
 
   /**
+   * Max number of categories when indexing labels and features
+   */
+  private[spark] val maxCategories: Int = 10000
+
+  /**
    * Conduct Pearson's independence test for each feature against the label across the input RDD.
    * The contingency table is constructed from the raw (feature, label) pairs and used to conduct
    * the independence test.
@@ -78,7 +83,6 @@ private[stat] object ChiSqTest extends Logging {
    */
   def chiSquaredFeatures(data: RDD[LabeledPoint],
       methodName: String = PEARSON.name): Array[ChiSqTestResult] = {
-    val maxCategories = 10000
     val numCols = data.first().features.size
     val results = new Array[ChiSqTestResult](numCols)
     var labels: Map[Double, Int] = null
author	Joseph K. Bradley <joseph@databricks.com>	2017-03-16 17:10:15 -0700
committer	Joseph K. Bradley <joseph@databricks.com>	2017-03-16 17:10:15 -0700
commit	4c3200546c5c55e671988a957011417ba76a0600 (patch)
tree	6d5b0aafc2fb302d0829c8da5b039e45646cf332 /mllib/src/main
parent	2ea214dd05da929840c15891e908384cfa695ca8 (diff)
download	spark-4c3200546c5c55e671988a957011417ba76a0600.tar.gz spark-4c3200546c5c55e671988a957011417ba76a0600.tar.bz2 spark-4c3200546c5c55e671988a957011417ba76a0600.zip