aboutsummaryrefslogtreecommitdiff
path: root/mllib
diff options
context:
space:
mode:
authorDoris Xin <doris.s.xin@gmail.com>2014-08-11 19:22:14 -0700
committerXiangrui Meng <meng@databricks.com>2014-08-11 19:22:14 -0700
commit32638b5e74e02410831b391f555223f90c830498 (patch)
tree5fabdd0a7b2741da6e84160f3121b6908a86f78b /mllib
parent7712e724ad69dd0b83754e938e9799d13a4d43b9 (diff)
downloadspark-32638b5e74e02410831b391f555223f90c830498.tar.gz
spark-32638b5e74e02410831b391f555223f90c830498.tar.bz2
spark-32638b5e74e02410831b391f555223f90c830498.zip
[SPARK-2515][mllib] Chi Squared test
Author: Doris Xin <doris.s.xin@gmail.com> Closes #1733 from dorx/chisquare and squashes the following commits: cafb3a7 [Doris Xin] fixed p-value for extreme case. d286783 [Doris Xin] Merge branch 'master' into chisquare e95e485 [Doris Xin] reviewer comments. 7dde711 [Doris Xin] ChiSqTestResult renaming and changed to Class 80d03e2 [Doris Xin] Reviewer comments. c39eeb5 [Doris Xin] units passed with updated API e90d90a [Doris Xin] Merge branch 'master' into chisquare 7eea80b [Doris Xin] WIP d64c2fb [Doris Xin] Merge branch 'master' into chisquare 5686082 [Doris Xin] facelift bc7eb2e [Doris Xin] unit passed; still need docs and some refactoring 50703a5 [Doris Xin] merge master 4e4e361 [Doris Xin] WIP e6b83f3 [Doris Xin] reviewer comments 3d61582 [Doris Xin] input names 706d436 [Doris Xin] Added API for RDD[Vector] 6598379 [Doris Xin] API and code structure. ff17423 [Doris Xin] WIP
Diffstat (limited to 'mllib')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala64
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala221
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala88
-rw-r--r--mllib/src/test/scala/org/apache/spark/mllib/stat/HypothesisTestSuite.scala139
4 files changed, 512 insertions, 0 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index f416a9fbb3..cf8679610e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -19,7 +19,9 @@ package org.apache.spark.mllib.stat
import org.apache.spark.annotation.Experimental
import org.apache.spark.mllib.linalg.{Matrix, Vector}
+import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.correlation.Correlations
+import org.apache.spark.mllib.stat.test.{ChiSqTest, ChiSqTestResult}
import org.apache.spark.rdd.RDD
/**
@@ -89,4 +91,66 @@ object Statistics {
*/
@Experimental
def corr(x: RDD[Double], y: RDD[Double], method: String): Double = Correlations.corr(x, y, method)
+
+ /**
+ * :: Experimental ::
+ * Conduct Pearson's chi-squared goodness of fit test of the observed data against the
+ * expected distribution.
+ *
+ * Note: the two input Vectors need to have the same size.
+ * `observed` cannot contain negative values.
+ * `expected` cannot contain nonpositive values.
+ *
+ * @param observed Vector containing the observed categorical counts/relative frequencies.
+ * @param expected Vector containing the expected categorical counts/relative frequencies.
+ * `expected` is rescaled if the `expected` sum differs from the `observed` sum.
+ * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
+ * the method used, and the null hypothesis.
+ */
+ @Experimental
+ def chiSqTest(observed: Vector, expected: Vector): ChiSqTestResult = {
+ ChiSqTest.chiSquared(observed, expected)
+ }
+
+ /**
+ * :: Experimental ::
+ * Conduct Pearson's chi-squared goodness of fit test of the observed data against the uniform
+ * distribution, with each category having an expected frequency of `1 / observed.size`.
+ *
+ * Note: `observed` cannot contain negative values.
+ *
+ * @param observed Vector containing the observed categorical counts/relative frequencies.
+ * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
+ * the method used, and the null hypothesis.
+ */
+ @Experimental
+ def chiSqTest(observed: Vector): ChiSqTestResult = ChiSqTest.chiSquared(observed)
+
+ /**
+ * :: Experimental ::
+ * Conduct Pearson's independence test on the input contingency matrix, which cannot contain
+ * negative entries or columns or rows that sum up to 0.
+ *
+ * @param observed The contingency matrix (containing either counts or relative frequencies).
+ * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
+ * the method used, and the null hypothesis.
+ */
+ @Experimental
+ def chiSqTest(observed: Matrix): ChiSqTestResult = ChiSqTest.chiSquaredMatrix(observed)
+
+ /**
+ * :: Experimental ::
+ * Conduct Pearson's independence test for every feature against the label across the input RDD.
+ * For each feature, the (feature, label) pairs are converted into a contingency matrix for which
+ * the chi-squared statistic is computed.
+ *
+ * @param data an `RDD[LabeledPoint]` containing the labeled dataset with categorical features.
+ * Real-valued features will be treated as categorical for each distinct value.
+ * @return an array containing the ChiSquaredTestResult for every feature against the label.
+ * The order of the elements in the returned array reflects the order of input features.
+ */
+ @Experimental
+ def chiSqTest(data: RDD[LabeledPoint]): Array[ChiSqTestResult] = {
+ ChiSqTest.chiSquaredFeatures(data)
+ }
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
new file mode 100644
index 0000000000..8f67527374
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
@@ -0,0 +1,221 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat.test
+
+import breeze.linalg.{DenseMatrix => BDM}
+import cern.jet.stat.Probability.chiSquareComplemented
+
+import org.apache.spark.Logging
+import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+
+/**
+ * Conduct the chi-squared test for the input RDDs using the specified method.
+ * Goodness-of-fit test is conducted on two `Vectors`, whereas test of independence is conducted
+ * on an input of type `Matrix` in which independence between columns is assessed.
+ * We also provide a method for computing the chi-squared statistic between each feature and the
+ * label for an input `RDD[LabeledPoint]`, return an `Array[ChiSquaredTestResult]` of size =
+ * number of features in the inpuy RDD.
+ *
+ * Supported methods for goodness of fit: `pearson` (default)
+ * Supported methods for independence: `pearson` (default)
+ *
+ * More information on Chi-squared test: http://en.wikipedia.org/wiki/Chi-squared_test
+ */
+private[stat] object ChiSqTest extends Logging {
+
+ /**
+ * @param name String name for the method.
+ * @param chiSqFunc Function for computing the statistic given the observed and expected counts.
+ */
+ case class Method(name: String, chiSqFunc: (Double, Double) => Double)
+
+ // Pearson's chi-squared test: http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
+ val PEARSON = new Method("pearson", (observed: Double, expected: Double) => {
+ val dev = observed - expected
+ dev * dev / expected
+ })
+
+ // Null hypothesis for the two different types of chi-squared tests to be included in the result.
+ object NullHypothesis extends Enumeration {
+ type NullHypothesis = Value
+ val goodnessOfFit = Value("observed follows the same distribution as expected.")
+ val independence = Value("observations in each column are statistically independent.")
+ }
+
+ // Method identification based on input methodName string
+ private def methodFromString(methodName: String): Method = {
+ methodName match {
+ case PEARSON.name => PEARSON
+ case _ => throw new IllegalArgumentException("Unrecognized method for Chi squared test.")
+ }
+ }
+
+ /**
+ * Conduct Pearson's independence test for each feature against the label across the input RDD.
+ * The contingency table is constructed from the raw (feature, label) pairs and used to conduct
+ * the independence test.
+ * Returns an array containing the ChiSquaredTestResult for every feature against the label.
+ */
+ def chiSquaredFeatures(data: RDD[LabeledPoint],
+ methodName: String = PEARSON.name): Array[ChiSqTestResult] = {
+ val numCols = data.first().features.size
+ val results = new Array[ChiSqTestResult](numCols)
+ var labels: Map[Double, Int] = null
+ // At most 100 columns at a time
+ val batchSize = 100
+ var batch = 0
+ while (batch * batchSize < numCols) {
+ // The following block of code can be cleaned up and made public as
+ // chiSquared(data: RDD[(V1, V2)])
+ val startCol = batch * batchSize
+ val endCol = startCol + math.min(batchSize, numCols - startCol)
+ val pairCounts = data.flatMap { p =>
+ // assume dense vectors
+ p.features.toArray.slice(startCol, endCol).zipWithIndex.map { case (feature, col) =>
+ (col, feature, p.label)
+ }
+ }.countByValue()
+
+ if (labels == null) {
+ // Do this only once for the first column since labels are invariant across features.
+ labels =
+ pairCounts.keys.filter(_._1 == startCol).map(_._3).toArray.distinct.zipWithIndex.toMap
+ }
+ val numLabels = labels.size
+ pairCounts.keys.groupBy(_._1).map { case (col, keys) =>
+ val features = keys.map(_._2).toArray.distinct.zipWithIndex.toMap
+ val numRows = features.size
+ val contingency = new BDM(numRows, numLabels, new Array[Double](numRows * numLabels))
+ keys.foreach { case (_, feature, label) =>
+ val i = features(feature)
+ val j = labels(label)
+ contingency(i, j) += pairCounts((col, feature, label))
+ }
+ results(col) = chiSquaredMatrix(Matrices.fromBreeze(contingency), methodName)
+ }
+ batch += 1
+ }
+ results
+ }
+
+ /*
+ * Pearon's goodness of fit test on the input observed and expected counts/relative frequencies.
+ * Uniform distribution is assumed when `expected` is not passed in.
+ */
+ def chiSquared(observed: Vector,
+ expected: Vector = Vectors.dense(Array[Double]()),
+ methodName: String = PEARSON.name): ChiSqTestResult = {
+
+ // Validate input arguments
+ val method = methodFromString(methodName)
+ if (expected.size != 0 && observed.size != expected.size) {
+ throw new IllegalArgumentException("observed and expected must be of the same size.")
+ }
+ val size = observed.size
+ if (size > 1000) {
+ logWarning("Chi-squared approximation may not be accurate due to low expected frequencies "
+ + s" as a result of a large number of categories: $size.")
+ }
+ val obsArr = observed.toArray
+ val expArr = if (expected.size == 0) Array.tabulate(size)(_ => 1.0 / size) else expected.toArray
+ if (!obsArr.forall(_ >= 0.0)) {
+ throw new IllegalArgumentException("Negative entries disallowed in the observed vector.")
+ }
+ if (expected.size != 0 && ! expArr.forall(_ >= 0.0)) {
+ throw new IllegalArgumentException("Negative entries disallowed in the expected vector.")
+ }
+
+ // Determine the scaling factor for expected
+ val obsSum = obsArr.sum
+ val expSum = if (expected.size == 0.0) 1.0 else expArr.sum
+ val scale = if (math.abs(obsSum - expSum) < 1e-7) 1.0 else obsSum / expSum
+
+ // compute chi-squared statistic
+ val statistic = obsArr.zip(expArr).foldLeft(0.0) { case (stat, (obs, exp)) =>
+ if (exp == 0.0) {
+ if (obs == 0.0) {
+ throw new IllegalArgumentException("Chi-squared statistic undefined for input vectors due"
+ + " to 0.0 values in both observed and expected.")
+ } else {
+ return new ChiSqTestResult(0.0, size - 1, Double.PositiveInfinity, PEARSON.name,
+ NullHypothesis.goodnessOfFit.toString)
+ }
+ }
+ if (scale == 1.0) {
+ stat + method.chiSqFunc(obs, exp)
+ } else {
+ stat + method.chiSqFunc(obs, exp * scale)
+ }
+ }
+ val df = size - 1
+ val pValue = chiSquareComplemented(df, statistic)
+ new ChiSqTestResult(pValue, df, statistic, PEARSON.name, NullHypothesis.goodnessOfFit.toString)
+ }
+
+ /*
+ * Pearon's independence test on the input contingency matrix.
+ * TODO: optimize for SparseMatrix when it becomes supported.
+ */
+ def chiSquaredMatrix(counts: Matrix, methodName:String = PEARSON.name): ChiSqTestResult = {
+ val method = methodFromString(methodName)
+ val numRows = counts.numRows
+ val numCols = counts.numCols
+
+ // get row and column sums
+ val colSums = new Array[Double](numCols)
+ val rowSums = new Array[Double](numRows)
+ val colMajorArr = counts.toArray
+ var i = 0
+ while (i < colMajorArr.size) {
+ val elem = colMajorArr(i)
+ if (elem < 0.0) {
+ throw new IllegalArgumentException("Contingency table cannot contain negative entries.")
+ }
+ colSums(i / numRows) += elem
+ rowSums(i % numRows) += elem
+ i += 1
+ }
+ val total = colSums.sum
+
+ // second pass to collect statistic
+ var statistic = 0.0
+ var j = 0
+ while (j < colMajorArr.size) {
+ val col = j / numRows
+ val colSum = colSums(col)
+ if (colSum == 0.0) {
+ throw new IllegalArgumentException("Chi-squared statistic undefined for input matrix due to"
+ + s"0 sum in column [$col].")
+ }
+ val row = j % numRows
+ val rowSum = rowSums(row)
+ if (rowSum == 0.0) {
+ throw new IllegalArgumentException("Chi-squared statistic undefined for input matrix due to"
+ + s"0 sum in row [$row].")
+ }
+ val expected = colSum * rowSum / total
+ statistic += method.chiSqFunc(colMajorArr(j), expected)
+ j += 1
+ }
+ val df = (numCols - 1) * (numRows - 1)
+ val pValue = chiSquareComplemented(df, statistic)
+ new ChiSqTestResult(pValue, df, statistic, methodName, NullHypothesis.independence.toString)
+ }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala
new file mode 100644
index 0000000000..2f27862133
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat.test
+
+import org.apache.spark.annotation.Experimental
+
+/**
+ * :: Experimental ::
+ * Trait for hypothesis test results.
+ * @tparam DF Return type of `degreesOfFreedom`.
+ */
+@Experimental
+trait TestResult[DF] {
+
+ /**
+ * The probability of obtaining a test statistic result at least as extreme as the one that was
+ * actually observed, assuming that the null hypothesis is true.
+ */
+ def pValue: Double
+
+ /**
+ * Returns the degree(s) of freedom of the hypothesis test.
+ * Return type should be Number(e.g. Int, Double) or tuples of Numbers for toString compatibility.
+ */
+ def degreesOfFreedom: DF
+
+ /**
+ * Test statistic.
+ */
+ def statistic: Double
+
+ /**
+ * String explaining the hypothesis test result.
+ * Specific classes implementing this trait should override this method to output test-specific
+ * information.
+ */
+ override def toString: String = {
+
+ // String explaining what the p-value indicates.
+ val pValueExplain = if (pValue <= 0.01) {
+ "Very strong presumption against null hypothesis."
+ } else if (0.01 < pValue && pValue <= 0.05) {
+ "Strong presumption against null hypothesis."
+ } else if (0.05 < pValue && pValue <= 0.01) {
+ "Low presumption against null hypothesis."
+ } else {
+ "No presumption against null hypothesis."
+ }
+
+ s"degrees of freedom = ${degreesOfFreedom.toString} \n" +
+ s"statistic = $statistic \n" +
+ s"pValue = $pValue \n" + pValueExplain
+ }
+}
+
+/**
+ * :: Experimental ::
+ * Object containing the test results for the chi squared hypothesis test.
+ */
+@Experimental
+class ChiSqTestResult(override val pValue: Double,
+ override val degreesOfFreedom: Int,
+ override val statistic: Double,
+ val method: String,
+ val nullHypothesis: String) extends TestResult[Int] {
+
+ override def toString: String = {
+ "Chi squared test summary: \n" +
+ s"method: $method \n" +
+ s"null hypothesis: $nullHypothesis \n" +
+ super.toString
+ }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/HypothesisTestSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/HypothesisTestSuite.scala
new file mode 100644
index 0000000000..5bd0521298
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/stat/HypothesisTestSuite.scala
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.mllib.linalg.{DenseVector, Matrices, Vectors}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.stat.test.ChiSqTest
+import org.apache.spark.mllib.util.LocalSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+
+class HypothesisTestSuite extends FunSuite with LocalSparkContext {
+
+ test("chi squared pearson goodness of fit") {
+
+ val observed = new DenseVector(Array[Double](4, 6, 5))
+ val pearson = Statistics.chiSqTest(observed)
+
+ // Results validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))`
+ assert(pearson.statistic === 0.4)
+ assert(pearson.degreesOfFreedom === 2)
+ assert(pearson.pValue ~== 0.8187 relTol 1e-4)
+ assert(pearson.method === ChiSqTest.PEARSON.name)
+ assert(pearson.nullHypothesis === ChiSqTest.NullHypothesis.goodnessOfFit.toString)
+
+ // different expected and observed sum
+ val observed1 = new DenseVector(Array[Double](21, 38, 43, 80))
+ val expected1 = new DenseVector(Array[Double](3, 5, 7, 20))
+ val pearson1 = Statistics.chiSqTest(observed1, expected1)
+
+ // Results validated against the R command
+ // `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))`
+ assert(pearson1.statistic ~== 14.1429 relTol 1e-4)
+ assert(pearson1.degreesOfFreedom === 3)
+ assert(pearson1.pValue ~== 0.002717 relTol 1e-4)
+ assert(pearson1.method === ChiSqTest.PEARSON.name)
+ assert(pearson1.nullHypothesis === ChiSqTest.NullHypothesis.goodnessOfFit.toString)
+
+ // Vectors with different sizes
+ val observed3 = new DenseVector(Array(1.0, 2.0, 3.0))
+ val expected3 = new DenseVector(Array(1.0, 2.0, 3.0, 4.0))
+ intercept[IllegalArgumentException](Statistics.chiSqTest(observed3, expected3))
+
+ // negative counts in observed
+ val negObs = new DenseVector(Array(1.0, 2.0, 3.0, -4.0))
+ intercept[IllegalArgumentException](Statistics.chiSqTest(negObs, expected1))
+
+ // count = 0.0 in expected but not observed
+ val zeroExpected = new DenseVector(Array(1.0, 0.0, 3.0))
+ val inf = Statistics.chiSqTest(observed, zeroExpected)
+ assert(inf.statistic === Double.PositiveInfinity)
+ assert(inf.degreesOfFreedom === 2)
+ assert(inf.pValue === 0.0)
+ assert(inf.method === ChiSqTest.PEARSON.name)
+ assert(inf.nullHypothesis === ChiSqTest.NullHypothesis.goodnessOfFit.toString)
+
+ // 0.0 in expected and observed simultaneously
+ val zeroObserved = new DenseVector(Array(2.0, 0.0, 1.0))
+ intercept[IllegalArgumentException](Statistics.chiSqTest(zeroObserved, zeroExpected))
+ }
+
+ test("chi squared pearson matrix independence") {
+ val data = Array(40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0)
+ // [[40.0, 56.0, 31.0, 30.0],
+ // [24.0, 32.0, 10.0, 15.0],
+ // [29.0, 42.0, 0.0, 12.0]]
+ val chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))
+ // Results validated against R command
+ // `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
+ assert(chi.statistic ~== 21.9958 relTol 1e-4)
+ assert(chi.degreesOfFreedom === 6)
+ assert(chi.pValue ~== 0.001213 relTol 1e-4)
+ assert(chi.method === ChiSqTest.PEARSON.name)
+ assert(chi.nullHypothesis === ChiSqTest.NullHypothesis.independence.toString)
+
+ // Negative counts
+ val negCounts = Array(4.0, 5.0, 3.0, -3.0)
+ intercept[IllegalArgumentException](Statistics.chiSqTest(Matrices.dense(2, 2, negCounts)))
+
+ // Row sum = 0.0
+ val rowZero = Array(0.0, 1.0, 0.0, 2.0)
+ intercept[IllegalArgumentException](Statistics.chiSqTest(Matrices.dense(2, 2, rowZero)))
+
+ // Column sum = 0.0
+ val colZero = Array(0.0, 0.0, 2.0, 2.0)
+ // IllegalArgumentException thrown here since it's thrown on driver, not inside a task
+ intercept[IllegalArgumentException](Statistics.chiSqTest(Matrices.dense(2, 2, colZero)))
+ }
+
+ test("chi squared pearson RDD[LabeledPoint]") {
+ // labels: 1.0 (2 / 6), 0.0 (4 / 6)
+ // feature1: 0.5 (1 / 6), 1.5 (2 / 6), 3.5 (3 / 6)
+ // feature2: 10.0 (1 / 6), 20.0 (1 / 6), 30.0 (2 / 6), 40.0 (2 / 6)
+ val data = Array(new LabeledPoint(0.0, Vectors.dense(0.5, 10.0)),
+ new LabeledPoint(0.0, Vectors.dense(1.5, 20.0)),
+ new LabeledPoint(1.0, Vectors.dense(1.5, 30.0)),
+ new LabeledPoint(0.0, Vectors.dense(3.5, 30.0)),
+ new LabeledPoint(0.0, Vectors.dense(3.5, 40.0)),
+ new LabeledPoint(1.0, Vectors.dense(3.5, 40.0)))
+ for (numParts <- List(2, 4, 6, 8)) {
+ val chi = Statistics.chiSqTest(sc.parallelize(data, numParts))
+ val feature1 = chi(0)
+ assert(feature1.statistic === 0.75)
+ assert(feature1.degreesOfFreedom === 2)
+ assert(feature1.pValue ~== 0.6873 relTol 1e-4)
+ assert(feature1.method === ChiSqTest.PEARSON.name)
+ assert(feature1.nullHypothesis === ChiSqTest.NullHypothesis.independence.toString)
+ val feature2 = chi(1)
+ assert(feature2.statistic === 1.5)
+ assert(feature2.degreesOfFreedom === 3)
+ assert(feature2.pValue ~== 0.6823 relTol 1e-4)
+ assert(feature2.method === ChiSqTest.PEARSON.name)
+ assert(feature2.nullHypothesis === ChiSqTest.NullHypothesis.independence.toString)
+ }
+
+ // Test that the right number of results is returned
+ val numCols = 321
+ val sparseData = Array(new LabeledPoint(0.0, Vectors.sparse(numCols, Seq((100, 2.0)))),
+ new LabeledPoint(0.0, Vectors.sparse(numCols, Seq((200, 1.0)))))
+ val chi = Statistics.chiSqTest(sc.parallelize(sparseData))
+ assert(chi.size === numCols)
+ }
+}