aboutsummaryrefslogtreecommitdiff
path: root/mllib/src/main
diff options
context:
space:
mode:
authorDoris Xin <doris.s.xin@gmail.com>2014-08-11 19:22:14 -0700
committerXiangrui Meng <meng@databricks.com>2014-08-11 19:22:14 -0700
commit32638b5e74e02410831b391f555223f90c830498 (patch)
tree5fabdd0a7b2741da6e84160f3121b6908a86f78b /mllib/src/main
parent7712e724ad69dd0b83754e938e9799d13a4d43b9 (diff)
downloadspark-32638b5e74e02410831b391f555223f90c830498.tar.gz
spark-32638b5e74e02410831b391f555223f90c830498.tar.bz2
spark-32638b5e74e02410831b391f555223f90c830498.zip
[SPARK-2515][mllib] Chi Squared test
Author: Doris Xin <doris.s.xin@gmail.com> Closes #1733 from dorx/chisquare and squashes the following commits: cafb3a7 [Doris Xin] fixed p-value for extreme case. d286783 [Doris Xin] Merge branch 'master' into chisquare e95e485 [Doris Xin] reviewer comments. 7dde711 [Doris Xin] ChiSqTestResult renaming and changed to Class 80d03e2 [Doris Xin] Reviewer comments. c39eeb5 [Doris Xin] units passed with updated API e90d90a [Doris Xin] Merge branch 'master' into chisquare 7eea80b [Doris Xin] WIP d64c2fb [Doris Xin] Merge branch 'master' into chisquare 5686082 [Doris Xin] facelift bc7eb2e [Doris Xin] unit passed; still need docs and some refactoring 50703a5 [Doris Xin] merge master 4e4e361 [Doris Xin] WIP e6b83f3 [Doris Xin] reviewer comments 3d61582 [Doris Xin] input names 706d436 [Doris Xin] Added API for RDD[Vector] 6598379 [Doris Xin] API and code structure. ff17423 [Doris Xin] WIP
Diffstat (limited to 'mllib/src/main')
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala64
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala221
-rw-r--r--mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala88
3 files changed, 373 insertions, 0 deletions
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index f416a9fbb3..cf8679610e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -19,7 +19,9 @@ package org.apache.spark.mllib.stat
import org.apache.spark.annotation.Experimental
import org.apache.spark.mllib.linalg.{Matrix, Vector}
+import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.correlation.Correlations
+import org.apache.spark.mllib.stat.test.{ChiSqTest, ChiSqTestResult}
import org.apache.spark.rdd.RDD
/**
@@ -89,4 +91,66 @@ object Statistics {
*/
@Experimental
def corr(x: RDD[Double], y: RDD[Double], method: String): Double = Correlations.corr(x, y, method)
+
+ /**
+ * :: Experimental ::
+ * Conduct Pearson's chi-squared goodness of fit test of the observed data against the
+ * expected distribution.
+ *
+ * Note: the two input Vectors need to have the same size.
+ * `observed` cannot contain negative values.
+ * `expected` cannot contain nonpositive values.
+ *
+ * @param observed Vector containing the observed categorical counts/relative frequencies.
+ * @param expected Vector containing the expected categorical counts/relative frequencies.
+ * `expected` is rescaled if the `expected` sum differs from the `observed` sum.
+ * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
+ * the method used, and the null hypothesis.
+ */
+ @Experimental
+ def chiSqTest(observed: Vector, expected: Vector): ChiSqTestResult = {
+ ChiSqTest.chiSquared(observed, expected)
+ }
+
+ /**
+ * :: Experimental ::
+ * Conduct Pearson's chi-squared goodness of fit test of the observed data against the uniform
+ * distribution, with each category having an expected frequency of `1 / observed.size`.
+ *
+ * Note: `observed` cannot contain negative values.
+ *
+ * @param observed Vector containing the observed categorical counts/relative frequencies.
+ * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
+ * the method used, and the null hypothesis.
+ */
+ @Experimental
+ def chiSqTest(observed: Vector): ChiSqTestResult = ChiSqTest.chiSquared(observed)
+
+ /**
+ * :: Experimental ::
+ * Conduct Pearson's independence test on the input contingency matrix, which cannot contain
+ * negative entries or columns or rows that sum up to 0.
+ *
+ * @param observed The contingency matrix (containing either counts or relative frequencies).
+ * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
+ * the method used, and the null hypothesis.
+ */
+ @Experimental
+ def chiSqTest(observed: Matrix): ChiSqTestResult = ChiSqTest.chiSquaredMatrix(observed)
+
+ /**
+ * :: Experimental ::
+ * Conduct Pearson's independence test for every feature against the label across the input RDD.
+ * For each feature, the (feature, label) pairs are converted into a contingency matrix for which
+ * the chi-squared statistic is computed.
+ *
+ * @param data an `RDD[LabeledPoint]` containing the labeled dataset with categorical features.
+ * Real-valued features will be treated as categorical for each distinct value.
+ * @return an array containing the ChiSquaredTestResult for every feature against the label.
+ * The order of the elements in the returned array reflects the order of input features.
+ */
+ @Experimental
+ def chiSqTest(data: RDD[LabeledPoint]): Array[ChiSqTestResult] = {
+ ChiSqTest.chiSquaredFeatures(data)
+ }
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
new file mode 100644
index 0000000000..8f67527374
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
@@ -0,0 +1,221 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat.test
+
+import breeze.linalg.{DenseMatrix => BDM}
+import cern.jet.stat.Probability.chiSquareComplemented
+
+import org.apache.spark.Logging
+import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+
+/**
+ * Conduct the chi-squared test for the input RDDs using the specified method.
+ * Goodness-of-fit test is conducted on two `Vectors`, whereas test of independence is conducted
+ * on an input of type `Matrix` in which independence between columns is assessed.
+ * We also provide a method for computing the chi-squared statistic between each feature and the
+ * label for an input `RDD[LabeledPoint]`, return an `Array[ChiSquaredTestResult]` of size =
+ * number of features in the inpuy RDD.
+ *
+ * Supported methods for goodness of fit: `pearson` (default)
+ * Supported methods for independence: `pearson` (default)
+ *
+ * More information on Chi-squared test: http://en.wikipedia.org/wiki/Chi-squared_test
+ */
+private[stat] object ChiSqTest extends Logging {
+
+ /**
+ * @param name String name for the method.
+ * @param chiSqFunc Function for computing the statistic given the observed and expected counts.
+ */
+ case class Method(name: String, chiSqFunc: (Double, Double) => Double)
+
+ // Pearson's chi-squared test: http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
+ val PEARSON = new Method("pearson", (observed: Double, expected: Double) => {
+ val dev = observed - expected
+ dev * dev / expected
+ })
+
+ // Null hypothesis for the two different types of chi-squared tests to be included in the result.
+ object NullHypothesis extends Enumeration {
+ type NullHypothesis = Value
+ val goodnessOfFit = Value("observed follows the same distribution as expected.")
+ val independence = Value("observations in each column are statistically independent.")
+ }
+
+ // Method identification based on input methodName string
+ private def methodFromString(methodName: String): Method = {
+ methodName match {
+ case PEARSON.name => PEARSON
+ case _ => throw new IllegalArgumentException("Unrecognized method for Chi squared test.")
+ }
+ }
+
+ /**
+ * Conduct Pearson's independence test for each feature against the label across the input RDD.
+ * The contingency table is constructed from the raw (feature, label) pairs and used to conduct
+ * the independence test.
+ * Returns an array containing the ChiSquaredTestResult for every feature against the label.
+ */
+ def chiSquaredFeatures(data: RDD[LabeledPoint],
+ methodName: String = PEARSON.name): Array[ChiSqTestResult] = {
+ val numCols = data.first().features.size
+ val results = new Array[ChiSqTestResult](numCols)
+ var labels: Map[Double, Int] = null
+ // At most 100 columns at a time
+ val batchSize = 100
+ var batch = 0
+ while (batch * batchSize < numCols) {
+ // The following block of code can be cleaned up and made public as
+ // chiSquared(data: RDD[(V1, V2)])
+ val startCol = batch * batchSize
+ val endCol = startCol + math.min(batchSize, numCols - startCol)
+ val pairCounts = data.flatMap { p =>
+ // assume dense vectors
+ p.features.toArray.slice(startCol, endCol).zipWithIndex.map { case (feature, col) =>
+ (col, feature, p.label)
+ }
+ }.countByValue()
+
+ if (labels == null) {
+ // Do this only once for the first column since labels are invariant across features.
+ labels =
+ pairCounts.keys.filter(_._1 == startCol).map(_._3).toArray.distinct.zipWithIndex.toMap
+ }
+ val numLabels = labels.size
+ pairCounts.keys.groupBy(_._1).map { case (col, keys) =>
+ val features = keys.map(_._2).toArray.distinct.zipWithIndex.toMap
+ val numRows = features.size
+ val contingency = new BDM(numRows, numLabels, new Array[Double](numRows * numLabels))
+ keys.foreach { case (_, feature, label) =>
+ val i = features(feature)
+ val j = labels(label)
+ contingency(i, j) += pairCounts((col, feature, label))
+ }
+ results(col) = chiSquaredMatrix(Matrices.fromBreeze(contingency), methodName)
+ }
+ batch += 1
+ }
+ results
+ }
+
+ /*
+ * Pearon's goodness of fit test on the input observed and expected counts/relative frequencies.
+ * Uniform distribution is assumed when `expected` is not passed in.
+ */
+ def chiSquared(observed: Vector,
+ expected: Vector = Vectors.dense(Array[Double]()),
+ methodName: String = PEARSON.name): ChiSqTestResult = {
+
+ // Validate input arguments
+ val method = methodFromString(methodName)
+ if (expected.size != 0 && observed.size != expected.size) {
+ throw new IllegalArgumentException("observed and expected must be of the same size.")
+ }
+ val size = observed.size
+ if (size > 1000) {
+ logWarning("Chi-squared approximation may not be accurate due to low expected frequencies "
+ + s" as a result of a large number of categories: $size.")
+ }
+ val obsArr = observed.toArray
+ val expArr = if (expected.size == 0) Array.tabulate(size)(_ => 1.0 / size) else expected.toArray
+ if (!obsArr.forall(_ >= 0.0)) {
+ throw new IllegalArgumentException("Negative entries disallowed in the observed vector.")
+ }
+ if (expected.size != 0 && ! expArr.forall(_ >= 0.0)) {
+ throw new IllegalArgumentException("Negative entries disallowed in the expected vector.")
+ }
+
+ // Determine the scaling factor for expected
+ val obsSum = obsArr.sum
+ val expSum = if (expected.size == 0.0) 1.0 else expArr.sum
+ val scale = if (math.abs(obsSum - expSum) < 1e-7) 1.0 else obsSum / expSum
+
+ // compute chi-squared statistic
+ val statistic = obsArr.zip(expArr).foldLeft(0.0) { case (stat, (obs, exp)) =>
+ if (exp == 0.0) {
+ if (obs == 0.0) {
+ throw new IllegalArgumentException("Chi-squared statistic undefined for input vectors due"
+ + " to 0.0 values in both observed and expected.")
+ } else {
+ return new ChiSqTestResult(0.0, size - 1, Double.PositiveInfinity, PEARSON.name,
+ NullHypothesis.goodnessOfFit.toString)
+ }
+ }
+ if (scale == 1.0) {
+ stat + method.chiSqFunc(obs, exp)
+ } else {
+ stat + method.chiSqFunc(obs, exp * scale)
+ }
+ }
+ val df = size - 1
+ val pValue = chiSquareComplemented(df, statistic)
+ new ChiSqTestResult(pValue, df, statistic, PEARSON.name, NullHypothesis.goodnessOfFit.toString)
+ }
+
+ /*
+ * Pearon's independence test on the input contingency matrix.
+ * TODO: optimize for SparseMatrix when it becomes supported.
+ */
+ def chiSquaredMatrix(counts: Matrix, methodName:String = PEARSON.name): ChiSqTestResult = {
+ val method = methodFromString(methodName)
+ val numRows = counts.numRows
+ val numCols = counts.numCols
+
+ // get row and column sums
+ val colSums = new Array[Double](numCols)
+ val rowSums = new Array[Double](numRows)
+ val colMajorArr = counts.toArray
+ var i = 0
+ while (i < colMajorArr.size) {
+ val elem = colMajorArr(i)
+ if (elem < 0.0) {
+ throw new IllegalArgumentException("Contingency table cannot contain negative entries.")
+ }
+ colSums(i / numRows) += elem
+ rowSums(i % numRows) += elem
+ i += 1
+ }
+ val total = colSums.sum
+
+ // second pass to collect statistic
+ var statistic = 0.0
+ var j = 0
+ while (j < colMajorArr.size) {
+ val col = j / numRows
+ val colSum = colSums(col)
+ if (colSum == 0.0) {
+ throw new IllegalArgumentException("Chi-squared statistic undefined for input matrix due to"
+ + s"0 sum in column [$col].")
+ }
+ val row = j % numRows
+ val rowSum = rowSums(row)
+ if (rowSum == 0.0) {
+ throw new IllegalArgumentException("Chi-squared statistic undefined for input matrix due to"
+ + s"0 sum in row [$row].")
+ }
+ val expected = colSum * rowSum / total
+ statistic += method.chiSqFunc(colMajorArr(j), expected)
+ j += 1
+ }
+ val df = (numCols - 1) * (numRows - 1)
+ val pValue = chiSquareComplemented(df, statistic)
+ new ChiSqTestResult(pValue, df, statistic, methodName, NullHypothesis.independence.toString)
+ }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala
new file mode 100644
index 0000000000..2f27862133
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.stat.test
+
+import org.apache.spark.annotation.Experimental
+
+/**
+ * :: Experimental ::
+ * Trait for hypothesis test results.
+ * @tparam DF Return type of `degreesOfFreedom`.
+ */
+@Experimental
+trait TestResult[DF] {
+
+ /**
+ * The probability of obtaining a test statistic result at least as extreme as the one that was
+ * actually observed, assuming that the null hypothesis is true.
+ */
+ def pValue: Double
+
+ /**
+ * Returns the degree(s) of freedom of the hypothesis test.
+ * Return type should be Number(e.g. Int, Double) or tuples of Numbers for toString compatibility.
+ */
+ def degreesOfFreedom: DF
+
+ /**
+ * Test statistic.
+ */
+ def statistic: Double
+
+ /**
+ * String explaining the hypothesis test result.
+ * Specific classes implementing this trait should override this method to output test-specific
+ * information.
+ */
+ override def toString: String = {
+
+ // String explaining what the p-value indicates.
+ val pValueExplain = if (pValue <= 0.01) {
+ "Very strong presumption against null hypothesis."
+ } else if (0.01 < pValue && pValue <= 0.05) {
+ "Strong presumption against null hypothesis."
+ } else if (0.05 < pValue && pValue <= 0.01) {
+ "Low presumption against null hypothesis."
+ } else {
+ "No presumption against null hypothesis."
+ }
+
+ s"degrees of freedom = ${degreesOfFreedom.toString} \n" +
+ s"statistic = $statistic \n" +
+ s"pValue = $pValue \n" + pValueExplain
+ }
+}
+
+/**
+ * :: Experimental ::
+ * Object containing the test results for the chi squared hypothesis test.
+ */
+@Experimental
+class ChiSqTestResult(override val pValue: Double,
+ override val degreesOfFreedom: Int,
+ override val statistic: Double,
+ val method: String,
+ val nullHypothesis: String) extends TestResult[Int] {
+
+ override def toString: String = {
+ "Chi squared test summary: \n" +
+ s"method: $method \n" +
+ s"null hypothesis: $nullHypothesis \n" +
+ super.toString
+ }
+}