aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorDavies Liu <davies@databricks.com>2014-11-04 21:35:52 -0800
committerXiangrui Meng <meng@databricks.com>2014-11-04 21:36:05 -0800
commitf225b3cc18698b2ee8a94c8ffa0b6aca2fce7cf9 (patch)
tree0daca2a2f01192eeca8e834d697b0193c1394ae9 /python
parente5c7869f20139832ad9e636eaeb5e77da7297456 (diff)
downloadspark-f225b3cc18698b2ee8a94c8ffa0b6aca2fce7cf9.tar.gz
spark-f225b3cc18698b2ee8a94c8ffa0b6aca2fce7cf9.tar.bz2
spark-f225b3cc18698b2ee8a94c8ffa0b6aca2fce7cf9.zip
[SPARK-3964] [MLlib] [PySpark] add Hypothesis test Python API
``` pyspark.mllib.stat.StatisticschiSqTest(observed, expected=None) :: Experimental :: If `observed` is Vector, conduct Pearson's chi-squared goodness of fit test of the observed data against the expected distribution, or againt the uniform distribution (by default), with each category having an expected frequency of `1 / len(observed)`. (Note: `observed` cannot contain negative values) If `observed` is matrix, conduct Pearson's independence test on the input contingency matrix, which cannot contain negative entries or columns or rows that sum up to 0. If `observed` is an RDD of LabeledPoint, conduct Pearson's independence test for every feature against the label across the input RDD. For each feature, the (feature, label) pairs are converted into a contingency matrix for which the chi-squared statistic is computed. All label and feature values must be categorical. :param observed: it could be a vector containing the observed categorical counts/relative frequencies, or the contingency matrix (containing either counts or relative frequencies), or an RDD of LabeledPoint containing the labeled dataset with categorical features. Real-valued features will be treated as categorical for each distinct value. :param expected: Vector containing the expected categorical counts/relative frequencies. `expected` is rescaled if the `expected` sum differs from the `observed` sum. :return: ChiSquaredTest object containing the test statistic, degrees of freedom, p-value, the method used, and the null hypothesis. ``` Author: Davies Liu <davies@databricks.com> Closes #3091 from davies/his and squashes the following commits: 145d16c [Davies Liu] address comments 0ab0764 [Davies Liu] fix float 5097d54 [Davies Liu] add Hypothesis test Python API (cherry picked from commit c8abddc5164d8cf11cdede6ab3d5d1ea08028708) Signed-off-by: Xiangrui Meng <meng@databricks.com>
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/mllib/common.py7
-rw-r--r--python/pyspark/mllib/linalg.py13
-rw-r--r--python/pyspark/mllib/stat.py137
3 files changed, 153 insertions, 4 deletions
diff --git a/python/pyspark/mllib/common.py b/python/pyspark/mllib/common.py
index dbe5f698b7..c6149fe391 100644
--- a/python/pyspark/mllib/common.py
+++ b/python/pyspark/mllib/common.py
@@ -98,8 +98,13 @@ def _java2py(sc, r):
jrdd = sc._jvm.SerDe.javaToPython(r)
return RDD(jrdd, sc)
- elif isinstance(r, (JavaArray, JavaList)) or clsName in _picklable_classes:
+ if clsName in _picklable_classes:
r = sc._jvm.SerDe.dumps(r)
+ elif isinstance(r, (JavaArray, JavaList)):
+ try:
+ r = sc._jvm.SerDe.dumps(r)
+ except Py4JJavaError:
+ pass # not pickable
if isinstance(r, bytearray):
r = PickleSerializer().loads(str(r))
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
index c0c3dff31e..e35202dca0 100644
--- a/python/pyspark/mllib/linalg.py
+++ b/python/pyspark/mllib/linalg.py
@@ -33,7 +33,7 @@ from pyspark.sql import UserDefinedType, StructField, StructType, ArrayType, Dou
IntegerType, ByteType, Row
-__all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors']
+__all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors', 'DenseMatrix', 'Matrices']
if sys.version_info[:2] == (2, 7):
@@ -578,6 +578,8 @@ class DenseMatrix(Matrix):
def __init__(self, numRows, numCols, values):
Matrix.__init__(self, numRows, numCols)
assert len(values) == numRows * numCols
+ if not isinstance(values, array.array):
+ values = array.array('d', values)
self.values = values
def __reduce__(self):
@@ -596,6 +598,15 @@ class DenseMatrix(Matrix):
return np.reshape(self.values, (self.numRows, self.numCols), order='F')
+class Matrices(object):
+ @staticmethod
+ def dense(numRows, numCols, values):
+ """
+ Create a DenseMatrix
+ """
+ return DenseMatrix(numRows, numCols, values)
+
+
def _test():
import doctest
(failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS)
diff --git a/python/pyspark/mllib/stat.py b/python/pyspark/mllib/stat.py
index 15f0652f83..0700f8a8e5 100644
--- a/python/pyspark/mllib/stat.py
+++ b/python/pyspark/mllib/stat.py
@@ -19,11 +19,12 @@
Python package for statistical functions in MLlib.
"""
+from pyspark import RDD
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
-from pyspark.mllib.linalg import _convert_to_vector
+from pyspark.mllib.linalg import Matrix, _convert_to_vector
-__all__ = ['MultivariateStatisticalSummary', 'Statistics']
+__all__ = ['MultivariateStatisticalSummary', 'ChiSqTestResult', 'Statistics']
class MultivariateStatisticalSummary(JavaModelWrapper):
@@ -51,6 +52,54 @@ class MultivariateStatisticalSummary(JavaModelWrapper):
return self.call("min").toArray()
+class ChiSqTestResult(JavaModelWrapper):
+ """
+ :: Experimental ::
+
+ Object containing the test results for the chi-squared hypothesis test.
+ """
+ @property
+ def method(self):
+ """
+ Name of the test method
+ """
+ return self._java_model.method()
+
+ @property
+ def pValue(self):
+ """
+ The probability of obtaining a test statistic result at least as
+ extreme as the one that was actually observed, assuming that the
+ null hypothesis is true.
+ """
+ return self._java_model.pValue()
+
+ @property
+ def degreesOfFreedom(self):
+ """
+ Returns the degree(s) of freedom of the hypothesis test.
+ Return type should be Number(e.g. Int, Double) or tuples of Numbers.
+ """
+ return self._java_model.degreesOfFreedom()
+
+ @property
+ def statistic(self):
+ """
+ Test statistic.
+ """
+ return self._java_model.statistic()
+
+ @property
+ def nullHypothesis(self):
+ """
+ Null hypothesis of the test.
+ """
+ return self._java_model.nullHypothesis()
+
+ def __str__(self):
+ return self._java_model.toString()
+
+
class Statistics(object):
@staticmethod
@@ -135,6 +184,90 @@ class Statistics(object):
else:
return callMLlibFunc("corr", x.map(float), y.map(float), method)
+ @staticmethod
+ def chiSqTest(observed, expected=None):
+ """
+ :: Experimental ::
+
+ If `observed` is Vector, conduct Pearson's chi-squared goodness
+ of fit test of the observed data against the expected distribution,
+ or againt the uniform distribution (by default), with each category
+ having an expected frequency of `1 / len(observed)`.
+ (Note: `observed` cannot contain negative values)
+
+ If `observed` is matrix, conduct Pearson's independence test on the
+ input contingency matrix, which cannot contain negative entries or
+ columns or rows that sum up to 0.
+
+ If `observed` is an RDD of LabeledPoint, conduct Pearson's independence
+ test for every feature against the label across the input RDD.
+ For each feature, the (feature, label) pairs are converted into a
+ contingency matrix for which the chi-squared statistic is computed.
+ All label and feature values must be categorical.
+
+ :param observed: it could be a vector containing the observed categorical
+ counts/relative frequencies, or the contingency matrix
+ (containing either counts or relative frequencies),
+ or an RDD of LabeledPoint containing the labeled dataset
+ with categorical features. Real-valued features will be
+ treated as categorical for each distinct value.
+ :param expected: Vector containing the expected categorical counts/relative
+ frequencies. `expected` is rescaled if the `expected` sum
+ differs from the `observed` sum.
+ :return: ChiSquaredTest object containing the test statistic, degrees
+ of freedom, p-value, the method used, and the null hypothesis.
+
+ >>> from pyspark.mllib.linalg import Vectors, Matrices
+ >>> observed = Vectors.dense([4, 6, 5])
+ >>> pearson = Statistics.chiSqTest(observed)
+ >>> print pearson.statistic
+ 0.4
+ >>> pearson.degreesOfFreedom
+ 2
+ >>> print round(pearson.pValue, 4)
+ 0.8187
+ >>> pearson.method
+ u'pearson'
+ >>> pearson.nullHypothesis
+ u'observed follows the same distribution as expected.'
+
+ >>> observed = Vectors.dense([21, 38, 43, 80])
+ >>> expected = Vectors.dense([3, 5, 7, 20])
+ >>> pearson = Statistics.chiSqTest(observed, expected)
+ >>> print round(pearson.pValue, 4)
+ 0.0027
+
+ >>> data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
+ >>> chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))
+ >>> print round(chi.statistic, 4)
+ 21.9958
+
+ >>> from pyspark.mllib.regression import LabeledPoint
+ >>> data = [LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
+ ... LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
+ ... LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
+ ... LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
+ ... LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
+ ... LabeledPoint(1.0, Vectors.dense([3.5, 40.0])),]
+ >>> rdd = sc.parallelize(data, 4)
+ >>> chi = Statistics.chiSqTest(rdd)
+ >>> print chi[0].statistic
+ 0.75
+ >>> print chi[1].statistic
+ 1.5
+ """
+ if isinstance(observed, RDD):
+ jmodels = callMLlibFunc("chiSqTest", observed)
+ return [ChiSqTestResult(m) for m in jmodels]
+
+ if isinstance(observed, Matrix):
+ jmodel = callMLlibFunc("chiSqTest", observed)
+ else:
+ if expected and len(expected) != len(observed):
+ raise ValueError("`expected` should have same length with `observed`")
+ jmodel = callMLlibFunc("chiSqTest", _convert_to_vector(observed), expected)
+ return ChiSqTestResult(jmodel)
+
def _test():
import doctest