diff options
author | MechCoder <manojkumarsivaraj334@gmail.com> | 2015-07-20 09:00:01 -0700 |
---|---|---|
committer | Xiangrui Meng <meng@databricks.com> | 2015-07-20 09:00:01 -0700 |
commit | d0b4e93f7e92ea59058cc457a5586a4d9a596d71 (patch) | |
tree | 9af6a3d90845d2cc36fa573125e8ed62770b7804 /python | |
parent | 3f7de7db4cf7c5e2824cb91087c5e9d4beb0f738 (diff) | |
download | spark-d0b4e93f7e92ea59058cc457a5586a4d9a596d71.tar.gz spark-d0b4e93f7e92ea59058cc457a5586a4d9a596d71.tar.bz2 spark-d0b4e93f7e92ea59058cc457a5586a4d9a596d71.zip |
[SPARK-8996] [MLLIB] [PYSPARK] Python API for Kolmogorov-Smirnov Test
Python API for the KS-test
Statistics.kolmogorovSmirnovTest(data, distName, *params)
I'm not quite sure how to support the callable function since it is not serializable.
Author: MechCoder <manojkumarsivaraj334@gmail.com>
Closes #7430 from MechCoder/spark-8996 and squashes the following commits:
2dd009d [MechCoder] minor
021d233 [MechCoder] Remove one wrapper and other minor stuff
49d07ab [MechCoder] [SPARK-8996] [MLlib] Python API for Kolmogorov-Smirnov Test
Diffstat (limited to 'python')
-rw-r--r-- | python/pyspark/mllib/stat/_statistics.py | 67 | ||||
-rw-r--r-- | python/pyspark/mllib/stat/test.py | 37 | ||||
-rw-r--r-- | python/pyspark/mllib/tests.py | 19 |
3 files changed, 110 insertions, 13 deletions
diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py index b475be4b4d..36c8f48a4a 100644 --- a/python/pyspark/mllib/stat/_statistics.py +++ b/python/pyspark/mllib/stat/_statistics.py @@ -15,11 +15,15 @@ # limitations under the License. # +import sys +if sys.version >= '3': + basestring = str + from pyspark.rdd import RDD, ignore_unicode_prefix from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper from pyspark.mllib.linalg import Matrix, _convert_to_vector from pyspark.mllib.regression import LabeledPoint -from pyspark.mllib.stat.test import ChiSqTestResult +from pyspark.mllib.stat.test import ChiSqTestResult, KolmogorovSmirnovTestResult __all__ = ['MultivariateStatisticalSummary', 'Statistics'] @@ -238,6 +242,67 @@ class Statistics(object): jmodel = callMLlibFunc("chiSqTest", _convert_to_vector(observed), expected) return ChiSqTestResult(jmodel) + @staticmethod + @ignore_unicode_prefix + def kolmogorovSmirnovTest(data, distName="norm", *params): + """ + .. note:: Experimental + + Performs the Kolmogorov-Smirnov (KS) test for data sampled from + a continuous distribution. It tests the null hypothesis that + the data is generated from a particular distribution. + + The given data is sorted and the Empirical Cumulative + Distribution Function (ECDF) is calculated + which for a given point is the number of points having a CDF + value lesser than it divided by the total number of points. + + Since the data is sorted, this is a step function + that rises by (1 / length of data) for every ordered point. + + The KS statistic gives us the maximum distance between the + ECDF and the CDF. Intuitively if this statistic is large, the + probabilty that the null hypothesis is true becomes small. + For specific details of the implementation, please have a look + at the Scala documentation. + + :param data: RDD, samples from the data + :param distName: string, currently only "norm" is supported. + (Normal distribution) to calculate the + theoretical distribution of the data. + :param params: additional values which need to be provided for + a certain distribution. + If not provided, the default values are used. + :return: KolmogorovSmirnovTestResult object containing the test + statistic, degrees of freedom, p-value, + the method used, and the null hypothesis. + + >>> kstest = Statistics.kolmogorovSmirnovTest + >>> data = sc.parallelize([-1.0, 0.0, 1.0]) + >>> ksmodel = kstest(data, "norm") + >>> print(round(ksmodel.pValue, 3)) + 1.0 + >>> print(round(ksmodel.statistic, 3)) + 0.175 + >>> ksmodel.nullHypothesis + u'Sample follows theoretical distribution' + + >>> data = sc.parallelize([2.0, 3.0, 4.0]) + >>> ksmodel = kstest(data, "norm", 3.0, 1.0) + >>> print(round(ksmodel.pValue, 3)) + 1.0 + >>> print(round(ksmodel.statistic, 3)) + 0.175 + """ + if not isinstance(data, RDD): + raise TypeError("data should be an RDD, got %s." % type(data)) + if not isinstance(distName, basestring): + raise TypeError("distName should be a string, got %s." % type(distName)) + + params = [float(param) for param in params] + return KolmogorovSmirnovTestResult( + callMLlibFunc("kolmogorovSmirnovTest", data, distName, params)) + def _test(): import doctest diff --git a/python/pyspark/mllib/stat/test.py b/python/pyspark/mllib/stat/test.py index 762506e952..0abe104049 100644 --- a/python/pyspark/mllib/stat/test.py +++ b/python/pyspark/mllib/stat/test.py @@ -15,24 +15,16 @@ # limitations under the License. # -from pyspark.mllib.common import JavaModelWrapper +from pyspark.mllib.common import inherit_doc, JavaModelWrapper -__all__ = ["ChiSqTestResult"] +__all__ = ["ChiSqTestResult", "KolmogorovSmirnovTestResult"] -class ChiSqTestResult(JavaModelWrapper): +class TestResult(JavaModelWrapper): """ - .. note:: Experimental - - Object containing the test results for the chi-squared hypothesis test. + Base class for all test results. """ - @property - def method(self): - """ - Name of the test method - """ - return self._java_model.method() @property def pValue(self): @@ -67,3 +59,24 @@ class ChiSqTestResult(JavaModelWrapper): def __str__(self): return self._java_model.toString() + + +@inherit_doc +class ChiSqTestResult(TestResult): + """ + Contains test results for the chi-squared hypothesis test. + """ + + @property + def method(self): + """ + Name of the test method + """ + return self._java_model.method() + + +@inherit_doc +class KolmogorovSmirnovTestResult(TestResult): + """ + Contains test results for the Kolmogorov-Smirnov test. + """ diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index f2eab5b18f..3f5a02af12 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -869,6 +869,25 @@ class ChiSqTestTests(MLlibTestCase): self.assertIsNotNone(chi[1000]) +class KolmogorovSmirnovTest(MLlibTestCase): + + def test_R_implementation_equivalence(self): + data = self.sc.parallelize([ + 1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501, + -0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555, + -0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063, + -0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691, + 0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942 + ]) + model = Statistics.kolmogorovSmirnovTest(data, "norm") + self.assertAlmostEqual(model.statistic, 0.189, 3) + self.assertAlmostEqual(model.pValue, 0.422, 3) + + model = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1) + self.assertAlmostEqual(model.statistic, 0.189, 3) + self.assertAlmostEqual(model.pValue, 0.422, 3) + + class SerDeTest(MLlibTestCase): def test_to_java_object_rdd(self): # SPARK-6660 data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0) |