[SPARK-8996] [MLLIB] [PYSPARK] Python API for Kolmogorov-Smirnov Test

Python API for the KS-test Statistics.kolmogorovSmirnovTest(data, distName, *params) I'm not quite sure how to support the callable function since it is not serializable. Author: MechCoder <manojkumarsivaraj334@gmail.com> Closes #7430 from MechCoder/spark-8996 and squashes the following commits: 2dd009d [MechCoder] minor 021d233 [MechCoder] Remove one wrapper and other minor stuff 49d07ab [MechCoder] [SPARK-8996] [MLlib] Python API for Kolmogorov-Smirnov Test
author: MechCoder <manojkumarsivaraj334@gmail.com> 2015-07-20 09:00:01 -0700
committer: Xiangrui Meng <meng@databricks.com> 2015-07-20 09:00:01 -0700
commit: d0b4e93f7e92ea59058cc457a5586a4d9a596d71 (patch)
tree: 9af6a3d90845d2cc36fa573125e8ed62770b7804 /python
parent: 3f7de7db4cf7c5e2824cb91087c5e9d4beb0f738 (diff)
download: spark-d0b4e93f7e92ea59058cc457a5586a4d9a596d71.tar.gz
spark-d0b4e93f7e92ea59058cc457a5586a4d9a596d71.tar.bz2
spark-d0b4e93f7e92ea59058cc457a5586a4d9a596d71.zip
3 files changed, 110 insertions, 13 deletions
diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py
index b475be4b4d..36c8f48a4a 100644
--- a/python/pyspark/mllib/stat/_statistics.py
+++ b/python/pyspark/mllib/stat/_statistics.py
@@ -15,11 +15,15 @@
 # limitations under the License.
 #
 
+import sys
+if sys.version >= '3':
+    basestring = str
+
 from pyspark.rdd import RDD, ignore_unicode_prefix
 from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
 from pyspark.mllib.linalg import Matrix, _convert_to_vector
 from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib.stat.test import ChiSqTestResult
+from pyspark.mllib.stat.test import ChiSqTestResult, KolmogorovSmirnovTestResult
 
 
 __all__ = ['MultivariateStatisticalSummary', 'Statistics']
@@ -238,6 +242,67 @@ class Statistics(object):
             jmodel = callMLlibFunc("chiSqTest", _convert_to_vector(observed), expected)
         return ChiSqTestResult(jmodel)
 
+    @staticmethod
+    @ignore_unicode_prefix
+    def kolmogorovSmirnovTest(data, distName="norm", *params):
+        """
+        .. note:: Experimental
+
+        Performs the Kolmogorov-Smirnov (KS) test for data sampled from
+        a continuous distribution. It tests the null hypothesis that
+        the data is generated from a particular distribution.
+
+        The given data is sorted and the Empirical Cumulative
+        Distribution Function (ECDF) is calculated
+        which for a given point is the number of points having a CDF
+        value lesser than it divided by the total number of points.
+
+        Since the data is sorted, this is a step function
+        that rises by (1 / length of data) for every ordered point.
+
+        The KS statistic gives us the maximum distance between the
+        ECDF and the CDF. Intuitively if this statistic is large, the
+        probabilty that the null hypothesis is true becomes small.
+        For specific details of the implementation, please have a look
+        at the Scala documentation.
+
+        :param data: RDD, samples from the data
+        :param distName: string, currently only "norm" is supported.
+                         (Normal distribution) to calculate the
+                         theoretical distribution of the data.
+        :param params: additional values which need to be provided for
+                       a certain distribution.
+                       If not provided, the default values are used.
+        :return: KolmogorovSmirnovTestResult object containing the test
+                 statistic, degrees of freedom, p-value,
+                 the method used, and the null hypothesis.
+
+        >>> kstest = Statistics.kolmogorovSmirnovTest
+        >>> data = sc.parallelize([-1.0, 0.0, 1.0])
+        >>> ksmodel = kstest(data, "norm")
+        >>> print(round(ksmodel.pValue, 3))
+        1.0
+        >>> print(round(ksmodel.statistic, 3))
+        0.175
+        >>> ksmodel.nullHypothesis
+        u'Sample follows theoretical distribution'
+
+        >>> data = sc.parallelize([2.0, 3.0, 4.0])
+        >>> ksmodel = kstest(data, "norm", 3.0, 1.0)
+        >>> print(round(ksmodel.pValue, 3))
+        1.0
+        >>> print(round(ksmodel.statistic, 3))
+        0.175
+        """
+        if not isinstance(data, RDD):
+            raise TypeError("data should be an RDD, got %s." % type(data))
+        if not isinstance(distName, basestring):
+            raise TypeError("distName should be a string, got %s." % type(distName))
+
+        params = [float(param) for param in params]
+        return KolmogorovSmirnovTestResult(
+            callMLlibFunc("kolmogorovSmirnovTest", data, distName, params))
+
 
 def _test():
     import doctest
diff --git a/python/pyspark/mllib/stat/test.py b/python/pyspark/mllib/stat/test.py
index 762506e952..0abe104049 100644
--- a/python/pyspark/mllib/stat/test.py
+++ b/python/pyspark/mllib/stat/test.py
@@ -15,24 +15,16 @@
 # limitations under the License.
 #
 
-from pyspark.mllib.common import JavaModelWrapper
+from pyspark.mllib.common import inherit_doc, JavaModelWrapper
 
 
-__all__ = ["ChiSqTestResult"]
+__all__ = ["ChiSqTestResult", "KolmogorovSmirnovTestResult"]
 
 
-class ChiSqTestResult(JavaModelWrapper):
+class TestResult(JavaModelWrapper):
     """
-    .. note:: Experimental
-
-    Object containing the test results for the chi-squared hypothesis test.
+    Base class for all test results.
     """
-    @property
-    def method(self):
-        """
-        Name of the test method
-        """
-        return self._java_model.method()
 
     @property
     def pValue(self):
@@ -67,3 +59,24 @@ class ChiSqTestResult(JavaModelWrapper):
 
     def __str__(self):
         return self._java_model.toString()
+
+
+@inherit_doc
+class ChiSqTestResult(TestResult):
+    """
+    Contains test results for the chi-squared hypothesis test.
+    """
+
+    @property
+    def method(self):
+        """
+        Name of the test method
+        """
+        return self._java_model.method()
+
+
+@inherit_doc
+class KolmogorovSmirnovTestResult(TestResult):
+    """
+    Contains test results for the Kolmogorov-Smirnov test.
+    """
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index f2eab5b18f..3f5a02af12 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -869,6 +869,25 @@ class ChiSqTestTests(MLlibTestCase):
         self.assertIsNotNone(chi[1000])
 
 
+class KolmogorovSmirnovTest(MLlibTestCase):
+
+    def test_R_implementation_equivalence(self):
+        data = self.sc.parallelize([
+            1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501,
+            -0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555,
+            -0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063,
+            -0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691,
+            0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942
+        ])
+        model = Statistics.kolmogorovSmirnovTest(data, "norm")
+        self.assertAlmostEqual(model.statistic, 0.189, 3)
+        self.assertAlmostEqual(model.pValue, 0.422, 3)
+
+        model = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
+        self.assertAlmostEqual(model.statistic, 0.189, 3)
+        self.assertAlmostEqual(model.pValue, 0.422, 3)
+
+
 class SerDeTest(MLlibTestCase):
     def test_to_java_object_rdd(self):  # SPARK-6660
         data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0)
author	MechCoder <manojkumarsivaraj334@gmail.com>	2015-07-20 09:00:01 -0700
committer	Xiangrui Meng <meng@databricks.com>	2015-07-20 09:00:01 -0700
commit	d0b4e93f7e92ea59058cc457a5586a4d9a596d71 (patch)
tree	9af6a3d90845d2cc36fa573125e8ed62770b7804 /python
parent	3f7de7db4cf7c5e2824cb91087c5e9d4beb0f738 (diff)
download	spark-d0b4e93f7e92ea59058cc457a5586a4d9a596d71.tar.gz spark-d0b4e93f7e92ea59058cc457a5586a4d9a596d71.tar.bz2 spark-d0b4e93f7e92ea59058cc457a5586a4d9a596d71.zip