aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorMechCoder <manojkumarsivaraj334@gmail.com>2015-07-20 09:00:01 -0700
committerXiangrui Meng <meng@databricks.com>2015-07-20 09:00:01 -0700
commitd0b4e93f7e92ea59058cc457a5586a4d9a596d71 (patch)
tree9af6a3d90845d2cc36fa573125e8ed62770b7804 /python
parent3f7de7db4cf7c5e2824cb91087c5e9d4beb0f738 (diff)
downloadspark-d0b4e93f7e92ea59058cc457a5586a4d9a596d71.tar.gz
spark-d0b4e93f7e92ea59058cc457a5586a4d9a596d71.tar.bz2
spark-d0b4e93f7e92ea59058cc457a5586a4d9a596d71.zip
[SPARK-8996] [MLLIB] [PYSPARK] Python API for Kolmogorov-Smirnov Test
Python API for the KS-test Statistics.kolmogorovSmirnovTest(data, distName, *params) I'm not quite sure how to support the callable function since it is not serializable. Author: MechCoder <manojkumarsivaraj334@gmail.com> Closes #7430 from MechCoder/spark-8996 and squashes the following commits: 2dd009d [MechCoder] minor 021d233 [MechCoder] Remove one wrapper and other minor stuff 49d07ab [MechCoder] [SPARK-8996] [MLlib] Python API for Kolmogorov-Smirnov Test
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/mllib/stat/_statistics.py67
-rw-r--r--python/pyspark/mllib/stat/test.py37
-rw-r--r--python/pyspark/mllib/tests.py19
3 files changed, 110 insertions, 13 deletions
diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py
index b475be4b4d..36c8f48a4a 100644
--- a/python/pyspark/mllib/stat/_statistics.py
+++ b/python/pyspark/mllib/stat/_statistics.py
@@ -15,11 +15,15 @@
# limitations under the License.
#
+import sys
+if sys.version >= '3':
+ basestring = str
+
from pyspark.rdd import RDD, ignore_unicode_prefix
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
from pyspark.mllib.linalg import Matrix, _convert_to_vector
from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib.stat.test import ChiSqTestResult
+from pyspark.mllib.stat.test import ChiSqTestResult, KolmogorovSmirnovTestResult
__all__ = ['MultivariateStatisticalSummary', 'Statistics']
@@ -238,6 +242,67 @@ class Statistics(object):
jmodel = callMLlibFunc("chiSqTest", _convert_to_vector(observed), expected)
return ChiSqTestResult(jmodel)
+ @staticmethod
+ @ignore_unicode_prefix
+ def kolmogorovSmirnovTest(data, distName="norm", *params):
+ """
+ .. note:: Experimental
+
+ Performs the Kolmogorov-Smirnov (KS) test for data sampled from
+ a continuous distribution. It tests the null hypothesis that
+ the data is generated from a particular distribution.
+
+ The given data is sorted and the Empirical Cumulative
+ Distribution Function (ECDF) is calculated
+ which for a given point is the number of points having a CDF
+ value lesser than it divided by the total number of points.
+
+ Since the data is sorted, this is a step function
+ that rises by (1 / length of data) for every ordered point.
+
+ The KS statistic gives us the maximum distance between the
+ ECDF and the CDF. Intuitively if this statistic is large, the
+ probabilty that the null hypothesis is true becomes small.
+ For specific details of the implementation, please have a look
+ at the Scala documentation.
+
+ :param data: RDD, samples from the data
+ :param distName: string, currently only "norm" is supported.
+ (Normal distribution) to calculate the
+ theoretical distribution of the data.
+ :param params: additional values which need to be provided for
+ a certain distribution.
+ If not provided, the default values are used.
+ :return: KolmogorovSmirnovTestResult object containing the test
+ statistic, degrees of freedom, p-value,
+ the method used, and the null hypothesis.
+
+ >>> kstest = Statistics.kolmogorovSmirnovTest
+ >>> data = sc.parallelize([-1.0, 0.0, 1.0])
+ >>> ksmodel = kstest(data, "norm")
+ >>> print(round(ksmodel.pValue, 3))
+ 1.0
+ >>> print(round(ksmodel.statistic, 3))
+ 0.175
+ >>> ksmodel.nullHypothesis
+ u'Sample follows theoretical distribution'
+
+ >>> data = sc.parallelize([2.0, 3.0, 4.0])
+ >>> ksmodel = kstest(data, "norm", 3.0, 1.0)
+ >>> print(round(ksmodel.pValue, 3))
+ 1.0
+ >>> print(round(ksmodel.statistic, 3))
+ 0.175
+ """
+ if not isinstance(data, RDD):
+ raise TypeError("data should be an RDD, got %s." % type(data))
+ if not isinstance(distName, basestring):
+ raise TypeError("distName should be a string, got %s." % type(distName))
+
+ params = [float(param) for param in params]
+ return KolmogorovSmirnovTestResult(
+ callMLlibFunc("kolmogorovSmirnovTest", data, distName, params))
+
def _test():
import doctest
diff --git a/python/pyspark/mllib/stat/test.py b/python/pyspark/mllib/stat/test.py
index 762506e952..0abe104049 100644
--- a/python/pyspark/mllib/stat/test.py
+++ b/python/pyspark/mllib/stat/test.py
@@ -15,24 +15,16 @@
# limitations under the License.
#
-from pyspark.mllib.common import JavaModelWrapper
+from pyspark.mllib.common import inherit_doc, JavaModelWrapper
-__all__ = ["ChiSqTestResult"]
+__all__ = ["ChiSqTestResult", "KolmogorovSmirnovTestResult"]
-class ChiSqTestResult(JavaModelWrapper):
+class TestResult(JavaModelWrapper):
"""
- .. note:: Experimental
-
- Object containing the test results for the chi-squared hypothesis test.
+ Base class for all test results.
"""
- @property
- def method(self):
- """
- Name of the test method
- """
- return self._java_model.method()
@property
def pValue(self):
@@ -67,3 +59,24 @@ class ChiSqTestResult(JavaModelWrapper):
def __str__(self):
return self._java_model.toString()
+
+
+@inherit_doc
+class ChiSqTestResult(TestResult):
+ """
+ Contains test results for the chi-squared hypothesis test.
+ """
+
+ @property
+ def method(self):
+ """
+ Name of the test method
+ """
+ return self._java_model.method()
+
+
+@inherit_doc
+class KolmogorovSmirnovTestResult(TestResult):
+ """
+ Contains test results for the Kolmogorov-Smirnov test.
+ """
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index f2eab5b18f..3f5a02af12 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -869,6 +869,25 @@ class ChiSqTestTests(MLlibTestCase):
self.assertIsNotNone(chi[1000])
+class KolmogorovSmirnovTest(MLlibTestCase):
+
+ def test_R_implementation_equivalence(self):
+ data = self.sc.parallelize([
+ 1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501,
+ -0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555,
+ -0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063,
+ -0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691,
+ 0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942
+ ])
+ model = Statistics.kolmogorovSmirnovTest(data, "norm")
+ self.assertAlmostEqual(model.statistic, 0.189, 3)
+ self.assertAlmostEqual(model.pValue, 0.422, 3)
+
+ model = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
+ self.assertAlmostEqual(model.statistic, 0.189, 3)
+ self.assertAlmostEqual(model.pValue, 0.422, 3)
+
+
class SerDeTest(MLlibTestCase):
def test_to_java_object_rdd(self): # SPARK-6660
data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0)