aboutsummaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorXiangrui Meng <meng@databricks.com>2015-01-29 10:11:44 -0800
committerXiangrui Meng <meng@databricks.com>2015-01-29 10:11:44 -0800
commita3dc6184862345c459d1fba475b1c9210038a913 (patch)
tree706a2dcab1d4c0decf5fe6269fcabf56e8d17b2f /python
parent5ad78f62056f2560cd371ee964111a646806d0ff (diff)
downloadspark-a3dc6184862345c459d1fba475b1c9210038a913.tar.gz
spark-a3dc6184862345c459d1fba475b1c9210038a913.tar.bz2
spark-a3dc6184862345c459d1fba475b1c9210038a913.zip
[SPARK-5477] refactor stat.py
There is only a single `stat.py` file for the `mllib.stat` package. We recently added `MultivariateGaussian` under `mllib.stat.distribution` in Scala/Java. It would be nice to refactor `stat.py` and make it easy to expand. Note that `ChiSqTestResult` is moved from `mllib.stat` to `mllib.stat.test`. The latter is used in Scala/Java. It is only used in the return value of `Statistics.chiSqTest`, so this should be an okay change. davies Author: Xiangrui Meng <meng@databricks.com> Closes #4266 from mengxr/py-stat-refactor and squashes the following commits: 1a5e1db [Xiangrui Meng] refactor stat.py
Diffstat (limited to 'python')
-rw-r--r--python/pyspark/mllib/stat/__init__.py24
-rw-r--r--python/pyspark/mllib/stat/_statistics.py (renamed from python/pyspark/mllib/stat.py)55
-rw-r--r--python/pyspark/mllib/stat/test.py69
-rwxr-xr-xpython/run-tests2
4 files changed, 96 insertions, 54 deletions
diff --git a/python/pyspark/mllib/stat/__init__.py b/python/pyspark/mllib/stat/__init__.py
new file mode 100644
index 0000000000..799d260c09
--- /dev/null
+++ b/python/pyspark/mllib/stat/__init__.py
@@ -0,0 +1,24 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Python package for statistical functions in MLlib.
+"""
+
+from pyspark.mllib.stat._statistics import *
+
+__all__ = ["Statistics", "MultivariateStatisticalSummary"]
diff --git a/python/pyspark/mllib/stat.py b/python/pyspark/mllib/stat/_statistics.py
index c8af777a8b..218ac148ca 100644
--- a/python/pyspark/mllib/stat.py
+++ b/python/pyspark/mllib/stat/_statistics.py
@@ -15,17 +15,14 @@
# limitations under the License.
#
-"""
-Python package for statistical functions in MLlib.
-"""
-
from pyspark import RDD
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
from pyspark.mllib.linalg import Matrix, _convert_to_vector
from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.stat.test import ChiSqTestResult
-__all__ = ['MultivariateStatisticalSummary', 'ChiSqTestResult', 'Statistics']
+__all__ = ['MultivariateStatisticalSummary', 'Statistics']
class MultivariateStatisticalSummary(JavaModelWrapper):
@@ -53,54 +50,6 @@ class MultivariateStatisticalSummary(JavaModelWrapper):
return self.call("min").toArray()
-class ChiSqTestResult(JavaModelWrapper):
- """
- .. note:: Experimental
-
- Object containing the test results for the chi-squared hypothesis test.
- """
- @property
- def method(self):
- """
- Name of the test method
- """
- return self._java_model.method()
-
- @property
- def pValue(self):
- """
- The probability of obtaining a test statistic result at least as
- extreme as the one that was actually observed, assuming that the
- null hypothesis is true.
- """
- return self._java_model.pValue()
-
- @property
- def degreesOfFreedom(self):
- """
- Returns the degree(s) of freedom of the hypothesis test.
- Return type should be Number(e.g. Int, Double) or tuples of Numbers.
- """
- return self._java_model.degreesOfFreedom()
-
- @property
- def statistic(self):
- """
- Test statistic.
- """
- return self._java_model.statistic()
-
- @property
- def nullHypothesis(self):
- """
- Null hypothesis of the test.
- """
- return self._java_model.nullHypothesis()
-
- def __str__(self):
- return self._java_model.toString()
-
-
class Statistics(object):
@staticmethod
diff --git a/python/pyspark/mllib/stat/test.py b/python/pyspark/mllib/stat/test.py
new file mode 100644
index 0000000000..762506e952
--- /dev/null
+++ b/python/pyspark/mllib/stat/test.py
@@ -0,0 +1,69 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from pyspark.mllib.common import JavaModelWrapper
+
+
+__all__ = ["ChiSqTestResult"]
+
+
+class ChiSqTestResult(JavaModelWrapper):
+ """
+ .. note:: Experimental
+
+ Object containing the test results for the chi-squared hypothesis test.
+ """
+ @property
+ def method(self):
+ """
+ Name of the test method
+ """
+ return self._java_model.method()
+
+ @property
+ def pValue(self):
+ """
+ The probability of obtaining a test statistic result at least as
+ extreme as the one that was actually observed, assuming that the
+ null hypothesis is true.
+ """
+ return self._java_model.pValue()
+
+ @property
+ def degreesOfFreedom(self):
+ """
+ Returns the degree(s) of freedom of the hypothesis test.
+ Return type should be Number(e.g. Int, Double) or tuples of Numbers.
+ """
+ return self._java_model.degreesOfFreedom()
+
+ @property
+ def statistic(self):
+ """
+ Test statistic.
+ """
+ return self._java_model.statistic()
+
+ @property
+ def nullHypothesis(self):
+ """
+ Null hypothesis of the test.
+ """
+ return self._java_model.nullHypothesis()
+
+ def __str__(self):
+ return self._java_model.toString()
diff --git a/python/run-tests b/python/run-tests
index 84cb89b1a9..e91f1a875d 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -76,7 +76,7 @@ function run_mllib_tests() {
run_test "pyspark/mllib/rand.py"
run_test "pyspark/mllib/recommendation.py"
run_test "pyspark/mllib/regression.py"
- run_test "pyspark/mllib/stat.py"
+ run_test "pyspark/mllib/stat/_statistics.py"
run_test "pyspark/mllib/tree.py"
run_test "pyspark/mllib/util.py"
run_test "pyspark/mllib/tests.py"