aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/mllib/stat/_statistics.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/pyspark/mllib/stat/_statistics.py')
-rw-r--r--python/pyspark/mllib/stat/_statistics.py25
1 files changed, 13 insertions, 12 deletions
diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py
index 1d83e9d483..b475be4b4d 100644
--- a/python/pyspark/mllib/stat/_statistics.py
+++ b/python/pyspark/mllib/stat/_statistics.py
@@ -15,7 +15,7 @@
# limitations under the License.
#
-from pyspark import RDD
+from pyspark.rdd import RDD, ignore_unicode_prefix
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
from pyspark.mllib.linalg import Matrix, _convert_to_vector
from pyspark.mllib.regression import LabeledPoint
@@ -38,7 +38,7 @@ class MultivariateStatisticalSummary(JavaModelWrapper):
return self.call("variance").toArray()
def count(self):
- return self.call("count")
+ return int(self.call("count"))
def numNonzeros(self):
return self.call("numNonzeros").toArray()
@@ -78,7 +78,7 @@ class Statistics(object):
>>> cStats.variance()
array([ 4., 13., 0., 25.])
>>> cStats.count()
- 3L
+ 3
>>> cStats.numNonzeros()
array([ 3., 2., 0., 3.])
>>> cStats.max()
@@ -124,20 +124,20 @@ class Statistics(object):
>>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]),
... Vectors.dense([6, 7, 0, 8]), Vectors.dense([9, 0, 0, 1])])
>>> pearsonCorr = Statistics.corr(rdd)
- >>> print str(pearsonCorr).replace('nan', 'NaN')
+ >>> print(str(pearsonCorr).replace('nan', 'NaN'))
[[ 1. 0.05564149 NaN 0.40047142]
[ 0.05564149 1. NaN 0.91359586]
[ NaN NaN 1. NaN]
[ 0.40047142 0.91359586 NaN 1. ]]
>>> spearmanCorr = Statistics.corr(rdd, method="spearman")
- >>> print str(spearmanCorr).replace('nan', 'NaN')
+ >>> print(str(spearmanCorr).replace('nan', 'NaN'))
[[ 1. 0.10540926 NaN 0.4 ]
[ 0.10540926 1. NaN 0.9486833 ]
[ NaN NaN 1. NaN]
[ 0.4 0.9486833 NaN 1. ]]
>>> try:
... Statistics.corr(rdd, "spearman")
- ... print "Method name as second argument without 'method=' shouldn't be allowed."
+ ... print("Method name as second argument without 'method=' shouldn't be allowed.")
... except TypeError:
... pass
"""
@@ -153,6 +153,7 @@ class Statistics(object):
return callMLlibFunc("corr", x.map(float), y.map(float), method)
@staticmethod
+ @ignore_unicode_prefix
def chiSqTest(observed, expected=None):
"""
.. note:: Experimental
@@ -188,11 +189,11 @@ class Statistics(object):
>>> from pyspark.mllib.linalg import Vectors, Matrices
>>> observed = Vectors.dense([4, 6, 5])
>>> pearson = Statistics.chiSqTest(observed)
- >>> print pearson.statistic
+ >>> print(pearson.statistic)
0.4
>>> pearson.degreesOfFreedom
2
- >>> print round(pearson.pValue, 4)
+ >>> print(round(pearson.pValue, 4))
0.8187
>>> pearson.method
u'pearson'
@@ -202,12 +203,12 @@ class Statistics(object):
>>> observed = Vectors.dense([21, 38, 43, 80])
>>> expected = Vectors.dense([3, 5, 7, 20])
>>> pearson = Statistics.chiSqTest(observed, expected)
- >>> print round(pearson.pValue, 4)
+ >>> print(round(pearson.pValue, 4))
0.0027
>>> data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
>>> chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))
- >>> print round(chi.statistic, 4)
+ >>> print(round(chi.statistic, 4))
21.9958
>>> data = [LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
@@ -218,9 +219,9 @@ class Statistics(object):
... LabeledPoint(1.0, Vectors.dense([3.5, 40.0])),]
>>> rdd = sc.parallelize(data, 4)
>>> chi = Statistics.chiSqTest(rdd)
- >>> print chi[0].statistic
+ >>> print(chi[0].statistic)
0.75
- >>> print chi[1].statistic
+ >>> print(chi[1].statistic)
1.5
"""
if isinstance(observed, RDD):