aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/statcounter.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/pyspark/statcounter.py')
-rw-r--r--python/pyspark/statcounter.py25
1 files changed, 22 insertions, 3 deletions
diff --git a/python/pyspark/statcounter.py b/python/pyspark/statcounter.py
index 8e1cbd4ad9..080325061a 100644
--- a/python/pyspark/statcounter.py
+++ b/python/pyspark/statcounter.py
@@ -26,7 +26,9 @@ class StatCounter(object):
self.n = 0L # Running count of our values
self.mu = 0.0 # Running mean of our values
self.m2 = 0.0 # Running variance numerator (sum of (x - mean)^2)
-
+ self.maxValue = float("-inf")
+ self.minValue = float("inf")
+
for v in values:
self.merge(v)
@@ -36,6 +38,11 @@ class StatCounter(object):
self.n += 1
self.mu += delta / self.n
self.m2 += delta * (value - self.mu)
+ if self.maxValue < value:
+ self.maxValue = value
+ if self.minValue > value:
+ self.minValue = value
+
return self
# Merge another StatCounter into this one, adding up the internal statistics.
@@ -49,7 +56,10 @@ class StatCounter(object):
if self.n == 0:
self.mu = other.mu
self.m2 = other.m2
- self.n = other.n
+ self.n = other.n
+ self.maxValue = other.maxValue
+ self.minValue = other.minValue
+
elif other.n != 0:
delta = other.mu - self.mu
if other.n * 10 < self.n:
@@ -58,6 +68,9 @@ class StatCounter(object):
self.mu = other.mu - (delta * self.n) / (self.n + other.n)
else:
self.mu = (self.mu * self.n + other.mu * other.n) / (self.n + other.n)
+
+ self.maxValue = max(self.maxValue, other.maxValue)
+ self.minValue = min(self.minValue, other.minValue)
self.m2 += other.m2 + (delta * delta * self.n * other.n) / (self.n + other.n)
self.n += other.n
@@ -76,6 +89,12 @@ class StatCounter(object):
def sum(self):
return self.n * self.mu
+ def min(self):
+ return self.minValue
+
+ def max(self):
+ return self.maxValue
+
# Return the variance of the values.
def variance(self):
if self.n == 0:
@@ -105,5 +124,5 @@ class StatCounter(object):
return math.sqrt(self.sampleVariance())
def __repr__(self):
- return "(count: %s, mean: %s, stdev: %s)" % (self.count(), self.mean(), self.stdev())
+ return "(count: %s, mean: %s, stdev: %s, max: %s, min: %s)" % (self.count(), self.mean(), self.stdev(), self.max(), self.min())