Spark 1246 add min max to stat counter

Here's the addition of min and max to statscounter.py and min and max methods to rdd.py. Author: Dan McClary <dan.mcclary@gmail.com> Closes #144 from dwmclary/SPARK-1246-add-min-max-to-stat-counter and squashes the following commits: fd3fd4b [Dan McClary] fixed error, updated test 82cde0e [Dan McClary] flipped incorrectly assigned inf values in StatCounter 5d96799 [Dan McClary] added max and min to StatCounter repr for pyspark 21dd366 [Dan McClary] added max and min to StatCounter output, updated doc 1a97558 [Dan McClary] added max and min to StatCounter output, updated doc a5c13b0 [Dan McClary] Added min and max to Scala and Java RDD, added min and max to StatCounter ed67136 [Dan McClary] broke min/max out into separate transaction, added to rdd.py 1e7056d [Dan McClary] added underscore to getBucket 37a7dea [Dan McClary] cleaned up boundaries for histogram -- uses real min/max when buckets are derived 29981f2 [Dan McClary] fixed indentation on doctest comment eaf89d9 [Dan McClary] added correct doctest for histogram 4916016 [Dan McClary] added histogram method, added max and min to statscounter
author: Dan McClary <dan.mcclary@gmail.com> 2014-03-18 00:45:47 -0700
committer: Matei Zaharia <matei@databricks.com> 2014-03-18 00:45:47 -0700
commit: e3681f26fae7e87321ac991f5a0fb7517415803a (patch)
tree: ecb920edc1484d1a3d7bda193546f1c71984e0a4 /python
parent: 087eedca32fd87bfe1629588091bd307d45e4a7c (diff)
download: spark-e3681f26fae7e87321ac991f5a0fb7517415803a.tar.gz
spark-e3681f26fae7e87321ac991f5a0fb7517415803a.tar.bz2
spark-e3681f26fae7e87321ac991f5a0fb7517415803a.zip
2 files changed, 41 insertions, 3 deletions
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index f3b432ff24..ae09dbff02 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -571,7 +571,26 @@ class RDD(object):
         return reduce(op, vals, zeroValue)
 
     # TODO: aggregate
+        
+
+    def max(self):
+        """
+        Find the maximum item in this RDD.
+
+        >>> sc.parallelize([1.0, 5.0, 43.0, 10.0]).max()
+        43.0
+        """
+        return self.reduce(max)
 
+    def min(self):
+        """
+        Find the maximum item in this RDD.
+
+        >>> sc.parallelize([1.0, 5.0, 43.0, 10.0]).min()
+        1.0
+        """
+        return self.reduce(min)
+    
     def sum(self):
         """
         Add up the elements in this RDD.
diff --git a/python/pyspark/statcounter.py b/python/pyspark/statcounter.py
index 8e1cbd4ad9..080325061a 100644
--- a/python/pyspark/statcounter.py
+++ b/python/pyspark/statcounter.py
@@ -26,7 +26,9 @@ class StatCounter(object):
         self.n = 0L    # Running count of our values
         self.mu = 0.0  # Running mean of our values
         self.m2 = 0.0  # Running variance numerator (sum of (x - mean)^2)
-
+        self.maxValue = float("-inf")
+        self.minValue = float("inf")
+        
         for v in values:
             self.merge(v)
             
@@ -36,6 +38,11 @@ class StatCounter(object):
         self.n += 1
         self.mu += delta / self.n
         self.m2 += delta * (value - self.mu)
+        if self.maxValue < value:
+            self.maxValue = value
+        if self.minValue > value:
+            self.minValue = value
+            
         return self
 
     # Merge another StatCounter into this one, adding up the internal statistics.
@@ -49,7 +56,10 @@ class StatCounter(object):
             if self.n == 0:
                 self.mu = other.mu
                 self.m2 = other.m2
-                self.n = other.n       
+                self.n = other.n
+                self.maxValue = other.maxValue
+                self.minValue = other.minValue
+                
             elif other.n != 0:        
                 delta = other.mu - self.mu
                 if other.n * 10 < self.n:
@@ -58,6 +68,9 @@ class StatCounter(object):
                     self.mu = other.mu - (delta * self.n) / (self.n + other.n)
                 else:
                     self.mu = (self.mu * self.n + other.mu * other.n) / (self.n + other.n)
+                
+                    self.maxValue = max(self.maxValue, other.maxValue)
+                    self.minValue = min(self.minValue, other.minValue)
         
                 self.m2 += other.m2 + (delta * delta * self.n * other.n) / (self.n + other.n)
                 self.n += other.n
@@ -76,6 +89,12 @@ class StatCounter(object):
     def sum(self):
         return self.n * self.mu
 
+    def min(self):
+        return self.minValue
+
+    def max(self):
+        return self.maxValue
+    
     # Return the variance of the values.
     def variance(self):
         if self.n == 0:
@@ -105,5 +124,5 @@ class StatCounter(object):
         return math.sqrt(self.sampleVariance())
 
     def __repr__(self):
-        return "(count: %s, mean: %s, stdev: %s)" % (self.count(), self.mean(), self.stdev())
+        return "(count: %s, mean: %s, stdev: %s, max: %s, min: %s)" % (self.count(), self.mean(), self.stdev(), self.max(), self.min())
author	Dan McClary <dan.mcclary@gmail.com>	2014-03-18 00:45:47 -0700
committer	Matei Zaharia <matei@databricks.com>	2014-03-18 00:45:47 -0700
commit	e3681f26fae7e87321ac991f5a0fb7517415803a (patch)
tree	ecb920edc1484d1a3d7bda193546f1c71984e0a4 /python
parent	087eedca32fd87bfe1629588091bd307d45e4a7c (diff)
download	spark-e3681f26fae7e87321ac991f5a0fb7517415803a.tar.gz spark-e3681f26fae7e87321ac991f5a0fb7517415803a.tar.bz2 spark-e3681f26fae7e87321ac991f5a0fb7517415803a.zip