aboutsummaryrefslogtreecommitdiff
path: root/python/pyspark/statcounter.py
diff options
context:
space:
mode:
authorNicholas Chammas <nicholas.chammas@gmail.com>2014-07-21 22:30:53 -0700
committerReynold Xin <rxin@apache.org>2014-07-21 22:30:53 -0700
commit5d16d5bbfd242c16ee0d6952c48dcd90651f8ae2 (patch)
tree61352aa954fb1fb2001586c8795a959421eb3c6f /python/pyspark/statcounter.py
parentc3462c65684885299cf037d56c88bd53c08c6348 (diff)
downloadspark-5d16d5bbfd242c16ee0d6952c48dcd90651f8ae2.tar.gz
spark-5d16d5bbfd242c16ee0d6952c48dcd90651f8ae2.tar.bz2
spark-5d16d5bbfd242c16ee0d6952c48dcd90651f8ae2.zip
[SPARK-2470] PEP8 fixes to PySpark
This pull request aims to resolve all outstanding PEP8 violations in PySpark. Author: Nicholas Chammas <nicholas.chammas@gmail.com> Author: nchammas <nicholas.chammas@gmail.com> Closes #1505 from nchammas/master and squashes the following commits: 98171af [Nicholas Chammas] [SPARK-2470] revert PEP 8 fixes to cloudpickle cba7768 [Nicholas Chammas] [SPARK-2470] wrap expression list in parentheses e178dbe [Nicholas Chammas] [SPARK-2470] style - change position of line break 9127d2b [Nicholas Chammas] [SPARK-2470] wrap expression lists in parentheses 22132a4 [Nicholas Chammas] [SPARK-2470] wrap conditionals in parentheses 24639bc [Nicholas Chammas] [SPARK-2470] fix whitespace for doctest 7d557b7 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to tests.py 8f8e4c0 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to storagelevel.py b3b96cf [Nicholas Chammas] [SPARK-2470] PEP8 fixes to statcounter.py d644477 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to worker.py aa3a7b6 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to sql.py 1916859 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to shell.py 95d1d95 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to serializers.py a0fec2e [Nicholas Chammas] [SPARK-2470] PEP8 fixes to mllib c85e1e5 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to join.py d14f2f1 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to __init__.py 81fcb20 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to resultiterable.py 1bde265 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to java_gateway.py 7fc849c [Nicholas Chammas] [SPARK-2470] PEP8 fixes to daemon.py ca2d28b [Nicholas Chammas] [SPARK-2470] PEP8 fixes to context.py f4e0039 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to conf.py a6d5e4b [Nicholas Chammas] [SPARK-2470] PEP8 fixes to cloudpickle.py f0a7ebf [Nicholas Chammas] [SPARK-2470] PEP8 fixes to rddsampler.py 4dd148f [nchammas] Merge pull request #5 from apache/master f7e4581 [Nicholas Chammas] unrelated pep8 fix a36eed0 [Nicholas Chammas] name ec2 instances and security groups consistently de7292a [nchammas] Merge pull request #4 from apache/master 2e4fe00 [nchammas] Merge pull request #3 from apache/master 89fde08 [nchammas] Merge pull request #2 from apache/master 69f6e22 [Nicholas Chammas] PEP8 fixes 2627247 [Nicholas Chammas] broke up lines before they hit 100 chars 6544b7e [Nicholas Chammas] [SPARK-2065] give launched instances names 69da6cf [nchammas] Merge pull request #1 from apache/master
Diffstat (limited to 'python/pyspark/statcounter.py')
-rw-r--r--python/pyspark/statcounter.py25
1 files changed, 13 insertions, 12 deletions
diff --git a/python/pyspark/statcounter.py b/python/pyspark/statcounter.py
index 080325061a..e287bd3da1 100644
--- a/python/pyspark/statcounter.py
+++ b/python/pyspark/statcounter.py
@@ -20,18 +20,19 @@
import copy
import math
+
class StatCounter(object):
-
+
def __init__(self, values=[]):
self.n = 0L # Running count of our values
self.mu = 0.0 # Running mean of our values
self.m2 = 0.0 # Running variance numerator (sum of (x - mean)^2)
self.maxValue = float("-inf")
self.minValue = float("inf")
-
+
for v in values:
self.merge(v)
-
+
# Add a value into this StatCounter, updating the internal statistics.
def merge(self, value):
delta = value - self.mu
@@ -42,7 +43,7 @@ class StatCounter(object):
self.maxValue = value
if self.minValue > value:
self.minValue = value
-
+
return self
# Merge another StatCounter into this one, adding up the internal statistics.
@@ -50,7 +51,7 @@ class StatCounter(object):
if not isinstance(other, StatCounter):
raise Exception("Can only merge Statcounters!")
- if other is self: # reference equality holds
+ if other is self: # reference equality holds
self.merge(copy.deepcopy(other)) # Avoid overwriting fields in a weird order
else:
if self.n == 0:
@@ -59,8 +60,8 @@ class StatCounter(object):
self.n = other.n
self.maxValue = other.maxValue
self.minValue = other.minValue
-
- elif other.n != 0:
+
+ elif other.n != 0:
delta = other.mu - self.mu
if other.n * 10 < self.n:
self.mu = self.mu + (delta * other.n) / (self.n + other.n)
@@ -68,10 +69,10 @@ class StatCounter(object):
self.mu = other.mu - (delta * self.n) / (self.n + other.n)
else:
self.mu = (self.mu * self.n + other.mu * other.n) / (self.n + other.n)
-
+
self.maxValue = max(self.maxValue, other.maxValue)
self.minValue = min(self.minValue, other.minValue)
-
+
self.m2 += other.m2 + (delta * delta * self.n * other.n) / (self.n + other.n)
self.n += other.n
return self
@@ -94,7 +95,7 @@ class StatCounter(object):
def max(self):
return self.maxValue
-
+
# Return the variance of the values.
def variance(self):
if self.n == 0:
@@ -124,5 +125,5 @@ class StatCounter(object):
return math.sqrt(self.sampleVariance())
def __repr__(self):
- return "(count: %s, mean: %s, stdev: %s, max: %s, min: %s)" % (self.count(), self.mean(), self.stdev(), self.max(), self.min())
-
+ return ("(count: %s, mean: %s, stdev: %s, max: %s, min: %s)" %
+ (self.count(), self.mean(), self.stdev(), self.max(), self.min()))