[SPARK-9966] [STREAMING] Handle couple of corner cases in PIDRateEstimator

1. The rate estimator should not estimate any rate when there are no records in the batch, as there is no data to estimate the rate. In the current state, it estimates and set the rate to zero. That is incorrect. 2. The rate estimator should not never set the rate to zero under any circumstances. Otherwise the system will stop receiving data, and stop generating useful estimates (see reason 1). So the fix is to define a parameters that sets a lower bound on the estimated rate, so that the system always receives some data. Author: Tathagata Das <tathagata.das1565@gmail.com> Closes #8199 from tdas/SPARK-9966 and squashes the following commits: 829f793 [Tathagata Das] Fixed unit test and added comments 3a994db [Tathagata Das] Added min rate and updated tests in PIDRateEstimator
author: Tathagata Das <tathagata.das1565@gmail.com> 2015-08-14 15:10:01 -0700
committer: Tathagata Das <tathagata.das1565@gmail.com> 2015-08-14 15:10:01 -0700
commit: f3bfb711c1742d0915e43bda8230b4d1d22b4190 (patch)
tree: c6339008ed647f134b8fcca88e78afab81e865c0 /streaming/src/main
parent: 1150a19b188a075166899fdb1e107b2ba1e505d8 (diff)
download: spark-f3bfb711c1742d0915e43bda8230b4d1d22b4190.tar.gz
spark-f3bfb711c1742d0915e43bda8230b4d1d22b4190.tar.bz2
spark-f3bfb711c1742d0915e43bda8230b4d1d22b4190.zip
2 files changed, 36 insertions, 14 deletions
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala
index 6ae56a68ad..84a3ca9d74 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.streaming.scheduler.rate
 
+import org.apache.spark.Logging
+
 /**
  * Implements a proportional-integral-derivative (PID) controller which acts on
  * the speed of ingestion of elements into Spark Streaming. A PID controller works
@@ -26,7 +28,7 @@ package org.apache.spark.streaming.scheduler.rate
  *
  * @see https://en.wikipedia.org/wiki/PID_controller
  *
- * @param batchDurationMillis the batch duration, in milliseconds
+ * @param batchIntervalMillis the batch duration, in milliseconds
  * @param proportional how much the correction should depend on the current
  *        error. This term usually provides the bulk of correction and should be positive or zero.
  *        A value too large would make the controller overshoot the setpoint, while a small value
@@ -39,13 +41,17 @@ package org.apache.spark.streaming.scheduler.rate
  *        of future errors, based on current rate of change. This value should be positive or 0.
  *        This term is not used very often, as it impacts stability of the system. The default
  *        value is 0.
+ * @param minRate what is the minimum rate that can be estimated.
+ *        This must be greater than zero, so that the system always receives some data for rate
+ *        estimation to work.
  */
 private[streaming] class PIDRateEstimator(
     batchIntervalMillis: Long,
-    proportional: Double = 1D,
-    integral: Double = .2D,
-    derivative: Double = 0D)
-  extends RateEstimator {
+    proportional: Double,
+    integral: Double,
+    derivative: Double,
+    minRate: Double
+  ) extends RateEstimator with Logging {
 
   private var firstRun: Boolean = true
   private var latestTime: Long = -1L
@@ -64,16 +70,23 @@ private[streaming] class PIDRateEstimator(
   require(
     derivative >= 0,
     s"Derivative term $derivative in PIDRateEstimator should be >= 0.")
+  require(
+    minRate > 0,
+    s"Minimum rate in PIDRateEstimator should be > 0")
 
+  logInfo(s"Created PIDRateEstimator with proportional = $proportional, integral = $integral, " +
+    s"derivative = $derivative, min rate = $minRate")
 
-  def compute(time: Long, // in milliseconds
+  def compute(
+      time: Long, // in milliseconds
       numElements: Long,
       processingDelay: Long, // in milliseconds
       schedulingDelay: Long // in milliseconds
     ): Option[Double] = {
-
+    logTrace(s"\ntime = $time, # records = $numElements, " +
+      s"processing time = $processingDelay, scheduling delay = $schedulingDelay")
     this.synchronized {
-      if (time > latestTime && processingDelay > 0 && batchIntervalMillis > 0) {
+      if (time > latestTime && numElements > 0 && processingDelay > 0) {
 
         // in seconds, should be close to batchDuration
         val delaySinceUpdate = (time - latestTime).toDouble / 1000
@@ -104,21 +117,30 @@ private[streaming] class PIDRateEstimator(
 
         val newRate = (latestRate - proportional * error -
                                     integral * historicalError -
-                                    derivative * dError).max(0.0)
+                                    derivative * dError).max(minRate)
+        logTrace(s"""
+            | latestRate = $latestRate, error = $error
+            | latestError = $latestError, historicalError = $historicalError
+            | delaySinceUpdate = $delaySinceUpdate, dError = $dError
+            """.stripMargin)
+
         latestTime = time
         if (firstRun) {
           latestRate = processingRate
           latestError = 0D
           firstRun = false
-
+          logTrace("First run, rate estimation skipped")
           None
         } else {
           latestRate = newRate
           latestError = error
-
+          logTrace(s"New rate = $newRate")
           Some(newRate)
         }
-      } else None
+      } else {
+        logTrace("Rate estimation skipped")
+        None
+      }
     }
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala
index 17ccebc1ed..d7210f64fc 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/RateEstimator.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.streaming.scheduler.rate
 
 import org.apache.spark.SparkConf
-import org.apache.spark.SparkException
 import org.apache.spark.streaming.Duration
 
 /**
@@ -61,7 +60,8 @@ object RateEstimator {
         val proportional = conf.getDouble("spark.streaming.backpressure.pid.proportional", 1.0)
         val integral = conf.getDouble("spark.streaming.backpressure.pid.integral", 0.2)
         val derived = conf.getDouble("spark.streaming.backpressure.pid.derived", 0.0)
-        new PIDRateEstimator(batchInterval.milliseconds, proportional, integral, derived)
+        val minRate = conf.getDouble("spark.streaming.backpressure.pid.minRate", 100)
+        new PIDRateEstimator(batchInterval.milliseconds, proportional, integral, derived, minRate)
 
       case estimator =>
         throw new IllegalArgumentException(s"Unkown rate estimator: $estimator")
author	Tathagata Das <tathagata.das1565@gmail.com>	2015-08-14 15:10:01 -0700
committer	Tathagata Das <tathagata.das1565@gmail.com>	2015-08-14 15:10:01 -0700
commit	f3bfb711c1742d0915e43bda8230b4d1d22b4190 (patch)
tree	c6339008ed647f134b8fcca88e78afab81e865c0 /streaming/src/main
parent	1150a19b188a075166899fdb1e107b2ba1e505d8 (diff)
download	spark-f3bfb711c1742d0915e43bda8230b4d1d22b4190.tar.gz spark-f3bfb711c1742d0915e43bda8230b4d1d22b4190.tar.bz2 spark-f3bfb711c1742d0915e43bda8230b4d1d22b4190.zip