Improve the performance of analyses tracking aliases

Plenty of optimizations to make aliasing analysis faster. I cannot say how much faster: an analysis that used to not terminate now takes one second. The lessons are not new: - avoid boxing (of course) - avoid allocating in tight loops - use arrays, avoid hash sets / maps - specialize for common cases (see SmallBitSet in this commit) - avoid megamorphic calls - understand the details of the problem and find those twists that make it run faster More explanations in the code.
author: Lukas Rytz <lukas.rytz@gmail.com> 2015-09-09 17:14:17 +0200
committer: Lukas Rytz <lukas.rytz@gmail.com> 2015-09-17 22:05:05 +0200
commit: 72e1d0567ec011da5c519378c8883a492e67a25b (patch)
tree: 22c3aa1a447f725333ff8ae3725ad13cf7c53f50
parent: 382015824cfb3d1af2eb0dc3065b305e93185577 (diff)
download: scala-72e1d0567ec011da5c519378c8883a492e67a25b.tar.gz
scala-72e1d0567ec011da5c519378c8883a492e67a25b.tar.bz2
scala-72e1d0567ec011da5c519378c8883a492e67a25b.zip
4 files changed, 796 insertions, 115 deletions
diff --git a/src/compiler/scala/tools/nsc/backend/jvm/analysis/AliasingFrame.scala b/src/compiler/scala/tools/nsc/backend/jvm/analysis/AliasingFrame.scala
index 6ae6eddfef..9e5fbfcc0e 100644
--- a/src/compiler/scala/tools/nsc/backend/jvm/analysis/AliasingFrame.scala
+++ b/src/compiler/scala/tools/nsc/backend/jvm/analysis/AliasingFrame.scala
@@ -3,17 +3,22 @@ package backend.jvm
 package analysis
 
 import scala.annotation.switch
-import scala.collection.{mutable, immutable}
+import scala.collection.mutable
 import scala.tools.asm.Opcodes
 import scala.tools.asm.tree._
 import scala.tools.asm.tree.analysis.{Analyzer, Value, Frame, Interpreter}
 import opt.BytecodeUtils._
+import AliasSet.SmallBitSet
 
-object AliasingFrame {
-  private var _idCounter: Long = 0l
-  private def nextId = { _idCounter += 1; _idCounter }
-}
-
+/**
+ * A subclass of Frame that tracks aliasing of values stored in local variables and on the stack.
+ *
+ * Note: an analysis tracking aliases is roughly 5x slower than a usual analysis (assuming a simple
+ * value domain with a fast merge function). For example, nullness analysis is roughly 5x slower
+ * than a BasicValue analysis.
+ *
+ * See the doc of package object `analysis` for some notes on the performance of alias analysis.
+ */
 class AliasingFrame[V <: Value](nLocals: Int, nStack: Int) extends Frame[V](nLocals, nStack) {
   import Opcodes._
 
@@ -24,50 +29,66 @@ class AliasingFrame[V <: Value](nLocals: Int, nStack: Int) extends Frame[V](nLoc
   }
 
   /**
-   * For each slot (entry in the `values` array of the frame), an id that uniquely represents
-   * the object stored in it. If two values have the same id, they are aliases of the same
-   * object.
-   */
-  private val aliasIds: Array[Long] = Array.fill(nLocals + nStack)(AliasingFrame.nextId)
-
-  /**
-   * The object alias id of for a value index.
-   */
-  def aliasId(entry: Int) = aliasIds(entry)
-
-  /**
-   * Returns the indices of the values array which are aliases of the object `id`.
+   * For every value the set of values that are aliases of it.
+   *
+   * Invariants:
+   *  - If `aliases(i) == null` then i has no aliases. This is equivalent to having
+   *    `aliases(i) == SingletonSet(i)`.
+   *  - If `aliases(i) != null` then `aliases(i) contains i`.
+   *  - If `aliases(i) contains j` then `aliases(i) eq aliases(j)`, i.e., they are references to the
+   *    same (mutable) AliasSet.
    */
-  def valuesWithAliasId(id: Long): Set[Int] = immutable.BitSet.empty ++ aliasIds.indices.iterator.filter(i => aliasId(i) == id)
+  val aliases: Array[AliasSet] = new Array[AliasSet](getLocals + getMaxStackSize)
 
   /**
    * The set of aliased values for a given entry in the `values` array.
    */
-  def aliasesOf(entry: Int): Set[Int] = valuesWithAliasId(aliasIds(entry))
+  def aliasesOf(entry: Int): AliasSet = {
+    if (aliases(entry) != null) aliases(entry)
+    else {
+      val init = new AliasSet(new AliasSet.SmallBitSet(entry, -1, -1, -1), 1)
+      aliases(entry) = init
+      init
+    }
+  }
 
   /**
-   * Define a new alias. For example, given
-   *   var a = this       // this, a have the same aliasId
-   * then an assignment
+   * Define a new alias. For example, an assignment
    *   b = a
-   * will set the same the aliasId for `b`.
+   * adds b to the set of aliases of a.
    */
   private def newAlias(assignee: Int, source: Int): Unit = {
-    aliasIds(assignee) = aliasIds(source)
+    removeAlias(assignee)
+    val sourceAliases = aliasesOf(source)
+    sourceAliases += assignee
+    aliases(assignee) = sourceAliases
   }
 
   /**
-   * An assignment
+   * Remove an alias. For example, an assignment
    *   a = someUnknownValue()
-   * sets a fresh alias id for `a`.
-   * A stack value is also removed from its alias set when being consumed.
+   * removes a from its former alias set.
+   * As another example, stack values are removed from their alias sets when being consumed.
    */
   private def removeAlias(assignee: Int): Unit = {
-    aliasIds(assignee) = AliasingFrame.nextId
+    if (aliases(assignee) != null) {
+      aliases(assignee) -= assignee
+      aliases(assignee) = null
+    }
+  }
+
+  /**
+   * Define the alias set for a given value.
+   */
+  private def setAliasSet(assignee: Int, set: AliasSet): Unit = {
+    if (aliases(assignee) != null) {
+      aliases(assignee) -= assignee
+    }
+    aliases(assignee) = set
   }
 
   override def execute(insn: AbstractInsnNode, interpreter: Interpreter[V]): Unit = {
-    // Make the extendsion methods easier to use (otherwise we have to repeat `this`.stackTop)
+    // Make the extension methods easier to use (otherwise we have to repeat `this`.stackTop)
     def stackTop: Int = this.stackTop
     def peekStack(n: Int): V = this.peekStack(n)
 
@@ -166,14 +187,34 @@ class AliasingFrame[V <: Value](nLocals: Int, nStack: Int) extends Frame[V](nLoc
         }
 
       case SWAP =>
+        // could be written more elegantly with higher-order combinators, but thinking of performance
         val top = stackTop
-        val idTop = aliasIds(top)
-        aliasIds(top)     = aliasIds(top - 1)
-        aliasIds(top - 1) = idTop
+
+        def moveNextToTop(): Unit = {
+          val nextAliases = aliases(top - 1)
+          aliases(top) = nextAliases
+          nextAliases -= (top - 1)
+          nextAliases += top
+        }
+
+        if (aliases(top) != null) {
+          val topAliases = aliases(top)
+          if (aliases(top - 1) != null) moveNextToTop()
+          else aliases(top) = null
+          // move top to next
+          aliases(top - 1) = topAliases
+          topAliases -= top
+          topAliases += (top - 1)
+        } else {
+          if (aliases(top - 1) != null) {
+            moveNextToTop()
+            aliases(top - 1) = null
+          }
+        }
 
       case opcode =>
         if (opcode == ASTORE) {
-          // Not a separate case because we need to remove the consumed stack value from alias sets after.
+          // not a separate case: we re-use the code below that removes the consumed stack value from alias sets
           val stackTopBefore = stackTop - produced + consumed
           val local = insn.asInstanceOf[VarInsnNode].`var`
           newAlias(assignee = local, source = stackTopBefore)
@@ -198,9 +239,6 @@ class AliasingFrame[V <: Value](nLocals: Int, nStack: Int) extends Frame[V](nLoc
         val firstConsumed = stackTop - produced + 1                 // firstConsumed = 3
         for (i <- 0 until consumed)
           removeAlias(firstConsumed + i)                            // remove aliases for 3 and 4
-
-        // We don't need to set the aliases ids for the produced values: the aliasIds array already
-        // contains fresh ids for non-used stack values (ensured by removeAlias).
     }
   }
 
@@ -232,30 +270,124 @@ class AliasingFrame[V <: Value](nLocals: Int, nStack: Int) extends Frame[V](nLoc
    *   x = a
    *   y = b     // (x, a) and (y, b)
    * }
-   * [...]       // (x, a)
+   * [...]       // (x, a) -- merge of ((x, y, a)) and ((x, a), (y, b))
    */
   override def merge(other: Frame[_ <: V], interpreter: Interpreter[V]): Boolean = {
+    // merge is the main performance hot spot of a data flow analysis.
+
+    // in nullness analysis, super.merge (which actually merges the nullness values) takes 20% of
+    // the overall analysis time.
     val valuesChanged = super.merge(other, interpreter)
+
+    // in nullness analysis, merging the alias sets takes ~55% of the analysis time. therefore, this
+    // code has been heavily optimized. most of the time is spent in the `hasNext` method of the
+    // andNotIterator, see its comment.
+
     var aliasesChanged = false
     val aliasingOther = other.asInstanceOf[AliasingFrame[_]]
-    for (i <- aliasIds.indices) {
-      val thisAliases = aliasesOf(i)
-      val thisNotOther = thisAliases diff (thisAliases intersect aliasingOther.aliasesOf(i))
-      if (thisNotOther.nonEmpty) {
-        aliasesChanged = true
-        thisNotOther foreach removeAlias
+
+    val numValues = getLocals + getStackSize
+    // assume (a, b) are aliases both in this frame, and the other frame. when merging the alias set
+    // for a, we already see that a and b will be aliases in the final result. so we can skip over
+    // merging the alias set for b. in this case, while merging the sets for a, knownOk(b) will be
+    // set to `true`.
+    val knownOk = new Array[Boolean](numValues)
+    var i = 0
+    while (i < numValues) {
+      if (!knownOk(i)) {
+        val thisAliases = this.aliases(i)
+        val otherAliases = aliasingOther.aliases(i)
+        if (thisAliases != null && otherAliases != null) {
+          // The iterator yields elements that are in `thisAliases` but not in `otherAliases`.
+          // As a side-effect, for every index `i` that is in both alias sets, the iterator sets
+          // `knownOk(i) = true`: the alias sets for these values don't need to be merged again.
+          val thisNotOtherIt = AliasSet.andNotIterator(thisAliases, otherAliases, knownOk)
+          if (thisNotOtherIt.hasNext) {
+            aliasesChanged = true
+            val newSet = AliasSet.empty
+            while (thisNotOtherIt.hasNext) {
+              val next = thisNotOtherIt.next()
+              newSet += next
+              setAliasSet(next, newSet)
+            }
+          }
+        }
       }
+      i += 1
     }
+
     valuesChanged || aliasesChanged
   }
 
+  private def min(s: SmallBitSet) = {
+    var r = s.a
+    if (             s.b < r) r = s.b
+    if (s.c != -1 && s.c < r) r = s.c
+    if (s.d != -1 && s.d < r) r = s.d
+    r
+  }
+
   override def init(src: Frame[_ <: V]): Frame[V] = {
-    super.init(src)
-    compat.Platform.arraycopy(src.asInstanceOf[AliasingFrame[_]].aliasIds, 0, aliasIds, 0, aliasIds.length)
+    super.init(src) // very quick (just an arraycopy)
+    System.arraycopy(src.asInstanceOf[AliasingFrame[_]].aliases, 0, aliases, 0, aliases.length) // also quick
+
+    val newSets = mutable.HashMap.empty[AliasSet, AliasSet]
+
+    // the rest of this method (cloning alias sets) is the second performance˙hotspot (next to
+    // AliasingFrame.merge). for nullness, it takes ~20% of the analysis time.
+    // the difficulty here is that we have to clone the alias sets correctly. if two values a, b are
+    // aliases, then aliases(a) eq aliases(b). we need to make sure to use the same clone for the
+    // two values.
+
+    var i = 0
+    while (i < aliases.length) {
+      val set = aliases(i)
+      if (set != null) {
+        // size cannot be 0 - alias sets are always at least singletons.
+        // for sets of size 1-4, don't use the `newSets` map - lookup / update is slow
+        if (set.size == 1) {
+          aliases(i) = null
+        } else if (set.size <= 4) {
+          val small = set.set.asInstanceOf[AliasSet.SmallBitSet]
+          val firstOfSet = i == min(small)
+          if (firstOfSet) {
+            val newSet = set.clone()
+            aliases(small.a) = newSet
+            aliases(small.b) = newSet
+            if (small.c != -1) aliases(small.c) = newSet
+            if (small.d != -1) aliases(small.d) = newSet
+          }
+        } else {
+          // the actual hot spot is the hash map operations here: this is where almost all of the 20%
+          // mentioned above is spent.
+          // i also benchmarked an alternative implementation: keep an array of booleans for indexes
+          // that already contain the cloned set. iterate through all elements of the cloned set and
+          // assign the cloned set. this approach is 50% slower than using a hash map.
+          if (newSets contains set) aliases(i) = newSets(set)
+          else {
+            val newSet = set.clone()
+            newSets(set) = newSet
+            aliases(i) = newSet
+          }
+        }
+      }
+      i += 1
+    }
     this
   }
 }
 
+object AliasingFrame {
+//  val start1 = AliasingFrame.timer1.start()
+//  AliasingFrame.timer1.stop(start1)
+  import scala.reflect.internal.util.Statistics._
+  val timer1 = newTimer("t1", "jvm")
+  val timer2 = newTimer("t2", "jvm")
+  val timer3 = newTimer("t3", "jvm")
+  val timers = List(timer1, timer2, timer3)
+  def reset(): Unit = for (t <- timers) { t.nanos = 0; t.timings = 0 }
+}
+
 /**
  * An analyzer that uses AliasingFrames instead of bare Frames. This can be used when an analysis
  * needs to track aliases, but doesn't require a more specific Frame subclass.
@@ -264,3 +396,269 @@ class AliasingAnalyzer[V <: Value](interpreter: Interpreter[V]) extends Analyzer
   override def newFrame(nLocals: Int, nStack: Int): AliasingFrame[V] = new AliasingFrame(nLocals, nStack)
   override def newFrame(src: Frame[_ <: V]): AliasingFrame[V] = new AliasingFrame(src)
 }
+
+/**
+ * An iterator over Int (required to prevent boxing the result of next).
+ */
+abstract class IntIterator extends Iterator[Int] {
+  def hasNext: Boolean
+  def next(): Int
+}
+
+/**
+ * An efficient mutable bit set.
+ *
+ * @param set  Either a SmallBitSet or an Array[Long]
+ * @param size The size of the set, useful for performance of certain operations
+ */
+class AliasSet(var set: Object /*SmallBitSet | Array[Long]*/, var size: Int) {
+  import AliasSet._
+
+  override def toString: String = set.toString
+
+  /**
+   * An iterator for the elements of this bit set. Note that only one iterator can be used at a
+   * time. Also make sure not to change the underlying AliasSet during iteration.
+   */
+  def iterator: IntIterator = andNotIterator(this, empty, null)
+
+  def +=(value: Int): Unit = this.set match {
+    case s: SmallBitSet => (size: @switch) match {
+      case 0 =>                                                     s.a = value; size = 1
+      case 1 => if (value != s.a)                                 { s.b = value; size = 2 }
+      case 2 => if (value != s.a && value != s.b)                 { s.c = value; size = 3 }
+      case 3 => if (value != s.a && value != s.b && value != s.c) { s.d = value; size = 4 }
+      case 4 =>
+        if (value != s.a && value != s.b && value != s.c && value != s.d) {
+          this.set = bsEmpty
+          this.size = 0
+          bsAdd(this, s.a)
+          bsAdd(this, s.b)
+          bsAdd(this, s.c)
+          bsAdd(this, s.d)
+          bsAdd(this, value)
+        }
+    }
+    case bits: Array[Long] =>
+      bsAdd(this, value)
+  }
+
+  def -=(value: Int): Unit = this.set match {
+    case s: SmallBitSet => (size: @switch) match {
+      case 0 =>
+      case 1 =>
+        if      (value == s.a) { s.a = -1; size = 0 }
+      case 2 =>
+        if      (value == s.a) { s.a = s.b; s.b = -1; size = 1 }
+        else if (value == s.b) {            s.b = -1; size = 1 }
+      case 3 =>
+        if      (value == s.a) { s.a = s.b; s.b = s.c; s.c = -1; size = 2 }
+        else if (value == s.b) {            s.b = s.c; s.c = -1; size = 2 }
+        else if (value == s.c) {                       s.c = -1; size = 2 }
+      case 4 =>
+        if      (value == s.a) { s.a = s.b; s.b = s.c; s.c = s.d; s.d = -1; size = 3 }
+        else if (value == s.b) {            s.b = s.c; s.c = s.d; s.d = -1; size = 3 }
+        else if (value == s.c) {                       s.c = s.d; s.d = -1; size = 3 }
+        else if (value == s.d) {                                  s.d = -1; size = 3 }
+    }
+    case bits: Array[Long] =>
+      bsRemove(this, value)
+      if (this.size == 4)
+        this.set = bsToSmall(this.set.asInstanceOf[Array[Long]])
+  }
+
+  override def clone(): AliasSet = {
+    val resSet = this.set match {
+      case s: SmallBitSet => new SmallBitSet(s.a, s.b, s.c, s.d)
+      case bits: Array[Long] => bits.clone()
+    }
+    new AliasSet(resSet, this.size)
+  }
+}
+
+object AliasSet {
+  def empty = new AliasSet(new SmallBitSet(-1, -1, -1, -1), 0)
+
+  final class SmallBitSet(var a: Int, var b: Int, var c: Int, var d: Int) {
+    override def toString = s"($a, $b, $c, $d)"
+  }
+
+  def bsEmpty: Array[Long] = new Array[Long](1)
+
+  private def bsEnsureCapacity(set: Array[Long], index: Int): Array[Long] = {
+    if (index < set.length) set
+    else {
+      var newLength = set.length
+      while (index >= newLength) newLength *= 2
+      val newSet = new Array[Long](newLength)
+      Array.copy(set, 0, newSet, 0, set.length)
+      newSet
+    }
+  }
+
+  def bsAdd(set: AliasSet, bit: Int): Unit = {
+    val bits = set.set.asInstanceOf[Array[Long]]
+    val index = bit >> 6
+    val resSet = bsEnsureCapacity(bits, index)
+    val before = resSet(index)
+    val result = before | (1l << bit)
+    if (result != before) {
+      resSet(index) = result
+      set.set = resSet
+      set.size += 1
+    }
+  }
+
+  def bsRemove(set: AliasSet, bit: Int): Unit = {
+    val bits = set.set.asInstanceOf[Array[Long]]
+    val index = bit >> 6
+    if (index < bits.length) {
+      val before = bits(index)
+      val result = before & ~(1l << bit)
+      if (result != before) {
+        bits(index) = result
+        set.size -= 1
+      }
+    }
+  }
+
+  def bsContains(set: Array[Long], bit: Int): Boolean = {
+    val index = bit >> 6
+    bit >= 0 && index < set.length && (set(index) & (1L << bit)) != 0L
+  }
+
+//  var sizesHist: Array[Int] = new Array[Int](1000)
+
+  /**
+   * Convert a bit array to a SmallBitSet. Requires the bit array to contain exactly four bits.
+   */
+  def bsToSmall(bits: Array[Long]): SmallBitSet = {
+    var a = -1
+    var b = -1
+    var c = -1
+    var i = 0
+    val end = bits.length * 64
+    while (i < end) {
+      if (bsContains(bits, i)) {
+        if (a == -1) a = i
+        else if (b == -1) b = i
+        else if (c == -1) c = i
+        else return new SmallBitSet(a, b, c, i)
+      }
+      i += 1
+    }
+    null
+  }
+
+  /**
+   * An iterator that yields the elements that are in one bit set and not in another (&~).
+   */
+  private class AndNotIt(setA: AliasSet, setB: AliasSet, thisAndOther: Array[Boolean]) extends IntIterator {
+    // values in the first bit set
+    private var a, b, c, d = -1
+    private var xs: Array[Long] = null
+
+    // values in the second bit set
+    private var notA, notB, notC, notD = -1
+    private var notXs: Array[Long] = null
+
+    // holds the next value of `x`, `y` or `z` that should be returned. assigned in hasNext
+    private var abcdNext = -1
+
+    // counts through elements in the `xs` bit set
+    private var i = 0
+    // true if the current value of `i` should be returned by this iterator
+    private var iValid = false
+
+    setA.set match {
+      case s: SmallBitSet => a = s.a; b = s.b; c = s.c; d = s.d
+      case bits: Array[Long] => xs = bits
+    }
+
+    setB.set match {
+      case s: SmallBitSet => notA = s.a; notB = s.b; notC = s.c; notD = s.d
+      case bits: Array[Long] => notXs = bits
+    }
+
+    // for each value that exists both in this AND (&) the other bit, `thisAndOther` is set to true.
+    // hacky side-effect, used for performance of AliasingFrame.merge.
+    private def setThisAndOther(x: Int) = if (thisAndOther != null) thisAndOther(x) = true
+
+    private def checkABCD(x: Int, num: Int): Boolean = {
+      // assert(x == a && num == 1 || x == b && num == 2 || ...)
+      x != -1 && {
+        val otherHasA = x == notA || x == notB  || x == notC || x == notD || (notXs != null && bsContains(notXs, x))
+        if (otherHasA) setThisAndOther(x)
+        else abcdNext = x
+        (num: @switch) match {
+          case 1 => a = -1
+          case 2 => b = -1
+          case 3 => c = -1
+          case 4 => d = -1
+        }
+        !otherHasA
+      }
+    }
+
+    // main performance hot spot
+    private def checkXs = {
+      (xs != null) && {
+        val end = xs.length * 64
+
+        while (i < end && {
+          val index = i >> 6
+          if (xs(index) == 0l) { // boom. for nullness, this saves 35% of the overall analysis time.
+            i = ((index + 1) << 6) - 1 // -1 required because i is incremented in the loop body
+            true
+          } else {
+            val mask = 1l << i
+            // if (mask > xs(index)) we could also advance i to the next value, but that didn't pay off in benchmarks
+            val thisHasI = (xs(index) & mask) != 0l
+            !thisHasI || {
+              val otherHasI = i == notA || i == notB || i == notC || i == notD || (notXs != null && index < notXs.length && (notXs(index) & mask) != 0l)
+              if (otherHasI) setThisAndOther(i)
+              otherHasI
+            }
+          }
+        }) i += 1
+
+        iValid = i < end
+        iValid
+      }
+    }
+
+    // this is the main hot spot of alias analysis. for nullness, 38% of the overall analysis time
+    // is spent here. within hasNext, almost the entire time is spent in `checkXs`.
+    //
+    def hasNext: Boolean = iValid || abcdNext != -1 || checkABCD(a, 1) || checkABCD(b, 2) || checkABCD(c, 3) || checkABCD(d, 4) || checkXs
+
+    def next(): Int = {
+      if (hasNext) {
+        if (abcdNext != -1) {
+          val r = abcdNext; abcdNext = -1; r
+        } else {
+          val r = i; i += 1; iValid = false; r
+        }
+      } else Iterator.empty.next()
+    }
+  }
+
+//  The number of bits in a bit array. Useful for debugging.
+//  def bsSize(bits: Array[Long]) = {
+//    var r = 0
+//    var i = 0
+//    while (i < bits.length) {
+//      r += java.lang.Long.bitCount(bits(i))
+//      i += 1
+//    }
+//    r
+//  }
+
+  /**
+   * An iterator returning the elements in a that are not also in b (a &~ b).
+   *
+   * If `thisAndOther` is non-null, the iterator sets thisAndOther(i) to true for every value that
+   * is both in a and b (&).
+   */
+  def andNotIterator(a: AliasSet, b: AliasSet, thisAndOther: Array[Boolean]): IntIterator = new AndNotIt(a, b, thisAndOther)
+}
diff --git a/src/compiler/scala/tools/nsc/backend/jvm/analysis/NullnessAnalyzer.scala b/src/compiler/scala/tools/nsc/backend/jvm/analysis/NullnessAnalyzer.scala
index 31710dcbee..f6d249db7b 100644
--- a/src/compiler/scala/tools/nsc/backend/jvm/analysis/NullnessAnalyzer.scala
+++ b/src/compiler/scala/tools/nsc/backend/jvm/analysis/NullnessAnalyzer.scala
@@ -7,66 +7,12 @@ import java.util
 import scala.annotation.switch
 import scala.tools.asm.{Type, Opcodes}
 import scala.tools.asm.tree.{MethodInsnNode, LdcInsnNode, AbstractInsnNode}
-import scala.tools.asm.tree.analysis.{Frame, Analyzer, Interpreter, Value}
+import scala.tools.asm.tree.analysis._
 import scala.tools.nsc.backend.jvm.opt.BytecodeUtils
 import BytecodeUtils._
 
 /**
- * Some notes on the ASM ananlyzer framework.
- *
- * Value
- *  - Abstract, needs to be implemented for each analysis.
- *  - Represents the desired information about local variables and stack values, for example:
- *    - Is this value known to be null / not null?
- *    - What are the instructions that could potentially have produced this value?
- *
- * Interpreter
- *  - Abstract, needs to be implemented for each analysis. Sometimes one can subclass an existing
- *    interpreter, e.g., SourceInterpreter or BasicInterpreter.
- *  - Multiple abstract methods that receive an instruction and the instruction's input values, and
- *    return a value representing the result of that instruction.
- *    - Note: due to control flow, the interpreter can be invoked multiple times for the same
- *      instruction, until reaching a fixed point.
- *  - Abstract `merge` function that computes the least upper bound of two values. Used by
- *    Frame.merge (see below).
- *
- * Frame
- *  - Can be used directly for many analyses, no subclass required.
- *  - Every frame has an array of values: one for each local variable and for each stack slot.
- *    - A `top` index stores the index of the current stack top
- *    - NOTE: for a size-2 local variable at index i, the local variable at i+1 is set to an empty
- *      value. However, for a size-2 value at index i on the stack, the value at i+1 holds the next
- *      stack value.
- *  - Defines the `execute(instruction)` method.
- *    - executing mutates the state of the frame according to the effect of the instruction
- *      - pop consumed values from the stack
- *      - pass them to the interpreter together with the instruction
- *      - if applicable, push the resulting value on the stack
- *  - Defines the `merge(otherFrame)` method
- *    - called by the analyzer when multiple control flow paths lead to an instruction
- *      - the frame at the branching instruction is merged into the current frame of the
- *        instruction (held by the analyzer)
- *      - mutates the values of the current frame, merges all values using interpreter.merge.
- *
- * Analyzer
- *   - Stores a frame for each instruction
- *   - `merge` function takes an instruction and a frame, merges the existing frame for that instr
- *     (from the frames array) with the new frame passed as argument.
- *     if the frame changed, puts the instruction on the work queue (fixpiont).
- *   - initial frame: initialized for first instr by calling interpreter.new[...]Value
- *     for each slot (locals and params), stored in frames[firstInstr] by calling `merge`
- *   - work queue of instructions (`queue` array, `top` index for next instruction to analyze)
- *   - analyze(method): simulate control flow. while work queue non-empty:
- *     - copy the state of `frames[instr]` into a local frame `current`
- *     - call `current.execute(instr, interpreter)`, mutating the `current` frame
- *     - if it's a branching instruction
- *       - for all potential destination instructions
- *         - merge the destination instruction frame with the `current` frame
- *           (this enqueues the destination instr if its frame changed)
- *       - invoke `newControlFlowEdge` (see below)
- *   - the analyzer also tracks active exception handlers at each instruction
- *   - the empty method `newControlFlowEdge` can be overridden to track control flow if required
- *
+ * See the package object `analysis` for details on the ASM analysis framework.
  *
  * Some notes on nullness analysis.
  *
@@ -219,8 +165,10 @@ class NullnessFrame(nLocals: Int, nStack: Int) extends AliasingFrame[NullnessVal
   override def execute(insn: AbstractInsnNode, interpreter: Interpreter[NullnessValue]): Unit = {
     import Opcodes._
 
-    // get the object id of the object that is known to be not-null after this operation
-    val nullCheckedAliasId: Long = (insn.getOpcode: @switch) match {
+    // get the alias set the object that is known to be not-null after this operation.
+    // alias sets are mutable / mutated, so after super.execute, this set contains the remaining
+    // aliases of the value that becomes not-null.
+    val nullCheckedAliases: AliasSet = (insn.getOpcode: @switch) match {
       case IALOAD |
            LALOAD |
            FALOAD |
@@ -229,7 +177,7 @@ class NullnessFrame(nLocals: Int, nStack: Int) extends AliasingFrame[NullnessVal
            BALOAD |
            CALOAD |
            SALOAD =>
-        aliasId(this.stackTop - 1)
+        aliasesOf(this.stackTop - 1)
 
       case IASTORE |
            FASTORE |
@@ -239,35 +187,36 @@ class NullnessFrame(nLocals: Int, nStack: Int) extends AliasingFrame[NullnessVal
            SASTORE |
            LASTORE |
            DASTORE =>
-        aliasId(this.stackTop - 2)
+        aliasesOf(this.stackTop - 2)
 
       case GETFIELD =>
-        aliasId(this.stackTop)
+        aliasesOf(this.stackTop)
 
       case PUTFIELD =>
-        aliasId(this.stackTop - 1)
+        aliasesOf(this.stackTop - 1)
 
       case INVOKEVIRTUAL |
            INVOKESPECIAL |
            INVOKEINTERFACE =>
         val desc = insn.asInstanceOf[MethodInsnNode].desc
         val numArgs = Type.getArgumentTypes(desc).length
-        aliasId(this.stackTop - numArgs)
+        aliasesOf(this.stackTop - numArgs)
 
       case ARRAYLENGTH |
            MONITORENTER |
            MONITOREXIT =>
-        aliasId(this.stackTop)
+        aliasesOf(this.stackTop)
 
       case _ =>
-        -1
+        null
     }
 
     super.execute(insn, interpreter)
 
-    if (nullCheckedAliasId != -1) {
-      for (i <- valuesWithAliasId(nullCheckedAliasId))
-        this.setValue(i, NotNullValue)
+    if (nullCheckedAliases != null) {
+      val it = nullCheckedAliases.iterator
+      while (it.hasNext)
+        this.setValue(it.next(), NotNullValue)
     }
   }
 }
diff --git a/src/compiler/scala/tools/nsc/backend/jvm/analysis/ProdConsAnalyzer.scala b/src/compiler/scala/tools/nsc/backend/jvm/analysis/ProdConsAnalyzer.scala
index 1c24acba03..f2d8dc910a 100644
--- a/src/compiler/scala/tools/nsc/backend/jvm/analysis/ProdConsAnalyzer.scala
+++ b/src/compiler/scala/tools/nsc/backend/jvm/analysis/ProdConsAnalyzer.scala
@@ -55,6 +55,11 @@ import scala.collection.convert.decorateAsScala._
  *
  * If ever needed, we could introduce a mode where primitive conversions (l2i) are considered as
  * copying operations.
+ *
+ * Note on performance: thee data flow analysis (SourceValue / SourceInterpreter, provided by ASM)
+ * is roughly 2-3x slower than a simple analysis (like BasicValue). The reason is that the merge
+ * function (merging producer sets) is more complex than merging simple basic values.
+ * See also the doc comment in the package object `analysis`.
  */
 class ProdConsAnalyzer(methodNode: MethodNode, classInternalName: InternalName) {
 
diff --git a/src/compiler/scala/tools/nsc/backend/jvm/analysis/package.scala b/src/compiler/scala/tools/nsc/backend/jvm/analysis/package.scala
new file mode 100644
index 0000000000..402357c55b
--- /dev/null
+++ b/src/compiler/scala/tools/nsc/backend/jvm/analysis/package.scala
@@ -0,0 +1,329 @@
+package scala.tools.nsc.backend.jvm
+
+/**
+ * Summary on the ASM ananlyzer framework
+ * --------------------------------------
+ *
+ * Value
+ *  - Abstract, needs to be implemented for each analysis.
+ *  - Represents the desired information about local variables and stack values, for example:
+ *    - Is this value known to be null / not null?
+ *    - What are the instructions that could potentially have produced this value?
+ *
+ * Interpreter
+ *  - Abstract, needs to be implemented for each analysis. Sometimes one can subclass an existing
+ *    interpreter, e.g., SourceInterpreter or BasicInterpreter.
+ *  - Multiple abstract methods that receive an instruction and the instruction's input values, and
+ *    return a value representing the result of that instruction.
+ *    - Note: due to control flow, the interpreter can be invoked multiple times for the same
+ *      instruction, until reaching a fixed point.
+ *  - Abstract `merge` function that computes the least upper bound of two values. Used by
+ *    Frame.merge (see below).
+ *
+ * Frame
+ *  - Can be used directly for many analyses, no subclass required.
+ *  - Every frame has an array of values: one for each local variable and for each stack slot.
+ *    - A `top` index stores the index of the current stack top
+ *    - NOTE: for a size-2 local variable at index i, the local variable at i+1 is set to an empty
+ *      value. However, for a size-2 value at index i on the stack, the value at i+1 holds the next
+ *      stack value.
+ *  - Defines the `execute(instruction)` method.
+ *    - executing mutates the state of the frame according to the effect of the instruction
+ *      - pop consumed values from the stack
+ *      - pass them to the interpreter together with the instruction
+ *      - if applicable, push the resulting value on the stack
+ *  - Defines the `merge(otherFrame)` method
+ *    - called by the analyzer when multiple control flow paths lead to an instruction
+ *      - the frame at the branching instruction is merged into the current frame of the
+ *        instruction (held by the analyzer)
+ *      - mutates the values of the current frame, merges all values using interpreter.merge.
+ *
+ * Analyzer
+ *   - Stores a frame for each instruction
+ *   - `merge` function takes an instruction and a frame, merges the existing frame for that instr
+ *     (from the frames array) with the new frame passed as argument.
+ *     if the frame changed, puts the instruction on the work queue (fixpiont).
+ *   - initial frame: initialized for first instr by calling interpreter.new[...]Value
+ *     for each slot (locals and params), stored in frames[firstInstr] by calling `merge`
+ *   - work queue of instructions (`queue` array, `top` index for next instruction to analyze)
+ *   - analyze(method): simulate control flow. while work queue non-empty:
+ *     - copy the state of `frames[instr]` into a local frame `current`
+ *     - call `current.execute(instr, interpreter)`, mutating the `current` frame
+ *     - if it's a branching instruction
+ *       - for all potential destination instructions
+ *         - merge the destination instruction frame with the `current` frame
+ *           (this enqueues the destination instr if its frame changed)
+ *       - invoke `newControlFlowEdge` (see below)
+ *   - the analyzer also tracks active exception handlers at each instruction
+ *   - the empty method `newControlFlowEdge` can be overridden to track control flow if required
+ *
+ *
+ * Lessons learnt while benchmarking the alias tracking analysis
+ * -------------------------------------------------------------
+ *
+ * Profiling
+ *  - Use YourKit for finding hotspots (cpu profiling). when it comes to drilling down into the details
+ *    of a hotspot, don't pay too much attention to the percentages / time counts.
+ *  - Should also try other profilers.
+ *  - Use timers. When a method showed up as a hotspot, i added a timer around that method, and a
+ *    second one within the method to measure specific parts. The timers slow things down, but the
+ *    relative numbers show what parts of a method are slow.
+ *
+ * ASM analyzer insights
+ *  - The time for running an analysis depends on the number of locals and the number of instructions.
+ *    Reducing the number of locals helps speeding up the analysis: there are less values to
+ *    merge when merging to frames.
+ *    See also https://github.com/scala/scala-dev/issues/47
+ *  - The common hot spot of an ASM analysis is Frame.merge, for example in producers / consumers.
+ *  - For nullness analysis the time is spent as follows
+ *    - 20% merging nullness values. this is as expected: for example, the same absolute amount of
+ *      time is spent in merging BasicValues when running a BasicInterpreter.
+ *    - 50% merging alias sets. i tried to optimize what i could out of this.
+ *    - 20% is spent creating new frames from existing ones, see comment on AliasingFrame.init.
+ *  - The implementation of Frame.merge (the main hot spot) contains a megamorphic callsite to
+ *    `interpreter.merge`. This can be observed easily by running a test program that either runs
+ *    a BasicValue analysis only, versus a program that first runs a nullness analysis and then
+ *    a BasicValue. In an example, the time for the BasicValue analysis goes from 519ms to 1963ms,
+ *    a 3.8x slowdown.
+ *  - I added counters to the Frame.merge methods for nullness and BasicValue analysis. In the
+ *    examples I benchmarked, the number of merge invocations was always exactly the same.
+ *    It would probably be possible to come up with an example where alias set merging forces
+ *    additional analysis rounds until reaching the fixpoint, but I did not observe such cases.
+ *
+ * To benchmark an analysis, instead of benchmarking analysis while it runs in the compiler
+ * backend, one can easily run it from a separate program (or the repl). The bytecode to analyze
+ * can simply be parsed from a classfile. See example at the end of this comment.
+ *
+ *
+ * Nullness Analysis in Miguel's Optimizer
+ * ---------------------------------------
+ *
+ * Miguel implemented alias tracking for nullness analysis differently [1]. Remember that every
+ * frame has an array of values. Miguel's idea was to represent aliasing using reference equality
+ * in the values array: if two entries in the array point to the same value object, the two entries
+ * are aliases in the frame of the given instruction.
+ *
+ * While this idea seems elegant at first sight, Miguel's implementation does not merge frames
+ * correctly when it comes to aliasing. Assume in frame 1, values (a, b, c) are aliases, while in
+ * frame 2 (a, b) are aliases. When merging the second into the first, we have to make sure that
+ * c is removed as an alias of (a, b).
+ *
+ * It would be possible to implement correct alias set merging in Miguel's approach. However, frame
+ * merging is the main hot spot of analysis. The computational complexity of implementing alias set
+ * merging by traversing the values array and comparing references is too high. The concrete
+ * alias set representation that is used in the current implementation (see class AliasingFrame)
+ * makes alias set merging more efficient.
+ *
+ * [1] https://github.com/scala-opt/scala/blob/opt/rebase/src/compiler/scala/tools/nsc/backend/bcode/NullnessPropagator.java
+ *
+ *
+ * Complexity and scaling of analysis
+ * ----------------------------------
+ *
+ * The time complexity of a data flow analysis depends on:
+ *
+ *   - The size of the method. The complexity factor is linear (assuming the number of locals and
+ *     branching instructions remains constant). The main analysis loop runs through all
+ *     instructions of a method once. Instructions are only re-enqueued if a control flow merge
+ *     changes the frame at some instruction.
+ *
+ *   - The branching instructions. When a second (third, ..) control flow edge arrives at an
+ *     instruction, the existing frame at the instruction is merged with the one computed on the
+ *     new branch. If the merge function changes the existing frame, the instruction is enqueued
+ *     for another analysis. This results in a merge operation for the successors of the
+ *     instruction.
+ *
+ *   - The number of local variables. The hot spot of analysis is frame merging. The merge function
+ *     iterates through the values in the frame (locals and stack values) and merges them.
+ *
+ * I measured the running time of an analysis for two examples:
+ *   - Keep the number of locals and branching instructions constant, increase the number of
+ *     instructions. The running time grows linearly with the method size.
+ *   - Increase the size and number of locals in a method. The method size and number of locals
+ *     grow in the same pace. Here, the running time increase is polynomial. It looks like the
+ *     complexity is be #instructions * #locals^2 (see below).
+ *
+ * I measured nullness analysis (which tracks aliases) and a SimpleValue analysis. Nullness runs
+ * roughly 5x slower (because of alias tracking) at every problem size - this factor doesn't change.
+ *
+ * The numbers below are for nullness. Note that the the last column is constant, i.e., the running
+ * time is proportional to #ins * #loc^2. Therefore we use this factor when limiting the maximal
+ * method size for running an analysis.
+ *
+ *   #insns    #locals    time (ms)       time / #ins * #loc^2 * 10^6
+ *   1305      156        34              1.07
+ *   2610      311        165             0.65
+ *   3915      466        490             0.57
+ *   5220      621        1200            0.59
+ *   6525      776        2220            0.56
+ *   7830      931        3830            0.56
+ *   9135      1086       6570            0.60
+ *   10440     1241       9700            0.60
+ *   11745     1396       13800           0.60
+ *
+ * As a second experiment, nullness analysis was run with varying #insns but constant #locals.
+ * The last column shows linear complexity with respect to the method size (linearOffset = 2279):
+ *
+ *   #insns     #locals     time (ms)    (time + linearOffset) / #insns
+ *   5220       621         1090         0.645
+ *   6224       621         1690         0.637
+ *   7226       621         2280         0.630
+ *   8228       621         2870         0.625
+ *   9230       621         3530         0.629
+ *   10232      621         4130         0.626
+ *   11234      621         4770         0.627
+ *   12236      621         5520         0.637
+ *   13238      621         6170         0.638
+ *
+ *
+ * When running a BasicValue analysis, the complexity observation is the same (time is proportional
+ * to #ins * #loc^2).
+ *
+ *
+ * Measuring analysis execution time
+ * ---------------------------------
+ *
+ * See code below.
+ */
+
+/*
+object Test {
+  val overwrite: Option[String] = null
+
+  @noinline def serialize(o: AnyRef): String = null
+
+  @noinline def deserialize(string: String): AnyRef = null
+
+  @inline def checkRoundTrip[T <: AnyRef](instance: T)(f: T => AnyRef) {
+    val result = serialize(instance)
+    val reconstituted = deserialize(result).asInstanceOf[T]
+    assert(f(instance) == f(reconstituted), (f(instance), f(reconstituted)))
+  }
+
+  @inline def check[T <: AnyRef](instance: => T)(prevResult: String, f: T => AnyRef = (x: T) => x) {
+    // pattern match to introduce a lot of control flow, i.e., a lot of frame merges
+    overwrite match {
+      case Some(f) =>
+      case None =>
+        checkRoundTrip(instance)(f)
+        assert(f(deserialize(prevResult).asInstanceOf[T]) == f(instance), instance)
+        assert(prevResult == "res", instance)
+    }
+  }
+
+  // @inline def fun[T <: AnyRef](instance: => T) = (x: T) => x
+
+  def testMain(): Unit = {
+    // every call to check creates quite a number of locals, and also quite a number of aliases
+    // of the same value (x1). First of all, the default argument call is expanded as below. Then
+    // method check is inlined, and within the body of check, checkRoundTrip and assert have
+    // already been inlined as well.
+
+    // {
+    //   val x1 = () => ""
+    //   val x2 = fun(x1())  // the compiler optimizes this: instead of passing `() => x1()`, it just passes x1
+    //   check(x1())("", x2) // same here for x1
+    // }
+
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("") // 5
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("") // 10
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("") // 15
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("") // 20
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("") // 25
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("") // 30
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("") // 35
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("") // 40
+    // check("")("")
+    // check("")("")
+    // check("")("")
+    // check("")("")
+    // check("")("") // 45
+    // check("")("")
+    // check("")("")
+    // check("")("")
+    // check("")("")
+    // check("")("") // 50
+    // check("")("")
+    // check("")("")
+    // check("")("")
+    // check("")("")
+    // check("")("") // 55
+
+    // 1000 bytecode instructions, 0 locals
+    // println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10));
+  }
+
+  def timed[T](f: => T): T = {
+    val start = System.nanoTime()
+    val r = f
+    val nanos = System.nanoTime() - start
+    println(s"took ${nanos/1000000}ms")
+    r
+  }
+
+  def main(args: Array[String]): Unit = {
+    import scala.tools.nsc.backend.jvm._
+    val cn = AsmUtils.readClass("/Users/luc/scala/scala/sandbox/Test$.class")
+    import scala.collection.convert.decorateAsScala._
+    val m = cn.methods.iterator.asScala.find(_.name == "testMain").head
+
+    println(s"${m.instructions.size} instructions - ${m.maxLocals} locals")
+
+    val a = new analysis.NullnessAnalyzer
+    a.analyze(cn.name, m) // warm up
+
+    analysis.AliasingFrame.reset()
+    timed(a.analyze(cn.name, m))
+    analysis.AliasingFrame.timers foreach println
+
+    println("---")
+
+    // NOTE: if we don't run nullness analysis above (comment it out), then the BasicValue
+    // analysis runs 3.5x faster. Most likely because the call to Interpreter.merge inside
+    // Frame.merge is no longer megamorphic.
+
+    import scala.tools.asm.tree.analysis._
+    val ba = new Analyzer(new BasicInterpreter)
+    ba.analyze(cn.name, m) // warm up
+
+    timed(ba.analyze(cn.name, m))
+
+    println("---")
+
+    timed(a.analyze(cn.name, m))
+  }
+}
+*/
+package object analysis
author	Lukas Rytz <lukas.rytz@gmail.com>	2015-09-09 17:14:17 +0200
committer	Lukas Rytz <lukas.rytz@gmail.com>	2015-09-17 22:05:05 +0200
commit	72e1d0567ec011da5c519378c8883a492e67a25b (patch)
tree	22c3aa1a447f725333ff8ae3725ad13cf7c53f50
parent	382015824cfb3d1af2eb0dc3065b305e93185577 (diff)
download	scala-72e1d0567ec011da5c519378c8883a492e67a25b.tar.gz scala-72e1d0567ec011da5c519378c8883a492e67a25b.tar.bz2 scala-72e1d0567ec011da5c519378c8883a492e67a25b.zip