summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLukas Rytz <lukas.rytz@gmail.com>2015-09-09 17:14:17 +0200
committerLukas Rytz <lukas.rytz@gmail.com>2015-09-17 22:05:05 +0200
commit72e1d0567ec011da5c519378c8883a492e67a25b (patch)
tree22c3aa1a447f725333ff8ae3725ad13cf7c53f50
parent382015824cfb3d1af2eb0dc3065b305e93185577 (diff)
downloadscala-72e1d0567ec011da5c519378c8883a492e67a25b.tar.gz
scala-72e1d0567ec011da5c519378c8883a492e67a25b.tar.bz2
scala-72e1d0567ec011da5c519378c8883a492e67a25b.zip
Improve the performance of analyses tracking aliases
Plenty of optimizations to make aliasing analysis faster. I cannot say how much faster: an analysis that used to not terminate now takes one second. The lessons are not new: - avoid boxing (of course) - avoid allocating in tight loops - use arrays, avoid hash sets / maps - specialize for common cases (see SmallBitSet in this commit) - avoid megamorphic calls - understand the details of the problem and find those twists that make it run faster More explanations in the code.
-rw-r--r--src/compiler/scala/tools/nsc/backend/jvm/analysis/AliasingFrame.scala492
-rw-r--r--src/compiler/scala/tools/nsc/backend/jvm/analysis/NullnessAnalyzer.scala85
-rw-r--r--src/compiler/scala/tools/nsc/backend/jvm/analysis/ProdConsAnalyzer.scala5
-rw-r--r--src/compiler/scala/tools/nsc/backend/jvm/analysis/package.scala329
4 files changed, 796 insertions, 115 deletions
diff --git a/src/compiler/scala/tools/nsc/backend/jvm/analysis/AliasingFrame.scala b/src/compiler/scala/tools/nsc/backend/jvm/analysis/AliasingFrame.scala
index 6ae6eddfef..9e5fbfcc0e 100644
--- a/src/compiler/scala/tools/nsc/backend/jvm/analysis/AliasingFrame.scala
+++ b/src/compiler/scala/tools/nsc/backend/jvm/analysis/AliasingFrame.scala
@@ -3,17 +3,22 @@ package backend.jvm
package analysis
import scala.annotation.switch
-import scala.collection.{mutable, immutable}
+import scala.collection.mutable
import scala.tools.asm.Opcodes
import scala.tools.asm.tree._
import scala.tools.asm.tree.analysis.{Analyzer, Value, Frame, Interpreter}
import opt.BytecodeUtils._
+import AliasSet.SmallBitSet
-object AliasingFrame {
- private var _idCounter: Long = 0l
- private def nextId = { _idCounter += 1; _idCounter }
-}
-
+/**
+ * A subclass of Frame that tracks aliasing of values stored in local variables and on the stack.
+ *
+ * Note: an analysis tracking aliases is roughly 5x slower than a usual analysis (assuming a simple
+ * value domain with a fast merge function). For example, nullness analysis is roughly 5x slower
+ * than a BasicValue analysis.
+ *
+ * See the doc of package object `analysis` for some notes on the performance of alias analysis.
+ */
class AliasingFrame[V <: Value](nLocals: Int, nStack: Int) extends Frame[V](nLocals, nStack) {
import Opcodes._
@@ -24,50 +29,66 @@ class AliasingFrame[V <: Value](nLocals: Int, nStack: Int) extends Frame[V](nLoc
}
/**
- * For each slot (entry in the `values` array of the frame), an id that uniquely represents
- * the object stored in it. If two values have the same id, they are aliases of the same
- * object.
- */
- private val aliasIds: Array[Long] = Array.fill(nLocals + nStack)(AliasingFrame.nextId)
-
- /**
- * The object alias id of for a value index.
- */
- def aliasId(entry: Int) = aliasIds(entry)
-
- /**
- * Returns the indices of the values array which are aliases of the object `id`.
+ * For every value the set of values that are aliases of it.
+ *
+ * Invariants:
+ * - If `aliases(i) == null` then i has no aliases. This is equivalent to having
+ * `aliases(i) == SingletonSet(i)`.
+ * - If `aliases(i) != null` then `aliases(i) contains i`.
+ * - If `aliases(i) contains j` then `aliases(i) eq aliases(j)`, i.e., they are references to the
+ * same (mutable) AliasSet.
*/
- def valuesWithAliasId(id: Long): Set[Int] = immutable.BitSet.empty ++ aliasIds.indices.iterator.filter(i => aliasId(i) == id)
+ val aliases: Array[AliasSet] = new Array[AliasSet](getLocals + getMaxStackSize)
/**
* The set of aliased values for a given entry in the `values` array.
*/
- def aliasesOf(entry: Int): Set[Int] = valuesWithAliasId(aliasIds(entry))
+ def aliasesOf(entry: Int): AliasSet = {
+ if (aliases(entry) != null) aliases(entry)
+ else {
+ val init = new AliasSet(new AliasSet.SmallBitSet(entry, -1, -1, -1), 1)
+ aliases(entry) = init
+ init
+ }
+ }
/**
- * Define a new alias. For example, given
- * var a = this // this, a have the same aliasId
- * then an assignment
+ * Define a new alias. For example, an assignment
* b = a
- * will set the same the aliasId for `b`.
+ * adds b to the set of aliases of a.
*/
private def newAlias(assignee: Int, source: Int): Unit = {
- aliasIds(assignee) = aliasIds(source)
+ removeAlias(assignee)
+ val sourceAliases = aliasesOf(source)
+ sourceAliases += assignee
+ aliases(assignee) = sourceAliases
}
/**
- * An assignment
+ * Remove an alias. For example, an assignment
* a = someUnknownValue()
- * sets a fresh alias id for `a`.
- * A stack value is also removed from its alias set when being consumed.
+ * removes a from its former alias set.
+ * As another example, stack values are removed from their alias sets when being consumed.
*/
private def removeAlias(assignee: Int): Unit = {
- aliasIds(assignee) = AliasingFrame.nextId
+ if (aliases(assignee) != null) {
+ aliases(assignee) -= assignee
+ aliases(assignee) = null
+ }
+ }
+
+ /**
+ * Define the alias set for a given value.
+ */
+ private def setAliasSet(assignee: Int, set: AliasSet): Unit = {
+ if (aliases(assignee) != null) {
+ aliases(assignee) -= assignee
+ }
+ aliases(assignee) = set
}
override def execute(insn: AbstractInsnNode, interpreter: Interpreter[V]): Unit = {
- // Make the extendsion methods easier to use (otherwise we have to repeat `this`.stackTop)
+ // Make the extension methods easier to use (otherwise we have to repeat `this`.stackTop)
def stackTop: Int = this.stackTop
def peekStack(n: Int): V = this.peekStack(n)
@@ -166,14 +187,34 @@ class AliasingFrame[V <: Value](nLocals: Int, nStack: Int) extends Frame[V](nLoc
}
case SWAP =>
+ // could be written more elegantly with higher-order combinators, but thinking of performance
val top = stackTop
- val idTop = aliasIds(top)
- aliasIds(top) = aliasIds(top - 1)
- aliasIds(top - 1) = idTop
+
+ def moveNextToTop(): Unit = {
+ val nextAliases = aliases(top - 1)
+ aliases(top) = nextAliases
+ nextAliases -= (top - 1)
+ nextAliases += top
+ }
+
+ if (aliases(top) != null) {
+ val topAliases = aliases(top)
+ if (aliases(top - 1) != null) moveNextToTop()
+ else aliases(top) = null
+ // move top to next
+ aliases(top - 1) = topAliases
+ topAliases -= top
+ topAliases += (top - 1)
+ } else {
+ if (aliases(top - 1) != null) {
+ moveNextToTop()
+ aliases(top - 1) = null
+ }
+ }
case opcode =>
if (opcode == ASTORE) {
- // Not a separate case because we need to remove the consumed stack value from alias sets after.
+ // not a separate case: we re-use the code below that removes the consumed stack value from alias sets
val stackTopBefore = stackTop - produced + consumed
val local = insn.asInstanceOf[VarInsnNode].`var`
newAlias(assignee = local, source = stackTopBefore)
@@ -198,9 +239,6 @@ class AliasingFrame[V <: Value](nLocals: Int, nStack: Int) extends Frame[V](nLoc
val firstConsumed = stackTop - produced + 1 // firstConsumed = 3
for (i <- 0 until consumed)
removeAlias(firstConsumed + i) // remove aliases for 3 and 4
-
- // We don't need to set the aliases ids for the produced values: the aliasIds array already
- // contains fresh ids for non-used stack values (ensured by removeAlias).
}
}
@@ -232,30 +270,124 @@ class AliasingFrame[V <: Value](nLocals: Int, nStack: Int) extends Frame[V](nLoc
* x = a
* y = b // (x, a) and (y, b)
* }
- * [...] // (x, a)
+ * [...] // (x, a) -- merge of ((x, y, a)) and ((x, a), (y, b))
*/
override def merge(other: Frame[_ <: V], interpreter: Interpreter[V]): Boolean = {
+ // merge is the main performance hot spot of a data flow analysis.
+
+ // in nullness analysis, super.merge (which actually merges the nullness values) takes 20% of
+ // the overall analysis time.
val valuesChanged = super.merge(other, interpreter)
+
+ // in nullness analysis, merging the alias sets takes ~55% of the analysis time. therefore, this
+ // code has been heavily optimized. most of the time is spent in the `hasNext` method of the
+ // andNotIterator, see its comment.
+
var aliasesChanged = false
val aliasingOther = other.asInstanceOf[AliasingFrame[_]]
- for (i <- aliasIds.indices) {
- val thisAliases = aliasesOf(i)
- val thisNotOther = thisAliases diff (thisAliases intersect aliasingOther.aliasesOf(i))
- if (thisNotOther.nonEmpty) {
- aliasesChanged = true
- thisNotOther foreach removeAlias
+
+ val numValues = getLocals + getStackSize
+ // assume (a, b) are aliases both in this frame, and the other frame. when merging the alias set
+ // for a, we already see that a and b will be aliases in the final result. so we can skip over
+ // merging the alias set for b. in this case, while merging the sets for a, knownOk(b) will be
+ // set to `true`.
+ val knownOk = new Array[Boolean](numValues)
+ var i = 0
+ while (i < numValues) {
+ if (!knownOk(i)) {
+ val thisAliases = this.aliases(i)
+ val otherAliases = aliasingOther.aliases(i)
+ if (thisAliases != null && otherAliases != null) {
+ // The iterator yields elements that are in `thisAliases` but not in `otherAliases`.
+ // As a side-effect, for every index `i` that is in both alias sets, the iterator sets
+ // `knownOk(i) = true`: the alias sets for these values don't need to be merged again.
+ val thisNotOtherIt = AliasSet.andNotIterator(thisAliases, otherAliases, knownOk)
+ if (thisNotOtherIt.hasNext) {
+ aliasesChanged = true
+ val newSet = AliasSet.empty
+ while (thisNotOtherIt.hasNext) {
+ val next = thisNotOtherIt.next()
+ newSet += next
+ setAliasSet(next, newSet)
+ }
+ }
+ }
}
+ i += 1
}
+
valuesChanged || aliasesChanged
}
+ private def min(s: SmallBitSet) = {
+ var r = s.a
+ if ( s.b < r) r = s.b
+ if (s.c != -1 && s.c < r) r = s.c
+ if (s.d != -1 && s.d < r) r = s.d
+ r
+ }
+
override def init(src: Frame[_ <: V]): Frame[V] = {
- super.init(src)
- compat.Platform.arraycopy(src.asInstanceOf[AliasingFrame[_]].aliasIds, 0, aliasIds, 0, aliasIds.length)
+ super.init(src) // very quick (just an arraycopy)
+ System.arraycopy(src.asInstanceOf[AliasingFrame[_]].aliases, 0, aliases, 0, aliases.length) // also quick
+
+ val newSets = mutable.HashMap.empty[AliasSet, AliasSet]
+
+ // the rest of this method (cloning alias sets) is the second performance˙hotspot (next to
+ // AliasingFrame.merge). for nullness, it takes ~20% of the analysis time.
+ // the difficulty here is that we have to clone the alias sets correctly. if two values a, b are
+ // aliases, then aliases(a) eq aliases(b). we need to make sure to use the same clone for the
+ // two values.
+
+ var i = 0
+ while (i < aliases.length) {
+ val set = aliases(i)
+ if (set != null) {
+ // size cannot be 0 - alias sets are always at least singletons.
+ // for sets of size 1-4, don't use the `newSets` map - lookup / update is slow
+ if (set.size == 1) {
+ aliases(i) = null
+ } else if (set.size <= 4) {
+ val small = set.set.asInstanceOf[AliasSet.SmallBitSet]
+ val firstOfSet = i == min(small)
+ if (firstOfSet) {
+ val newSet = set.clone()
+ aliases(small.a) = newSet
+ aliases(small.b) = newSet
+ if (small.c != -1) aliases(small.c) = newSet
+ if (small.d != -1) aliases(small.d) = newSet
+ }
+ } else {
+ // the actual hot spot is the hash map operations here: this is where almost all of the 20%
+ // mentioned above is spent.
+ // i also benchmarked an alternative implementation: keep an array of booleans for indexes
+ // that already contain the cloned set. iterate through all elements of the cloned set and
+ // assign the cloned set. this approach is 50% slower than using a hash map.
+ if (newSets contains set) aliases(i) = newSets(set)
+ else {
+ val newSet = set.clone()
+ newSets(set) = newSet
+ aliases(i) = newSet
+ }
+ }
+ }
+ i += 1
+ }
this
}
}
+object AliasingFrame {
+// val start1 = AliasingFrame.timer1.start()
+// AliasingFrame.timer1.stop(start1)
+ import scala.reflect.internal.util.Statistics._
+ val timer1 = newTimer("t1", "jvm")
+ val timer2 = newTimer("t2", "jvm")
+ val timer3 = newTimer("t3", "jvm")
+ val timers = List(timer1, timer2, timer3)
+ def reset(): Unit = for (t <- timers) { t.nanos = 0; t.timings = 0 }
+}
+
/**
* An analyzer that uses AliasingFrames instead of bare Frames. This can be used when an analysis
* needs to track aliases, but doesn't require a more specific Frame subclass.
@@ -264,3 +396,269 @@ class AliasingAnalyzer[V <: Value](interpreter: Interpreter[V]) extends Analyzer
override def newFrame(nLocals: Int, nStack: Int): AliasingFrame[V] = new AliasingFrame(nLocals, nStack)
override def newFrame(src: Frame[_ <: V]): AliasingFrame[V] = new AliasingFrame(src)
}
+
+/**
+ * An iterator over Int (required to prevent boxing the result of next).
+ */
+abstract class IntIterator extends Iterator[Int] {
+ def hasNext: Boolean
+ def next(): Int
+}
+
+/**
+ * An efficient mutable bit set.
+ *
+ * @param set Either a SmallBitSet or an Array[Long]
+ * @param size The size of the set, useful for performance of certain operations
+ */
+class AliasSet(var set: Object /*SmallBitSet | Array[Long]*/, var size: Int) {
+ import AliasSet._
+
+ override def toString: String = set.toString
+
+ /**
+ * An iterator for the elements of this bit set. Note that only one iterator can be used at a
+ * time. Also make sure not to change the underlying AliasSet during iteration.
+ */
+ def iterator: IntIterator = andNotIterator(this, empty, null)
+
+ def +=(value: Int): Unit = this.set match {
+ case s: SmallBitSet => (size: @switch) match {
+ case 0 => s.a = value; size = 1
+ case 1 => if (value != s.a) { s.b = value; size = 2 }
+ case 2 => if (value != s.a && value != s.b) { s.c = value; size = 3 }
+ case 3 => if (value != s.a && value != s.b && value != s.c) { s.d = value; size = 4 }
+ case 4 =>
+ if (value != s.a && value != s.b && value != s.c && value != s.d) {
+ this.set = bsEmpty
+ this.size = 0
+ bsAdd(this, s.a)
+ bsAdd(this, s.b)
+ bsAdd(this, s.c)
+ bsAdd(this, s.d)
+ bsAdd(this, value)
+ }
+ }
+ case bits: Array[Long] =>
+ bsAdd(this, value)
+ }
+
+ def -=(value: Int): Unit = this.set match {
+ case s: SmallBitSet => (size: @switch) match {
+ case 0 =>
+ case 1 =>
+ if (value == s.a) { s.a = -1; size = 0 }
+ case 2 =>
+ if (value == s.a) { s.a = s.b; s.b = -1; size = 1 }
+ else if (value == s.b) { s.b = -1; size = 1 }
+ case 3 =>
+ if (value == s.a) { s.a = s.b; s.b = s.c; s.c = -1; size = 2 }
+ else if (value == s.b) { s.b = s.c; s.c = -1; size = 2 }
+ else if (value == s.c) { s.c = -1; size = 2 }
+ case 4 =>
+ if (value == s.a) { s.a = s.b; s.b = s.c; s.c = s.d; s.d = -1; size = 3 }
+ else if (value == s.b) { s.b = s.c; s.c = s.d; s.d = -1; size = 3 }
+ else if (value == s.c) { s.c = s.d; s.d = -1; size = 3 }
+ else if (value == s.d) { s.d = -1; size = 3 }
+ }
+ case bits: Array[Long] =>
+ bsRemove(this, value)
+ if (this.size == 4)
+ this.set = bsToSmall(this.set.asInstanceOf[Array[Long]])
+ }
+
+ override def clone(): AliasSet = {
+ val resSet = this.set match {
+ case s: SmallBitSet => new SmallBitSet(s.a, s.b, s.c, s.d)
+ case bits: Array[Long] => bits.clone()
+ }
+ new AliasSet(resSet, this.size)
+ }
+}
+
+object AliasSet {
+ def empty = new AliasSet(new SmallBitSet(-1, -1, -1, -1), 0)
+
+ final class SmallBitSet(var a: Int, var b: Int, var c: Int, var d: Int) {
+ override def toString = s"($a, $b, $c, $d)"
+ }
+
+ def bsEmpty: Array[Long] = new Array[Long](1)
+
+ private def bsEnsureCapacity(set: Array[Long], index: Int): Array[Long] = {
+ if (index < set.length) set
+ else {
+ var newLength = set.length
+ while (index >= newLength) newLength *= 2
+ val newSet = new Array[Long](newLength)
+ Array.copy(set, 0, newSet, 0, set.length)
+ newSet
+ }
+ }
+
+ def bsAdd(set: AliasSet, bit: Int): Unit = {
+ val bits = set.set.asInstanceOf[Array[Long]]
+ val index = bit >> 6
+ val resSet = bsEnsureCapacity(bits, index)
+ val before = resSet(index)
+ val result = before | (1l << bit)
+ if (result != before) {
+ resSet(index) = result
+ set.set = resSet
+ set.size += 1
+ }
+ }
+
+ def bsRemove(set: AliasSet, bit: Int): Unit = {
+ val bits = set.set.asInstanceOf[Array[Long]]
+ val index = bit >> 6
+ if (index < bits.length) {
+ val before = bits(index)
+ val result = before & ~(1l << bit)
+ if (result != before) {
+ bits(index) = result
+ set.size -= 1
+ }
+ }
+ }
+
+ def bsContains(set: Array[Long], bit: Int): Boolean = {
+ val index = bit >> 6
+ bit >= 0 && index < set.length && (set(index) & (1L << bit)) != 0L
+ }
+
+// var sizesHist: Array[Int] = new Array[Int](1000)
+
+ /**
+ * Convert a bit array to a SmallBitSet. Requires the bit array to contain exactly four bits.
+ */
+ def bsToSmall(bits: Array[Long]): SmallBitSet = {
+ var a = -1
+ var b = -1
+ var c = -1
+ var i = 0
+ val end = bits.length * 64
+ while (i < end) {
+ if (bsContains(bits, i)) {
+ if (a == -1) a = i
+ else if (b == -1) b = i
+ else if (c == -1) c = i
+ else return new SmallBitSet(a, b, c, i)
+ }
+ i += 1
+ }
+ null
+ }
+
+ /**
+ * An iterator that yields the elements that are in one bit set and not in another (&~).
+ */
+ private class AndNotIt(setA: AliasSet, setB: AliasSet, thisAndOther: Array[Boolean]) extends IntIterator {
+ // values in the first bit set
+ private var a, b, c, d = -1
+ private var xs: Array[Long] = null
+
+ // values in the second bit set
+ private var notA, notB, notC, notD = -1
+ private var notXs: Array[Long] = null
+
+ // holds the next value of `x`, `y` or `z` that should be returned. assigned in hasNext
+ private var abcdNext = -1
+
+ // counts through elements in the `xs` bit set
+ private var i = 0
+ // true if the current value of `i` should be returned by this iterator
+ private var iValid = false
+
+ setA.set match {
+ case s: SmallBitSet => a = s.a; b = s.b; c = s.c; d = s.d
+ case bits: Array[Long] => xs = bits
+ }
+
+ setB.set match {
+ case s: SmallBitSet => notA = s.a; notB = s.b; notC = s.c; notD = s.d
+ case bits: Array[Long] => notXs = bits
+ }
+
+ // for each value that exists both in this AND (&) the other bit, `thisAndOther` is set to true.
+ // hacky side-effect, used for performance of AliasingFrame.merge.
+ private def setThisAndOther(x: Int) = if (thisAndOther != null) thisAndOther(x) = true
+
+ private def checkABCD(x: Int, num: Int): Boolean = {
+ // assert(x == a && num == 1 || x == b && num == 2 || ...)
+ x != -1 && {
+ val otherHasA = x == notA || x == notB || x == notC || x == notD || (notXs != null && bsContains(notXs, x))
+ if (otherHasA) setThisAndOther(x)
+ else abcdNext = x
+ (num: @switch) match {
+ case 1 => a = -1
+ case 2 => b = -1
+ case 3 => c = -1
+ case 4 => d = -1
+ }
+ !otherHasA
+ }
+ }
+
+ // main performance hot spot
+ private def checkXs = {
+ (xs != null) && {
+ val end = xs.length * 64
+
+ while (i < end && {
+ val index = i >> 6
+ if (xs(index) == 0l) { // boom. for nullness, this saves 35% of the overall analysis time.
+ i = ((index + 1) << 6) - 1 // -1 required because i is incremented in the loop body
+ true
+ } else {
+ val mask = 1l << i
+ // if (mask > xs(index)) we could also advance i to the next value, but that didn't pay off in benchmarks
+ val thisHasI = (xs(index) & mask) != 0l
+ !thisHasI || {
+ val otherHasI = i == notA || i == notB || i == notC || i == notD || (notXs != null && index < notXs.length && (notXs(index) & mask) != 0l)
+ if (otherHasI) setThisAndOther(i)
+ otherHasI
+ }
+ }
+ }) i += 1
+
+ iValid = i < end
+ iValid
+ }
+ }
+
+ // this is the main hot spot of alias analysis. for nullness, 38% of the overall analysis time
+ // is spent here. within hasNext, almost the entire time is spent in `checkXs`.
+ //
+ def hasNext: Boolean = iValid || abcdNext != -1 || checkABCD(a, 1) || checkABCD(b, 2) || checkABCD(c, 3) || checkABCD(d, 4) || checkXs
+
+ def next(): Int = {
+ if (hasNext) {
+ if (abcdNext != -1) {
+ val r = abcdNext; abcdNext = -1; r
+ } else {
+ val r = i; i += 1; iValid = false; r
+ }
+ } else Iterator.empty.next()
+ }
+ }
+
+// The number of bits in a bit array. Useful for debugging.
+// def bsSize(bits: Array[Long]) = {
+// var r = 0
+// var i = 0
+// while (i < bits.length) {
+// r += java.lang.Long.bitCount(bits(i))
+// i += 1
+// }
+// r
+// }
+
+ /**
+ * An iterator returning the elements in a that are not also in b (a &~ b).
+ *
+ * If `thisAndOther` is non-null, the iterator sets thisAndOther(i) to true for every value that
+ * is both in a and b (&).
+ */
+ def andNotIterator(a: AliasSet, b: AliasSet, thisAndOther: Array[Boolean]): IntIterator = new AndNotIt(a, b, thisAndOther)
+}
diff --git a/src/compiler/scala/tools/nsc/backend/jvm/analysis/NullnessAnalyzer.scala b/src/compiler/scala/tools/nsc/backend/jvm/analysis/NullnessAnalyzer.scala
index 31710dcbee..f6d249db7b 100644
--- a/src/compiler/scala/tools/nsc/backend/jvm/analysis/NullnessAnalyzer.scala
+++ b/src/compiler/scala/tools/nsc/backend/jvm/analysis/NullnessAnalyzer.scala
@@ -7,66 +7,12 @@ import java.util
import scala.annotation.switch
import scala.tools.asm.{Type, Opcodes}
import scala.tools.asm.tree.{MethodInsnNode, LdcInsnNode, AbstractInsnNode}
-import scala.tools.asm.tree.analysis.{Frame, Analyzer, Interpreter, Value}
+import scala.tools.asm.tree.analysis._
import scala.tools.nsc.backend.jvm.opt.BytecodeUtils
import BytecodeUtils._
/**
- * Some notes on the ASM ananlyzer framework.
- *
- * Value
- * - Abstract, needs to be implemented for each analysis.
- * - Represents the desired information about local variables and stack values, for example:
- * - Is this value known to be null / not null?
- * - What are the instructions that could potentially have produced this value?
- *
- * Interpreter
- * - Abstract, needs to be implemented for each analysis. Sometimes one can subclass an existing
- * interpreter, e.g., SourceInterpreter or BasicInterpreter.
- * - Multiple abstract methods that receive an instruction and the instruction's input values, and
- * return a value representing the result of that instruction.
- * - Note: due to control flow, the interpreter can be invoked multiple times for the same
- * instruction, until reaching a fixed point.
- * - Abstract `merge` function that computes the least upper bound of two values. Used by
- * Frame.merge (see below).
- *
- * Frame
- * - Can be used directly for many analyses, no subclass required.
- * - Every frame has an array of values: one for each local variable and for each stack slot.
- * - A `top` index stores the index of the current stack top
- * - NOTE: for a size-2 local variable at index i, the local variable at i+1 is set to an empty
- * value. However, for a size-2 value at index i on the stack, the value at i+1 holds the next
- * stack value.
- * - Defines the `execute(instruction)` method.
- * - executing mutates the state of the frame according to the effect of the instruction
- * - pop consumed values from the stack
- * - pass them to the interpreter together with the instruction
- * - if applicable, push the resulting value on the stack
- * - Defines the `merge(otherFrame)` method
- * - called by the analyzer when multiple control flow paths lead to an instruction
- * - the frame at the branching instruction is merged into the current frame of the
- * instruction (held by the analyzer)
- * - mutates the values of the current frame, merges all values using interpreter.merge.
- *
- * Analyzer
- * - Stores a frame for each instruction
- * - `merge` function takes an instruction and a frame, merges the existing frame for that instr
- * (from the frames array) with the new frame passed as argument.
- * if the frame changed, puts the instruction on the work queue (fixpiont).
- * - initial frame: initialized for first instr by calling interpreter.new[...]Value
- * for each slot (locals and params), stored in frames[firstInstr] by calling `merge`
- * - work queue of instructions (`queue` array, `top` index for next instruction to analyze)
- * - analyze(method): simulate control flow. while work queue non-empty:
- * - copy the state of `frames[instr]` into a local frame `current`
- * - call `current.execute(instr, interpreter)`, mutating the `current` frame
- * - if it's a branching instruction
- * - for all potential destination instructions
- * - merge the destination instruction frame with the `current` frame
- * (this enqueues the destination instr if its frame changed)
- * - invoke `newControlFlowEdge` (see below)
- * - the analyzer also tracks active exception handlers at each instruction
- * - the empty method `newControlFlowEdge` can be overridden to track control flow if required
- *
+ * See the package object `analysis` for details on the ASM analysis framework.
*
* Some notes on nullness analysis.
*
@@ -219,8 +165,10 @@ class NullnessFrame(nLocals: Int, nStack: Int) extends AliasingFrame[NullnessVal
override def execute(insn: AbstractInsnNode, interpreter: Interpreter[NullnessValue]): Unit = {
import Opcodes._
- // get the object id of the object that is known to be not-null after this operation
- val nullCheckedAliasId: Long = (insn.getOpcode: @switch) match {
+ // get the alias set the object that is known to be not-null after this operation.
+ // alias sets are mutable / mutated, so after super.execute, this set contains the remaining
+ // aliases of the value that becomes not-null.
+ val nullCheckedAliases: AliasSet = (insn.getOpcode: @switch) match {
case IALOAD |
LALOAD |
FALOAD |
@@ -229,7 +177,7 @@ class NullnessFrame(nLocals: Int, nStack: Int) extends AliasingFrame[NullnessVal
BALOAD |
CALOAD |
SALOAD =>
- aliasId(this.stackTop - 1)
+ aliasesOf(this.stackTop - 1)
case IASTORE |
FASTORE |
@@ -239,35 +187,36 @@ class NullnessFrame(nLocals: Int, nStack: Int) extends AliasingFrame[NullnessVal
SASTORE |
LASTORE |
DASTORE =>
- aliasId(this.stackTop - 2)
+ aliasesOf(this.stackTop - 2)
case GETFIELD =>
- aliasId(this.stackTop)
+ aliasesOf(this.stackTop)
case PUTFIELD =>
- aliasId(this.stackTop - 1)
+ aliasesOf(this.stackTop - 1)
case INVOKEVIRTUAL |
INVOKESPECIAL |
INVOKEINTERFACE =>
val desc = insn.asInstanceOf[MethodInsnNode].desc
val numArgs = Type.getArgumentTypes(desc).length
- aliasId(this.stackTop - numArgs)
+ aliasesOf(this.stackTop - numArgs)
case ARRAYLENGTH |
MONITORENTER |
MONITOREXIT =>
- aliasId(this.stackTop)
+ aliasesOf(this.stackTop)
case _ =>
- -1
+ null
}
super.execute(insn, interpreter)
- if (nullCheckedAliasId != -1) {
- for (i <- valuesWithAliasId(nullCheckedAliasId))
- this.setValue(i, NotNullValue)
+ if (nullCheckedAliases != null) {
+ val it = nullCheckedAliases.iterator
+ while (it.hasNext)
+ this.setValue(it.next(), NotNullValue)
}
}
}
diff --git a/src/compiler/scala/tools/nsc/backend/jvm/analysis/ProdConsAnalyzer.scala b/src/compiler/scala/tools/nsc/backend/jvm/analysis/ProdConsAnalyzer.scala
index 1c24acba03..f2d8dc910a 100644
--- a/src/compiler/scala/tools/nsc/backend/jvm/analysis/ProdConsAnalyzer.scala
+++ b/src/compiler/scala/tools/nsc/backend/jvm/analysis/ProdConsAnalyzer.scala
@@ -55,6 +55,11 @@ import scala.collection.convert.decorateAsScala._
*
* If ever needed, we could introduce a mode where primitive conversions (l2i) are considered as
* copying operations.
+ *
+ * Note on performance: thee data flow analysis (SourceValue / SourceInterpreter, provided by ASM)
+ * is roughly 2-3x slower than a simple analysis (like BasicValue). The reason is that the merge
+ * function (merging producer sets) is more complex than merging simple basic values.
+ * See also the doc comment in the package object `analysis`.
*/
class ProdConsAnalyzer(methodNode: MethodNode, classInternalName: InternalName) {
diff --git a/src/compiler/scala/tools/nsc/backend/jvm/analysis/package.scala b/src/compiler/scala/tools/nsc/backend/jvm/analysis/package.scala
new file mode 100644
index 0000000000..402357c55b
--- /dev/null
+++ b/src/compiler/scala/tools/nsc/backend/jvm/analysis/package.scala
@@ -0,0 +1,329 @@
+package scala.tools.nsc.backend.jvm
+
+/**
+ * Summary on the ASM ananlyzer framework
+ * --------------------------------------
+ *
+ * Value
+ * - Abstract, needs to be implemented for each analysis.
+ * - Represents the desired information about local variables and stack values, for example:
+ * - Is this value known to be null / not null?
+ * - What are the instructions that could potentially have produced this value?
+ *
+ * Interpreter
+ * - Abstract, needs to be implemented for each analysis. Sometimes one can subclass an existing
+ * interpreter, e.g., SourceInterpreter or BasicInterpreter.
+ * - Multiple abstract methods that receive an instruction and the instruction's input values, and
+ * return a value representing the result of that instruction.
+ * - Note: due to control flow, the interpreter can be invoked multiple times for the same
+ * instruction, until reaching a fixed point.
+ * - Abstract `merge` function that computes the least upper bound of two values. Used by
+ * Frame.merge (see below).
+ *
+ * Frame
+ * - Can be used directly for many analyses, no subclass required.
+ * - Every frame has an array of values: one for each local variable and for each stack slot.
+ * - A `top` index stores the index of the current stack top
+ * - NOTE: for a size-2 local variable at index i, the local variable at i+1 is set to an empty
+ * value. However, for a size-2 value at index i on the stack, the value at i+1 holds the next
+ * stack value.
+ * - Defines the `execute(instruction)` method.
+ * - executing mutates the state of the frame according to the effect of the instruction
+ * - pop consumed values from the stack
+ * - pass them to the interpreter together with the instruction
+ * - if applicable, push the resulting value on the stack
+ * - Defines the `merge(otherFrame)` method
+ * - called by the analyzer when multiple control flow paths lead to an instruction
+ * - the frame at the branching instruction is merged into the current frame of the
+ * instruction (held by the analyzer)
+ * - mutates the values of the current frame, merges all values using interpreter.merge.
+ *
+ * Analyzer
+ * - Stores a frame for each instruction
+ * - `merge` function takes an instruction and a frame, merges the existing frame for that instr
+ * (from the frames array) with the new frame passed as argument.
+ * if the frame changed, puts the instruction on the work queue (fixpiont).
+ * - initial frame: initialized for first instr by calling interpreter.new[...]Value
+ * for each slot (locals and params), stored in frames[firstInstr] by calling `merge`
+ * - work queue of instructions (`queue` array, `top` index for next instruction to analyze)
+ * - analyze(method): simulate control flow. while work queue non-empty:
+ * - copy the state of `frames[instr]` into a local frame `current`
+ * - call `current.execute(instr, interpreter)`, mutating the `current` frame
+ * - if it's a branching instruction
+ * - for all potential destination instructions
+ * - merge the destination instruction frame with the `current` frame
+ * (this enqueues the destination instr if its frame changed)
+ * - invoke `newControlFlowEdge` (see below)
+ * - the analyzer also tracks active exception handlers at each instruction
+ * - the empty method `newControlFlowEdge` can be overridden to track control flow if required
+ *
+ *
+ * Lessons learnt while benchmarking the alias tracking analysis
+ * -------------------------------------------------------------
+ *
+ * Profiling
+ * - Use YourKit for finding hotspots (cpu profiling). when it comes to drilling down into the details
+ * of a hotspot, don't pay too much attention to the percentages / time counts.
+ * - Should also try other profilers.
+ * - Use timers. When a method showed up as a hotspot, i added a timer around that method, and a
+ * second one within the method to measure specific parts. The timers slow things down, but the
+ * relative numbers show what parts of a method are slow.
+ *
+ * ASM analyzer insights
+ * - The time for running an analysis depends on the number of locals and the number of instructions.
+ * Reducing the number of locals helps speeding up the analysis: there are less values to
+ * merge when merging to frames.
+ * See also https://github.com/scala/scala-dev/issues/47
+ * - The common hot spot of an ASM analysis is Frame.merge, for example in producers / consumers.
+ * - For nullness analysis the time is spent as follows
+ * - 20% merging nullness values. this is as expected: for example, the same absolute amount of
+ * time is spent in merging BasicValues when running a BasicInterpreter.
+ * - 50% merging alias sets. i tried to optimize what i could out of this.
+ * - 20% is spent creating new frames from existing ones, see comment on AliasingFrame.init.
+ * - The implementation of Frame.merge (the main hot spot) contains a megamorphic callsite to
+ * `interpreter.merge`. This can be observed easily by running a test program that either runs
+ * a BasicValue analysis only, versus a program that first runs a nullness analysis and then
+ * a BasicValue. In an example, the time for the BasicValue analysis goes from 519ms to 1963ms,
+ * a 3.8x slowdown.
+ * - I added counters to the Frame.merge methods for nullness and BasicValue analysis. In the
+ * examples I benchmarked, the number of merge invocations was always exactly the same.
+ * It would probably be possible to come up with an example where alias set merging forces
+ * additional analysis rounds until reaching the fixpoint, but I did not observe such cases.
+ *
+ * To benchmark an analysis, instead of benchmarking analysis while it runs in the compiler
+ * backend, one can easily run it from a separate program (or the repl). The bytecode to analyze
+ * can simply be parsed from a classfile. See example at the end of this comment.
+ *
+ *
+ * Nullness Analysis in Miguel's Optimizer
+ * ---------------------------------------
+ *
+ * Miguel implemented alias tracking for nullness analysis differently [1]. Remember that every
+ * frame has an array of values. Miguel's idea was to represent aliasing using reference equality
+ * in the values array: if two entries in the array point to the same value object, the two entries
+ * are aliases in the frame of the given instruction.
+ *
+ * While this idea seems elegant at first sight, Miguel's implementation does not merge frames
+ * correctly when it comes to aliasing. Assume in frame 1, values (a, b, c) are aliases, while in
+ * frame 2 (a, b) are aliases. When merging the second into the first, we have to make sure that
+ * c is removed as an alias of (a, b).
+ *
+ * It would be possible to implement correct alias set merging in Miguel's approach. However, frame
+ * merging is the main hot spot of analysis. The computational complexity of implementing alias set
+ * merging by traversing the values array and comparing references is too high. The concrete
+ * alias set representation that is used in the current implementation (see class AliasingFrame)
+ * makes alias set merging more efficient.
+ *
+ * [1] https://github.com/scala-opt/scala/blob/opt/rebase/src/compiler/scala/tools/nsc/backend/bcode/NullnessPropagator.java
+ *
+ *
+ * Complexity and scaling of analysis
+ * ----------------------------------
+ *
+ * The time complexity of a data flow analysis depends on:
+ *
+ * - The size of the method. The complexity factor is linear (assuming the number of locals and
+ * branching instructions remains constant). The main analysis loop runs through all
+ * instructions of a method once. Instructions are only re-enqueued if a control flow merge
+ * changes the frame at some instruction.
+ *
+ * - The branching instructions. When a second (third, ..) control flow edge arrives at an
+ * instruction, the existing frame at the instruction is merged with the one computed on the
+ * new branch. If the merge function changes the existing frame, the instruction is enqueued
+ * for another analysis. This results in a merge operation for the successors of the
+ * instruction.
+ *
+ * - The number of local variables. The hot spot of analysis is frame merging. The merge function
+ * iterates through the values in the frame (locals and stack values) and merges them.
+ *
+ * I measured the running time of an analysis for two examples:
+ * - Keep the number of locals and branching instructions constant, increase the number of
+ * instructions. The running time grows linearly with the method size.
+ * - Increase the size and number of locals in a method. The method size and number of locals
+ * grow in the same pace. Here, the running time increase is polynomial. It looks like the
+ * complexity is be #instructions * #locals^2 (see below).
+ *
+ * I measured nullness analysis (which tracks aliases) and a SimpleValue analysis. Nullness runs
+ * roughly 5x slower (because of alias tracking) at every problem size - this factor doesn't change.
+ *
+ * The numbers below are for nullness. Note that the the last column is constant, i.e., the running
+ * time is proportional to #ins * #loc^2. Therefore we use this factor when limiting the maximal
+ * method size for running an analysis.
+ *
+ * #insns #locals time (ms) time / #ins * #loc^2 * 10^6
+ * 1305 156 34 1.07
+ * 2610 311 165 0.65
+ * 3915 466 490 0.57
+ * 5220 621 1200 0.59
+ * 6525 776 2220 0.56
+ * 7830 931 3830 0.56
+ * 9135 1086 6570 0.60
+ * 10440 1241 9700 0.60
+ * 11745 1396 13800 0.60
+ *
+ * As a second experiment, nullness analysis was run with varying #insns but constant #locals.
+ * The last column shows linear complexity with respect to the method size (linearOffset = 2279):
+ *
+ * #insns #locals time (ms) (time + linearOffset) / #insns
+ * 5220 621 1090 0.645
+ * 6224 621 1690 0.637
+ * 7226 621 2280 0.630
+ * 8228 621 2870 0.625
+ * 9230 621 3530 0.629
+ * 10232 621 4130 0.626
+ * 11234 621 4770 0.627
+ * 12236 621 5520 0.637
+ * 13238 621 6170 0.638
+ *
+ *
+ * When running a BasicValue analysis, the complexity observation is the same (time is proportional
+ * to #ins * #loc^2).
+ *
+ *
+ * Measuring analysis execution time
+ * ---------------------------------
+ *
+ * See code below.
+ */
+
+/*
+object Test {
+ val overwrite: Option[String] = null
+
+ @noinline def serialize(o: AnyRef): String = null
+
+ @noinline def deserialize(string: String): AnyRef = null
+
+ @inline def checkRoundTrip[T <: AnyRef](instance: T)(f: T => AnyRef) {
+ val result = serialize(instance)
+ val reconstituted = deserialize(result).asInstanceOf[T]
+ assert(f(instance) == f(reconstituted), (f(instance), f(reconstituted)))
+ }
+
+ @inline def check[T <: AnyRef](instance: => T)(prevResult: String, f: T => AnyRef = (x: T) => x) {
+ // pattern match to introduce a lot of control flow, i.e., a lot of frame merges
+ overwrite match {
+ case Some(f) =>
+ case None =>
+ checkRoundTrip(instance)(f)
+ assert(f(deserialize(prevResult).asInstanceOf[T]) == f(instance), instance)
+ assert(prevResult == "res", instance)
+ }
+ }
+
+ // @inline def fun[T <: AnyRef](instance: => T) = (x: T) => x
+
+ def testMain(): Unit = {
+ // every call to check creates quite a number of locals, and also quite a number of aliases
+ // of the same value (x1). First of all, the default argument call is expanded as below. Then
+ // method check is inlined, and within the body of check, checkRoundTrip and assert have
+ // already been inlined as well.
+
+ // {
+ // val x1 = () => ""
+ // val x2 = fun(x1()) // the compiler optimizes this: instead of passing `() => x1()`, it just passes x1
+ // check(x1())("", x2) // same here for x1
+ // }
+
+ check("")("")
+ check("")("")
+ check("")("")
+ check("")("")
+ check("")("") // 5
+ check("")("")
+ check("")("")
+ check("")("")
+ check("")("")
+ check("")("") // 10
+ check("")("")
+ check("")("")
+ check("")("")
+ check("")("")
+ check("")("") // 15
+ check("")("")
+ check("")("")
+ check("")("")
+ check("")("")
+ check("")("") // 20
+ check("")("")
+ check("")("")
+ check("")("")
+ check("")("")
+ check("")("") // 25
+ check("")("")
+ check("")("")
+ check("")("")
+ check("")("")
+ check("")("") // 30
+ check("")("")
+ check("")("")
+ check("")("")
+ check("")("")
+ check("")("") // 35
+ check("")("")
+ check("")("")
+ check("")("")
+ check("")("")
+ check("")("") // 40
+ // check("")("")
+ // check("")("")
+ // check("")("")
+ // check("")("")
+ // check("")("") // 45
+ // check("")("")
+ // check("")("")
+ // check("")("")
+ // check("")("")
+ // check("")("") // 50
+ // check("")("")
+ // check("")("")
+ // check("")("")
+ // check("")("")
+ // check("")("") // 55
+
+ // 1000 bytecode instructions, 0 locals
+ // println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10));
+ }
+
+ def timed[T](f: => T): T = {
+ val start = System.nanoTime()
+ val r = f
+ val nanos = System.nanoTime() - start
+ println(s"took ${nanos/1000000}ms")
+ r
+ }
+
+ def main(args: Array[String]): Unit = {
+ import scala.tools.nsc.backend.jvm._
+ val cn = AsmUtils.readClass("/Users/luc/scala/scala/sandbox/Test$.class")
+ import scala.collection.convert.decorateAsScala._
+ val m = cn.methods.iterator.asScala.find(_.name == "testMain").head
+
+ println(s"${m.instructions.size} instructions - ${m.maxLocals} locals")
+
+ val a = new analysis.NullnessAnalyzer
+ a.analyze(cn.name, m) // warm up
+
+ analysis.AliasingFrame.reset()
+ timed(a.analyze(cn.name, m))
+ analysis.AliasingFrame.timers foreach println
+
+ println("---")
+
+ // NOTE: if we don't run nullness analysis above (comment it out), then the BasicValue
+ // analysis runs 3.5x faster. Most likely because the call to Interpreter.merge inside
+ // Frame.merge is no longer megamorphic.
+
+ import scala.tools.asm.tree.analysis._
+ val ba = new Analyzer(new BasicInterpreter)
+ ba.analyze(cn.name, m) // warm up
+
+ timed(ba.analyze(cn.name, m))
+
+ println("---")
+
+ timed(a.analyze(cn.name, m))
+ }
+}
+*/
+package object analysis