diff options
author | Lukas Rytz <lukas.rytz@gmail.com> | 2015-09-09 17:14:17 +0200 |
---|---|---|
committer | Lukas Rytz <lukas.rytz@gmail.com> | 2015-09-17 22:05:05 +0200 |
commit | 72e1d0567ec011da5c519378c8883a492e67a25b (patch) | |
tree | 22c3aa1a447f725333ff8ae3725ad13cf7c53f50 | |
parent | 382015824cfb3d1af2eb0dc3065b305e93185577 (diff) | |
download | scala-72e1d0567ec011da5c519378c8883a492e67a25b.tar.gz scala-72e1d0567ec011da5c519378c8883a492e67a25b.tar.bz2 scala-72e1d0567ec011da5c519378c8883a492e67a25b.zip |
Improve the performance of analyses tracking aliases
Plenty of optimizations to make aliasing analysis faster. I cannot
say how much faster: an analysis that used to not terminate now takes
one second.
The lessons are not new:
- avoid boxing (of course)
- avoid allocating in tight loops
- use arrays, avoid hash sets / maps
- specialize for common cases (see SmallBitSet in this commit)
- avoid megamorphic calls
- understand the details of the problem and find those twists that
make it run faster
More explanations in the code.
4 files changed, 796 insertions, 115 deletions
diff --git a/src/compiler/scala/tools/nsc/backend/jvm/analysis/AliasingFrame.scala b/src/compiler/scala/tools/nsc/backend/jvm/analysis/AliasingFrame.scala index 6ae6eddfef..9e5fbfcc0e 100644 --- a/src/compiler/scala/tools/nsc/backend/jvm/analysis/AliasingFrame.scala +++ b/src/compiler/scala/tools/nsc/backend/jvm/analysis/AliasingFrame.scala @@ -3,17 +3,22 @@ package backend.jvm package analysis import scala.annotation.switch -import scala.collection.{mutable, immutable} +import scala.collection.mutable import scala.tools.asm.Opcodes import scala.tools.asm.tree._ import scala.tools.asm.tree.analysis.{Analyzer, Value, Frame, Interpreter} import opt.BytecodeUtils._ +import AliasSet.SmallBitSet -object AliasingFrame { - private var _idCounter: Long = 0l - private def nextId = { _idCounter += 1; _idCounter } -} - +/** + * A subclass of Frame that tracks aliasing of values stored in local variables and on the stack. + * + * Note: an analysis tracking aliases is roughly 5x slower than a usual analysis (assuming a simple + * value domain with a fast merge function). For example, nullness analysis is roughly 5x slower + * than a BasicValue analysis. + * + * See the doc of package object `analysis` for some notes on the performance of alias analysis. + */ class AliasingFrame[V <: Value](nLocals: Int, nStack: Int) extends Frame[V](nLocals, nStack) { import Opcodes._ @@ -24,50 +29,66 @@ class AliasingFrame[V <: Value](nLocals: Int, nStack: Int) extends Frame[V](nLoc } /** - * For each slot (entry in the `values` array of the frame), an id that uniquely represents - * the object stored in it. If two values have the same id, they are aliases of the same - * object. - */ - private val aliasIds: Array[Long] = Array.fill(nLocals + nStack)(AliasingFrame.nextId) - - /** - * The object alias id of for a value index. - */ - def aliasId(entry: Int) = aliasIds(entry) - - /** - * Returns the indices of the values array which are aliases of the object `id`. + * For every value the set of values that are aliases of it. + * + * Invariants: + * - If `aliases(i) == null` then i has no aliases. This is equivalent to having + * `aliases(i) == SingletonSet(i)`. + * - If `aliases(i) != null` then `aliases(i) contains i`. + * - If `aliases(i) contains j` then `aliases(i) eq aliases(j)`, i.e., they are references to the + * same (mutable) AliasSet. */ - def valuesWithAliasId(id: Long): Set[Int] = immutable.BitSet.empty ++ aliasIds.indices.iterator.filter(i => aliasId(i) == id) + val aliases: Array[AliasSet] = new Array[AliasSet](getLocals + getMaxStackSize) /** * The set of aliased values for a given entry in the `values` array. */ - def aliasesOf(entry: Int): Set[Int] = valuesWithAliasId(aliasIds(entry)) + def aliasesOf(entry: Int): AliasSet = { + if (aliases(entry) != null) aliases(entry) + else { + val init = new AliasSet(new AliasSet.SmallBitSet(entry, -1, -1, -1), 1) + aliases(entry) = init + init + } + } /** - * Define a new alias. For example, given - * var a = this // this, a have the same aliasId - * then an assignment + * Define a new alias. For example, an assignment * b = a - * will set the same the aliasId for `b`. + * adds b to the set of aliases of a. */ private def newAlias(assignee: Int, source: Int): Unit = { - aliasIds(assignee) = aliasIds(source) + removeAlias(assignee) + val sourceAliases = aliasesOf(source) + sourceAliases += assignee + aliases(assignee) = sourceAliases } /** - * An assignment + * Remove an alias. For example, an assignment * a = someUnknownValue() - * sets a fresh alias id for `a`. - * A stack value is also removed from its alias set when being consumed. + * removes a from its former alias set. + * As another example, stack values are removed from their alias sets when being consumed. */ private def removeAlias(assignee: Int): Unit = { - aliasIds(assignee) = AliasingFrame.nextId + if (aliases(assignee) != null) { + aliases(assignee) -= assignee + aliases(assignee) = null + } + } + + /** + * Define the alias set for a given value. + */ + private def setAliasSet(assignee: Int, set: AliasSet): Unit = { + if (aliases(assignee) != null) { + aliases(assignee) -= assignee + } + aliases(assignee) = set } override def execute(insn: AbstractInsnNode, interpreter: Interpreter[V]): Unit = { - // Make the extendsion methods easier to use (otherwise we have to repeat `this`.stackTop) + // Make the extension methods easier to use (otherwise we have to repeat `this`.stackTop) def stackTop: Int = this.stackTop def peekStack(n: Int): V = this.peekStack(n) @@ -166,14 +187,34 @@ class AliasingFrame[V <: Value](nLocals: Int, nStack: Int) extends Frame[V](nLoc } case SWAP => + // could be written more elegantly with higher-order combinators, but thinking of performance val top = stackTop - val idTop = aliasIds(top) - aliasIds(top) = aliasIds(top - 1) - aliasIds(top - 1) = idTop + + def moveNextToTop(): Unit = { + val nextAliases = aliases(top - 1) + aliases(top) = nextAliases + nextAliases -= (top - 1) + nextAliases += top + } + + if (aliases(top) != null) { + val topAliases = aliases(top) + if (aliases(top - 1) != null) moveNextToTop() + else aliases(top) = null + // move top to next + aliases(top - 1) = topAliases + topAliases -= top + topAliases += (top - 1) + } else { + if (aliases(top - 1) != null) { + moveNextToTop() + aliases(top - 1) = null + } + } case opcode => if (opcode == ASTORE) { - // Not a separate case because we need to remove the consumed stack value from alias sets after. + // not a separate case: we re-use the code below that removes the consumed stack value from alias sets val stackTopBefore = stackTop - produced + consumed val local = insn.asInstanceOf[VarInsnNode].`var` newAlias(assignee = local, source = stackTopBefore) @@ -198,9 +239,6 @@ class AliasingFrame[V <: Value](nLocals: Int, nStack: Int) extends Frame[V](nLoc val firstConsumed = stackTop - produced + 1 // firstConsumed = 3 for (i <- 0 until consumed) removeAlias(firstConsumed + i) // remove aliases for 3 and 4 - - // We don't need to set the aliases ids for the produced values: the aliasIds array already - // contains fresh ids for non-used stack values (ensured by removeAlias). } } @@ -232,30 +270,124 @@ class AliasingFrame[V <: Value](nLocals: Int, nStack: Int) extends Frame[V](nLoc * x = a * y = b // (x, a) and (y, b) * } - * [...] // (x, a) + * [...] // (x, a) -- merge of ((x, y, a)) and ((x, a), (y, b)) */ override def merge(other: Frame[_ <: V], interpreter: Interpreter[V]): Boolean = { + // merge is the main performance hot spot of a data flow analysis. + + // in nullness analysis, super.merge (which actually merges the nullness values) takes 20% of + // the overall analysis time. val valuesChanged = super.merge(other, interpreter) + + // in nullness analysis, merging the alias sets takes ~55% of the analysis time. therefore, this + // code has been heavily optimized. most of the time is spent in the `hasNext` method of the + // andNotIterator, see its comment. + var aliasesChanged = false val aliasingOther = other.asInstanceOf[AliasingFrame[_]] - for (i <- aliasIds.indices) { - val thisAliases = aliasesOf(i) - val thisNotOther = thisAliases diff (thisAliases intersect aliasingOther.aliasesOf(i)) - if (thisNotOther.nonEmpty) { - aliasesChanged = true - thisNotOther foreach removeAlias + + val numValues = getLocals + getStackSize + // assume (a, b) are aliases both in this frame, and the other frame. when merging the alias set + // for a, we already see that a and b will be aliases in the final result. so we can skip over + // merging the alias set for b. in this case, while merging the sets for a, knownOk(b) will be + // set to `true`. + val knownOk = new Array[Boolean](numValues) + var i = 0 + while (i < numValues) { + if (!knownOk(i)) { + val thisAliases = this.aliases(i) + val otherAliases = aliasingOther.aliases(i) + if (thisAliases != null && otherAliases != null) { + // The iterator yields elements that are in `thisAliases` but not in `otherAliases`. + // As a side-effect, for every index `i` that is in both alias sets, the iterator sets + // `knownOk(i) = true`: the alias sets for these values don't need to be merged again. + val thisNotOtherIt = AliasSet.andNotIterator(thisAliases, otherAliases, knownOk) + if (thisNotOtherIt.hasNext) { + aliasesChanged = true + val newSet = AliasSet.empty + while (thisNotOtherIt.hasNext) { + val next = thisNotOtherIt.next() + newSet += next + setAliasSet(next, newSet) + } + } + } } + i += 1 } + valuesChanged || aliasesChanged } + private def min(s: SmallBitSet) = { + var r = s.a + if ( s.b < r) r = s.b + if (s.c != -1 && s.c < r) r = s.c + if (s.d != -1 && s.d < r) r = s.d + r + } + override def init(src: Frame[_ <: V]): Frame[V] = { - super.init(src) - compat.Platform.arraycopy(src.asInstanceOf[AliasingFrame[_]].aliasIds, 0, aliasIds, 0, aliasIds.length) + super.init(src) // very quick (just an arraycopy) + System.arraycopy(src.asInstanceOf[AliasingFrame[_]].aliases, 0, aliases, 0, aliases.length) // also quick + + val newSets = mutable.HashMap.empty[AliasSet, AliasSet] + + // the rest of this method (cloning alias sets) is the second performance˙hotspot (next to + // AliasingFrame.merge). for nullness, it takes ~20% of the analysis time. + // the difficulty here is that we have to clone the alias sets correctly. if two values a, b are + // aliases, then aliases(a) eq aliases(b). we need to make sure to use the same clone for the + // two values. + + var i = 0 + while (i < aliases.length) { + val set = aliases(i) + if (set != null) { + // size cannot be 0 - alias sets are always at least singletons. + // for sets of size 1-4, don't use the `newSets` map - lookup / update is slow + if (set.size == 1) { + aliases(i) = null + } else if (set.size <= 4) { + val small = set.set.asInstanceOf[AliasSet.SmallBitSet] + val firstOfSet = i == min(small) + if (firstOfSet) { + val newSet = set.clone() + aliases(small.a) = newSet + aliases(small.b) = newSet + if (small.c != -1) aliases(small.c) = newSet + if (small.d != -1) aliases(small.d) = newSet + } + } else { + // the actual hot spot is the hash map operations here: this is where almost all of the 20% + // mentioned above is spent. + // i also benchmarked an alternative implementation: keep an array of booleans for indexes + // that already contain the cloned set. iterate through all elements of the cloned set and + // assign the cloned set. this approach is 50% slower than using a hash map. + if (newSets contains set) aliases(i) = newSets(set) + else { + val newSet = set.clone() + newSets(set) = newSet + aliases(i) = newSet + } + } + } + i += 1 + } this } } +object AliasingFrame { +// val start1 = AliasingFrame.timer1.start() +// AliasingFrame.timer1.stop(start1) + import scala.reflect.internal.util.Statistics._ + val timer1 = newTimer("t1", "jvm") + val timer2 = newTimer("t2", "jvm") + val timer3 = newTimer("t3", "jvm") + val timers = List(timer1, timer2, timer3) + def reset(): Unit = for (t <- timers) { t.nanos = 0; t.timings = 0 } +} + /** * An analyzer that uses AliasingFrames instead of bare Frames. This can be used when an analysis * needs to track aliases, but doesn't require a more specific Frame subclass. @@ -264,3 +396,269 @@ class AliasingAnalyzer[V <: Value](interpreter: Interpreter[V]) extends Analyzer override def newFrame(nLocals: Int, nStack: Int): AliasingFrame[V] = new AliasingFrame(nLocals, nStack) override def newFrame(src: Frame[_ <: V]): AliasingFrame[V] = new AliasingFrame(src) } + +/** + * An iterator over Int (required to prevent boxing the result of next). + */ +abstract class IntIterator extends Iterator[Int] { + def hasNext: Boolean + def next(): Int +} + +/** + * An efficient mutable bit set. + * + * @param set Either a SmallBitSet or an Array[Long] + * @param size The size of the set, useful for performance of certain operations + */ +class AliasSet(var set: Object /*SmallBitSet | Array[Long]*/, var size: Int) { + import AliasSet._ + + override def toString: String = set.toString + + /** + * An iterator for the elements of this bit set. Note that only one iterator can be used at a + * time. Also make sure not to change the underlying AliasSet during iteration. + */ + def iterator: IntIterator = andNotIterator(this, empty, null) + + def +=(value: Int): Unit = this.set match { + case s: SmallBitSet => (size: @switch) match { + case 0 => s.a = value; size = 1 + case 1 => if (value != s.a) { s.b = value; size = 2 } + case 2 => if (value != s.a && value != s.b) { s.c = value; size = 3 } + case 3 => if (value != s.a && value != s.b && value != s.c) { s.d = value; size = 4 } + case 4 => + if (value != s.a && value != s.b && value != s.c && value != s.d) { + this.set = bsEmpty + this.size = 0 + bsAdd(this, s.a) + bsAdd(this, s.b) + bsAdd(this, s.c) + bsAdd(this, s.d) + bsAdd(this, value) + } + } + case bits: Array[Long] => + bsAdd(this, value) + } + + def -=(value: Int): Unit = this.set match { + case s: SmallBitSet => (size: @switch) match { + case 0 => + case 1 => + if (value == s.a) { s.a = -1; size = 0 } + case 2 => + if (value == s.a) { s.a = s.b; s.b = -1; size = 1 } + else if (value == s.b) { s.b = -1; size = 1 } + case 3 => + if (value == s.a) { s.a = s.b; s.b = s.c; s.c = -1; size = 2 } + else if (value == s.b) { s.b = s.c; s.c = -1; size = 2 } + else if (value == s.c) { s.c = -1; size = 2 } + case 4 => + if (value == s.a) { s.a = s.b; s.b = s.c; s.c = s.d; s.d = -1; size = 3 } + else if (value == s.b) { s.b = s.c; s.c = s.d; s.d = -1; size = 3 } + else if (value == s.c) { s.c = s.d; s.d = -1; size = 3 } + else if (value == s.d) { s.d = -1; size = 3 } + } + case bits: Array[Long] => + bsRemove(this, value) + if (this.size == 4) + this.set = bsToSmall(this.set.asInstanceOf[Array[Long]]) + } + + override def clone(): AliasSet = { + val resSet = this.set match { + case s: SmallBitSet => new SmallBitSet(s.a, s.b, s.c, s.d) + case bits: Array[Long] => bits.clone() + } + new AliasSet(resSet, this.size) + } +} + +object AliasSet { + def empty = new AliasSet(new SmallBitSet(-1, -1, -1, -1), 0) + + final class SmallBitSet(var a: Int, var b: Int, var c: Int, var d: Int) { + override def toString = s"($a, $b, $c, $d)" + } + + def bsEmpty: Array[Long] = new Array[Long](1) + + private def bsEnsureCapacity(set: Array[Long], index: Int): Array[Long] = { + if (index < set.length) set + else { + var newLength = set.length + while (index >= newLength) newLength *= 2 + val newSet = new Array[Long](newLength) + Array.copy(set, 0, newSet, 0, set.length) + newSet + } + } + + def bsAdd(set: AliasSet, bit: Int): Unit = { + val bits = set.set.asInstanceOf[Array[Long]] + val index = bit >> 6 + val resSet = bsEnsureCapacity(bits, index) + val before = resSet(index) + val result = before | (1l << bit) + if (result != before) { + resSet(index) = result + set.set = resSet + set.size += 1 + } + } + + def bsRemove(set: AliasSet, bit: Int): Unit = { + val bits = set.set.asInstanceOf[Array[Long]] + val index = bit >> 6 + if (index < bits.length) { + val before = bits(index) + val result = before & ~(1l << bit) + if (result != before) { + bits(index) = result + set.size -= 1 + } + } + } + + def bsContains(set: Array[Long], bit: Int): Boolean = { + val index = bit >> 6 + bit >= 0 && index < set.length && (set(index) & (1L << bit)) != 0L + } + +// var sizesHist: Array[Int] = new Array[Int](1000) + + /** + * Convert a bit array to a SmallBitSet. Requires the bit array to contain exactly four bits. + */ + def bsToSmall(bits: Array[Long]): SmallBitSet = { + var a = -1 + var b = -1 + var c = -1 + var i = 0 + val end = bits.length * 64 + while (i < end) { + if (bsContains(bits, i)) { + if (a == -1) a = i + else if (b == -1) b = i + else if (c == -1) c = i + else return new SmallBitSet(a, b, c, i) + } + i += 1 + } + null + } + + /** + * An iterator that yields the elements that are in one bit set and not in another (&~). + */ + private class AndNotIt(setA: AliasSet, setB: AliasSet, thisAndOther: Array[Boolean]) extends IntIterator { + // values in the first bit set + private var a, b, c, d = -1 + private var xs: Array[Long] = null + + // values in the second bit set + private var notA, notB, notC, notD = -1 + private var notXs: Array[Long] = null + + // holds the next value of `x`, `y` or `z` that should be returned. assigned in hasNext + private var abcdNext = -1 + + // counts through elements in the `xs` bit set + private var i = 0 + // true if the current value of `i` should be returned by this iterator + private var iValid = false + + setA.set match { + case s: SmallBitSet => a = s.a; b = s.b; c = s.c; d = s.d + case bits: Array[Long] => xs = bits + } + + setB.set match { + case s: SmallBitSet => notA = s.a; notB = s.b; notC = s.c; notD = s.d + case bits: Array[Long] => notXs = bits + } + + // for each value that exists both in this AND (&) the other bit, `thisAndOther` is set to true. + // hacky side-effect, used for performance of AliasingFrame.merge. + private def setThisAndOther(x: Int) = if (thisAndOther != null) thisAndOther(x) = true + + private def checkABCD(x: Int, num: Int): Boolean = { + // assert(x == a && num == 1 || x == b && num == 2 || ...) + x != -1 && { + val otherHasA = x == notA || x == notB || x == notC || x == notD || (notXs != null && bsContains(notXs, x)) + if (otherHasA) setThisAndOther(x) + else abcdNext = x + (num: @switch) match { + case 1 => a = -1 + case 2 => b = -1 + case 3 => c = -1 + case 4 => d = -1 + } + !otherHasA + } + } + + // main performance hot spot + private def checkXs = { + (xs != null) && { + val end = xs.length * 64 + + while (i < end && { + val index = i >> 6 + if (xs(index) == 0l) { // boom. for nullness, this saves 35% of the overall analysis time. + i = ((index + 1) << 6) - 1 // -1 required because i is incremented in the loop body + true + } else { + val mask = 1l << i + // if (mask > xs(index)) we could also advance i to the next value, but that didn't pay off in benchmarks + val thisHasI = (xs(index) & mask) != 0l + !thisHasI || { + val otherHasI = i == notA || i == notB || i == notC || i == notD || (notXs != null && index < notXs.length && (notXs(index) & mask) != 0l) + if (otherHasI) setThisAndOther(i) + otherHasI + } + } + }) i += 1 + + iValid = i < end + iValid + } + } + + // this is the main hot spot of alias analysis. for nullness, 38% of the overall analysis time + // is spent here. within hasNext, almost the entire time is spent in `checkXs`. + // + def hasNext: Boolean = iValid || abcdNext != -1 || checkABCD(a, 1) || checkABCD(b, 2) || checkABCD(c, 3) || checkABCD(d, 4) || checkXs + + def next(): Int = { + if (hasNext) { + if (abcdNext != -1) { + val r = abcdNext; abcdNext = -1; r + } else { + val r = i; i += 1; iValid = false; r + } + } else Iterator.empty.next() + } + } + +// The number of bits in a bit array. Useful for debugging. +// def bsSize(bits: Array[Long]) = { +// var r = 0 +// var i = 0 +// while (i < bits.length) { +// r += java.lang.Long.bitCount(bits(i)) +// i += 1 +// } +// r +// } + + /** + * An iterator returning the elements in a that are not also in b (a &~ b). + * + * If `thisAndOther` is non-null, the iterator sets thisAndOther(i) to true for every value that + * is both in a and b (&). + */ + def andNotIterator(a: AliasSet, b: AliasSet, thisAndOther: Array[Boolean]): IntIterator = new AndNotIt(a, b, thisAndOther) +} diff --git a/src/compiler/scala/tools/nsc/backend/jvm/analysis/NullnessAnalyzer.scala b/src/compiler/scala/tools/nsc/backend/jvm/analysis/NullnessAnalyzer.scala index 31710dcbee..f6d249db7b 100644 --- a/src/compiler/scala/tools/nsc/backend/jvm/analysis/NullnessAnalyzer.scala +++ b/src/compiler/scala/tools/nsc/backend/jvm/analysis/NullnessAnalyzer.scala @@ -7,66 +7,12 @@ import java.util import scala.annotation.switch import scala.tools.asm.{Type, Opcodes} import scala.tools.asm.tree.{MethodInsnNode, LdcInsnNode, AbstractInsnNode} -import scala.tools.asm.tree.analysis.{Frame, Analyzer, Interpreter, Value} +import scala.tools.asm.tree.analysis._ import scala.tools.nsc.backend.jvm.opt.BytecodeUtils import BytecodeUtils._ /** - * Some notes on the ASM ananlyzer framework. - * - * Value - * - Abstract, needs to be implemented for each analysis. - * - Represents the desired information about local variables and stack values, for example: - * - Is this value known to be null / not null? - * - What are the instructions that could potentially have produced this value? - * - * Interpreter - * - Abstract, needs to be implemented for each analysis. Sometimes one can subclass an existing - * interpreter, e.g., SourceInterpreter or BasicInterpreter. - * - Multiple abstract methods that receive an instruction and the instruction's input values, and - * return a value representing the result of that instruction. - * - Note: due to control flow, the interpreter can be invoked multiple times for the same - * instruction, until reaching a fixed point. - * - Abstract `merge` function that computes the least upper bound of two values. Used by - * Frame.merge (see below). - * - * Frame - * - Can be used directly for many analyses, no subclass required. - * - Every frame has an array of values: one for each local variable and for each stack slot. - * - A `top` index stores the index of the current stack top - * - NOTE: for a size-2 local variable at index i, the local variable at i+1 is set to an empty - * value. However, for a size-2 value at index i on the stack, the value at i+1 holds the next - * stack value. - * - Defines the `execute(instruction)` method. - * - executing mutates the state of the frame according to the effect of the instruction - * - pop consumed values from the stack - * - pass them to the interpreter together with the instruction - * - if applicable, push the resulting value on the stack - * - Defines the `merge(otherFrame)` method - * - called by the analyzer when multiple control flow paths lead to an instruction - * - the frame at the branching instruction is merged into the current frame of the - * instruction (held by the analyzer) - * - mutates the values of the current frame, merges all values using interpreter.merge. - * - * Analyzer - * - Stores a frame for each instruction - * - `merge` function takes an instruction and a frame, merges the existing frame for that instr - * (from the frames array) with the new frame passed as argument. - * if the frame changed, puts the instruction on the work queue (fixpiont). - * - initial frame: initialized for first instr by calling interpreter.new[...]Value - * for each slot (locals and params), stored in frames[firstInstr] by calling `merge` - * - work queue of instructions (`queue` array, `top` index for next instruction to analyze) - * - analyze(method): simulate control flow. while work queue non-empty: - * - copy the state of `frames[instr]` into a local frame `current` - * - call `current.execute(instr, interpreter)`, mutating the `current` frame - * - if it's a branching instruction - * - for all potential destination instructions - * - merge the destination instruction frame with the `current` frame - * (this enqueues the destination instr if its frame changed) - * - invoke `newControlFlowEdge` (see below) - * - the analyzer also tracks active exception handlers at each instruction - * - the empty method `newControlFlowEdge` can be overridden to track control flow if required - * + * See the package object `analysis` for details on the ASM analysis framework. * * Some notes on nullness analysis. * @@ -219,8 +165,10 @@ class NullnessFrame(nLocals: Int, nStack: Int) extends AliasingFrame[NullnessVal override def execute(insn: AbstractInsnNode, interpreter: Interpreter[NullnessValue]): Unit = { import Opcodes._ - // get the object id of the object that is known to be not-null after this operation - val nullCheckedAliasId: Long = (insn.getOpcode: @switch) match { + // get the alias set the object that is known to be not-null after this operation. + // alias sets are mutable / mutated, so after super.execute, this set contains the remaining + // aliases of the value that becomes not-null. + val nullCheckedAliases: AliasSet = (insn.getOpcode: @switch) match { case IALOAD | LALOAD | FALOAD | @@ -229,7 +177,7 @@ class NullnessFrame(nLocals: Int, nStack: Int) extends AliasingFrame[NullnessVal BALOAD | CALOAD | SALOAD => - aliasId(this.stackTop - 1) + aliasesOf(this.stackTop - 1) case IASTORE | FASTORE | @@ -239,35 +187,36 @@ class NullnessFrame(nLocals: Int, nStack: Int) extends AliasingFrame[NullnessVal SASTORE | LASTORE | DASTORE => - aliasId(this.stackTop - 2) + aliasesOf(this.stackTop - 2) case GETFIELD => - aliasId(this.stackTop) + aliasesOf(this.stackTop) case PUTFIELD => - aliasId(this.stackTop - 1) + aliasesOf(this.stackTop - 1) case INVOKEVIRTUAL | INVOKESPECIAL | INVOKEINTERFACE => val desc = insn.asInstanceOf[MethodInsnNode].desc val numArgs = Type.getArgumentTypes(desc).length - aliasId(this.stackTop - numArgs) + aliasesOf(this.stackTop - numArgs) case ARRAYLENGTH | MONITORENTER | MONITOREXIT => - aliasId(this.stackTop) + aliasesOf(this.stackTop) case _ => - -1 + null } super.execute(insn, interpreter) - if (nullCheckedAliasId != -1) { - for (i <- valuesWithAliasId(nullCheckedAliasId)) - this.setValue(i, NotNullValue) + if (nullCheckedAliases != null) { + val it = nullCheckedAliases.iterator + while (it.hasNext) + this.setValue(it.next(), NotNullValue) } } } diff --git a/src/compiler/scala/tools/nsc/backend/jvm/analysis/ProdConsAnalyzer.scala b/src/compiler/scala/tools/nsc/backend/jvm/analysis/ProdConsAnalyzer.scala index 1c24acba03..f2d8dc910a 100644 --- a/src/compiler/scala/tools/nsc/backend/jvm/analysis/ProdConsAnalyzer.scala +++ b/src/compiler/scala/tools/nsc/backend/jvm/analysis/ProdConsAnalyzer.scala @@ -55,6 +55,11 @@ import scala.collection.convert.decorateAsScala._ * * If ever needed, we could introduce a mode where primitive conversions (l2i) are considered as * copying operations. + * + * Note on performance: thee data flow analysis (SourceValue / SourceInterpreter, provided by ASM) + * is roughly 2-3x slower than a simple analysis (like BasicValue). The reason is that the merge + * function (merging producer sets) is more complex than merging simple basic values. + * See also the doc comment in the package object `analysis`. */ class ProdConsAnalyzer(methodNode: MethodNode, classInternalName: InternalName) { diff --git a/src/compiler/scala/tools/nsc/backend/jvm/analysis/package.scala b/src/compiler/scala/tools/nsc/backend/jvm/analysis/package.scala new file mode 100644 index 0000000000..402357c55b --- /dev/null +++ b/src/compiler/scala/tools/nsc/backend/jvm/analysis/package.scala @@ -0,0 +1,329 @@ +package scala.tools.nsc.backend.jvm + +/** + * Summary on the ASM ananlyzer framework + * -------------------------------------- + * + * Value + * - Abstract, needs to be implemented for each analysis. + * - Represents the desired information about local variables and stack values, for example: + * - Is this value known to be null / not null? + * - What are the instructions that could potentially have produced this value? + * + * Interpreter + * - Abstract, needs to be implemented for each analysis. Sometimes one can subclass an existing + * interpreter, e.g., SourceInterpreter or BasicInterpreter. + * - Multiple abstract methods that receive an instruction and the instruction's input values, and + * return a value representing the result of that instruction. + * - Note: due to control flow, the interpreter can be invoked multiple times for the same + * instruction, until reaching a fixed point. + * - Abstract `merge` function that computes the least upper bound of two values. Used by + * Frame.merge (see below). + * + * Frame + * - Can be used directly for many analyses, no subclass required. + * - Every frame has an array of values: one for each local variable and for each stack slot. + * - A `top` index stores the index of the current stack top + * - NOTE: for a size-2 local variable at index i, the local variable at i+1 is set to an empty + * value. However, for a size-2 value at index i on the stack, the value at i+1 holds the next + * stack value. + * - Defines the `execute(instruction)` method. + * - executing mutates the state of the frame according to the effect of the instruction + * - pop consumed values from the stack + * - pass them to the interpreter together with the instruction + * - if applicable, push the resulting value on the stack + * - Defines the `merge(otherFrame)` method + * - called by the analyzer when multiple control flow paths lead to an instruction + * - the frame at the branching instruction is merged into the current frame of the + * instruction (held by the analyzer) + * - mutates the values of the current frame, merges all values using interpreter.merge. + * + * Analyzer + * - Stores a frame for each instruction + * - `merge` function takes an instruction and a frame, merges the existing frame for that instr + * (from the frames array) with the new frame passed as argument. + * if the frame changed, puts the instruction on the work queue (fixpiont). + * - initial frame: initialized for first instr by calling interpreter.new[...]Value + * for each slot (locals and params), stored in frames[firstInstr] by calling `merge` + * - work queue of instructions (`queue` array, `top` index for next instruction to analyze) + * - analyze(method): simulate control flow. while work queue non-empty: + * - copy the state of `frames[instr]` into a local frame `current` + * - call `current.execute(instr, interpreter)`, mutating the `current` frame + * - if it's a branching instruction + * - for all potential destination instructions + * - merge the destination instruction frame with the `current` frame + * (this enqueues the destination instr if its frame changed) + * - invoke `newControlFlowEdge` (see below) + * - the analyzer also tracks active exception handlers at each instruction + * - the empty method `newControlFlowEdge` can be overridden to track control flow if required + * + * + * Lessons learnt while benchmarking the alias tracking analysis + * ------------------------------------------------------------- + * + * Profiling + * - Use YourKit for finding hotspots (cpu profiling). when it comes to drilling down into the details + * of a hotspot, don't pay too much attention to the percentages / time counts. + * - Should also try other profilers. + * - Use timers. When a method showed up as a hotspot, i added a timer around that method, and a + * second one within the method to measure specific parts. The timers slow things down, but the + * relative numbers show what parts of a method are slow. + * + * ASM analyzer insights + * - The time for running an analysis depends on the number of locals and the number of instructions. + * Reducing the number of locals helps speeding up the analysis: there are less values to + * merge when merging to frames. + * See also https://github.com/scala/scala-dev/issues/47 + * - The common hot spot of an ASM analysis is Frame.merge, for example in producers / consumers. + * - For nullness analysis the time is spent as follows + * - 20% merging nullness values. this is as expected: for example, the same absolute amount of + * time is spent in merging BasicValues when running a BasicInterpreter. + * - 50% merging alias sets. i tried to optimize what i could out of this. + * - 20% is spent creating new frames from existing ones, see comment on AliasingFrame.init. + * - The implementation of Frame.merge (the main hot spot) contains a megamorphic callsite to + * `interpreter.merge`. This can be observed easily by running a test program that either runs + * a BasicValue analysis only, versus a program that first runs a nullness analysis and then + * a BasicValue. In an example, the time for the BasicValue analysis goes from 519ms to 1963ms, + * a 3.8x slowdown. + * - I added counters to the Frame.merge methods for nullness and BasicValue analysis. In the + * examples I benchmarked, the number of merge invocations was always exactly the same. + * It would probably be possible to come up with an example where alias set merging forces + * additional analysis rounds until reaching the fixpoint, but I did not observe such cases. + * + * To benchmark an analysis, instead of benchmarking analysis while it runs in the compiler + * backend, one can easily run it from a separate program (or the repl). The bytecode to analyze + * can simply be parsed from a classfile. See example at the end of this comment. + * + * + * Nullness Analysis in Miguel's Optimizer + * --------------------------------------- + * + * Miguel implemented alias tracking for nullness analysis differently [1]. Remember that every + * frame has an array of values. Miguel's idea was to represent aliasing using reference equality + * in the values array: if two entries in the array point to the same value object, the two entries + * are aliases in the frame of the given instruction. + * + * While this idea seems elegant at first sight, Miguel's implementation does not merge frames + * correctly when it comes to aliasing. Assume in frame 1, values (a, b, c) are aliases, while in + * frame 2 (a, b) are aliases. When merging the second into the first, we have to make sure that + * c is removed as an alias of (a, b). + * + * It would be possible to implement correct alias set merging in Miguel's approach. However, frame + * merging is the main hot spot of analysis. The computational complexity of implementing alias set + * merging by traversing the values array and comparing references is too high. The concrete + * alias set representation that is used in the current implementation (see class AliasingFrame) + * makes alias set merging more efficient. + * + * [1] https://github.com/scala-opt/scala/blob/opt/rebase/src/compiler/scala/tools/nsc/backend/bcode/NullnessPropagator.java + * + * + * Complexity and scaling of analysis + * ---------------------------------- + * + * The time complexity of a data flow analysis depends on: + * + * - The size of the method. The complexity factor is linear (assuming the number of locals and + * branching instructions remains constant). The main analysis loop runs through all + * instructions of a method once. Instructions are only re-enqueued if a control flow merge + * changes the frame at some instruction. + * + * - The branching instructions. When a second (third, ..) control flow edge arrives at an + * instruction, the existing frame at the instruction is merged with the one computed on the + * new branch. If the merge function changes the existing frame, the instruction is enqueued + * for another analysis. This results in a merge operation for the successors of the + * instruction. + * + * - The number of local variables. The hot spot of analysis is frame merging. The merge function + * iterates through the values in the frame (locals and stack values) and merges them. + * + * I measured the running time of an analysis for two examples: + * - Keep the number of locals and branching instructions constant, increase the number of + * instructions. The running time grows linearly with the method size. + * - Increase the size and number of locals in a method. The method size and number of locals + * grow in the same pace. Here, the running time increase is polynomial. It looks like the + * complexity is be #instructions * #locals^2 (see below). + * + * I measured nullness analysis (which tracks aliases) and a SimpleValue analysis. Nullness runs + * roughly 5x slower (because of alias tracking) at every problem size - this factor doesn't change. + * + * The numbers below are for nullness. Note that the the last column is constant, i.e., the running + * time is proportional to #ins * #loc^2. Therefore we use this factor when limiting the maximal + * method size for running an analysis. + * + * #insns #locals time (ms) time / #ins * #loc^2 * 10^6 + * 1305 156 34 1.07 + * 2610 311 165 0.65 + * 3915 466 490 0.57 + * 5220 621 1200 0.59 + * 6525 776 2220 0.56 + * 7830 931 3830 0.56 + * 9135 1086 6570 0.60 + * 10440 1241 9700 0.60 + * 11745 1396 13800 0.60 + * + * As a second experiment, nullness analysis was run with varying #insns but constant #locals. + * The last column shows linear complexity with respect to the method size (linearOffset = 2279): + * + * #insns #locals time (ms) (time + linearOffset) / #insns + * 5220 621 1090 0.645 + * 6224 621 1690 0.637 + * 7226 621 2280 0.630 + * 8228 621 2870 0.625 + * 9230 621 3530 0.629 + * 10232 621 4130 0.626 + * 11234 621 4770 0.627 + * 12236 621 5520 0.637 + * 13238 621 6170 0.638 + * + * + * When running a BasicValue analysis, the complexity observation is the same (time is proportional + * to #ins * #loc^2). + * + * + * Measuring analysis execution time + * --------------------------------- + * + * See code below. + */ + +/* +object Test { + val overwrite: Option[String] = null + + @noinline def serialize(o: AnyRef): String = null + + @noinline def deserialize(string: String): AnyRef = null + + @inline def checkRoundTrip[T <: AnyRef](instance: T)(f: T => AnyRef) { + val result = serialize(instance) + val reconstituted = deserialize(result).asInstanceOf[T] + assert(f(instance) == f(reconstituted), (f(instance), f(reconstituted))) + } + + @inline def check[T <: AnyRef](instance: => T)(prevResult: String, f: T => AnyRef = (x: T) => x) { + // pattern match to introduce a lot of control flow, i.e., a lot of frame merges + overwrite match { + case Some(f) => + case None => + checkRoundTrip(instance)(f) + assert(f(deserialize(prevResult).asInstanceOf[T]) == f(instance), instance) + assert(prevResult == "res", instance) + } + } + + // @inline def fun[T <: AnyRef](instance: => T) = (x: T) => x + + def testMain(): Unit = { + // every call to check creates quite a number of locals, and also quite a number of aliases + // of the same value (x1). First of all, the default argument call is expanded as below. Then + // method check is inlined, and within the body of check, checkRoundTrip and assert have + // already been inlined as well. + + // { + // val x1 = () => "" + // val x2 = fun(x1()) // the compiler optimizes this: instead of passing `() => x1()`, it just passes x1 + // check(x1())("", x2) // same here for x1 + // } + + check("")("") + check("")("") + check("")("") + check("")("") + check("")("") // 5 + check("")("") + check("")("") + check("")("") + check("")("") + check("")("") // 10 + check("")("") + check("")("") + check("")("") + check("")("") + check("")("") // 15 + check("")("") + check("")("") + check("")("") + check("")("") + check("")("") // 20 + check("")("") + check("")("") + check("")("") + check("")("") + check("")("") // 25 + check("")("") + check("")("") + check("")("") + check("")("") + check("")("") // 30 + check("")("") + check("")("") + check("")("") + check("")("") + check("")("") // 35 + check("")("") + check("")("") + check("")("") + check("")("") + check("")("") // 40 + // check("")("") + // check("")("") + // check("")("") + // check("")("") + // check("")("") // 45 + // check("")("") + // check("")("") + // check("")("") + // check("")("") + // check("")("") // 50 + // check("")("") + // check("")("") + // check("")("") + // check("")("") + // check("")("") // 55 + + // 1000 bytecode instructions, 0 locals + // println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); + } + + def timed[T](f: => T): T = { + val start = System.nanoTime() + val r = f + val nanos = System.nanoTime() - start + println(s"took ${nanos/1000000}ms") + r + } + + def main(args: Array[String]): Unit = { + import scala.tools.nsc.backend.jvm._ + val cn = AsmUtils.readClass("/Users/luc/scala/scala/sandbox/Test$.class") + import scala.collection.convert.decorateAsScala._ + val m = cn.methods.iterator.asScala.find(_.name == "testMain").head + + println(s"${m.instructions.size} instructions - ${m.maxLocals} locals") + + val a = new analysis.NullnessAnalyzer + a.analyze(cn.name, m) // warm up + + analysis.AliasingFrame.reset() + timed(a.analyze(cn.name, m)) + analysis.AliasingFrame.timers foreach println + + println("---") + + // NOTE: if we don't run nullness analysis above (comment it out), then the BasicValue + // analysis runs 3.5x faster. Most likely because the call to Interpreter.merge inside + // Frame.merge is no longer megamorphic. + + import scala.tools.asm.tree.analysis._ + val ba = new Analyzer(new BasicInterpreter) + ba.analyze(cn.name, m) // warm up + + timed(ba.analyze(cn.name, m)) + + println("---") + + timed(a.analyze(cn.name, m)) + } +} +*/ +package object analysis |