1 files changed, 374 insertions, 0 deletions
diff --git a/src/compiler/scala/tools/nsc/backend/jvm/analysis/package.scala b/src/compiler/scala/tools/nsc/backend/jvm/analysis/package.scala
new file mode 100644
index 0000000000..f1ac703532
--- /dev/null
+++ b/src/compiler/scala/tools/nsc/backend/jvm/analysis/package.scala
@@ -0,0 +1,374 @@
+package scala.tools.nsc.backend.jvm
+
+/**
+ * Summary on the ASM analyzer framework
+ * --------------------------------------
+ *
+ * Value
+ *  - Abstract, needs to be implemented for each analysis.
+ *  - Represents the desired information about local variables and stack values, for example:
+ *    - Is this value known to be null / not null?
+ *    - What are the instructions that could potentially have produced this value?
+ *
+ * Interpreter
+ *  - Abstract, needs to be implemented for each analysis. Sometimes one can subclass an existing
+ *    interpreter, e.g., SourceInterpreter or BasicInterpreter.
+ *  - Multiple abstract methods that receive an instruction and the instruction's input values, and
+ *    return a value representing the result of that instruction.
+ *    - Note: due to control flow, the interpreter can be invoked multiple times for the same
+ *      instruction, until reaching a fixed point.
+ *  - Abstract `merge` function that computes the least upper bound of two values. Used by
+ *    Frame.merge (see below).
+ *
+ * Frame
+ *  - Can be used directly for many analyses, no subclass required.
+ *  - Every frame has an array of values: one for each local variable and for each stack slot.
+ *    - A `top` index stores the index of the current stack top
+ *    - NOTE: for a size-2 local variable at index i, the local variable at i+1 is set to an empty
+ *      value. However, for a size-2 value at index i on the stack, the value at i+1 holds the next
+ *      stack value. IMPORTANT: this is only the case in ASM's analysis framework, not in bytecode.
+ *      See comment below.
+ *  - Defines the `execute(instruction)` method.
+ *    - executing mutates the state of the frame according to the effect of the instruction
+ *      - pop consumed values from the stack
+ *      - pass them to the interpreter together with the instruction
+ *      - if applicable, push the resulting value on the stack
+ *  - Defines the `merge(otherFrame)` method
+ *    - called by the analyzer when multiple control flow paths lead to an instruction
+ *      - the frame at the branching instruction is merged into the current frame of the
+ *        instruction (held by the analyzer)
+ *      - mutates the values of the current frame, merges all values using interpreter.merge.
+ *
+ * Analyzer
+ *   - Stores a frame for each instruction
+ *   - `merge` function takes an instruction and a frame, merges the existing frame for that instr
+ *     (from the frames array) with the new frame passed as argument.
+ *     if the frame changed, puts the instruction on the work queue (fixpiont).
+ *   - initial frame: initialized for first instr by calling interpreter.new[...]Value
+ *     for each slot (locals and params), stored in frames[firstInstr] by calling `merge`
+ *   - work queue of instructions (`queue` array, `top` index for next instruction to analyze)
+ *   - analyze(method): simulate control flow. while work queue non-empty:
+ *     - copy the state of `frames[instr]` into a local frame `current`
+ *     - call `current.execute(instr, interpreter)`, mutating the `current` frame
+ *     - if it's a branching instruction
+ *       - for all potential destination instructions
+ *         - merge the destination instruction frame with the `current` frame
+ *           (this enqueues the destination instr if its frame changed)
+ *       - invoke `newControlFlowEdge` (see below)
+ *   - the analyzer also tracks active exception handlers at each instruction
+ *   - the empty method `newControlFlowEdge` can be overridden to track control flow if required
+ *
+ *
+ * MaxLocals and MaxStack
+ * ----------------------
+ *
+ * At the JVM level, long and double values occupy two slots, both as local variables and on the
+ * stack, as specified in the JVM spec 2.6.2:
+ *   "At any point in time, an operand stack has an associated depth, where a value of type long or
+ *    double contributes two units to the depth and a value of any other type contributes one unit."
+ *
+ * For example, a method
+ *   class A { def f(a: Long, b: Long) = a + b }
+ * has MAXSTACK=4 in the classfile. This value is computed by the ClassWriter / MethodWriter when
+ * generating the classfile (we always pass COMPUTE_MAXS to the ClassWriter).
+ *
+ * For running an ASM Analyzer, long and double values occupy two local variable slots, but only
+ * a single slot on the call stack, as shown by the following snippet:
+ *
+ *   import scala.tools.nsc.backend.jvm._
+ *   import scala.tools.nsc.backend.jvm.opt.BytecodeUtils._
+ *   import scala.collection.convert.decorateAsScala._
+ *   import scala.tools.asm.tree.analysis._
+ *
+ *   val cn = AsmUtils.readClass("/Users/luc/scala/scala/sandbox/A.class")
+ *   val m = cn.methods.iterator.asScala.find(_.name == "f").head
+ *
+ *   // the value is read from the classfile, so it's 4
+ *   println(s"maxLocals: ${m.maxLocals}, maxStack: ${m.maxStack}") // maxLocals: 5, maxStack: 4
+ *
+ *   // we can safely set it to 2 for running the analyzer.
+ *   m.maxStack = 2
+ *
+ *   val a = new Analyzer(new BasicInterpreter)
+ *   a.analyze(cn.name, m)
+ *   val addInsn = m.instructions.iterator.asScala.find(_.getOpcode == 97).get // LADD Opcode
+ *   val addFrame = a.frameAt(addInsn, m)
+ *
+ *   addFrame.getStackSize // 2: the two long values only take one slot each
+ *   addFrame.getLocals    // 5: this takes one slot, the two long parameters take 2 slots each
+ *
+ *
+ * While running the optimizer, we need to make sure that the `maxStack` value of a method is
+ * large enough for running an ASM analyzer. We don't need to worry if the value is incorrect in
+ * the JVM perspective: the value will be re-computed and overwritten in the ClassWriter.
+ *
+ *
+ * Lessons learnt while benchmarking the alias tracking analysis
+ * -------------------------------------------------------------
+ *
+ * Profiling
+ *  - Use YourKit for finding hotspots (cpu profiling). when it comes to drilling down into the details
+ *    of a hotspot, don't pay too much attention to the percentages / time counts.
+ *  - Should also try other profilers.
+ *  - Use timers. When a method showed up as a hotspot, i added a timer around that method, and a
+ *    second one within the method to measure specific parts. The timers slow things down, but the
+ *    relative numbers show what parts of a method are slow.
+ *
+ * ASM analyzer insights
+ *  - The time for running an analysis depends on the number of locals and the number of instructions.
+ *    Reducing the number of locals helps speeding up the analysis: there are less values to
+ *    merge when merging to frames.
+ *    See also https://github.com/scala/scala-dev/issues/47
+ *  - The common hot spot of an ASM analysis is Frame.merge, for example in producers / consumers.
+ *  - For nullness analysis the time is spent as follows
+ *    - 20% merging nullness values. this is as expected: for example, the same absolute amount of
+ *      time is spent in merging BasicValues when running a BasicInterpreter.
+ *    - 50% merging alias sets. i tried to optimize what i could out of this.
+ *    - 20% is spent creating new frames from existing ones, see comment on AliasingFrame.init.
+ *  - The implementation of Frame.merge (the main hot spot) contains a megamorphic callsite to
+ *    `interpreter.merge`. This can be observed easily by running a test program that either runs
+ *    a BasicValue analysis only, versus a program that first runs a nullness analysis and then
+ *    a BasicValue. In an example, the time for the BasicValue analysis goes from 519ms to 1963ms,
+ *    a 3.8x slowdown.
+ *  - I added counters to the Frame.merge methods for nullness and BasicValue analysis. In the
+ *    examples I benchmarked, the number of merge invocations was always exactly the same.
+ *    It would probably be possible to come up with an example where alias set merging forces
+ *    additional analysis rounds until reaching the fixpoint, but I did not observe such cases.
+ *
+ * To benchmark an analysis, instead of benchmarking analysis while it runs in the compiler
+ * backend, one can easily run it from a separate program (or the repl). The bytecode to analyze
+ * can simply be parsed from a classfile. See example at the end of this comment.
+ *
+ *
+ * Nullness Analysis in Miguel's Optimizer
+ * ---------------------------------------
+ *
+ * Miguel implemented alias tracking for nullness analysis differently [1]. Remember that every
+ * frame has an array of values. Miguel's idea was to represent aliasing using reference equality
+ * in the values array: if two entries in the array point to the same value object, the two entries
+ * are aliases in the frame of the given instruction.
+ *
+ * While this idea seems elegant at first sight, Miguel's implementation does not merge frames
+ * correctly when it comes to aliasing. Assume in frame 1, values (a, b, c) are aliases, while in
+ * frame 2 (a, b) are aliases. When merging the second into the first, we have to make sure that
+ * c is removed as an alias of (a, b).
+ *
+ * It would be possible to implement correct alias set merging in Miguel's approach. However, frame
+ * merging is the main hot spot of analysis. The computational complexity of implementing alias set
+ * merging by traversing the values array and comparing references is too high. The concrete
+ * alias set representation that is used in the current implementation (see class AliasingFrame)
+ * makes alias set merging more efficient.
+ *
+ * [1] https://github.com/scala-opt/scala/blob/opt/rebase/src/compiler/scala/tools/nsc/backend/bcode/NullnessPropagator.java
+ *
+ *
+ * Complexity and scaling of analysis
+ * ----------------------------------
+ *
+ * The time complexity of a data flow analysis depends on:
+ *
+ *   - The size of the method. The complexity factor is linear (assuming the number of locals and
+ *     branching instructions remains constant). The main analysis loop runs through all
+ *     instructions of a method once. Instructions are only re-enqueued if a control flow merge
+ *     changes the frame at some instruction.
+ *
+ *   - The branching instructions. When a second (third, ..) control flow edge arrives at an
+ *     instruction, the existing frame at the instruction is merged with the one computed on the
+ *     new branch. If the merge function changes the existing frame, the instruction is enqueued
+ *     for another analysis. This results in a merge operation for the successors of the
+ *     instruction.
+ *
+ *   - The number of local variables. The hot spot of analysis is frame merging. The merge function
+ *     iterates through the values in the frame (locals and stack values) and merges them.
+ *
+ * I measured the running time of an analysis for two examples:
+ *   - Keep the number of locals and branching instructions constant, increase the number of
+ *     instructions. The running time grows linearly with the method size.
+ *   - Increase the size and number of locals in a method. The method size and number of locals
+ *     grow in the same pace. Here, the running time increase is polynomial. It looks like the
+ *     complexity is be #instructions * #locals^2 (see below).
+ *
+ * I measured nullness analysis (which tracks aliases) and a SimpleValue analysis. Nullness runs
+ * roughly 5x slower (because of alias tracking) at every problem size - this factor doesn't change.
+ *
+ * The numbers below are for nullness. Note that the the last column is constant, i.e., the running
+ * time is proportional to #ins * #loc^2. Therefore we use this factor when limiting the maximal
+ * method size for running an analysis.
+ *
+ *   #insns    #locals    time (ms)       time / #ins * #loc^2 * 10^6
+ *   1305      156        34              1.07
+ *   2610      311        165             0.65
+ *   3915      466        490             0.57
+ *   5220      621        1200            0.59
+ *   6525      776        2220            0.56
+ *   7830      931        3830            0.56
+ *   9135      1086       6570            0.60
+ *   10440     1241       9700            0.60
+ *   11745     1396       13800           0.60
+ *
+ * As a second experiment, nullness analysis was run with varying #insns but constant #locals.
+ * The last column shows linear complexity with respect to the method size (linearOffset = 2279):
+ *
+ *   #insns     #locals     time (ms)    (time + linearOffset) / #insns
+ *   5220       621         1090         0.645
+ *   6224       621         1690         0.637
+ *   7226       621         2280         0.630
+ *   8228       621         2870         0.625
+ *   9230       621         3530         0.629
+ *   10232      621         4130         0.626
+ *   11234      621         4770         0.627
+ *   12236      621         5520         0.637
+ *   13238      621         6170         0.638
+ *
+ *
+ * When running a BasicValue analysis, the complexity observation is the same (time is proportional
+ * to #ins * #loc^2).
+ *
+ *
+ * Measuring analysis execution time
+ * ---------------------------------
+ *
+ * See code below.
+ */
+
+/*
+object Test {
+  val overwrite: Option[String] = null
+
+  @noinline def serialize(o: AnyRef): String = null
+
+  @noinline def deserialize(string: String): AnyRef = null
+
+  @inline def checkRoundTrip[T <: AnyRef](instance: T)(f: T => AnyRef) {
+    val result = serialize(instance)
+    val reconstituted = deserialize(result).asInstanceOf[T]
+    assert(f(instance) == f(reconstituted), (f(instance), f(reconstituted)))
+  }
+
+  @inline def check[T <: AnyRef](instance: => T)(prevResult: String, f: T => AnyRef = (x: T) => x) {
+    // pattern match to introduce a lot of control flow, i.e., a lot of frame merges
+    overwrite match {
+      case Some(f) =>
+      case None =>
+        checkRoundTrip(instance)(f)
+        assert(f(deserialize(prevResult).asInstanceOf[T]) == f(instance), instance)
+        assert(prevResult == "res", instance)
+    }
+  }
+
+  // @inline def fun[T <: AnyRef](instance: => T) = (x: T) => x
+
+  def testMain(): Unit = {
+    // every call to check creates quite a number of locals, and also quite a number of aliases
+    // of the same value (x1). First of all, the default argument call is expanded as below. Then
+    // method check is inlined, and within the body of check, checkRoundTrip and assert have
+    // already been inlined as well.
+
+    // {
+    //   val x1 = () => ""
+    //   val x2 = fun(x1())  // the compiler optimizes this: instead of passing `() => x1()`, it just passes x1
+    //   check(x1())("", x2) // same here for x1
+    // }
+
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("") // 5
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("") // 10
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("") // 15
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("") // 20
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("") // 25
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("") // 30
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("") // 35
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("")
+    check("")("") // 40
+    // check("")("")
+    // check("")("")
+    // check("")("")
+    // check("")("")
+    // check("")("") // 45
+    // check("")("")
+    // check("")("")
+    // check("")("")
+    // check("")("")
+    // check("")("") // 50
+    // check("")("")
+    // check("")("")
+    // check("")("")
+    // check("")("")
+    // check("")("") // 55
+
+    // 1000 bytecode instructions, 0 locals
+    // println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10)); println((1,2,3,4,5,6,7,8,9,10));
+  }
+
+  def timed[T](f: => T): T = {
+    val start = System.nanoTime()
+    val r = f
+    val nanos = System.nanoTime() - start
+    println(s"took ${nanos/1000000}ms")
+    r
+  }
+
+  def main(args: Array[String]): Unit = {
+    import scala.tools.nsc.backend.jvm._
+    val cn = AsmUtils.readClass("/Users/luc/scala/scala/sandbox/Test$.class")
+    import scala.collection.convert.decorateAsScala._
+    val m = cn.methods.iterator.asScala.find(_.name == "testMain").head
+
+    println(s"${m.instructions.size} instructions - ${m.maxLocals} locals")
+
+    val a = new analysis.NullnessAnalyzer
+    a.analyze(cn.name, m) // warm up
+
+    analysis.AliasingFrame.reset()
+    timed(a.analyze(cn.name, m))
+    analysis.AliasingFrame.timers foreach println
+
+    println("---")
+
+    // NOTE: if we don't run nullness analysis above (comment it out), then the BasicValue
+    // analysis runs 3.5x faster. Most likely because the call to Interpreter.merge inside
+    // Frame.merge is no longer megamorphic.
+
+    import scala.tools.asm.tree.analysis._
+    val ba = new Analyzer(new BasicInterpreter)
+    ba.analyze(cn.name, m) // warm up
+
+    timed(ba.analyze(cn.name, m))
+
+    println("---")
+
+    timed(a.analyze(cn.name, m))
+  }
+}
+*/
+package object analysis