Added updated ForkJoinPool, w/ necessary updates to Scala Actors.

This commit includes Doug Lea's updates to ForkJoinPool, as of January 25th, 2012. Details of the changes and performance improvements available at: http://markmail.org/message/323vxzn6irkk5yrg. The ForkJoinPool used in this commit comes from the most recent JSR166y. Additionally, also included are minimal changes to parts of the Scala Actors library which interface with the ForkJoinPool, as the ForkJoinPool's interface has changed (prior to the release of Java 7) since we last updated it for the Scala 2.8 release. Of note- this is part of the planned overhaul of scala.concurrent, and corresponds to ticket SI-5523. For testing this was built on JDK 1.6, and passes all tests on both JDK 1.5 and 1.6. A new forkjoin.jar is necessary prior to applying these changes. Using this source, the new jar can be built by running: ant newforkjoin forkjoin.done This creates a new forkjoin.jar in build/libs/. It must replace lib/forkjoin.jar.
author: Heather Miller <heather.miller@epfl.ch> 2012-02-25 17:36:08 +0100
committer: Heather Miller <heather.miller@epfl.ch> 2012-02-25 17:36:08 +0100
commit: 76e9da2ca4c31daec2b04848c3c2dbad6ecd426e (patch)
tree: d8f27ad3952d43c2049805cb8805ea3479431dc8
parent: 0c2f493804db6b594d7ec68e49e76c75a316230b (diff)
download: scala-76e9da2ca4c31daec2b04848c3c2dbad6ecd426e.tar.gz
scala-76e9da2ca4c31daec2b04848c3c2dbad6ecd426e.tar.bz2
scala-76e9da2ca4c31daec2b04848c3c2dbad6ecd426e.zip
11 files changed, 4810 insertions, 3442 deletions
diff --git a/src/actors/scala/actors/scheduler/DrainableForkJoinPool.scala b/src/actors/scala/actors/scheduler/DrainableForkJoinPool.scala
index 257fe92a91..15ce60566a 100644
--- a/src/actors/scala/actors/scheduler/DrainableForkJoinPool.scala
+++ b/src/actors/scala/actors/scheduler/DrainableForkJoinPool.scala
@@ -4,9 +4,9 @@ package scheduler
 import java.util.Collection
 import scala.concurrent.forkjoin.{ForkJoinPool, ForkJoinTask}
 
-private class DrainableForkJoinPool extends ForkJoinPool {
+private class DrainableForkJoinPool(parallelism: Int, maxPoolSize: Int) extends ForkJoinPool(parallelism, ForkJoinPool.defaultForkJoinWorkerThreadFactory, null, true) {
 
-  override def drainTasksTo(c: Collection[ForkJoinTask[_]]): Int =
+  override def drainTasksTo(c: Collection[ _ >: ForkJoinTask[_]]): Int =
     super.drainTasksTo(c)
 
 }
diff --git a/src/actors/scala/actors/scheduler/ForkJoinScheduler.scala b/src/actors/scala/actors/scheduler/ForkJoinScheduler.scala
index ba0f88c668..ce67ffd037 100644
--- a/src/actors/scala/actors/scheduler/ForkJoinScheduler.scala
+++ b/src/actors/scala/actors/scheduler/ForkJoinScheduler.scala
@@ -38,13 +38,8 @@ class ForkJoinScheduler(val initCoreSize: Int, val maxSize: Int, daemon: Boolean
   }
 
   private def makeNewPool(): DrainableForkJoinPool = {
-    val p = new DrainableForkJoinPool()
-    // enable locally FIFO scheduling mode
-    p.setAsyncMode(true)
-    p.setParallelism(initCoreSize)
-    p.setMaximumPoolSize(maxSize)
+    val p = new DrainableForkJoinPool(initCoreSize, maxSize)
     Debug.info(this+": parallelism "+p.getParallelism())
-    Debug.info(this+": max pool size "+p.getMaximumPoolSize())
     p
   }
 
@@ -144,7 +139,7 @@ class ForkJoinScheduler(val initCoreSize: Int, val maxSize: Int, daemon: Boolean
     ForkJoinPool.managedBlock(new ForkJoinPool.ManagedBlocker {
       def block = blocker.block()
       def isReleasable() = blocker.isReleasable
-    }, true)
+    })
   }
 
   /** Suspends the scheduler. All threads that were in use by the
diff --git a/src/forkjoin/scala/concurrent/forkjoin/ForkJoinPool.java b/src/forkjoin/scala/concurrent/forkjoin/ForkJoinPool.java
index 3fad92cbf1..e9389e9acb 100644
--- a/src/forkjoin/scala/concurrent/forkjoin/ForkJoinPool.java
+++ b/src/forkjoin/scala/concurrent/forkjoin/ForkJoinPool.java
@@ -1,669 +1,2324 @@
 /*
+
  * Written by Doug Lea with assistance from members of JCP JSR-166
  * Expert Group and released to the public domain, as explained at
- * http://creativecommons.org/licenses/publicdomain
+ * http://creativecommons.org/publicdomain/zero/1.0/
  */
 
 package scala.concurrent.forkjoin;
-import java.util.*;
-import java.util.concurrent.*;
-import java.util.concurrent.locks.*;
-import java.util.concurrent.atomic.*;
-import sun.misc.Unsafe;
-import java.lang.reflect.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Random;
+//import java.util.concurrent.AbstractExecutorService;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.RejectedExecutionException;
+//import java.util.concurrent.RunnableFuture;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.locks.AbstractQueuedSynchronizer;
+import java.util.concurrent.locks.Condition;
+
+interface RunnableFuture<T> extends Runnable {
+  //TR placeholder for java.util.concurrent.RunnableFuture
+}
 
 /**
- * An {@link ExecutorService} for running {@link ForkJoinTask}s.  A
- * ForkJoinPool provides the entry point for submissions from
- * non-ForkJoinTasks, as well as management and monitoring operations.
- * Normally a single ForkJoinPool is used for a large number of
- * submitted tasks. Otherwise, use would not usually outweigh the
- * construction and bookkeeping overhead of creating a large set of
- * threads.
+ * An {@link ExecutorService} for running {@link ForkJoinTask}s.
+ * A {@code ForkJoinPool} provides the entry point for submissions
+ * from non-{@code ForkJoinTask} clients, as well as management and
+ * monitoring operations.
  *
- * <p>ForkJoinPools differ from other kinds of Executors mainly in
- * that they provide <em>work-stealing</em>: all threads in the pool
- * attempt to find and execute subtasks created by other active tasks
- * (eventually blocking if none exist). This makes them efficient when
- * most tasks spawn other subtasks (as do most ForkJoinTasks), as well
- * as the mixed execution of some plain Runnable- or Callable- based
- * activities along with ForkJoinTasks. When setting
- * <tt>setAsyncMode</tt>, a ForkJoinPools may also be appropriate for
- * use with fine-grained tasks that are never joined. Otherwise, other
- * ExecutorService implementations are typically more appropriate
- * choices.
+ * <p>A {@code ForkJoinPool} differs from other kinds of {@link
+ * ExecutorService} mainly by virtue of employing
+ * <em>work-stealing</em>: all threads in the pool attempt to find and
+ * execute tasks submitted to the pool and/or created by other active
+ * tasks (eventually blocking waiting for work if none exist). This
+ * enables efficient processing when most tasks spawn other subtasks
+ * (as do most {@code ForkJoinTask}s), as well as when many small
+ * tasks are submitted to the pool from external clients.  Especially
+ * when setting <em>asyncMode</em> to true in constructors, {@code
+ * ForkJoinPool}s may also be appropriate for use with event-style
+ * tasks that are never joined.
  *
- * <p>A ForkJoinPool may be constructed with a given parallelism level
- * (target pool size), which it attempts to maintain by dynamically
- * adding, suspending, or resuming threads, even if some tasks are
- * waiting to join others. However, no such adjustments are performed
- * in the face of blocked IO or other unmanaged synchronization. The
- * nested <code>ManagedBlocker</code> interface enables extension of
- * the kinds of synchronization accommodated.  The target parallelism
- * level may also be changed dynamically (<code>setParallelism</code>)
- * and thread construction can be limited using methods
- * <code>setMaximumPoolSize</code> and/or
- * <code>setMaintainsParallelism</code>.
+ * <p>A {@code ForkJoinPool} is constructed with a given target
+ * parallelism level; by default, equal to the number of available
+ * processors. The pool attempts to maintain enough active (or
+ * available) threads by dynamically adding, suspending, or resuming
+ * internal worker threads, even if some tasks are stalled waiting to
+ * join others. However, no such adjustments are guaranteed in the
+ * face of blocked IO or other unmanaged synchronization. The nested
+ * {@link ManagedBlocker} interface enables extension of the kinds of
+ * synchronization accommodated.
  *
  * <p>In addition to execution and lifecycle control methods, this
  * class provides status check methods (for example
- * <code>getStealCount</code>) that are intended to aid in developing,
+ * {@link #getStealCount}) that are intended to aid in developing,
  * tuning, and monitoring fork/join applications. Also, method
- * <code>toString</code> returns indications of pool state in a
+ * {@link #toString} returns indications of pool state in a
  * convenient form for informal monitoring.
  *
+ * <p> As is the case with other ExecutorServices, there are three
+ * main task execution methods summarized in the following table.
+ * These are designed to be used primarily by clients not already
+ * engaged in fork/join computations in the current pool.  The main
+ * forms of these methods accept instances of {@code ForkJoinTask},
+ * but overloaded forms also allow mixed execution of plain {@code
+ * Runnable}- or {@code Callable}- based activities as well.  However,
+ * tasks that are already executing in a pool should normally instead
+ * use the within-computation forms listed in the table unless using
+ * async event-style tasks that are not usually joined, in which case
+ * there is little difference among choice of methods.
+ *
+ * <table BORDER CELLPADDING=3 CELLSPACING=1>
+ *  <tr>
+ *    <td></td>
+ *    <td ALIGN=CENTER> <b>Call from non-fork/join clients</b></td>
+ *    <td ALIGN=CENTER> <b>Call from within fork/join computations</b></td>
+ *  </tr>
+ *  <tr>
+ *    <td> <b>Arrange async execution</td>
+ *    <td> {@link #execute(ForkJoinTask)}</td>
+ *    <td> {@link ForkJoinTask#fork}</td>
+ *  </tr>
+ *  <tr>
+ *    <td> <b>Await and obtain result</td>
+ *    <td> {@link #invoke(ForkJoinTask)}</td>
+ *    <td> {@link ForkJoinTask#invoke}</td>
+ *  </tr>
+ *  <tr>
+ *    <td> <b>Arrange exec and obtain Future</td>
+ *    <td> {@link #submit(ForkJoinTask)}</td>
+ *    <td> {@link ForkJoinTask#fork} (ForkJoinTasks <em>are</em> Futures)</td>
+ *  </tr>
+ * </table>
+ *
+ * <p><b>Sample Usage.</b> Normally a single {@code ForkJoinPool} is
+ * used for all parallel task execution in a program or subsystem.
+ * Otherwise, use would not usually outweigh the construction and
+ * bookkeeping overhead of creating a large set of threads. For
+ * example, a common pool could be used for the {@code SortTasks}
+ * illustrated in {@link RecursiveAction}. Because {@code
+ * ForkJoinPool} uses threads in {@linkplain java.lang.Thread#isDaemon
+ * daemon} mode, there is typically no need to explicitly {@link
+ * #shutdown} such a pool upon program exit.
+ *
+ *  <pre> {@code
+ * static final ForkJoinPool mainPool = new ForkJoinPool();
+ * ...
+ * public void sort(long[] array) {
+ *   mainPool.invoke(new SortTask(array, 0, array.length));
+ * }}</pre>
+ *
  * <p><b>Implementation notes</b>: This implementation restricts the
  * maximum number of running threads to 32767. Attempts to create
- * pools with greater than the maximum result in
- * IllegalArgumentExceptions.
+ * pools with greater than the maximum number result in
+ * {@code IllegalArgumentException}.
+ *
+ * <p>This implementation rejects submitted tasks (that is, by throwing
+ * {@link RejectedExecutionException}) only when the pool is shut down
+ * or internal resources have been exhausted.
+ *
+ * @since 1.7
+ * @author Doug Lea
  */
 public class ForkJoinPool /*extends AbstractExecutorService*/ {
 
     /*
-     * See the extended comments interspersed below for design,
-     * rationale, and walkthroughs.
+     * Implementation Overview
+     *
+     * This class and its nested classes provide the main
+     * functionality and control for a set of worker threads:
+     * Submissions from non-FJ threads enter into submission queues.
+     * Workers take these tasks and typically split them into subtasks
+     * that may be stolen by other workers.  Preference rules give
+     * first priority to processing tasks from their own queues (LIFO
+     * or FIFO, depending on mode), then to randomized FIFO steals of
+     * tasks in other queues.
+     *
+     * WorkQueues
+     * ==========
+     *
+     * Most operations occur within work-stealing queues (in nested
+     * class WorkQueue).  These are special forms of Deques that
+     * support only three of the four possible end-operations -- push,
+     * pop, and poll (aka steal), under the further constraints that
+     * push and pop are called only from the owning thread (or, as
+     * extended here, under a lock), while poll may be called from
+     * other threads.  (If you are unfamiliar with them, you probably
+     * want to read Herlihy and Shavit's book "The Art of
+     * Multiprocessor programming", chapter 16 describing these in
+     * more detail before proceeding.)  The main work-stealing queue
+     * design is roughly similar to those in the papers "Dynamic
+     * Circular Work-Stealing Deque" by Chase and Lev, SPAA 2005
+     * (http://research.sun.com/scalable/pubs/index.html) and
+     * "Idempotent work stealing" by Michael, Saraswat, and Vechev,
+     * PPoPP 2009 (http://portal.acm.org/citation.cfm?id=1504186).
+     * The main differences ultimately stem from GC requirements that
+     * we null out taken slots as soon as we can, to maintain as small
+     * a footprint as possible even in programs generating huge
+     * numbers of tasks. To accomplish this, we shift the CAS
+     * arbitrating pop vs poll (steal) from being on the indices
+     * ("base" and "top") to the slots themselves.  So, both a
+     * successful pop and poll mainly entail a CAS of a slot from
+     * non-null to null.  Because we rely on CASes of references, we
+     * do not need tag bits on base or top.  They are simple ints as
+     * used in any circular array-based queue (see for example
+     * ArrayDeque).  Updates to the indices must still be ordered in a
+     * way that guarantees that top == base means the queue is empty,
+     * but otherwise may err on the side of possibly making the queue
+     * appear nonempty when a push, pop, or poll have not fully
+     * committed. Note that this means that the poll operation,
+     * considered individually, is not wait-free. One thief cannot
+     * successfully continue until another in-progress one (or, if
+     * previously empty, a push) completes.  However, in the
+     * aggregate, we ensure at least probabilistic non-blockingness.
+     * If an attempted steal fails, a thief always chooses a different
+     * random victim target to try next. So, in order for one thief to
+     * progress, it suffices for any in-progress poll or new push on
+     * any empty queue to complete. (This is why we normally use
+     * method pollAt and its variants that try once at the apparent
+     * base index, else consider alternative actions, rather than
+     * method poll.)
+     *
+     * This approach also enables support of a user mode in which local
+     * task processing is in FIFO, not LIFO order, simply by using
+     * poll rather than pop.  This can be useful in message-passing
+     * frameworks in which tasks are never joined.  However neither
+     * mode considers affinities, loads, cache localities, etc, so
+     * rarely provide the best possible performance on a given
+     * machine, but portably provide good throughput by averaging over
+     * these factors.  (Further, even if we did try to use such
+     * information, we do not usually have a basis for exploiting it.
+     * For example, some sets of tasks profit from cache affinities,
+     * but others are harmed by cache pollution effects.)
+     *
+     * WorkQueues are also used in a similar way for tasks submitted
+     * to the pool. We cannot mix these tasks in the same queues used
+     * for work-stealing (this would contaminate lifo/fifo
+     * processing). Instead, we loosely associate submission queues
+     * with submitting threads, using a form of hashing.  The
+     * ThreadLocal Submitter class contains a value initially used as
+     * a hash code for choosing existing queues, but may be randomly
+     * repositioned upon contention with other submitters.  In
+     * essence, submitters act like workers except that they never
+     * take tasks, and they are multiplexed on to a finite number of
+     * shared work queues. However, classes are set up so that future
+     * extensions could allow submitters to optionally help perform
+     * tasks as well. Insertion of tasks in shared mode requires a
+     * lock (mainly to protect in the case of resizing) but we use
+     * only a simple spinlock (using bits in field runState), because
+     * submitters encountering a busy queue move on to try or create
+     * other queues -- they block only when creating and registering
+     * new queues.
+     *
+     * Management
+     * ==========
+     *
+     * The main throughput advantages of work-stealing stem from
+     * decentralized control -- workers mostly take tasks from
+     * themselves or each other. We cannot negate this in the
+     * implementation of other management responsibilities. The main
+     * tactic for avoiding bottlenecks is packing nearly all
+     * essentially atomic control state into two volatile variables
+     * that are by far most often read (not written) as status and
+     * consistency checks.
+     *
+     * Field "ctl" contains 64 bits holding all the information needed
+     * to atomically decide to add, inactivate, enqueue (on an event
+     * queue), dequeue, and/or re-activate workers.  To enable this
+     * packing, we restrict maximum parallelism to (1<<15)-1 (which is
+     * far in excess of normal operating range) to allow ids, counts,
+     * and their negations (used for thresholding) to fit into 16bit
+     * fields.
+     *
+     * Field "runState" contains 32 bits needed to register and
+     * deregister WorkQueues, as well as to enable shutdown. It is
+     * only modified under a lock (normally briefly held, but
+     * occasionally protecting allocations and resizings) but even
+     * when locked remains available to check consistency.
+     *
+     * Recording WorkQueues.  WorkQueues are recorded in the
+     * "workQueues" array that is created upon pool construction and
+     * expanded if necessary.  Updates to the array while recording
+     * new workers and unrecording terminated ones are protected from
+     * each other by a lock but the array is otherwise concurrently
+     * readable, and accessed directly.  To simplify index-based
+     * operations, the array size is always a power of two, and all
+     * readers must tolerate null slots. Shared (submission) queues
+     * are at even indices, worker queues at odd indices. Grouping
+     * them together in this way simplifies and speeds up task
+     * scanning.
+     *
+     * All worker thread creation is on-demand, triggered by task
+     * submissions, replacement of terminated workers, and/or
+     * compensation for blocked workers. However, all other support
+     * code is set up to work with other policies.  To ensure that we
+     * do not hold on to worker references that would prevent GC, ALL
+     * accesses to workQueues are via indices into the workQueues
+     * array (which is one source of some of the messy code
+     * constructions here). In essence, the workQueues array serves as
+     * a weak reference mechanism. Thus for example the wait queue
+     * field of ctl stores indices, not references.  Access to the
+     * workQueues in associated methods (for example signalWork) must
+     * both index-check and null-check the IDs. All such accesses
+     * ignore bad IDs by returning out early from what they are doing,
+     * since this can only be associated with termination, in which
+     * case it is OK to give up.  All uses of the workQueues array
+     * also check that it is non-null (even if previously
+     * non-null). This allows nulling during termination, which is
+     * currently not necessary, but remains an option for
+     * resource-revocation-based shutdown schemes. It also helps
+     * reduce JIT issuance of uncommon-trap code, which tends to
+     * unnecessarily complicate control flow in some methods.
+     *
+     * Event Queuing. Unlike HPC work-stealing frameworks, we cannot
+     * let workers spin indefinitely scanning for tasks when none can
+     * be found immediately, and we cannot start/resume workers unless
+     * there appear to be tasks available.  On the other hand, we must
+     * quickly prod them into action when new tasks are submitted or
+     * generated. In many usages, ramp-up time to activate workers is
+     * the main limiting factor in overall performance (this is
+     * compounded at program start-up by JIT compilation and
+     * allocation). So we try to streamline this as much as possible.
+     * We park/unpark workers after placing in an event wait queue
+     * when they cannot find work. This "queue" is actually a simple
+     * Treiber stack, headed by the "id" field of ctl, plus a 15bit
+     * counter value (that reflects the number of times a worker has
+     * been inactivated) to avoid ABA effects (we need only as many
+     * version numbers as worker threads). Successors are held in
+     * field WorkQueue.nextWait.  Queuing deals with several intrinsic
+     * races, mainly that a task-producing thread can miss seeing (and
+     * signalling) another thread that gave up looking for work but
+     * has not yet entered the wait queue. We solve this by requiring
+     * a full sweep of all workers (via repeated calls to method
+     * scan()) both before and after a newly waiting worker is added
+     * to the wait queue. During a rescan, the worker might release
+     * some other queued worker rather than itself, which has the same
+     * net effect. Because enqueued workers may actually be rescanning
+     * rather than waiting, we set and clear the "parker" field of
+     * WorkQueues to reduce unnecessary calls to unpark.  (This
+     * requires a secondary recheck to avoid missed signals.)  Note
+     * the unusual conventions about Thread.interrupts surrounding
+     * parking and other blocking: Because interrupts are used solely
+     * to alert threads to check termination, which is checked anyway
+     * upon blocking, we clear status (using Thread.interrupted)
+     * before any call to park, so that park does not immediately
+     * return due to status being set via some other unrelated call to
+     * interrupt in user code.
+     *
+     * Signalling.  We create or wake up workers only when there
+     * appears to be at least one task they might be able to find and
+     * execute.  When a submission is added or another worker adds a
+     * task to a queue that previously had fewer than two tasks, they
+     * signal waiting workers (or trigger creation of new ones if
+     * fewer than the given parallelism level -- see signalWork).
+     * These primary signals are buttressed by signals during rescans;
+     * together these cover the signals needed in cases when more
+     * tasks are pushed but untaken, and improve performance compared
+     * to having one thread wake up all workers.
+     *
+     * Trimming workers. To release resources after periods of lack of
+     * use, a worker starting to wait when the pool is quiescent will
+     * time out and terminate if the pool has remained quiescent for
+     * SHRINK_RATE nanosecs. This will slowly propagate, eventually
+     * terminating all workers after long periods of non-use.
+     *
+     * Shutdown and Termination. A call to shutdownNow atomically sets
+     * a runState bit and then (non-atomically) sets each worker's
+     * runState status, cancels all unprocessed tasks, and wakes up
+     * all waiting workers.  Detecting whether termination should
+     * commence after a non-abrupt shutdown() call requires more work
+     * and bookkeeping. We need consensus about quiescence (i.e., that
+     * there is no more work). The active count provides a primary
+     * indication but non-abrupt shutdown still requires a rechecking
+     * scan for any workers that are inactive but not queued.
+     *
+     * Joining Tasks
+     * =============
+     *
+     * Any of several actions may be taken when one worker is waiting
+     * to join a task stolen (or always held) by another.  Because we
+     * are multiplexing many tasks on to a pool of workers, we can't
+     * just let them block (as in Thread.join).  We also cannot just
+     * reassign the joiner's run-time stack with another and replace
+     * it later, which would be a form of "continuation", that even if
+     * possible is not necessarily a good idea since we sometimes need
+     * both an unblocked task and its continuation to progress.
+     * Instead we combine two tactics:
+     *
+     *   Helping: Arranging for the joiner to execute some task that it
+     *      would be running if the steal had not occurred.
+     *
+     *   Compensating: Unless there are already enough live threads,
+     *      method tryCompensate() may create or re-activate a spare
+     *      thread to compensate for blocked joiners until they unblock.
+     *
+     * A third form (implemented in tryRemoveAndExec and
+     * tryPollForAndExec) amounts to helping a hypothetical
+     * compensator: If we can readily tell that a possible action of a
+     * compensator is to steal and execute the task being joined, the
+     * joining thread can do so directly, without the need for a
+     * compensation thread (although at the expense of larger run-time
+     * stacks, but the tradeoff is typically worthwhile).
+     *
+     * The ManagedBlocker extension API can't use helping so relies
+     * only on compensation in method awaitBlocker.
+     *
+     * The algorithm in tryHelpStealer entails a form of "linear"
+     * helping: Each worker records (in field currentSteal) the most
+     * recent task it stole from some other worker. Plus, it records
+     * (in field currentJoin) the task it is currently actively
+     * joining. Method tryHelpStealer uses these markers to try to
+     * find a worker to help (i.e., steal back a task from and execute
+     * it) that could hasten completion of the actively joined task.
+     * In essence, the joiner executes a task that would be on its own
+     * local deque had the to-be-joined task not been stolen. This may
+     * be seen as a conservative variant of the approach in Wagner &
+     * Calder "Leapfrogging: a portable technique for implementing
+     * efficient futures" SIGPLAN Notices, 1993
+     * (http://portal.acm.org/citation.cfm?id=155354). It differs in
+     * that: (1) We only maintain dependency links across workers upon
+     * steals, rather than use per-task bookkeeping.  This sometimes
+     * requires a linear scan of workQueues array to locate stealers,
+     * but often doesn't because stealers leave hints (that may become
+     * stale/wrong) of where to locate them.  A stealHint is only a
+     * hint because a worker might have had multiple steals and the
+     * hint records only one of them (usually the most current).
+     * Hinting isolates cost to when it is needed, rather than adding
+     * to per-task overhead.  (2) It is "shallow", ignoring nesting
+     * and potentially cyclic mutual steals.  (3) It is intentionally
+     * racy: field currentJoin is updated only while actively joining,
+     * which means that we miss links in the chain during long-lived
+     * tasks, GC stalls etc (which is OK since blocking in such cases
+     * is usually a good idea).  (4) We bound the number of attempts
+     * to find work (see MAX_HELP) and fall back to suspending the
+     * worker and if necessary replacing it with another.
+     *
+     * It is impossible to keep exactly the target parallelism number
+     * of threads running at any given time.  Determining the
+     * existence of conservatively safe helping targets, the
+     * availability of already-created spares, and the apparent need
+     * to create new spares are all racy, so we rely on multiple
+     * retries of each.  Compensation in the apparent absence of
+     * helping opportunities is challenging to control on JVMs, where
+     * GC and other activities can stall progress of tasks that in
+     * turn stall out many other dependent tasks, without us being
+     * able to determine whether they will ever require compensation.
+     * Even though work-stealing otherwise encounters little
+     * degradation in the presence of more threads than cores,
+     * aggressively adding new threads in such cases entails risk of
+     * unwanted positive feedback control loops in which more threads
+     * cause more dependent stalls (as well as delayed progress of
+     * unblocked threads to the point that we know they are available)
+     * leading to more situations requiring more threads, and so
+     * on. This aspect of control can be seen as an (analytically
+     * intractable) game with an opponent that may choose the worst
+     * (for us) active thread to stall at any time.  We take several
+     * precautions to bound losses (and thus bound gains), mainly in
+     * methods tryCompensate and awaitJoin: (1) We only try
+     * compensation after attempting enough helping steps (measured
+     * via counting and timing) that we have already consumed the
+     * estimated cost of creating and activating a new thread.  (2) We
+     * allow up to 50% of threads to be blocked before initially
+     * adding any others, and unless completely saturated, check that
+     * some work is available for a new worker before adding. Also, we
+     * create up to only 50% more threads until entering a mode that
+     * only adds a thread if all others are possibly blocked.  All
+     * together, this means that we might be half as fast to react,
+     * and create half as many threads as possible in the ideal case,
+     * but present vastly fewer anomalies in all other cases compared
+     * to both more aggressive and more conservative alternatives.
+     *
+     * Style notes: There is a lot of representation-level coupling
+     * among classes ForkJoinPool, ForkJoinWorkerThread, and
+     * ForkJoinTask.  The fields of WorkQueue maintain data structures
+     * managed by ForkJoinPool, so are directly accessed.  There is
+     * little point trying to reduce this, since any associated future
+     * changes in representations will need to be accompanied by
+     * algorithmic changes anyway. Several methods intrinsically
+     * sprawl because they must accumulate sets of consistent reads of
+     * volatiles held in local variables.  Methods signalWork() and
+     * scan() are the main bottlenecks, so are especially heavily
+     * micro-optimized/mangled.  There are lots of inline assignments
+     * (of form "while ((local = field) != 0)") which are usually the
+     * simplest way to ensure the required read orderings (which are
+     * sometimes critical). This leads to a "C"-like style of listing
+     * declarations of these locals at the heads of methods or blocks.
+     * There are several occurrences of the unusual "do {} while
+     * (!cas...)"  which is the simplest way to force an update of a
+     * CAS'ed variable. There are also other coding oddities that help
+     * some methods perform reasonably even when interpreted (not
+     * compiled).
+     *
+     * The order of declarations in this file is:
+     * (1) Static utility functions
+     * (2) Nested (static) classes
+     * (3) Static fields
+     * (4) Fields, along with constants used when unpacking some of them
+     * (5) Internal control methods
+     * (6) Callbacks and other support for ForkJoinTask methods
+     * (7) Exported methods
+     * (8) Static block initializing statics in minimally dependent order
      */
 
-    /** Mask for packing and unpacking shorts */
-    private static final int  shortMask = 0xffff;
-
-    /** Max pool size -- must be a power of two minus 1 */
-    private static final int MAX_THREADS =  0x7FFF;
+    // Static utilities
 
-    // placeholder for java.util.concurrent.RunnableFuture
-    interface RunnableFuture<T> extends Runnable {
+    /**
+     * If there is a security manager, makes sure caller has
+     * permission to modify threads.
+     */
+    private static void checkPermission() {
+        SecurityManager security = System.getSecurityManager();
+        if (security != null)
+            security.checkPermission(modifyThreadPermission);
     }
 
+    // Nested classes
+
     /**
-     * Factory for creating new ForkJoinWorkerThreads.  A
-     * ForkJoinWorkerThreadFactory must be defined and used for
-     * ForkJoinWorkerThread subclasses that extend base functionality
-     * or initialize threads with different contexts.
+     * Factory for creating new {@link ForkJoinWorkerThread}s.
+     * A {@code ForkJoinWorkerThreadFactory} must be defined and used
+     * for {@code ForkJoinWorkerThread} subclasses that extend base
+     * functionality or initialize threads with different contexts.
      */
     public static interface ForkJoinWorkerThreadFactory {
         /**
          * Returns a new worker thread operating in the given pool.
          *
          * @param pool the pool this thread works in
-         * @throws NullPointerException if pool is null;
+         * @throws NullPointerException if the pool is null
          */
         public ForkJoinWorkerThread newThread(ForkJoinPool pool);
     }
 
     /**
-     * Default ForkJoinWorkerThreadFactory implementation, creates a
+     * Default ForkJoinWorkerThreadFactory implementation; creates a
      * new ForkJoinWorkerThread.
      */
-    static class  DefaultForkJoinWorkerThreadFactory
+    static class DefaultForkJoinWorkerThreadFactory
         implements ForkJoinWorkerThreadFactory {
         public ForkJoinWorkerThread newThread(ForkJoinPool pool) {
-            try {
-                return new ForkJoinWorkerThread(pool);
-            } catch (OutOfMemoryError oom)  {
-                return null;
-            }
+            return new ForkJoinWorkerThread(pool);
         }
     }
 
     /**
-     * Creates a new ForkJoinWorkerThread. This factory is used unless
-     * overridden in ForkJoinPool constructors.
+     * A simple non-reentrant lock used for exclusion when managing
+     * queues and workers. We use a custom lock so that we can readily
+     * probe lock state in constructions that check among alternative
+     * actions. The lock is normally only very briefly held, and
+     * sometimes treated as a spinlock, but other usages block to
+     * reduce overall contention in those cases where locked code
+     * bodies perform allocation/resizing.
      */
-    public static final ForkJoinWorkerThreadFactory
-        defaultForkJoinWorkerThreadFactory =
-        new DefaultForkJoinWorkerThreadFactory();
-
-    /**
-     * Permission required for callers of methods that may start or
-     * kill threads.
-     */
-    private static final RuntimePermission modifyThreadPermission =
-        new RuntimePermission("modifyThread");
+    static final class Mutex extends AbstractQueuedSynchronizer {
+        public final boolean tryAcquire(int ignore) {
+            return compareAndSetState(0, 1);
+        }
+        public final boolean tryRelease(int ignore) {
+            setState(0);
+            return true;
+        }
+        public final void lock() { acquire(0); }
+        public final void unlock() { release(0); }
+        public final boolean isHeldExclusively() { return getState() == 1; }
+        public final Condition newCondition() { return new ConditionObject(); }
+    }
 
     /**
-     * If there is a security manager, makes sure caller has
-     * permission to modify threads.
+     * Class for artificial tasks that are used to replace the target
+     * of local joins if they are removed from an interior queue slot
+     * in WorkQueue.tryRemoveAndExec. We don't need the proxy to
+     * actually do anything beyond having a unique identity.
      */
-    private static void checkPermission() {
-        SecurityManager security = System.getSecurityManager();
-        if (security != null)
-            security.checkPermission(modifyThreadPermission);
+    static final class EmptyTask extends ForkJoinTask<Void> {
+        EmptyTask() { status = ForkJoinTask.NORMAL; } // force done
+        public final Void getRawResult() { return null; }
+        public final void setRawResult(Void x) {}
+        public final boolean exec() { return true; }
     }
 
     /**
-     * Generator for assigning sequence numbers as pool names.
-     */
-    private static final AtomicInteger poolNumberGenerator =
-        new AtomicInteger();
+     * Queues supporting work-stealing as well as external task
+     * submission. See above for main rationale and algorithms.
+     * Implementation relies heavily on "Unsafe" intrinsics
+     * and selective use of "volatile":
+     *
+     * Field "base" is the index (mod array.length) of the least valid
+     * queue slot, which is always the next position to steal (poll)
+     * from if nonempty. Reads and writes require volatile orderings
+     * but not CAS, because updates are only performed after slot
+     * CASes.
+     *
+     * Field "top" is the index (mod array.length) of the next queue
+     * slot to push to or pop from. It is written only by owner thread
+     * for push, or under lock for trySharedPush, and accessed by
+     * other threads only after reading (volatile) base.  Both top and
+     * base are allowed to wrap around on overflow, but (top - base)
+     * (or more commonly -(base - top) to force volatile read of base
+     * before top) still estimates size.
+     *
+     * The array slots are read and written using the emulation of
+     * volatiles/atomics provided by Unsafe. Insertions must in
+     * general use putOrderedObject as a form of releasing store to
+     * ensure that all writes to the task object are ordered before
+     * its publication in the queue. (Although we can avoid one case
+     * of this when locked in trySharedPush.) All removals entail a
+     * CAS to null.  The array is always a power of two. To ensure
+     * safety of Unsafe array operations, all accesses perform
+     * explicit null checks and implicit bounds checks via
+     * power-of-two masking.
+     *
+     * In addition to basic queuing support, this class contains
+     * fields described elsewhere to control execution. It turns out
+     * to work better memory-layout-wise to include them in this
+     * class rather than a separate class.
+     *
+     * Performance on most platforms is very sensitive to placement of
+     * instances of both WorkQueues and their arrays -- we absolutely
+     * do not want multiple WorkQueue instances or multiple queue
+     * arrays sharing cache lines. (It would be best for queue objects
+     * and their arrays to share, but there is nothing available to
+     * help arrange that).  Unfortunately, because they are recorded
+     * in a common array, WorkQueue instances are often moved to be
+     * adjacent by garbage collectors. To reduce impact, we use field
+     * padding that works OK on common platforms; this effectively
+     * trades off slightly slower average field access for the sake of
+     * avoiding really bad worst-case access. (Until better JVM
+     * support is in place, this padding is dependent on transient
+     * properties of JVM field layout rules.)  We also take care in
+     * allocating, sizing and resizing the array. Non-shared queue
+     * arrays are initialized (via method growArray) by workers before
+     * use. Others are allocated on first use.
+     */
+    static final class WorkQueue {
+        /**
+         * Capacity of work-stealing queue array upon initialization.
+         * Must be a power of two; at least 4, but should be larger to
+         * reduce or eliminate cacheline sharing among queues.
+         * Currently, it is much larger, as a partial workaround for
+         * the fact that JVMs often place arrays in locations that
+         * share GC bookkeeping (especially cardmarks) such that
+         * per-write accesses encounter serious memory contention.
+         */
+        static final int INITIAL_QUEUE_CAPACITY = 1 << 13;
 
-    /**
-     * Array holding all worker threads in the pool. Initialized upon
-     * first use. Array size must be a power of two.  Updates and
-     * replacements are protected by workerLock, but it is always kept
-     * in a consistent enough state to be randomly accessed without
-     * locking by workers performing work-stealing.
-     */
-    public volatile ForkJoinWorkerThread[] workers;
+        /**
+         * Maximum size for queue arrays. Must be a power of two less
+         * than or equal to 1 << (31 - width of array entry) to ensure
+         * lack of wraparound of index calculations, but defined to a
+         * value a bit less than this to help users trap runaway
+         * programs before saturating systems.
+         */
+        static final int MAXIMUM_QUEUE_CAPACITY = 1 << 26; // 64M
+
+        volatile long totalSteals; // cumulative number of steals
+        int seed;                  // for random scanning; initialize nonzero
+        volatile int eventCount;   // encoded inactivation count; < 0 if inactive
+        int nextWait;              // encoded record of next event waiter
+        int rescans;               // remaining scans until block
+        int nsteals;               // top-level task executions since last idle
+        final int mode;            // lifo, fifo, or shared
+        int poolIndex;             // index of this queue in pool (or 0)
+        int stealHint;             // index of most recent known stealer
+        volatile int runState;     // 1: locked, -1: terminate; else 0
+        volatile int base;         // index of next slot for poll
+        int top;                   // index of next slot for push
+        ForkJoinTask<?>[] array;   // the elements (initially unallocated)
+        final ForkJoinPool pool;   // the containing pool (may be null)
+        final ForkJoinWorkerThread owner; // owning thread or null if shared
+        volatile Thread parker;    // == owner during call to park; else null
+        ForkJoinTask<?> currentJoin;  // task being joined in awaitJoin
+        ForkJoinTask<?> currentSteal; // current non-local task being executed
+        // Heuristic padding to ameliorate unfortunate memory placements
+        Object p00, p01, p02, p03, p04, p05, p06, p07;
+        Object p08, p09, p0a, p0b, p0c, p0d, p0e;
+
+        WorkQueue(ForkJoinPool pool, ForkJoinWorkerThread owner, int mode) {
+            this.mode = mode;
+            this.pool = pool;
+            this.owner = owner;
+            // Place indices in the center of array (that is not yet allocated)
+            base = top = INITIAL_QUEUE_CAPACITY >>> 1;
+        }
 
-    /**
-     * Lock protecting access to workers.
-     */
-    private final ReentrantLock workerLock;
+        /**
+         * Returns the approximate number of tasks in the queue.
+         */
+        final int queueSize() {
+            int n = base - top;       // non-owner callers must read base first
+            return (n >= 0) ? 0 : -n; // ignore transient negative
+        }
 
-    /**
-     * Condition for awaitTermination.
-     */
-    private final Condition termination;
+        /**
+         * Provides a more accurate estimate of whether this queue has
+         * any tasks than does queueSize, by checking whether a
+         * near-empty queue has at least one unclaimed task.
+         */
+        final boolean isEmpty() {
+            ForkJoinTask<?>[] a; int m, s;
+            int n = base - (s = top);
+            return (n >= 0 ||
+                    (n == -1 &&
+                     ((a = array) == null ||
+                      (m = a.length - 1) < 0 ||
+                      U.getObjectVolatile
+                      (a, ((m & (s - 1)) << ASHIFT) + ABASE) == null)));
+        }
+
+        /**
+         * Pushes a task. Call only by owner in unshared queues.
+         *
+         * @param task the task. Caller must ensure non-null.
+         * @throw RejectedExecutionException if array cannot be resized
+         */
+        final void push(ForkJoinTask<?> task) {
+            ForkJoinTask<?>[] a; ForkJoinPool p;
+            int s = top, m, n;
+            if ((a = array) != null) {    // ignore if queue removed
+                U.putOrderedObject
+                    (a, (((m = a.length - 1) & s) << ASHIFT) + ABASE, task);
+                if ((n = (top = s + 1) - base) <= 2) {
+                    if ((p = pool) != null)
+                        p.signalWork();
+                }
+                else if (n >= m)
+                    growArray(true);
+            }
+        }
+
+        /**
+         * Pushes a task if lock is free and array is either big
+         * enough or can be resized to be big enough.
+         *
+         * @param task the task. Caller must ensure non-null.
+         * @return true if submitted
+         */
+        final boolean trySharedPush(ForkJoinTask<?> task) {
+            boolean submitted = false;
+            if (runState == 0 && U.compareAndSwapInt(this, RUNSTATE, 0, 1)) {
+                ForkJoinTask<?>[] a = array;
+                int s = top;
+                try {
+                    if ((a != null && a.length > s + 1 - base) ||
+                        (a = growArray(false)) != null) { // must presize
+                        int j = (((a.length - 1) & s) << ASHIFT) + ABASE;
+                        U.putObject(a, (long)j, task);    // don't need "ordered"
+                        top = s + 1;
+                        submitted = true;
+                    }
+                } finally {
+                    runState = 0;                         // unlock
+                }
+            }
+            return submitted;
+        }
+
+        /**
+         * Takes next task, if one exists, in LIFO order.  Call only
+         * by owner in unshared queues. (We do not have a shared
+         * version of this method because it is never needed.)
+         */
+        final ForkJoinTask<?> pop() {
+            ForkJoinTask<?> t; int m;
+            ForkJoinTask<?>[] a = array;
+            if (a != null && (m = a.length - 1) >= 0) {
+                for (int s; (s = top - 1) - base >= 0;) {
+                    int j = ((m & s) << ASHIFT) + ABASE;
+                    if ((t = (ForkJoinTask<?>)U.getObjectVolatile(a, j)) == null)
+                        break;
+                    if (U.compareAndSwapObject(a, j, t, null)) {
+                        top = s;
+                        return t;
+                    }
+                }
+            }
+            return null;
+        }
+
+        /**
+         * Takes a task in FIFO order if b is base of queue and a task
+         * can be claimed without contention. Specialized versions
+         * appear in ForkJoinPool methods scan and tryHelpStealer.
+         */
+        final ForkJoinTask<?> pollAt(int b) {
+            ForkJoinTask<?> t; ForkJoinTask<?>[] a;
+            if ((a = array) != null) {
+                int j = (((a.length - 1) & b) << ASHIFT) + ABASE;
+                if ((t = (ForkJoinTask<?>)U.getObjectVolatile(a, j)) != null &&
+                    base == b &&
+                    U.compareAndSwapObject(a, j, t, null)) {
+                    base = b + 1;
+                    return t;
+                }
+            }
+            return null;
+        }
+
+        /**
+         * Takes next task, if one exists, in FIFO order.
+         */
+        final ForkJoinTask<?> poll() {
+            ForkJoinTask<?>[] a; int b; ForkJoinTask<?> t;
+            while ((b = base) - top < 0 && (a = array) != null) {
+                int j = (((a.length - 1) & b) << ASHIFT) + ABASE;
+                t = (ForkJoinTask<?>)U.getObjectVolatile(a, j);
+                if (t != null) {
+                    if (base == b &&
+                        U.compareAndSwapObject(a, j, t, null)) {
+                        base = b + 1;
+                        return t;
+                    }
+                }
+                else if (base == b) {
+                    if (b + 1 == top)
+                        break;
+                    Thread.yield(); // wait for lagging update
+                }
+            }
+            return null;
+        }
+
+        /**
+         * Takes next task, if one exists, in order specified by mode.
+         */
+        final ForkJoinTask<?> nextLocalTask() {
+            return mode == 0 ? pop() : poll();
+        }
+
+        /**
+         * Returns next task, if one exists, in order specified by mode.
+         */
+        final ForkJoinTask<?> peek() {
+            ForkJoinTask<?>[] a = array; int m;
+            if (a == null || (m = a.length - 1) < 0)
+                return null;
+            int i = mode == 0 ? top - 1 : base;
+            int j = ((i & m) << ASHIFT) + ABASE;
+            return (ForkJoinTask<?>)U.getObjectVolatile(a, j);
+        }
+
+        /**
+         * Pops the given task only if it is at the current top.
+         */
+        final boolean tryUnpush(ForkJoinTask<?> t) {
+            ForkJoinTask<?>[] a; int s;
+            if ((a = array) != null && (s = top) != base &&
+                U.compareAndSwapObject
+                (a, (((a.length - 1) & --s) << ASHIFT) + ABASE, t, null)) {
+                top = s;
+                return true;
+            }
+            return false;
+        }
+
+        /**
+         * Polls the given task only if it is at the current base.
+         */
+        final boolean pollFor(ForkJoinTask<?> task) {
+            ForkJoinTask<?>[] a; int b;
+            if ((b = base) - top < 0 && (a = array) != null) {
+                int j = (((a.length - 1) & b) << ASHIFT) + ABASE;
+                if (U.getObjectVolatile(a, j) == task && base == b &&
+                    U.compareAndSwapObject(a, j, task, null)) {
+                    base = b + 1;
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        /**
+         * If present, removes from queue and executes the given task, or
+         * any other cancelled task. Returns (true) immediately on any CAS
+         * or consistency check failure so caller can retry.
+         *
+         * @return false if no progress can be made
+         */
+        final boolean tryRemoveAndExec(ForkJoinTask<?> task) {
+            boolean removed = false, empty = true, progress = true;
+            ForkJoinTask<?>[] a; int m, s, b, n;
+            if ((a = array) != null && (m = a.length - 1) >= 0 &&
+                (n = (s = top) - (b = base)) > 0) {
+                for (ForkJoinTask<?> t;;) {           // traverse from s to b
+                    int j = ((--s & m) << ASHIFT) + ABASE;
+                    t = (ForkJoinTask<?>)U.getObjectVolatile(a, j);
+                    if (t == null)                    // inconsistent length
+                        break;
+                    else if (t == task) {
+                        if (s + 1 == top) {           // pop
+                            if (!U.compareAndSwapObject(a, j, task, null))
+                                break;
+                            top = s;
+                            removed = true;
+                        }
+                        else if (base == b)           // replace with proxy
+                            removed = U.compareAndSwapObject(a, j, task,
+                                                             new EmptyTask());
+                        break;
+                    }
+                    else if (t.status >= 0)
+                        empty = false;
+                    else if (s + 1 == top) {          // pop and throw away
+                        if (U.compareAndSwapObject(a, j, t, null))
+                            top = s;
+                        break;
+                    }
+                    if (--n == 0) {
+                        if (!empty && base == b)
+                            progress = false;
+                        break;
+                    }
+                }
+            }
+            if (removed)
+                task.doExec();
+            return progress;
+        }
+
+        /**
+         * Initializes or doubles the capacity of array. Call either
+         * by owner or with lock held -- it is OK for base, but not
+         * top, to move while resizings are in progress.
+         *
+         * @param rejectOnFailure if true, throw exception if capacity
+         * exceeded (relayed ultimately to user); else return null.
+         */
+        final ForkJoinTask<?>[] growArray(boolean rejectOnFailure) {
+            ForkJoinTask<?>[] oldA = array;
+            int size = oldA != null ? oldA.length << 1 : INITIAL_QUEUE_CAPACITY;
+            if (size <= MAXIMUM_QUEUE_CAPACITY) {
+                int oldMask, t, b;
+                ForkJoinTask<?>[] a = array = new ForkJoinTask<?>[size];
+                if (oldA != null && (oldMask = oldA.length - 1) >= 0 &&
+                    (t = top) - (b = base) > 0) {
+                    int mask = size - 1;
+                    do {
+                        ForkJoinTask<?> x;
+                        int oldj = ((b & oldMask) << ASHIFT) + ABASE;
+                        int j    = ((b &    mask) << ASHIFT) + ABASE;
+                        x = (ForkJoinTask<?>)U.getObjectVolatile(oldA, oldj);
+                        if (x != null &&
+                            U.compareAndSwapObject(oldA, oldj, x, null))
+                            U.putObjectVolatile(a, j, x);
+                    } while (++b != t);
+                }
+                return a;
+            }
+            else if (!rejectOnFailure)
+                return null;
+            else
+                throw new RejectedExecutionException("Queue capacity exceeded");
+        }
+
+        /**
+         * Removes and cancels all known tasks, ignoring any exceptions.
+         */
+        final void cancelAll() {
+            ForkJoinTask.cancelIgnoringExceptions(currentJoin);
+            ForkJoinTask.cancelIgnoringExceptions(currentSteal);
+            for (ForkJoinTask<?> t; (t = poll()) != null; )
+                ForkJoinTask.cancelIgnoringExceptions(t);
+        }
+
+        /**
+         * Computes next value for random probes.  Scans don't require
+         * a very high quality generator, but also not a crummy one.
+         * Marsaglia xor-shift is cheap and works well enough.  Note:
+         * This is manually inlined in its usages in ForkJoinPool to
+         * avoid writes inside busy scan loops.
+         */
+        final int nextSeed() {
+            int r = seed;
+            r ^= r << 13;
+            r ^= r >>> 17;
+            return seed = r ^= r << 5;
+        }
+
+        // Execution methods
+
+        /**
+         * Removes and runs tasks until empty, using local mode
+         * ordering. Normally called only after checking for apparent
+         * non-emptiness.
+         */
+        final void runLocalTasks() {
+            // hoist checks from repeated pop/poll
+            ForkJoinTask<?>[] a; int m;
+            if ((a = array) != null && (m = a.length - 1) >= 0) {
+                if (mode == 0) {
+                    for (int s; (s = top - 1) - base >= 0;) {
+                        int j = ((m & s) << ASHIFT) + ABASE;
+                        ForkJoinTask<?> t =
+                            (ForkJoinTask<?>)U.getObjectVolatile(a, j);
+                        if (t != null) {
+                            if (U.compareAndSwapObject(a, j, t, null)) {
+                                top = s;
+                                t.doExec();
+                            }
+                        }
+                        else
+                            break;
+                    }
+                }
+                else {
+                    for (int b; (b = base) - top < 0;) {
+                        int j = ((m & b) << ASHIFT) + ABASE;
+                        ForkJoinTask<?> t =
+                            (ForkJoinTask<?>)U.getObjectVolatile(a, j);
+                        if (t != null) {
+                            if (base == b &&
+                                U.compareAndSwapObject(a, j, t, null)) {
+                                base = b + 1;
+                                t.doExec();
+                            }
+                        } else if (base == b) {
+                            if (b + 1 == top)
+                                break;
+                            Thread.yield(); // wait for lagging update
+                        }
+                    }
+                }
+            }
+        }
+
+        /**
+         * Executes a top-level task and any local tasks remaining
+         * after execution.
+         *
+         * @return true unless terminating
+         */
+        final boolean runTask(ForkJoinTask<?> t) {
+            boolean alive = true;
+            if (t != null) {
+                currentSteal = t;
+                t.doExec();
+                if (top != base)        // conservative guard
+                    runLocalTasks();
+                ++nsteals;
+                currentSteal = null;
+            }
+            else if (runState < 0)      // terminating
+                alive = false;
+            return alive;
+        }
+
+        /**
+         * Executes a non-top-level (stolen) task.
+         */
+        final void runSubtask(ForkJoinTask<?> t) {
+            if (t != null) {
+                ForkJoinTask<?> ps = currentSteal;
+                currentSteal = t;
+                t.doExec();
+                currentSteal = ps;
+            }
+        }
+
+        /**
+         * Returns true if owned and not known to be blocked.
+         */
+        final boolean isApparentlyUnblocked() {
+            Thread wt; Thread.State s;
+            return (eventCount >= 0 &&
+                    (wt = owner) != null &&
+                    (s = wt.getState()) != Thread.State.BLOCKED &&
+                    s != Thread.State.WAITING &&
+                    s != Thread.State.TIMED_WAITING);
+        }
+
+        /**
+         * If this owned and is not already interrupted, try to
+         * interrupt and/or unpark, ignoring exceptions.
+         */
+        final void interruptOwner() {
+            Thread wt, p;
+            if ((wt = owner) != null && !wt.isInterrupted()) {
+                try {
+                    wt.interrupt();
+                } catch (SecurityException ignore) {
+                }
+            }
+            if ((p = parker) != null)
+                U.unpark(p);
+        }
+
+        // Unsafe mechanics
+        private static final sun.misc.Unsafe U;
+        private static final long RUNSTATE;
+        private static final int ABASE;
+        private static final int ASHIFT;
+        static {
+            int s;
+            try {
+                U = getUnsafe();
+                Class<?> k = WorkQueue.class;
+                Class<?> ak = ForkJoinTask[].class;
+                RUNSTATE = U.objectFieldOffset
+                    (k.getDeclaredField("runState"));
+                ABASE = U.arrayBaseOffset(ak);
+                s = U.arrayIndexScale(ak);
+            } catch (Exception e) {
+                throw new Error(e);
+            }
+            if ((s & (s-1)) != 0)
+                throw new Error("data type scale not a power of two");
+            ASHIFT = 31 - Integer.numberOfLeadingZeros(s);
+        }
+    }
 
     /**
-     * The uncaught exception handler used when any worker
-     * abrupty terminates
-     */
-    private Thread.UncaughtExceptionHandler ueh;
+     * Per-thread records for threads that submit to pools. Currently
+     * holds only pseudo-random seed / index that is used to choose
+     * submission queues in method doSubmit. In the future, this may
+     * also incorporate a means to implement different task rejection
+     * and resubmission policies.
+     *
+     * Seeds for submitters and workers/workQueues work in basically
+     * the same way but are initialized and updated using slightly
+     * different mechanics. Both are initialized using the same
+     * approach as in class ThreadLocal, where successive values are
+     * unlikely to collide with previous values. This is done during
+     * registration for workers, but requires a separate AtomicInteger
+     * for submitters. Seeds are then randomly modified upon
+     * collisions using xorshifts, which requires a non-zero seed.
+     */
+    static final class Submitter {
+        int seed;
+        Submitter() {
+            int s = nextSubmitterSeed.getAndAdd(SEED_INCREMENT);
+            seed = (s == 0) ? 1 : s; // ensure non-zero
+        }
+    }
+
+    /** ThreadLocal class for Submitters */
+    static final class ThreadSubmitter extends ThreadLocal<Submitter> {
+        public Submitter initialValue() { return new Submitter(); }
+    }
+
+    // static fields (initialized in static initializer below)
 
     /**
-     * Creation factory for worker threads.
+     * Creates a new ForkJoinWorkerThread. This factory is used unless
+     * overridden in ForkJoinPool constructors.
      */
-    private final ForkJoinWorkerThreadFactory factory;
+    public static final ForkJoinWorkerThreadFactory
+        defaultForkJoinWorkerThreadFactory;
 
     /**
-     * Head of stack of threads that were created to maintain
-     * parallelism when other threads blocked, but have since
-     * suspended when the parallelism level rose.
+     * Generator for assigning sequence numbers as pool names.
      */
-    private volatile WaitQueueNode spareStack;
+    private static final AtomicInteger poolNumberGenerator;
 
     /**
-     * Sum of per-thread steal counts, updated only when threads are
-     * idle or terminating.
+     * Generator for initial hashes/seeds for submitters. Accessed by
+     * Submitter class constructor.
      */
-    private final AtomicLong stealCount;
+    static final AtomicInteger nextSubmitterSeed;
 
     /**
-     * Queue for external submissions.
+     * Permission required for callers of methods that may start or
+     * kill threads.
      */
-    private final LinkedTransferQueue<ForkJoinTask<?>> submissionQueue;
+    private static final RuntimePermission modifyThreadPermission;
 
     /**
-     * Head of Treiber stack for barrier sync. See below for explanation
+     * Per-thread submission bookeeping. Shared across all pools
+     * to reduce ThreadLocal pollution and because random motion
+     * to avoid contention in one pool is likely to hold for others.
      */
-    private volatile WaitQueueNode syncStack;
+    private static final ThreadSubmitter submitters;
+
+    // static constants
 
     /**
-     * The count for event barrier
+     * The wakeup interval (in nanoseconds) for a worker waiting for a
+     * task when the pool is quiescent to instead try to shrink the
+     * number of workers.  The exact value does not matter too
+     * much. It must be short enough to release resources during
+     * sustained periods of idleness, but not so short that threads
+     * are continually re-created.
      */
-    private volatile long eventCount;
+    private static final long SHRINK_RATE =
+        4L * 1000L * 1000L * 1000L; // 4 seconds
 
     /**
-     * Pool number, just for assigning useful names to worker threads
+     * The timeout value for attempted shrinkage, includes
+     * some slop to cope with system timer imprecision.
      */
-    private final int poolNumber;
+    private static final long SHRINK_TIMEOUT = SHRINK_RATE - (SHRINK_RATE / 10);
 
     /**
-     * The maximum allowed pool size
+     * The maximum stolen->joining link depth allowed in method
+     * tryHelpStealer.  Must be a power of two. This value also
+     * controls the maximum number of times to try to help join a task
+     * without any apparent progress or change in pool state before
+     * giving up and blocking (see awaitJoin).  Depths for legitimate
+     * chains are unbounded, but we use a fixed constant to avoid
+     * (otherwise unchecked) cycles and to bound staleness of
+     * traversal parameters at the expense of sometimes blocking when
+     * we could be helping.
      */
-    private volatile int maxPoolSize;
+    private static final int MAX_HELP = 32;
 
     /**
-     * The desired parallelism level, updated only under workerLock.
+     * Secondary time-based bound (in nanosecs) for helping attempts
+     * before trying compensated blocking in awaitJoin. Used in
+     * conjunction with MAX_HELP to reduce variance due to different
+     * polling rates associated with different helping options. The
+     * value should roughly approximate the time required to create
+     * and/or activate a worker thread.
      */
-    private volatile int parallelism;
+    private static final long COMPENSATION_DELAY = 100L * 1000L; // 0.1 millisec
 
     /**
-     * True if use local fifo, not default lifo, for local polling
+     * Increment for seed generators. See class ThreadLocal for
+     * explanation.
      */
-    private volatile boolean locallyFifo;
+    private static final int SEED_INCREMENT = 0x61c88647;
 
     /**
-     * Holds number of total (i.e., created and not yet terminated)
-     * and running (i.e., not blocked on joins or other managed sync)
-     * threads, packed into one int to ensure consistent snapshot when
-     * making decisions about creating and suspending spare
-     * threads. Updated only by CAS.  Note: CASes in
-     * updateRunningCount and preJoin running active count is in low
-     * word, so need to be modified if this changes
-     */
-    private volatile int workerCounts;
+     * Bits and masks for control variables
+     *
+     * Field ctl is a long packed with:
+     * AC: Number of active running workers minus target parallelism (16 bits)
+     * TC: Number of total workers minus target parallelism (16 bits)
+     * ST: true if pool is terminating (1 bit)
+     * EC: the wait count of top waiting thread (15 bits)
+     * ID: poolIndex of top of Treiber stack of waiters (16 bits)
+     *
+     * When convenient, we can extract the upper 32 bits of counts and
+     * the lower 32 bits of queue state, u = (int)(ctl >>> 32) and e =
+     * (int)ctl.  The ec field is never accessed alone, but always
+     * together with id and st. The offsets of counts by the target
+     * parallelism and the positionings of fields makes it possible to
+     * perform the most common checks via sign tests of fields: When
+     * ac is negative, there are not enough active workers, when tc is
+     * negative, there are not enough total workers, and when e is
+     * negative, the pool is terminating.  To deal with these possibly
+     * negative fields, we use casts in and out of "short" and/or
+     * signed shifts to maintain signedness.
+     *
+     * When a thread is queued (inactivated), its eventCount field is
+     * set negative, which is the only way to tell if a worker is
+     * prevented from executing tasks, even though it must continue to
+     * scan for them to avoid queuing races. Note however that
+     * eventCount updates lag releases so usage requires care.
+     *
+     * Field runState is an int packed with:
+     * SHUTDOWN: true if shutdown is enabled (1 bit)
+     * SEQ:  a sequence number updated upon (de)registering workers (30 bits)
+     * INIT: set true after workQueues array construction (1 bit)
+     *
+     * The sequence number enables simple consistency checks:
+     * Staleness of read-only operations on the workQueues array can
+     * be checked by comparing runState before vs after the reads.
+     */
+
+    // bit positions/shifts for fields
+    private static final int  AC_SHIFT   = 48;
+    private static final int  TC_SHIFT   = 32;
+    private static final int  ST_SHIFT   = 31;
+    private static final int  EC_SHIFT   = 16;
+
+    // bounds
+    private static final int  SMASK      = 0xffff;  // short bits
+    private static final int  MAX_CAP    = 0x7fff;  // max #workers - 1
+    private static final int  SQMASK     = 0xfffe;  // even short bits
+    private static final int  SHORT_SIGN = 1 << 15;
+    private static final int  INT_SIGN   = 1 << 31;
+
+    // masks
+    private static final long STOP_BIT   = 0x0001L << ST_SHIFT;
+    private static final long AC_MASK    = ((long)SMASK) << AC_SHIFT;
+    private static final long TC_MASK    = ((long)SMASK) << TC_SHIFT;
+
+    // units for incrementing and decrementing
+    private static final long TC_UNIT    = 1L << TC_SHIFT;
+    private static final long AC_UNIT    = 1L << AC_SHIFT;
+
+    // masks and units for dealing with u = (int)(ctl >>> 32)
+    private static final int  UAC_SHIFT  = AC_SHIFT - 32;
+    private static final int  UTC_SHIFT  = TC_SHIFT - 32;
+    private static final int  UAC_MASK   = SMASK << UAC_SHIFT;
+    private static final int  UTC_MASK   = SMASK << UTC_SHIFT;
+    private static final int  UAC_UNIT   = 1 << UAC_SHIFT;
+    private static final int  UTC_UNIT   = 1 << UTC_SHIFT;
+
+    // masks and units for dealing with e = (int)ctl
+    private static final int E_MASK      = 0x7fffffff; // no STOP_BIT
+    private static final int E_SEQ       = 1 << EC_SHIFT;
+
+    // runState bits
+    private static final int SHUTDOWN    = 1 << 31;
+
+    // access mode for WorkQueue
+    static final int LIFO_QUEUE          =  0;
+    static final int FIFO_QUEUE          =  1;
+    static final int SHARED_QUEUE        = -1;
+
+    // Instance fields
 
-    private static int totalCountOf(int s)           { return s >>> 16;  }
-    private static int runningCountOf(int s)         { return s & shortMask; }
-    private static int workerCountsFor(int t, int r) { return (t << 16) + r; }
+    /*
+     * Field layout order in this class tends to matter more than one
+     * would like. Runtime layout order is only loosely related to
+     * declaration order and may differ across JVMs, but the following
+     * empirically works OK on current JVMs.
+     */
+
+    volatile long ctl;                         // main pool control
+    final int parallelism;                     // parallelism level
+    final int localMode;                       // per-worker scheduling mode
+    final int submitMask;                      // submit queue index bound
+    int nextSeed;                              // for initializing worker seeds
+    volatile int runState;                     // shutdown status and seq
+    WorkQueue[] workQueues;                    // main registry
+    final Mutex lock;                          // for registration
+    final Condition termination;               // for awaitTermination
+    final ForkJoinWorkerThreadFactory factory; // factory for new workers
+    final Thread.UncaughtExceptionHandler ueh; // per-worker UEH
+    final AtomicLong stealCount;               // collect counts when terminated
+    final AtomicInteger nextWorkerNumber;      // to create worker name string
+    final String workerNamePrefix;             // to create worker name string
+
+    //  Creating, registering, and deregistering workers
+
+    /**
+     * Tries to create and start a worker
+     */
+    private void addWorker() {
+        Throwable ex = null;
+        ForkJoinWorkerThread wt = null;
+        try {
+            if ((wt = factory.newThread(this)) != null) {
+                wt.start();
+                return;
+            }
+        } catch (Throwable e) {
+            ex = e;
+        }
+        deregisterWorker(wt, ex); // adjust counts etc on failure
+    }
 
     /**
-     * Add delta (which may be negative) to running count.  This must
-     * be called before (with negative arg) and after (with positive)
-     * any managed synchronization (i.e., mainly, joins)
-     * @param delta the number to add
+     * Callback from ForkJoinWorkerThread constructor to assign a
+     * public name. This must be separate from registerWorker because
+     * it is called during the "super" constructor call in
+     * ForkJoinWorkerThread.
      */
-    final void updateRunningCount(int delta) {
-        int s;
-        do;while (!casWorkerCounts(s = workerCounts, s + delta));
+    final String nextWorkerName() {
+        return workerNamePrefix.concat
+            (Integer.toString(nextWorkerNumber.addAndGet(1)));
     }
 
     /**
-     * Add delta (which may be negative) to both total and running
-     * count.  This must be called upon creation and termination of
-     * worker threads.
-     * @param delta the number to add
+     * Callback from ForkJoinWorkerThread constructor to establish its
+     * poolIndex and record its WorkQueue. To avoid scanning bias due
+     * to packing entries in front of the workQueues array, we treat
+     * the array as a simple power-of-two hash table using per-thread
+     * seed as hash, expanding as needed.
+     *
+     * @param w the worker's queue
      */
-    private void updateWorkerCount(int delta) {
-        int d = delta + (delta << 16); // add to both lo and hi parts
-        int s;
-        do;while (!casWorkerCounts(s = workerCounts, s + d));
+    final void registerWorker(WorkQueue w) {
+        Mutex lock = this.lock;
+        lock.lock();
+        try {
+            WorkQueue[] ws = workQueues;
+            if (w != null && ws != null) {          // skip on shutdown/failure
+                int rs, n;
+                while ((n = ws.length) <            // ensure can hold total
+                       (parallelism + (short)(ctl >>> TC_SHIFT) << 1))
+                    workQueues = ws = Arrays.copyOf(ws, n << 1);
+                int m = n - 1;
+                int s = nextSeed += SEED_INCREMENT; // rarely-colliding sequence
+                w.seed = (s == 0) ? 1 : s;          // ensure non-zero seed
+                int r = (s << 1) | 1;               // use odd-numbered indices
+                while (ws[r &= m] != null)          // step by approx half size
+                    r += ((n >>> 1) & SQMASK) + 2;
+                w.eventCount = w.poolIndex = r;     // establish before recording
+                ws[r] = w;                          // also update seq
+                runState = ((rs = runState) & SHUTDOWN) | ((rs + 2) & ~SHUTDOWN);
+            }
+        } finally {
+            lock.unlock();
+        }
     }
 
     /**
-     * Lifecycle control. High word contains runState, low word
-     * contains the number of workers that are (probably) executing
-     * tasks. This value is atomically incremented before a worker
-     * gets a task to run, and decremented when worker has no tasks
-     * and cannot find any. These two fields are bundled together to
-     * support correct termination triggering.  Note: activeCount
-     * CAS'es cheat by assuming active count is in low word, so need
-     * to be modified if this changes
-     */
-    private volatile int runControl;
-
-    // RunState values. Order among values matters
-    private static final int RUNNING     = 0;
-    private static final int SHUTDOWN    = 1;
-    private static final int TERMINATING = 2;
-    private static final int TERMINATED  = 3;
+     * Final callback from terminating worker, as well as upon failure
+     * to construct or start a worker in addWorker.  Removes record of
+     * worker from array, and adjusts counts. If pool is shutting
+     * down, tries to complete termination.
+     *
+     * @param wt the worker thread or null if addWorker failed
+     * @param ex the exception causing failure, or null if none
+     */
+    final void deregisterWorker(ForkJoinWorkerThread wt, Throwable ex) {
+        Mutex lock = this.lock;
+        WorkQueue w = null;
+        if (wt != null && (w = wt.workQueue) != null) {
+            w.runState = -1;                // ensure runState is set
+            stealCount.getAndAdd(w.totalSteals + w.nsteals);
+            int idx = w.poolIndex;
+            lock.lock();
+            try {                           // remove record from array
+                WorkQueue[] ws = workQueues;
+                if (ws != null && idx >= 0 && idx < ws.length && ws[idx] == w)
+                    ws[idx] = null;
+            } finally {
+                lock.unlock();
+            }
+        }
 
-    private static int runStateOf(int c)             { return c >>> 16; }
-    private static int activeCountOf(int c)          { return c & shortMask; }
-    private static int runControlFor(int r, int a)   { return (r << 16) + a; }
+        long c;                             // adjust ctl counts
+        do {} while (!U.compareAndSwapLong
+                     (this, CTL, c = ctl, (((c - AC_UNIT) & AC_MASK) |
+                                           ((c - TC_UNIT) & TC_MASK) |
+                                           (c & ~(AC_MASK|TC_MASK)))));
+
+        if (!tryTerminate(false, false) && w != null) {
+            w.cancelAll();                  // cancel remaining tasks
+            if (w.array != null)            // suppress signal if never ran
+                signalWork();               // wake up or create replacement
+            if (ex == null)                 // help clean refs on way out
+                ForkJoinTask.helpExpungeStaleExceptions();
+        }
 
-    /**
-     * Try incrementing active count; fail on contention. Called by
-     * workers before/during executing tasks.
-     * @return true on success;
-     */
-    final boolean tryIncrementActiveCount() {
-        int c = runControl;
-        return casRunControl(c, c+1);
+        if (ex != null)                     // rethrow
+            U.throwException(ex);
     }
 
+
+    // Submissions
+
     /**
-     * Try decrementing active count; fail on contention.
-     * Possibly trigger termination on success
-     * Called by workers when they can't find tasks.
-     * @return true on success
-     */
-    final boolean tryDecrementActiveCount() {
-        int c = runControl;
-        int nextc = c - 1;
-        if (!casRunControl(c, nextc))
-            return false;
-        if (canTerminateOnShutdown(nextc))
-            terminateOnShutdown();
-        return true;
+     * Unless shutting down, adds the given task to a submission queue
+     * at submitter's current queue index (modulo submission
+     * range). If no queue exists at the index, one is created.  If
+     * the queue is busy, another index is randomly chosen. The
+     * submitMask bounds the effective number of queues to the
+     * (nearest power of two for) parallelism level.
+     *
+     * @param task the task. Caller must ensure non-null.
+     */
+    private void doSubmit(ForkJoinTask<?> task) {
+        Submitter s = submitters.get();
+        for (int r = s.seed, m = submitMask;;) {
+            WorkQueue[] ws; WorkQueue q;
+            int k = r & m & SQMASK;          // use only even indices
+            if (runState < 0 || (ws = workQueues) == null || ws.length <= k)
+                throw new RejectedExecutionException(); // shutting down
+            else if ((q = ws[k]) == null) {  // create new queue
+                WorkQueue nq = new WorkQueue(this, null, SHARED_QUEUE);
+                Mutex lock = this.lock;      // construct outside lock
+                lock.lock();
+                try {                        // recheck under lock
+                    int rs = runState;       // to update seq
+                    if (ws == workQueues && ws[k] == null) {
+                        ws[k] = nq;
+                        runState = ((rs & SHUTDOWN) | ((rs + 2) & ~SHUTDOWN));
+                    }
+                } finally {
+                    lock.unlock();
+                }
+            }
+            else if (q.trySharedPush(task)) {
+                signalWork();
+                return;
+            }
+            else if (m > 1) {                // move to a different index
+                r ^= r << 13;                // same xorshift as WorkQueues
+                r ^= r >>> 17;
+                s.seed = r ^= r << 5;
+            }
+            else
+                Thread.yield();              // yield if no alternatives
+        }
     }
 
+    // Maintaining ctl counts
+
     /**
-     * Return true if argument represents zero active count and
-     * nonzero runstate, which is the triggering condition for
-     * terminating on shutdown.
+     * Increments active count; mainly called upon return from blocking.
      */
-    private static boolean canTerminateOnShutdown(int c) {
-        return ((c & -c) >>> 16) != 0; // i.e. least bit is nonzero runState bit
+    final void incrementActiveCount() {
+        long c;
+        do {} while (!U.compareAndSwapLong(this, CTL, c = ctl, c + AC_UNIT));
     }
 
     /**
-     * Transition run state to at least the given state. Return true
-     * if not already at least given state.
+     * Tries to activate or create a worker if too few are active.
      */
-    private boolean transitionRunStateTo(int state) {
-        for (;;) {
-            int c = runControl;
-            if (runStateOf(c) >= state)
-                return false;
-            if (casRunControl(c, runControlFor(state, activeCountOf(c))))
-                return true;
+    final void signalWork() {
+        long c; int u;
+        while ((u = (int)((c = ctl) >>> 32)) < 0) {     // too few active
+            WorkQueue[] ws = workQueues; int e, i; WorkQueue w; Thread p;
+            if ((e = (int)c) > 0) {                     // at least one waiting
+                if (ws != null && (i = e & SMASK) < ws.length &&
+                    (w = ws[i]) != null && w.eventCount == (e | INT_SIGN)) {
+                    long nc = (((long)(w.nextWait & E_MASK)) |
+                               ((long)(u + UAC_UNIT) << 32));
+                    if (U.compareAndSwapLong(this, CTL, c, nc)) {
+                        w.eventCount = (e + E_SEQ) & E_MASK;
+                        if ((p = w.parker) != null)
+                            U.unpark(p);                // activate and release
+                        break;
+                    }
+                }
+                else
+                    break;
+            }
+            else if (e == 0 && (u & SHORT_SIGN) != 0) { // too few total
+                long nc = (long)(((u + UTC_UNIT) & UTC_MASK) |
+                                 ((u + UAC_UNIT) & UAC_MASK)) << 32;
+                if (U.compareAndSwapLong(this, CTL, c, nc)) {
+                    addWorker();
+                    break;
+                }
+            }
+            else
+                break;
         }
     }
 
-    /**
-     * Controls whether to add spares to maintain parallelism
-     */
-    private volatile boolean maintainsParallelism;
 
-    // Constructors
+    // Scanning for tasks
 
     /**
-     * Creates a ForkJoinPool with a pool size equal to the number of
-     * processors available on the system and using the default
-     * ForkJoinWorkerThreadFactory,
-     * @throws SecurityException if a security manager exists and
-     *         the caller is not permitted to modify threads
-     *         because it does not hold {@link
-     *         java.lang.RuntimePermission}<code>("modifyThread")</code>,
+     * Top-level runloop for workers, called by ForkJoinWorkerThread.run.
      */
-    public ForkJoinPool() {
-        this(Runtime.getRuntime().availableProcessors(),
-             defaultForkJoinWorkerThreadFactory);
+    final void runWorker(WorkQueue w) {
+        w.growArray(false);         // initialize queue array in this thread
+        do {} while (w.runTask(scan(w)));
     }
 
     /**
-     * Creates a ForkJoinPool with the indicated parellelism level
-     * threads, and using the default ForkJoinWorkerThreadFactory,
-     * @param parallelism the number of worker threads
-     * @throws IllegalArgumentException if parallelism less than or
-     * equal to zero
-     * @throws SecurityException if a security manager exists and
-     *         the caller is not permitted to modify threads
-     *         because it does not hold {@link
-     *         java.lang.RuntimePermission}<code>("modifyThread")</code>,
-     */
-    public ForkJoinPool(int parallelism) {
-        this(parallelism, defaultForkJoinWorkerThreadFactory);
+     * Scans for and, if found, returns one task, else possibly
+     * inactivates the worker. This method operates on single reads of
+     * volatile state and is designed to be re-invoked continuously,
+     * in part because it returns upon detecting inconsistencies,
+     * contention, or state changes that indicate possible success on
+     * re-invocation.
+     *
+     * The scan searches for tasks across a random permutation of
+     * queues (starting at a random index and stepping by a random
+     * relative prime, checking each at least once).  The scan
+     * terminates upon either finding a non-empty queue, or completing
+     * the sweep. If the worker is not inactivated, it takes and
+     * returns a task from this queue.  On failure to find a task, we
+     * take one of the following actions, after which the caller will
+     * retry calling this method unless terminated.
+     *
+     * * If pool is terminating, terminate the worker.
+     *
+     * * If not a complete sweep, try to release a waiting worker.  If
+     * the scan terminated because the worker is inactivated, then the
+     * released worker will often be the calling worker, and it can
+     * succeed obtaining a task on the next call. Or maybe it is
+     * another worker, but with same net effect. Releasing in other
+     * cases as well ensures that we have enough workers running.
+     *
+     * * If not already enqueued, try to inactivate and enqueue the
+     * worker on wait queue. Or, if inactivating has caused the pool
+     * to be quiescent, relay to idleAwaitWork to check for
+     * termination and possibly shrink pool.
+     *
+     * * If already inactive, and the caller has run a task since the
+     * last empty scan, return (to allow rescan) unless others are
+     * also inactivated.  Field WorkQueue.rescans counts down on each
+     * scan to ensure eventual inactivation and blocking.
+     *
+     * * If already enqueued and none of the above apply, park
+     * awaiting signal,
+     *
+     * @param w the worker (via its WorkQueue)
+     * @return a task or null of none found
+     */
+    private final ForkJoinTask<?> scan(WorkQueue w) {
+        WorkQueue[] ws;                       // first update random seed
+        int r = w.seed; r ^= r << 13; r ^= r >>> 17; w.seed = r ^= r << 5;
+        int rs = runState, m;                 // volatile read order matters
+        if ((ws = workQueues) != null && (m = ws.length - 1) > 0) {
+            int ec = w.eventCount;            // ec is negative if inactive
+            int step = (r >>> 16) | 1;        // relative prime
+            for (int j = (m + 1) << 2; ; r += step) {
+                WorkQueue q; ForkJoinTask<?> t; ForkJoinTask<?>[] a; int b;
+                if ((q = ws[r & m]) != null && (b = q.base) - q.top < 0 &&
+                    (a = q.array) != null) {  // probably nonempty
+                    int i = (((a.length - 1) & b) << ASHIFT) + ABASE;
+                    t = (ForkJoinTask<?>)U.getObjectVolatile(a, i);
+                    if (q.base == b && ec >= 0 && t != null &&
+                        U.compareAndSwapObject(a, i, t, null)) {
+                        q.base = b + 1;       // specialization of pollAt
+                        return t;
+                    }
+                    else if ((t != null || b + 1 != q.top) &&
+                             (ec < 0 || j <= m)) {
+                        rs = 0;               // mark scan as imcomplete
+                        break;                // caller can retry after release
+                    }
+                }
+                if (--j < 0)
+                    break;
+            }
+            long c = ctl; int e = (int)c, a = (int)(c >> AC_SHIFT), nr, ns;
+            if (e < 0)                        // decode ctl on empty scan
+                w.runState = -1;              // pool is terminating
+            else if (rs == 0 || rs != runState) { // incomplete scan
+                WorkQueue v; Thread p;        // try to release a waiter
+                if (e > 0 && a < 0 && w.eventCount == ec &&
+                    (v = ws[e & m]) != null && v.eventCount == (e | INT_SIGN)) {
+                    long nc = ((long)(v.nextWait & E_MASK) |
+                               ((c + AC_UNIT) & (AC_MASK|TC_MASK)));
+                    if (ctl == c && U.compareAndSwapLong(this, CTL, c, nc)) {
+                        v.eventCount = (e + E_SEQ) & E_MASK;
+                        if ((p = v.parker) != null)
+                            U.unpark(p);
+                    }
+                }
+            }
+            else if (ec >= 0) {               // try to enqueue/inactivate
+                long nc = (long)ec | ((c - AC_UNIT) & (AC_MASK|TC_MASK));
+                w.nextWait = e;
+                w.eventCount = ec | INT_SIGN; // mark as inactive
+                if (ctl != c || !U.compareAndSwapLong(this, CTL, c, nc))
+                    w.eventCount = ec;        // unmark on CAS failure
+                else {
+                    if ((ns = w.nsteals) != 0) {
+                        w.nsteals = 0;        // set rescans if ran task
+                        w.rescans = (a > 0) ? 0 : a + parallelism;
+                        w.totalSteals += ns;
+                    }
+                    if (a == 1 - parallelism) // quiescent
+                        idleAwaitWork(w, nc, c);
+                }
+            }
+            else if (w.eventCount < 0) {      // already queued
+                if ((nr = w.rescans) > 0) {   // continue rescanning
+                    int ac = a + parallelism;
+                    if (((w.rescans = (ac < nr) ? ac : nr - 1) & 3) == 0)
+                        Thread.yield();       // yield before block
+                }
+                else {
+                    Thread.interrupted();     // clear status
+                    Thread wt = Thread.currentThread();
+                    U.putObject(wt, PARKBLOCKER, this);
+                    w.parker = wt;            // emulate LockSupport.park
+                    if (w.eventCount < 0)     // recheck
+                        U.park(false, 0L);
+                    w.parker = null;
+                    U.putObject(wt, PARKBLOCKER, null);
+                }
+            }
+        }
+        return null;
     }
 
     /**
-     * Creates a ForkJoinPool with parallelism equal to the number of
-     * processors available on the system and using the given
-     * ForkJoinWorkerThreadFactory,
-     * @param factory the factory for creating new threads
-     * @throws NullPointerException if factory is null
-     * @throws SecurityException if a security manager exists and
-     *         the caller is not permitted to modify threads
-     *         because it does not hold {@link
-     *         java.lang.RuntimePermission}<code>("modifyThread")</code>,
-     */
-    public ForkJoinPool(ForkJoinWorkerThreadFactory factory) {
-        this(Runtime.getRuntime().availableProcessors(), factory);
+     * If inactivating worker w has caused the pool to become
+     * quiescent, checks for pool termination, and, so long as this is
+     * not the only worker, waits for event for up to SHRINK_RATE
+     * nanosecs.  On timeout, if ctl has not changed, terminates the
+     * worker, which will in turn wake up another worker to possibly
+     * repeat this process.
+     *
+     * @param w the calling worker
+     * @param currentCtl the ctl value triggering possible quiescence
+     * @param prevCtl the ctl value to restore if thread is terminated
+     */
+    private void idleAwaitWork(WorkQueue w, long currentCtl, long prevCtl) {
+        if (w.eventCount < 0 && !tryTerminate(false, false) &&
+            (int)prevCtl != 0 && ctl == currentCtl) {
+            Thread wt = Thread.currentThread();
+            Thread.yield();            // yield before block
+            while (ctl == currentCtl) {
+                long startTime = System.nanoTime();
+                Thread.interrupted();  // timed variant of version in scan()
+                U.putObject(wt, PARKBLOCKER, this);
+                w.parker = wt;
+                if (ctl == currentCtl)
+                    U.park(false, SHRINK_RATE);
+                w.parker = null;
+                U.putObject(wt, PARKBLOCKER, null);
+                if (ctl != currentCtl)
+                    break;
+                if (System.nanoTime() - startTime >= SHRINK_TIMEOUT &&
+                    U.compareAndSwapLong(this, CTL, currentCtl, prevCtl)) {
+                    w.eventCount = (w.eventCount + E_SEQ) | E_MASK;
+                    w.runState = -1;   // shrink
+                    break;
+                }
+            }
+        }
     }
 
     /**
-     * Creates a ForkJoinPool with the given parallelism and factory.
+     * Tries to locate and execute tasks for a stealer of the given
+     * task, or in turn one of its stealers, Traces currentSteal ->
+     * currentJoin links looking for a thread working on a descendant
+     * of the given task and with a non-empty queue to steal back and
+     * execute tasks from. The first call to this method upon a
+     * waiting join will often entail scanning/search, (which is OK
+     * because the joiner has nothing better to do), but this method
+     * leaves hints in workers to speed up subsequent calls. The
+     * implementation is very branchy to cope with potential
+     * inconsistencies or loops encountering chains that are stale,
+     * unknown, or so long that they are likely cyclic.  All of these
+     * cases are dealt with by just retrying by caller.
      *
-     * @param parallelism the targeted number of worker threads
-     * @param factory the factory for creating new threads
-     * @throws IllegalArgumentException if parallelism less than or
-     * equal to zero, or greater than implementation limit.
-     * @throws NullPointerException if factory is null
-     * @throws SecurityException if a security manager exists and
-     *         the caller is not permitted to modify threads
-     *         because it does not hold {@link
-     *         java.lang.RuntimePermission}<code>("modifyThread")</code>,
-     */
-    public ForkJoinPool(int parallelism, ForkJoinWorkerThreadFactory factory) {
-        if (parallelism <= 0 || parallelism > MAX_THREADS)
-            throw new IllegalArgumentException();
-        if (factory == null)
-            throw new NullPointerException();
-        checkPermission();
-        this.factory = factory;
-        this.parallelism = parallelism;
-        this.maxPoolSize = MAX_THREADS;
-        this.maintainsParallelism = true;
-        this.poolNumber = poolNumberGenerator.incrementAndGet();
-        this.workerLock = new ReentrantLock();
-        this.termination = workerLock.newCondition();
-        this.stealCount = new AtomicLong();
-        this.submissionQueue = new LinkedTransferQueue<ForkJoinTask<?>>();
-        // worker array and workers are lazily constructed
-    }
+     * @param joiner the joining worker
+     * @param task the task to join
+     * @return true if found or ran a task (and so is immediately retryable)
+     */
+    private boolean tryHelpStealer(WorkQueue joiner, ForkJoinTask<?> task) {
+        WorkQueue[] ws;
+        int m, depth = MAX_HELP;                // remaining chain depth
+        boolean progress = false;
+        if ((ws = workQueues) != null && (m = ws.length - 1) > 0 &&
+            task.status >= 0) {
+            ForkJoinTask<?> subtask = task;     // current target
+            outer: for (WorkQueue j = joiner;;) {
+                WorkQueue stealer = null;       // find stealer of subtask
+                WorkQueue v = ws[j.stealHint & m]; // try hint
+                if (v != null && v.currentSteal == subtask)
+                    stealer = v;
+                else {                          // scan
+                    for (int i = 1; i <= m; i += 2) {
+                        if ((v = ws[i]) != null && v.currentSteal == subtask &&
+                            v != joiner) {
+                            stealer = v;
+                            j.stealHint = i;    // save hint
+                            break;
+                        }
+                    }
+                    if (stealer == null)
+                        break;
+                }
 
-    /**
-     * Create new worker using factory.
-     * @param index the index to assign worker
-     * @return new worker, or null of factory failed
-     */
-    private ForkJoinWorkerThread createWorker(int index) {
-        Thread.UncaughtExceptionHandler h = ueh;
-        ForkJoinWorkerThread w = factory.newThread(this);
-        if (w != null) {
-            w.poolIndex = index;
-            w.setDaemon(true);
-            w.setAsyncMode(locallyFifo);
-            w.setName("ForkJoinPool-" + poolNumber + "-worker-" + index);
-            if (h != null)
-                w.setUncaughtExceptionHandler(h);
+                for (WorkQueue q = stealer;;) { // try to help stealer
+                    ForkJoinTask[] a; ForkJoinTask<?> t; int b;
+                    if (task.status < 0)
+                        break outer;
+                    if ((b = q.base) - q.top < 0 && (a = q.array) != null) {
+                        progress = true;
+                        int i = (((a.length - 1) & b) << ASHIFT) + ABASE;
+                        t = (ForkJoinTask<?>)U.getObjectVolatile(a, i);
+                        if (subtask.status < 0) // must recheck before taking
+                            break outer;
+                        if (t != null &&
+                            q.base == b &&
+                            U.compareAndSwapObject(a, i, t, null)) {
+                            q.base = b + 1;
+                            joiner.runSubtask(t);
+                        }
+                        else if (q.base == b)
+                            break outer;        // possibly stalled
+                    }
+                    else {                      // descend
+                        ForkJoinTask<?> next = stealer.currentJoin;
+                        if (--depth <= 0 || subtask.status < 0 ||
+                            next == null || next == subtask)
+                            break outer;        // stale, dead-end, or cyclic
+                        subtask = next;
+                        j = stealer;
+                        break;
+                    }
+                }
+            }
         }
-        return w;
+        return progress;
     }
 
     /**
-     * Return a good size for worker array given pool size.
-     * Currently requires size to be a power of two.
+     * If task is at base of some steal queue, steals and executes it.
+     *
+     * @param joiner the joining worker
+     * @param task the task
      */
-    private static int arraySizeFor(int ps) {
-        return ps <= 1? 1 : (1 << (32 - Integer.numberOfLeadingZeros(ps-1)));
+    private void tryPollForAndExec(WorkQueue joiner, ForkJoinTask<?> task) {
+        WorkQueue[] ws;
+        if ((ws = workQueues) != null) {
+            for (int j = 1; j < ws.length && task.status >= 0; j += 2) {
+                WorkQueue q = ws[j];
+                if (q != null && q.pollFor(task)) {
+                    joiner.runSubtask(task);
+                    break;
+                }
+            }
+        }
     }
 
-    public static ForkJoinWorkerThread[] copyOfWorkers(ForkJoinWorkerThread[] original, int newLength) {
-        ForkJoinWorkerThread[] copy = new ForkJoinWorkerThread[newLength];
-        System.arraycopy(original, 0, copy, 0, Math.min(newLength, original.length));
-        return copy;
+    /**
+     * Tries to decrement active count (sometimes implicitly) and
+     * possibly release or create a compensating worker in preparation
+     * for blocking. Fails on contention or termination. Otherwise,
+     * adds a new thread if no idle workers are available and either
+     * pool would become completely starved or: (at least half
+     * starved, and fewer than 50% spares exist, and there is at least
+     * one task apparently available). Even though the availability
+     * check requires a full scan, it is worthwhile in reducing false
+     * alarms.
+     *
+     * @param task if non-null, a task being waited for
+     * @param blocker if non-null, a blocker being waited for
+     * @return true if the caller can block, else should recheck and retry
+     */
+    final boolean tryCompensate(ForkJoinTask<?> task, ManagedBlocker blocker) {
+        int pc = parallelism, e;
+        long c = ctl;
+        WorkQueue[] ws = workQueues;
+        if ((e = (int)c) >= 0 && ws != null) {
+            int u, a, ac, hc;
+            int tc = (short)((u = (int)(c >>> 32)) >>> UTC_SHIFT) + pc;
+            boolean replace = false;
+            if ((a = u >> UAC_SHIFT) <= 0) {
+                if ((ac = a + pc) <= 1)
+                    replace = true;
+                else if ((e > 0 || (task != null &&
+                                    ac <= (hc = pc >>> 1) && tc < pc + hc))) {
+                    WorkQueue w;
+                    for (int j = 0; j < ws.length; ++j) {
+                        if ((w = ws[j]) != null && !w.isEmpty()) {
+                            replace = true;
+                            break;   // in compensation range and tasks available
+                        }
+                    }
+                }
+            }
+            if ((task == null || task.status >= 0) && // recheck need to block
+                (blocker == null || !blocker.isReleasable()) && ctl == c) {
+                if (!replace) {          // no compensation
+                    long nc = ((c - AC_UNIT) & AC_MASK) | (c & ~AC_MASK);
+                    if (U.compareAndSwapLong(this, CTL, c, nc))
+                        return true;
+                }
+                else if (e != 0) {       // release an idle worker
+                    WorkQueue w; Thread p; int i;
+                    if ((i = e & SMASK) < ws.length && (w = ws[i]) != null) {
+                        long nc = ((long)(w.nextWait & E_MASK) |
+                                   (c & (AC_MASK|TC_MASK)));
+                        if (w.eventCount == (e | INT_SIGN) &&
+                            U.compareAndSwapLong(this, CTL, c, nc)) {
+                            w.eventCount = (e + E_SEQ) & E_MASK;
+                            if ((p = w.parker) != null)
+                                U.unpark(p);
+                            return true;
+                        }
+                    }
+                }
+                else if (tc < MAX_CAP) { // create replacement
+                    long nc = ((c + TC_UNIT) & TC_MASK) | (c & ~TC_MASK);
+                    if (U.compareAndSwapLong(this, CTL, c, nc)) {
+                        addWorker();
+                        return true;
+                    }
+                }
+            }
+        }
+        return false;
     }
 
     /**
-     * Create or resize array if necessary to hold newLength.
-     * Call only under exlusion or lock
-     * @return the array
-     */
-    private ForkJoinWorkerThread[] ensureWorkerArrayCapacity(int newLength) {
-        ForkJoinWorkerThread[] ws = workers;
-        if (ws == null)
-            return workers = new ForkJoinWorkerThread[arraySizeFor(newLength)];
-        else if (newLength > ws.length)
-            return workers = copyOfWorkers(ws, arraySizeFor(newLength));
-        else
-            return ws;
+     * Helps and/or blocks until the given task is done.
+     *
+     * @param joiner the joining worker
+     * @param task the task
+     * @return task status on exit
+     */
+    final int awaitJoin(WorkQueue joiner, ForkJoinTask<?> task) {
+        ForkJoinTask<?> prevJoin = joiner.currentJoin;
+        joiner.currentJoin = task;
+        long startTime = 0L;
+        for (int k = 0, s; ; ++k) {
+            if ((joiner.isEmpty() ?                  // try to help
+                 !tryHelpStealer(joiner, task) :
+                 !joiner.tryRemoveAndExec(task))) {
+                if (k == 0) {
+                    startTime = System.nanoTime();
+                    tryPollForAndExec(joiner, task); // check uncommon case
+                }
+                else if ((k & (MAX_HELP - 1)) == 0 &&
+                         System.nanoTime() - startTime >= COMPENSATION_DELAY &&
+                         tryCompensate(task, null)) {
+                    if (task.trySetSignal() && task.status >= 0) {
+                        synchronized (task) {
+                            if (task.status >= 0) {
+                                try {                // see ForkJoinTask
+                                    task.wait();     //  for explanation
+                                } catch (InterruptedException ie) {
+                                }
+                            }
+                            else
+                                task.notifyAll();
+                        }
+                    }
+                    long c;                          // re-activate
+                    do {} while (!U.compareAndSwapLong
+                                 (this, CTL, c = ctl, c + AC_UNIT));
+                }
+            }
+            if ((s = task.status) < 0) {
+                joiner.currentJoin = prevJoin;
+                return s;
+            }
+            else if ((k & (MAX_HELP - 1)) == MAX_HELP >>> 1)
+                Thread.yield();                     // for politeness
+        }
     }
 
     /**
-     * Try to shrink workers into smaller array after one or more terminate
+     * Stripped-down variant of awaitJoin used by timed joins. Tries
+     * to help join only while there is continuous progress. (Caller
+     * will then enter a timed wait.)
+     *
+     * @param joiner the joining worker
+     * @param task the task
+     * @return task status on exit
      */
-    private void tryShrinkWorkerArray() {
-        ForkJoinWorkerThread[] ws = workers;
-        if (ws != null) {
-            int len = ws.length;
-            int last = len - 1;
-            while (last >= 0 && ws[last] == null)
-                --last;
-            int newLength = arraySizeFor(last+1);
-            if (newLength < len)
-                workers = copyOfWorkers(ws, newLength);
+    final int helpJoinOnce(WorkQueue joiner, ForkJoinTask<?> task) {
+        int s;
+        while ((s = task.status) >= 0 &&
+               (joiner.isEmpty() ?
+                tryHelpStealer(joiner, task) :
+                joiner.tryRemoveAndExec(task)))
+            ;
+        return s;
+    }
+
+    /**
+     * Returns a (probably) non-empty steal queue, if one is found
+     * during a random, then cyclic scan, else null.  This method must
+     * be retried by caller if, by the time it tries to use the queue,
+     * it is empty.
+     */
+    private WorkQueue findNonEmptyStealQueue(WorkQueue w) {
+        // Similar to loop in scan(), but ignoring submissions
+        int r = w.seed; r ^= r << 13; r ^= r >>> 17; w.seed = r ^= r << 5;
+        int step = (r >>> 16) | 1;
+        for (WorkQueue[] ws;;) {
+            int rs = runState, m;
+            if ((ws = workQueues) == null || (m = ws.length - 1) < 1)
+                return null;
+            for (int j = (m + 1) << 2; ; r += step) {
+                WorkQueue q = ws[((r << 1) | 1) & m];
+                if (q != null && !q.isEmpty())
+                    return q;
+                else if (--j < 0) {
+                    if (runState == rs)
+                        return null;
+                    break;
+                }
+            }
         }
     }
 
     /**
-     * Initialize workers if necessary
-     */
-    final void ensureWorkerInitialization() {
-        ForkJoinWorkerThread[] ws = workers;
-        if (ws == null) {
-            final ReentrantLock lock = this.workerLock;
-            lock.lock();
-            try {
-                ws = workers;
-                if (ws == null) {
-                    int ps = parallelism;
-                    ws = ensureWorkerArrayCapacity(ps);
-                    for (int i = 0; i < ps; ++i) {
-                        ForkJoinWorkerThread w = createWorker(i);
-                        if (w != null) {
-                            ws[i] = w;
-                            w.start();
-                            updateWorkerCount(1);
-                        }
-                    }
+     * Runs tasks until {@code isQuiescent()}. We piggyback on
+     * active count ctl maintenance, but rather than blocking
+     * when tasks cannot be found, we rescan until all others cannot
+     * find tasks either.
+     */
+    final void helpQuiescePool(WorkQueue w) {
+        for (boolean active = true;;) {
+            if (w.base - w.top < 0)
+                w.runLocalTasks();  // exhaust local queue
+            WorkQueue q = findNonEmptyStealQueue(w);
+            if (q != null) {
+                ForkJoinTask<?> t; int b;
+                if (!active) {      // re-establish active count
+                    long c;
+                    active = true;
+                    do {} while (!U.compareAndSwapLong
+                                 (this, CTL, c = ctl, c + AC_UNIT));
+                }
+                if ((b = q.base) - q.top < 0 && (t = q.pollAt(b)) != null)
+                    w.runSubtask(t);
+            }
+            else {
+                long c;
+                if (active) {       // decrement active count without queuing
+                    active = false;
+                    do {} while (!U.compareAndSwapLong
+                                 (this, CTL, c = ctl, c -= AC_UNIT));
+                }
+                else
+                    c = ctl;        // re-increment on exit
+                if ((int)(c >> AC_SHIFT) + parallelism == 0) {
+                    do {} while (!U.compareAndSwapLong
+                                 (this, CTL, c = ctl, c + AC_UNIT));
+                    break;
                 }
-            } finally {
-                lock.unlock();
             }
         }
     }
 
     /**
-     * Worker creation and startup for threads added via setParallelism.
+     * Gets and removes a local or stolen task for the given worker.
+     *
+     * @return a task, if available
+     */
+    final ForkJoinTask<?> nextTaskFor(WorkQueue w) {
+        for (ForkJoinTask<?> t;;) {
+            WorkQueue q; int b;
+            if ((t = w.nextLocalTask()) != null)
+                return t;
+            if ((q = findNonEmptyStealQueue(w)) == null)
+                return null;
+            if ((b = q.base) - q.top < 0 && (t = q.pollAt(b)) != null)
+                return t;
+        }
+    }
+
+    /**
+     * Returns the approximate (non-atomic) number of idle threads per
+     * active thread to offset steal queue size for method
+     * ForkJoinTask.getSurplusQueuedTaskCount().
      */
-    private void createAndStartAddedWorkers() {
-        resumeAllSpares();  // Allow spares to convert to nonspare
-        int ps = parallelism;
-        ForkJoinWorkerThread[] ws = ensureWorkerArrayCapacity(ps);
-        int len = ws.length;
-        // Sweep through slots, to keep lowest indices most populated
-        int k = 0;
-        while (k < len) {
-            if (ws[k] != null) {
-                ++k;
-                continue;
+    final int idlePerActive() {
+        // Approximate at powers of two for small values, saturate past 4
+        int p = parallelism;
+        int a = p + (int)(ctl >> AC_SHIFT);
+        return (a > (p >>>= 1) ? 0 :
+                a > (p >>>= 1) ? 1 :
+                a > (p >>>= 1) ? 2 :
+                a > (p >>>= 1) ? 4 :
+                8);
+    }
+
+    //  Termination
+
+    /**
+     * Possibly initiates and/or completes termination.  The caller
+     * triggering termination runs three passes through workQueues:
+     * (0) Setting termination status, followed by wakeups of queued
+     * workers; (1) cancelling all tasks; (2) interrupting lagging
+     * threads (likely in external tasks, but possibly also blocked in
+     * joins).  Each pass repeats previous steps because of potential
+     * lagging thread creation.
+     *
+     * @param now if true, unconditionally terminate, else only
+     * if no work and no active workers
+     * @param enable if true, enable shutdown when next possible
+     * @return true if now terminating or terminated
+     */
+    private boolean tryTerminate(boolean now, boolean enable) {
+        Mutex lock = this.lock;
+        for (long c;;) {
+            if (((c = ctl) & STOP_BIT) != 0) {      // already terminating
+                if ((short)(c >>> TC_SHIFT) == -parallelism) {
+                    lock.lock();                    // don't need try/finally
+                    termination.signalAll();        // signal when 0 workers
+                    lock.unlock();
+                }
+                return true;
             }
-            int s = workerCounts;
-            int tc = totalCountOf(s);
-            int rc = runningCountOf(s);
-            if (rc >= ps || tc >= ps)
-                break;
-            if (casWorkerCounts (s, workerCountsFor(tc+1, rc+1))) {
-                ForkJoinWorkerThread w = createWorker(k);
-                if (w != null) {
-                    ws[k++] = w;
-                    w.start();
+            if (runState >= 0) {                    // not yet enabled
+                if (!enable)
+                    return false;
+                lock.lock();
+                runState |= SHUTDOWN;
+                lock.unlock();
+            }
+            if (!now) {                             // check if idle & no tasks
+                if ((int)(c >> AC_SHIFT) != -parallelism ||
+                    hasQueuedSubmissions())
+                    return false;
+                // Check for unqueued inactive workers. One pass suffices.
+                WorkQueue[] ws = workQueues; WorkQueue w;
+                if (ws != null) {
+                    for (int i = 1; i < ws.length; i += 2) {
+                        if ((w = ws[i]) != null && w.eventCount >= 0)
+                            return false;
+                    }
                 }
-                else {
-                    updateWorkerCount(-1); // back out on failed creation
-                    break;
+            }
+            if (U.compareAndSwapLong(this, CTL, c, c | STOP_BIT)) {
+                for (int pass = 0; pass < 3; ++pass) {
+                    WorkQueue[] ws = workQueues;
+                    if (ws != null) {
+                        WorkQueue w;
+                        int n = ws.length;
+                        for (int i = 0; i < n; ++i) {
+                            if ((w = ws[i]) != null) {
+                                w.runState = -1;
+                                if (pass > 0) {
+                                    w.cancelAll();
+                                    if (pass > 1)
+                                        w.interruptOwner();
+                                }
+                            }
+                        }
+                        // Wake up workers parked on event queue
+                        int i, e; long cc; Thread p;
+                        while ((e = (int)(cc = ctl) & E_MASK) != 0 &&
+                               (i = e & SMASK) < n &&
+                               (w = ws[i]) != null) {
+                            long nc = ((long)(w.nextWait & E_MASK) |
+                                       ((cc + AC_UNIT) & AC_MASK) |
+                                       (cc & (TC_MASK|STOP_BIT)));
+                            if (w.eventCount == (e | INT_SIGN) &&
+                                U.compareAndSwapLong(this, CTL, cc, nc)) {
+                                w.eventCount = (e + E_SEQ) & E_MASK;
+                                w.runState = -1;
+                                if ((p = w.parker) != null)
+                                    U.unpark(p);
+                            }
+                        }
+                    }
                 }
             }
         }
     }
 
-    // Execution methods
+    // Exported methods
+
+    // Constructors
+
+    /**
+     * Creates a {@code ForkJoinPool} with parallelism equal to {@link
+     * java.lang.Runtime#availableProcessors}, using the {@linkplain
+     * #defaultForkJoinWorkerThreadFactory default thread factory},
+     * no UncaughtExceptionHandler, and non-async LIFO processing mode.
+     *
+     * @throws SecurityException if a security manager exists and
+     *         the caller is not permitted to modify threads
+     *         because it does not hold {@link
+     *         java.lang.RuntimePermission}{@code ("modifyThread")}
+     */
+    public ForkJoinPool() {
+        this(Runtime.getRuntime().availableProcessors(),
+             defaultForkJoinWorkerThreadFactory, null, false);
+    }
+
+    /**
+     * Creates a {@code ForkJoinPool} with the indicated parallelism
+     * level, the {@linkplain
+     * #defaultForkJoinWorkerThreadFactory default thread factory},
+     * no UncaughtExceptionHandler, and non-async LIFO processing mode.
+     *
+     * @param parallelism the parallelism level
+     * @throws IllegalArgumentException if parallelism less than or
+     *         equal to zero, or greater than implementation limit
+     * @throws SecurityException if a security manager exists and
+     *         the caller is not permitted to modify threads
+     *         because it does not hold {@link
+     *         java.lang.RuntimePermission}{@code ("modifyThread")}
+     */
+    public ForkJoinPool(int parallelism) {
+        this(parallelism, defaultForkJoinWorkerThreadFactory, null, false);
+    }
 
     /**
-     * Common code for execute, invoke and submit
+     * Creates a {@code ForkJoinPool} with the given parameters.
+     *
+     * @param parallelism the parallelism level. For default value,
+     * use {@link java.lang.Runtime#availableProcessors}.
+     * @param factory the factory for creating new threads. For default value,
+     * use {@link #defaultForkJoinWorkerThreadFactory}.
+     * @param handler the handler for internal worker threads that
+     * terminate due to unrecoverable errors encountered while executing
+     * tasks. For default value, use {@code null}.
+     * @param asyncMode if true,
+     * establishes local first-in-first-out scheduling mode for forked
+     * tasks that are never joined. This mode may be more appropriate
+     * than default locally stack-based mode in applications in which
+     * worker threads only process event-style asynchronous tasks.
+     * For default value, use {@code false}.
+     * @throws IllegalArgumentException if parallelism less than or
+     *         equal to zero, or greater than implementation limit
+     * @throws NullPointerException if the factory is null
+     * @throws SecurityException if a security manager exists and
+     *         the caller is not permitted to modify threads
+     *         because it does not hold {@link
+     *         java.lang.RuntimePermission}{@code ("modifyThread")}
      */
-    private <T> void doSubmit(ForkJoinTask<T> task) {
-        if (isShutdown())
-            throw new RejectedExecutionException();
-        if (workers == null)
-            ensureWorkerInitialization();
-        submissionQueue.offer(task);
-        signalIdleWorkers();
+    public ForkJoinPool(int parallelism,
+                        ForkJoinWorkerThreadFactory factory,
+                        Thread.UncaughtExceptionHandler handler,
+                        boolean asyncMode) {
+        checkPermission();
+        if (factory == null)
+            throw new NullPointerException();
+        if (parallelism <= 0 || parallelism > MAX_CAP)
+            throw new IllegalArgumentException();
+        this.parallelism = parallelism;
+        this.factory = factory;
+        this.ueh = handler;
+        this.localMode = asyncMode ? FIFO_QUEUE : LIFO_QUEUE;
+        long np = (long)(-parallelism); // offset ctl counts
+        this.ctl = ((np << AC_SHIFT) & AC_MASK) | ((np << TC_SHIFT) & TC_MASK);
+        // Use nearest power 2 for workQueues size. See Hackers Delight sec 3.2.
+        int n = parallelism - 1;
+        n |= n >>> 1; n |= n >>> 2; n |= n >>> 4; n |= n >>> 8; n |= n >>> 16;
+        int size = (n + 1) << 1;        // #slots = 2*#workers
+        this.submitMask = size - 1;     // room for max # of submit queues
+        this.workQueues = new WorkQueue[size];
+        this.termination = (this.lock = new Mutex()).newCondition();
+        this.stealCount = new AtomicLong();
+        this.nextWorkerNumber = new AtomicInteger();
+        int pn = poolNumberGenerator.incrementAndGet();
+        StringBuilder sb = new StringBuilder("ForkJoinPool-");
+        sb.append(Integer.toString(pn));
+        sb.append("-worker-");
+        this.workerNamePrefix = sb.toString();
+        lock.lock();
+        this.runState = 1;              // set init flag
+        lock.unlock();
     }
 
+    // Execution methods
+
     /**
-     * Performs the given task; returning its result upon completion
+     * Performs the given task, returning its result upon completion.
+     * If the computation encounters an unchecked Exception or Error,
+     * it is rethrown as the outcome of this invocation.  Rethrown
+     * exceptions behave in the same way as regular exceptions, but,
+     * when possible, contain stack traces (as displayed for example
+     * using {@code ex.printStackTrace()}) of both the current thread
+     * as well as the thread actually encountering the exception;
+     * minimally only the latter.
+     *
      * @param task the task
      * @return the task's result
-     * @throws NullPointerException if task is null
-     * @throws RejectedExecutionException if pool is shut down
+     * @throws NullPointerException if the task is null
+     * @throws RejectedExecutionException if the task cannot be
+     *         scheduled for execution
      */
     public <T> T invoke(ForkJoinTask<T> task) {
+        if (task == null)
+            throw new NullPointerException();
         doSubmit(task);
         return task.join();
     }
 
     /**
      * Arranges for (asynchronous) execution of the given task.
+     *
      * @param task the task
-     * @throws NullPointerException if task is null
-     * @throws RejectedExecutionException if pool is shut down
+     * @throws NullPointerException if the task is null
+     * @throws RejectedExecutionException if the task cannot be
+     *         scheduled for execution
      */
-    public <T> void execute(ForkJoinTask<T> task) {
+    public void execute(ForkJoinTask<?> task) {
+        if (task == null)
+            throw new NullPointerException();
         doSubmit(task);
     }
 
     // AbstractExecutorService methods
 
+    /**
+     * @throws NullPointerException if the task is null
+     * @throws RejectedExecutionException if the task cannot be
+     *         scheduled for execution
+     */
     public void execute(Runnable task) {
-        doSubmit(new AdaptedRunnable<Void>(task, null));
+        if (task == null)
+            throw new NullPointerException();
+        ForkJoinTask<?> job;
+        if (task instanceof ForkJoinTask<?>) // avoid re-wrap
+            job = (ForkJoinTask<?>) task;
+        else
+            job = new ForkJoinTask.AdaptedRunnableAction(task);
+        doSubmit(job);
+    }
+
+    /**
+     * Submits a ForkJoinTask for execution.
+     *
+     * @param task the task to submit
+     * @return the task
+     * @throws NullPointerException if the task is null
+     * @throws RejectedExecutionException if the task cannot be
+     *         scheduled for execution
+     */
+    public <T> ForkJoinTask<T> submit(ForkJoinTask<T> task) {
+        if (task == null)
+            throw new NullPointerException();
+        doSubmit(task);
+        return task;
     }
 
+    /**
+     * @throws NullPointerException if the task is null
+     * @throws RejectedExecutionException if the task cannot be
+     *         scheduled for execution
+     */
     public <T> ForkJoinTask<T> submit(Callable<T> task) {
-        ForkJoinTask<T> job = new AdaptedCallable<T>(task);
+        ForkJoinTask<T> job = new ForkJoinTask.AdaptedCallable<T>(task);
         doSubmit(job);
         return job;
     }
 
+    /**
+     * @throws NullPointerException if the task is null
+     * @throws RejectedExecutionException if the task cannot be
+     *         scheduled for execution
+     */
     public <T> ForkJoinTask<T> submit(Runnable task, T result) {
-        ForkJoinTask<T> job = new AdaptedRunnable<T>(task, result);
+        ForkJoinTask<T> job = new ForkJoinTask.AdaptedRunnable<T>(task, result);
         doSubmit(job);
         return job;
     }
 
+    /**
+     * @throws NullPointerException if the task is null
+     * @throws RejectedExecutionException if the task cannot be
+     *         scheduled for execution
+     */
     public ForkJoinTask<?> submit(Runnable task) {
-        ForkJoinTask<Void> job = new AdaptedRunnable<Void>(task, null);
+        if (task == null)
+            throw new NullPointerException();
+        ForkJoinTask<?> job;
+        if (task instanceof ForkJoinTask<?>) // avoid re-wrap
+            job = (ForkJoinTask<?>) task;
+        else
+            job = new ForkJoinTask.AdaptedRunnableAction(task);
         doSubmit(job);
         return job;
     }
 
     /**
-     * Adaptor for Runnables. This implements RunnableFuture
-     * to be compliant with AbstractExecutorService constraints
+     * @throws NullPointerException       {@inheritDoc}
+     * @throws RejectedExecutionException {@inheritDoc}
      */
-    static final class AdaptedRunnable<T> extends ForkJoinTask<T>
-        implements RunnableFuture<T> {
-        final Runnable runnable;
-        final T resultOnCompletion;
-        T result;
-        AdaptedRunnable(Runnable runnable, T result) {
-            if (runnable == null) throw new NullPointerException();
-            this.runnable = runnable;
-            this.resultOnCompletion = result;
-        }
-        public T getRawResult() { return result; }
-        public void setRawResult(T v) { result = v; }
-        public boolean exec() {
-            runnable.run();
-            result = resultOnCompletion;
-            return true;
-        }
-        public void run() { invoke(); }
-    }
-
-    /**
-     * Adaptor for Callables
-     */
-    static final class AdaptedCallable<T> extends ForkJoinTask<T>
-        implements RunnableFuture<T> {
-        final Callable<T> callable;
-        T result;
-        AdaptedCallable(Callable<T> callable) {
-            if (callable == null) throw new NullPointerException();
-            this.callable = callable;
-        }
-        public T getRawResult() { return result; }
-        public void setRawResult(T v) { result = v; }
-        public boolean exec() {
-            try {
-                result = callable.call();
-                return true;
-            } catch (Error err) {
-                throw err;
-            } catch (RuntimeException rex) {
-                throw rex;
-            } catch (Exception ex) {
-                throw new RuntimeException(ex);
-            }
-        }
-        public void run() { invoke(); }
-    }
-
     public <T> List<Future<T>> invokeAll(Collection<? extends Callable<T>> tasks) {
-        ArrayList<ForkJoinTask<T>> ts =
-            new ArrayList<ForkJoinTask<T>>(tasks.size());
-        for (Callable<T> c : tasks)
-            ts.add(new AdaptedCallable<T>(c));
-        invoke(new InvokeAll<T>(ts));
-        return (List<Future<T>>)(List)ts;
-    }
-
-    static final class InvokeAll<T> extends RecursiveAction {
-        final ArrayList<ForkJoinTask<T>> tasks;
-        InvokeAll(ArrayList<ForkJoinTask<T>> tasks) { this.tasks = tasks; }
-        public void compute() {
-            try { invokeAll(tasks); } catch(Exception ignore) {}
+        // In previous versions of this class, this method constructed
+        // a task to run ForkJoinTask.invokeAll, but now external
+        // invocation of multiple tasks is at least as efficient.
+        List<ForkJoinTask<T>> fs = new ArrayList<ForkJoinTask<T>>(tasks.size());
+        // Workaround needed because method wasn't declared with
+        // wildcards in return type but should have been.
+        @SuppressWarnings({"unchecked", "rawtypes"})
+            List<Future<T>> futures = (List<Future<T>>) (List) fs;
+
+        boolean done = false;
+        try {
+            for (Callable<T> t : tasks) {
+                ForkJoinTask<T> f = new ForkJoinTask.AdaptedCallable<T>(t);
+                doSubmit(f);
+                fs.add(f);
+            }
+            for (ForkJoinTask<T> f : fs)
+                f.quietlyJoin();
+            done = true;
+            return futures;
+        } finally {
+            if (!done)
+                for (ForkJoinTask<T> f : fs)
+                    f.cancel(false);
         }
     }
 
-    // Configuration and status settings and queries
-
     /**
-     * Returns the factory used for constructing new workers
+     * Returns the factory used for constructing new workers.
      *
      * @return the factory used for constructing new workers
      */
@@ -674,92 +2329,17 @@ public class ForkJoinPool /*extends AbstractExecutorService*/ {
     /**
      * Returns the handler for internal worker threads that terminate
      * due to unrecoverable errors encountered while executing tasks.
-     * @return the handler, or null if none
-     */
-    public Thread.UncaughtExceptionHandler getUncaughtExceptionHandler() {
-        Thread.UncaughtExceptionHandler h;
-        final ReentrantLock lock = this.workerLock;
-        lock.lock();
-        try {
-            h = ueh;
-        } finally {
-            lock.unlock();
-        }
-        return h;
-    }
-
-    /**
-     * Sets the handler for internal worker threads that terminate due
-     * to unrecoverable errors encountered while executing tasks.
-     * Unless set, the current default or ThreadGroup handler is used
-     * as handler.
      *
-     * @param h the new handler
-     * @return the old handler, or null if none
-     * @throws SecurityException if a security manager exists and
-     *         the caller is not permitted to modify threads
-     *         because it does not hold {@link
-     *         java.lang.RuntimePermission}<code>("modifyThread")</code>,
+     * @return the handler, or {@code null} if none
      */
-    public Thread.UncaughtExceptionHandler
-        setUncaughtExceptionHandler(Thread.UncaughtExceptionHandler h) {
-        checkPermission();
-        Thread.UncaughtExceptionHandler old = null;
-        final ReentrantLock lock = this.workerLock;
-        lock.lock();
-        try {
-            old = ueh;
-            ueh = h;
-            ForkJoinWorkerThread[] ws = workers;
-            if (ws != null) {
-                for (int i = 0; i < ws.length; ++i) {
-                    ForkJoinWorkerThread w = ws[i];
-                    if (w != null)
-                        w.setUncaughtExceptionHandler(h);
-                }
-            }
-        } finally {
-            lock.unlock();
-        }
-        return old;
-    }
-
-
-    /**
-     * Sets the target paralleism level of this pool.
-     * @param parallelism the target parallelism
-     * @throws IllegalArgumentException if parallelism less than or
-     * equal to zero or greater than maximum size bounds.
-     * @throws SecurityException if a security manager exists and
-     *         the caller is not permitted to modify threads
-     *         because it does not hold {@link
-     *         java.lang.RuntimePermission}<code>("modifyThread")</code>,
-     */
-    public void setParallelism(int parallelism) {
-        checkPermission();
-        if (parallelism <= 0 || parallelism > maxPoolSize)
-            throw new IllegalArgumentException();
-        final ReentrantLock lock = this.workerLock;
-        lock.lock();
-        try {
-            if (!isTerminating()) {
-                int p = this.parallelism;
-                this.parallelism = parallelism;
-                if (parallelism > p)
-                    createAndStartAddedWorkers();
-                else
-                    trimSpares();
-            }
-        } finally {
-            lock.unlock();
-        }
-        signalIdleWorkers();
+    public Thread.UncaughtExceptionHandler getUncaughtExceptionHandler() {
+        return ueh;
     }
 
     /**
-     * Returns the targeted number of worker threads in this pool.
+     * Returns the targeted parallelism level of this pool.
      *
-     * @return the targeted number of worker threads in this pool
+     * @return the targeted parallelism level of this pool
      */
     public int getParallelism() {
         return parallelism;
@@ -767,141 +2347,71 @@ public class ForkJoinPool /*extends AbstractExecutorService*/ {
 
     /**
      * Returns the number of worker threads that have started but not
-     * yet terminated.  This result returned by this method may differ
-     * from <code>getParallelism</code> when threads are created to
+     * yet terminated.  The result returned by this method may differ
+     * from {@link #getParallelism} when threads are created to
      * maintain parallelism when others are cooperatively blocked.
      *
      * @return the number of worker threads
      */
     public int getPoolSize() {
-        return totalCountOf(workerCounts);
+        return parallelism + (short)(ctl >>> TC_SHIFT);
     }
 
     /**
-     * Returns the maximum number of threads allowed to exist in the
-     * pool, even if there are insufficient unblocked running threads.
-     * @return the maximum
-     */
-    public int getMaximumPoolSize() {
-        return maxPoolSize;
-    }
-
-    /**
-     * Sets the maximum number of threads allowed to exist in the
-     * pool, even if there are insufficient unblocked running threads.
-     * Setting this value has no effect on current pool size. It
-     * controls construction of new threads.
-     * @throws IllegalArgumentException if negative or greater then
-     * internal implementation limit.
-     */
-    public void setMaximumPoolSize(int newMax) {
-        if (newMax < 0 || newMax > MAX_THREADS)
-            throw new IllegalArgumentException();
-        maxPoolSize = newMax;
-    }
-
-
-    /**
-     * Returns true if this pool dynamically maintains its target
-     * parallelism level. If false, new threads are added only to
-     * avoid possible starvation.
-     * This setting is by default true;
-     * @return true if maintains parallelism
-     */
-    public boolean getMaintainsParallelism() {
-        return maintainsParallelism;
-    }
-
-    /**
-     * Sets whether this pool dynamically maintains its target
-     * parallelism level. If false, new threads are added only to
-     * avoid possible starvation.
-     * @param enable true to maintains parallelism
-     */
-    public void setMaintainsParallelism(boolean enable) {
-        maintainsParallelism = enable;
-    }
-
-    /**
-     * Establishes local first-in-first-out scheduling mode for forked
-     * tasks that are never joined. This mode may be more appropriate
-     * than default locally stack-based mode in applications in which
-     * worker threads only process asynchronous tasks.  This method is
-     * designed to be invoked only when pool is quiescent, and
-     * typically only before any tasks are submitted. The effects of
-     * invocations at ather times may be unpredictable.
-     *
-     * @param async if true, use locally FIFO scheduling
-     * @return the previous mode.
-     */
-    public boolean setAsyncMode(boolean async) {
-        boolean oldMode = locallyFifo;
-        locallyFifo = async;
-        ForkJoinWorkerThread[] ws = workers;
-        if (ws != null) {
-            for (int i = 0; i < ws.length; ++i) {
-                ForkJoinWorkerThread t = ws[i];
-                if (t != null)
-                    t.setAsyncMode(async);
-            }
-        }
-        return oldMode;
-    }
-
-    /**
-     * Returns true if this pool uses local first-in-first-out
+     * Returns {@code true} if this pool uses local first-in-first-out
      * scheduling mode for forked tasks that are never joined.
      *
-     * @return true if this pool uses async mode.
+     * @return {@code true} if this pool uses async mode
      */
     public boolean getAsyncMode() {
-        return locallyFifo;
+        return localMode != 0;
     }
 
     /**
      * Returns an estimate of the number of worker threads that are
      * not blocked waiting to join tasks or for other managed
-     * synchronization.
+     * synchronization. This method may overestimate the
+     * number of running threads.
      *
      * @return the number of worker threads
      */
     public int getRunningThreadCount() {
-        return runningCountOf(workerCounts);
+        int rc = 0;
+        WorkQueue[] ws; WorkQueue w;
+        if ((ws = workQueues) != null) {
+            for (int i = 1; i < ws.length; i += 2) {
+                if ((w = ws[i]) != null && w.isApparentlyUnblocked())
+                    ++rc;
+            }
+        }
+        return rc;
     }
 
     /**
      * Returns an estimate of the number of threads that are currently
      * stealing or executing tasks. This method may overestimate the
      * number of active threads.
-     * @return the number of active threads.
+     *
+     * @return the number of active threads
      */
     public int getActiveThreadCount() {
-        return activeCountOf(runControl);
-    }
-
-    /**
-     * Returns an estimate of the number of threads that are currently
-     * idle waiting for tasks. This method may underestimate the
-     * number of idle threads.
-     * @return the number of idle threads.
-     */
-    final int getIdleThreadCount() {
-        int c = runningCountOf(workerCounts) - activeCountOf(runControl);
-        return (c <= 0)? 0 : c;
+        int r = parallelism + (int)(ctl >> AC_SHIFT);
+        return (r <= 0) ? 0 : r; // suppress momentarily negative values
     }
 
     /**
-     * Returns true if all worker threads are currently idle. An idle
-     * worker is one that cannot obtain a task to execute because none
-     * are available to steal from other threads, and there are no
-     * pending submissions to the pool. This method is conservative:
-     * It might not return true immediately upon idleness of all
-     * threads, but will eventually become true if threads remain
-     * inactive.
-     * @return true if all threads are currently idle
+     * Returns {@code true} if all worker threads are currently idle.
+     * An idle worker is one that cannot obtain a task to execute
+     * because none are available to steal from other threads, and
+     * there are no pending submissions to the pool. This method is
+     * conservative; it might not return {@code true} immediately upon
+     * idleness of all threads, but will eventually become true if
+     * threads remain inactive.
+     *
+     * @return {@code true} if all threads are currently idle
      */
     public boolean isQuiescent() {
-        return activeCountOf(runControl) == 0;
+        return (int)(ctl >> AC_SHIFT) + parallelism == 0;
     }
 
     /**
@@ -909,23 +2419,22 @@ public class ForkJoinPool /*extends AbstractExecutorService*/ {
      * one thread's work queue by another. The reported value
      * underestimates the actual total number of steals when the pool
      * is not quiescent. This value may be useful for monitoring and
-     * tuning fork/join programs: In general, steal counts should be
+     * tuning fork/join programs: in general, steal counts should be
      * high enough to keep threads busy, but low enough to avoid
      * overhead and contention across threads.
-     * @return the number of steals.
+     *
+     * @return the number of steals
      */
     public long getStealCount() {
-        return stealCount.get();
-    }
-
-    /**
-     * Accumulate steal count from a worker. Call only
-     * when worker known to be idle.
-     */
-    private void updateStealCount(ForkJoinWorkerThread w) {
-        int sc = w.getAndClearStealCount();
-        if (sc != 0)
-            stealCount.addAndGet(sc);
+        long count = stealCount.get();
+        WorkQueue[] ws; WorkQueue w;
+        if ((ws = workQueues) != null) {
+            for (int i = 1; i < ws.length; i += 2) {
+                if ((w = ws[i]) != null)
+                    count += w.totalSteals;
+            }
+        }
+        return count;
     }
 
     /**
@@ -935,77 +2444,106 @@ public class ForkJoinPool /*extends AbstractExecutorService*/ {
      * an approximation, obtained by iterating across all threads in
      * the pool. This method may be useful for tuning task
      * granularities.
-     * @return the number of queued tasks.
+     *
+     * @return the number of queued tasks
      */
     public long getQueuedTaskCount() {
         long count = 0;
-        ForkJoinWorkerThread[] ws = workers;
-        if (ws != null) {
-            for (int i = 0; i < ws.length; ++i) {
-                ForkJoinWorkerThread t = ws[i];
-                if (t != null)
-                    count += t.getQueueSize();
+        WorkQueue[] ws; WorkQueue w;
+        if ((ws = workQueues) != null) {
+            for (int i = 1; i < ws.length; i += 2) {
+                if ((w = ws[i]) != null)
+                    count += w.queueSize();
             }
         }
         return count;
     }
 
     /**
-     * Returns an estimate of the number tasks submitted to this pool
-     * that have not yet begun executing. This method takes time
-     * proportional to the number of submissions.
-     * @return the number of queued submissions.
+     * Returns an estimate of the number of tasks submitted to this
+     * pool that have not yet begun executing.  This method may take
+     * time proportional to the number of submissions.
+     *
+     * @return the number of queued submissions
      */
     public int getQueuedSubmissionCount() {
-        return submissionQueue.size();
+        int count = 0;
+        WorkQueue[] ws; WorkQueue w;
+        if ((ws = workQueues) != null) {
+            for (int i = 0; i < ws.length; i += 2) {
+                if ((w = ws[i]) != null)
+                    count += w.queueSize();
+            }
+        }
+        return count;
     }
 
     /**
-     * Returns true if there are any tasks submitted to this pool
-     * that have not yet begun executing.
-     * @return <code>true</code> if there are any queued submissions.
+     * Returns {@code true} if there are any tasks submitted to this
+     * pool that have not yet begun executing.
+     *
+     * @return {@code true} if there are any queued submissions
      */
     public boolean hasQueuedSubmissions() {
-        return !submissionQueue.isEmpty();
+        WorkQueue[] ws; WorkQueue w;
+        if ((ws = workQueues) != null) {
+            for (int i = 0; i < ws.length; i += 2) {
+                if ((w = ws[i]) != null && !w.isEmpty())
+                    return true;
+            }
+        }
+        return false;
     }
 
     /**
      * Removes and returns the next unexecuted submission if one is
      * available.  This method may be useful in extensions to this
      * class that re-assign work in systems with multiple pools.
-     * @return the next submission, or null if none
+     *
+     * @return the next submission, or {@code null} if none
      */
     protected ForkJoinTask<?> pollSubmission() {
-        return submissionQueue.poll();
+        WorkQueue[] ws; WorkQueue w; ForkJoinTask<?> t;
+        if ((ws = workQueues) != null) {
+            for (int i = 0; i < ws.length; i += 2) {
+                if ((w = ws[i]) != null && (t = w.poll()) != null)
+                    return t;
+            }
+        }
+        return null;
     }
 
     /**
      * Removes all available unexecuted submitted and forked tasks
      * from scheduling queues and adds them to the given collection,
      * without altering their execution status. These may include
-     * artifically generated or wrapped tasks. This method id designed
-     * to be invoked only when the pool is known to be
+     * artificially generated or wrapped tasks. This method is
+     * designed to be invoked only when the pool is known to be
      * quiescent. Invocations at other times may not remove all
      * tasks. A failure encountered while attempting to add elements
-     * to collection <tt>c</tt> may result in elements being in
+     * to collection {@code c} may result in elements being in
      * neither, either or both collections when the associated
      * exception is thrown.  The behavior of this operation is
      * undefined if the specified collection is modified while the
      * operation is in progress.
+     *
      * @param c the collection to transfer elements into
      * @return the number of elements transferred
      */
-    protected int drainTasksTo(Collection<ForkJoinTask<?>> c) {
-        int n = submissionQueue.drainTo(c);
-        ForkJoinWorkerThread[] ws = workers;
-        if (ws != null) {
+    protected int drainTasksTo(Collection<? super ForkJoinTask<?>> c) {
+        int count = 0;
+        WorkQueue[] ws; WorkQueue w; ForkJoinTask<?> t;
+        if ((ws = workQueues) != null) {
             for (int i = 0; i < ws.length; ++i) {
-                ForkJoinWorkerThread w = ws[i];
-                if (w != null)
-                    n += w.drainTasksTo(c);
+                if ((w = ws[i]) != null) {
+                    while ((t = w.poll()) != null) {
+                        c.add(t);
+                        ++count;
+                    }
+                }
             }
         }
-        return n;
+        return count;
     }
 
     /**
@@ -1016,101 +2554,124 @@ public class ForkJoinPool /*extends AbstractExecutorService*/ {
      * @return a string identifying this pool, as well as its state
      */
     public String toString() {
-        int ps = parallelism;
-        int wc = workerCounts;
-        int rc = runControl;
-        long st = getStealCount();
-        long qt = getQueuedTaskCount();
-        long qs = getQueuedSubmissionCount();
+        // Use a single pass through workQueues to collect counts
+        long qt = 0L, qs = 0L; int rc = 0;
+        long st = stealCount.get();
+        long c = ctl;
+        WorkQueue[] ws; WorkQueue w;
+        if ((ws = workQueues) != null) {
+            for (int i = 0; i < ws.length; ++i) {
+                if ((w = ws[i]) != null) {
+                    int size = w.queueSize();
+                    if ((i & 1) == 0)
+                        qs += size;
+                    else {
+                        qt += size;
+                        st += w.totalSteals;
+                        if (w.isApparentlyUnblocked())
+                            ++rc;
+                    }
+                }
+            }
+        }
+        int pc = parallelism;
+        int tc = pc + (short)(c >>> TC_SHIFT);
+        int ac = pc + (int)(c >> AC_SHIFT);
+        if (ac < 0) // ignore transient negative
+            ac = 0;
+        String level;
+        if ((c & STOP_BIT) != 0)
+            level = (tc == 0) ? "Terminated" : "Terminating";
+        else
+            level = runState < 0 ? "Shutting down" : "Running";
         return super.toString() +
-            "[" + runStateToString(runStateOf(rc)) +
-            ", parallelism = " + ps +
-            ", size = " + totalCountOf(wc) +
-            ", active = " + activeCountOf(rc) +
-            ", running = " + runningCountOf(wc) +
+            "[" + level +
+            ", parallelism = " + pc +
+            ", size = " + tc +
+            ", active = " + ac +
+            ", running = " + rc +
             ", steals = " + st +
             ", tasks = " + qt +
             ", submissions = " + qs +
             "]";
     }
 
-    private static String runStateToString(int rs) {
-        switch(rs) {
-        case RUNNING: return "Running";
-        case SHUTDOWN: return "Shutting down";
-        case TERMINATING: return "Terminating";
-        case TERMINATED: return "Terminated";
-        default: throw new Error("Unknown run state");
-        }
-    }
-
-    // lifecycle control
-
     /**
      * Initiates an orderly shutdown in which previously submitted
      * tasks are executed, but no new tasks will be accepted.
      * Invocation has no additional effect if already shut down.
      * Tasks that are in the process of being submitted concurrently
      * during the course of this method may or may not be rejected.
+     *
      * @throws SecurityException if a security manager exists and
      *         the caller is not permitted to modify threads
      *         because it does not hold {@link
-     *         java.lang.RuntimePermission}<code>("modifyThread")</code>,
+     *         java.lang.RuntimePermission}{@code ("modifyThread")}
      */
     public void shutdown() {
         checkPermission();
-        transitionRunStateTo(SHUTDOWN);
-        if (canTerminateOnShutdown(runControl))
-            terminateOnShutdown();
+        tryTerminate(false, true);
     }
 
     /**
-     * Attempts to stop all actively executing tasks, and cancels all
-     * waiting tasks.  Tasks that are in the process of being
-     * submitted or executed concurrently during the course of this
-     * method may or may not be rejected. Unlike some other executors,
-     * this method cancels rather than collects non-executed tasks
-     * upon termination, so always returns an empty list. However, you
-     * can use method <code>drainTasksTo</code> before invoking this
-     * method to transfer unexecuted tasks to another collection.
+     * Attempts to cancel and/or stop all tasks, and reject all
+     * subsequently submitted tasks.  Tasks that are in the process of
+     * being submitted or executed concurrently during the course of
+     * this method may or may not be rejected. This method cancels
+     * both existing and unexecuted tasks, in order to permit
+     * termination in the presence of task dependencies. So the method
+     * always returns an empty list (unlike the case for some other
+     * Executors).
+     *
      * @return an empty list
      * @throws SecurityException if a security manager exists and
      *         the caller is not permitted to modify threads
      *         because it does not hold {@link
-     *         java.lang.RuntimePermission}<code>("modifyThread")</code>,
+     *         java.lang.RuntimePermission}{@code ("modifyThread")}
      */
     public List<Runnable> shutdownNow() {
         checkPermission();
-        terminate();
+        tryTerminate(true, true);
         return Collections.emptyList();
     }
 
     /**
-     * Returns <code>true</code> if all tasks have completed following shut down.
+     * Returns {@code true} if all tasks have completed following shut down.
      *
-     * @return <code>true</code> if all tasks have completed following shut down
+     * @return {@code true} if all tasks have completed following shut down
      */
     public boolean isTerminated() {
-        return runStateOf(runControl) == TERMINATED;
+        long c = ctl;
+        return ((c & STOP_BIT) != 0L &&
+                (short)(c >>> TC_SHIFT) == -parallelism);
     }
 
     /**
-     * Returns <code>true</code> if the process of termination has
-     * commenced but possibly not yet completed.
+     * Returns {@code true} if the process of termination has
+     * commenced but not yet completed.  This method may be useful for
+     * debugging. A return of {@code true} reported a sufficient
+     * period after shutdown may indicate that submitted tasks have
+     * ignored or suppressed interruption, or are waiting for IO,
+     * causing this executor not to properly terminate. (See the
+     * advisory notes for class {@link ForkJoinTask} stating that
+     * tasks should not normally entail blocking operations.  But if
+     * they do, they must abort them on interrupt.)
      *
-     * @return <code>true</code> if terminating
+     * @return {@code true} if terminating but not yet terminated
      */
     public boolean isTerminating() {
-        return runStateOf(runControl) >= TERMINATING;
+        long c = ctl;
+        return ((c & STOP_BIT) != 0L &&
+                (short)(c >>> TC_SHIFT) != -parallelism);
     }
 
     /**
-     * Returns <code>true</code> if this pool has been shut down.
+     * Returns {@code true} if this pool has been shut down.
      *
-     * @return <code>true</code> if this pool has been shut down
+     * @return {@code true} if this pool has been shut down
      */
     public boolean isShutdown() {
-        return runStateOf(runControl) >= SHUTDOWN;
+        return runState < 0;
     }
 
     /**
@@ -1120,14 +2681,14 @@ public class ForkJoinPool /*extends AbstractExecutorService*/ {
      *
      * @param timeout the maximum time to wait
      * @param unit the time unit of the timeout argument
-     * @return <code>true</code> if this executor terminated and
-     *         <code>false</code> if the timeout elapsed before termination
+     * @return {@code true} if this executor terminated and
+     *         {@code false} if the timeout elapsed before termination
      * @throws InterruptedException if interrupted while waiting
      */
     public boolean awaitTermination(long timeout, TimeUnit unit)
         throws InterruptedException {
         long nanos = unit.toNanos(timeout);
-        final ReentrantLock lock = this.workerLock;
+        final Mutex lock = this.lock;
         lock.lock();
         try {
             for (;;) {
@@ -1142,729 +2703,189 @@ public class ForkJoinPool /*extends AbstractExecutorService*/ {
         }
     }
 
-    // Shutdown and termination support
-
-    /**
-     * Callback from terminating worker. Null out the corresponding
-     * workers slot, and if terminating, try to terminate, else try to
-     * shrink workers array.
-     * @param w the worker
-     */
-    final void workerTerminated(ForkJoinWorkerThread w) {
-        updateStealCount(w);
-        updateWorkerCount(-1);
-        final ReentrantLock lock = this.workerLock;
-        lock.lock();
-        try {
-            ForkJoinWorkerThread[] ws = workers;
-            if (ws != null) {
-                int idx = w.poolIndex;
-                if (idx >= 0 && idx < ws.length && ws[idx] == w)
-                    ws[idx] = null;
-                if (totalCountOf(workerCounts) == 0) {
-                    terminate(); // no-op if already terminating
-                    transitionRunStateTo(TERMINATED);
-                    termination.signalAll();
-                }
-                else if (!isTerminating()) {
-                    tryShrinkWorkerArray();
-                    tryResumeSpare(true); // allow replacement
-                }
-            }
-        } finally {
-            lock.unlock();
-        }
-        signalIdleWorkers();
-    }
-
-    /**
-     * Initiate termination.
-     */
-    private void terminate() {
-        if (transitionRunStateTo(TERMINATING)) {
-            stopAllWorkers();
-            resumeAllSpares();
-            signalIdleWorkers();
-            cancelQueuedSubmissions();
-            cancelQueuedWorkerTasks();
-            interruptUnterminatedWorkers();
-            signalIdleWorkers(); // resignal after interrupt
-        }
-    }
-
-    /**
-     * Possibly terminate when on shutdown state
-     */
-    private void terminateOnShutdown() {
-        if (!hasQueuedSubmissions() && canTerminateOnShutdown(runControl))
-            terminate();
-    }
-
-    /**
-     * Clear out and cancel submissions
-     */
-    private void cancelQueuedSubmissions() {
-        ForkJoinTask<?> task;
-        while ((task = pollSubmission()) != null)
-            task.cancel(false);
-    }
-
-    /**
-     * Clean out worker queues.
-     */
-    private void cancelQueuedWorkerTasks() {
-        final ReentrantLock lock = this.workerLock;
-        lock.lock();
-        try {
-            ForkJoinWorkerThread[] ws = workers;
-            if (ws != null) {
-                for (int i = 0; i < ws.length; ++i) {
-                    ForkJoinWorkerThread t = ws[i];
-                    if (t != null)
-                        t.cancelTasks();
-                }
-            }
-        } finally {
-            lock.unlock();
-        }
-    }
-
-    /**
-     * Set each worker's status to terminating. Requires lock to avoid
-     * conflicts with add/remove
-     */
-    private void stopAllWorkers() {
-        final ReentrantLock lock = this.workerLock;
-        lock.lock();
-        try {
-            ForkJoinWorkerThread[] ws = workers;
-            if (ws != null) {
-                for (int i = 0; i < ws.length; ++i) {
-                    ForkJoinWorkerThread t = ws[i];
-                    if (t != null)
-                        t.shutdownNow();
-                }
-            }
-        } finally {
-            lock.unlock();
-        }
-    }
-
-    /**
-     * Interrupt all unterminated workers.  This is not required for
-     * sake of internal control, but may help unstick user code during
-     * shutdown.
-     */
-    private void interruptUnterminatedWorkers() {
-        final ReentrantLock lock = this.workerLock;
-        lock.lock();
-        try {
-            ForkJoinWorkerThread[] ws = workers;
-            if (ws != null) {
-                for (int i = 0; i < ws.length; ++i) {
-                    ForkJoinWorkerThread t = ws[i];
-                    if (t != null && !t.isTerminated()) {
-                        try {
-                            t.interrupt();
-                        } catch (SecurityException ignore) {
-                        }
-                    }
-                }
-            }
-        } finally {
-            lock.unlock();
-        }
-    }
-
-
-    /*
-     * Nodes for event barrier to manage idle threads.  Queue nodes
-     * are basic Treiber stack nodes, also used for spare stack.
-     *
-     * The event barrier has an event count and a wait queue (actually
-     * a Treiber stack).  Workers are enabled to look for work when
-     * the eventCount is incremented. If they fail to find work, they
-     * may wait for next count. Upon release, threads help others wake
-     * up.
-     *
-     * Synchronization events occur only in enough contexts to
-     * maintain overall liveness:
-     *
-     *   - Submission of a new task to the pool
-     *   - Resizes or other changes to the workers array
-     *   - pool termination
-     *   - A worker pushing a task on an empty queue
-     *
-     * The case of pushing a task occurs often enough, and is heavy
-     * enough compared to simple stack pushes, to require special
-     * handling: Method signalWork returns without advancing count if
-     * the queue appears to be empty.  This would ordinarily result in
-     * races causing some queued waiters not to be woken up. To avoid
-     * this, the first worker enqueued in method sync (see
-     * syncIsReleasable) rescans for tasks after being enqueued, and
-     * helps signal if any are found. This works well because the
-     * worker has nothing better to do, and so might as well help
-     * alleviate the overhead and contention on the threads actually
-     * doing work.  Also, since event counts increments on task
-     * availability exist to maintain liveness (rather than to force
-     * refreshes etc), it is OK for callers to exit early if
-     * contending with another signaller.
-     */
-    static final class WaitQueueNode {
-        WaitQueueNode next; // only written before enqueued
-        volatile ForkJoinWorkerThread thread; // nulled to cancel wait
-        final long count; // unused for spare stack
-
-        WaitQueueNode(long c, ForkJoinWorkerThread w) {
-            count = c;
-            thread = w;
-        }
-
-        /**
-         * Wake up waiter, returning false if known to already
-         */
-        boolean signal() {
-            ForkJoinWorkerThread t = thread;
-            if (t == null)
-                return false;
-            thread = null;
-            LockSupport.unpark(t);
-            return true;
-        }
-
-        /**
-         * Await release on sync
-         */
-        void awaitSyncRelease(ForkJoinPool p) {
-            while (thread != null && !p.syncIsReleasable(this))
-                LockSupport.park(this);
-        }
-
-        /**
-         * Await resumption as spare
-         */
-        void awaitSpareRelease() {
-            while (thread != null) {
-                if (!Thread.interrupted())
-                    LockSupport.park(this);
-            }
-        }
-    }
-
-    /**
-     * Ensures that no thread is waiting for count to advance from the
-     * current value of eventCount read on entry to this method, by
-     * releasing waiting threads if necessary.
-     * @return the count
-     */
-    final long ensureSync() {
-        long c = eventCount;
-        WaitQueueNode q;
-        while ((q = syncStack) != null && q.count < c) {
-            if (casBarrierStack(q, null)) {
-                do {
-                    q.signal();
-                } while ((q = q.next) != null);
-                break;
-            }
-        }
-        return c;
-    }
-
-    /**
-     * Increments event count and releases waiting threads.
-     */
-    private void signalIdleWorkers() {
-        long c;
-        do;while (!casEventCount(c = eventCount, c+1));
-        ensureSync();
-    }
-
-    /**
-     * Signal threads waiting to poll a task. Because method sync
-     * rechecks availability, it is OK to only proceed if queue
-     * appears to be non-empty, and OK to skip under contention to
-     * increment count (since some other thread succeeded).
-     */
-    final void signalWork() {
-        long c;
-        WaitQueueNode q;
-        if (syncStack != null &&
-            casEventCount(c = eventCount, c+1) &&
-            (((q = syncStack) != null && q.count <= c) &&
-             (!casBarrierStack(q, q.next) || !q.signal())))
-            ensureSync();
-    }
-
-    /**
-     * Waits until event count advances from last value held by
-     * caller, or if excess threads, caller is resumed as spare, or
-     * caller or pool is terminating. Updates caller's event on exit.
-     * @param w the calling worker thread
-     */
-    final void sync(ForkJoinWorkerThread w) {
-        updateStealCount(w); // Transfer w's count while it is idle
-
-        while (!w.isShutdown() && !isTerminating() && !suspendIfSpare(w)) {
-            long prev = w.lastEventCount;
-            WaitQueueNode node = null;
-            WaitQueueNode h;
-            while (eventCount == prev &&
-                   ((h = syncStack) == null || h.count == prev)) {
-                if (node == null)
-                    node = new WaitQueueNode(prev, w);
-                if (casBarrierStack(node.next = h, node)) {
-                    node.awaitSyncRelease(this);
-                    break;
-                }
-            }
-            long ec = ensureSync();
-            if (ec != prev) {
-                w.lastEventCount = ec;
-                break;
-            }
-        }
-    }
-
-    /**
-     * Returns true if worker waiting on sync can proceed:
-     *  - on signal (thread == null)
-     *  - on event count advance (winning race to notify vs signaller)
-     *  - on Interrupt
-     *  - if the first queued node, we find work available
-     * If node was not signalled and event count not advanced on exit,
-     * then we also help advance event count.
-     * @return true if node can be released
-     */
-    final boolean syncIsReleasable(WaitQueueNode node) {
-        long prev = node.count;
-        if (!Thread.interrupted() && node.thread != null &&
-            (node.next != null ||
-             !ForkJoinWorkerThread.hasQueuedTasks(workers)) &&
-            eventCount == prev)
-            return false;
-        if (node.thread != null) {
-            node.thread = null;
-            long ec = eventCount;
-            if (prev <= ec) // help signal
-                casEventCount(ec, ec+1);
-        }
-        return true;
-    }
-
-    /**
-     * Returns true if a new sync event occurred since last call to
-     * sync or this method, if so, updating caller's count.
-     */
-    final boolean hasNewSyncEvent(ForkJoinWorkerThread w) {
-        long lc = w.lastEventCount;
-        long ec = ensureSync();
-        if (ec == lc)
-            return false;
-        w.lastEventCount = ec;
-        return true;
-    }
-
-    //  Parallelism maintenance
-
-    /**
-     * Decrement running count; if too low, add spare.
-     *
-     * Conceptually, all we need to do here is add or resume a
-     * spare thread when one is about to block (and remove or
-     * suspend it later when unblocked -- see suspendIfSpare).
-     * However, implementing this idea requires coping with
-     * several problems: We have imperfect information about the
-     * states of threads. Some count updates can and usually do
-     * lag run state changes, despite arrangements to keep them
-     * accurate (for example, when possible, updating counts
-     * before signalling or resuming), especially when running on
-     * dynamic JVMs that don't optimize the infrequent paths that
-     * update counts. Generating too many threads can make these
-     * problems become worse, because excess threads are more
-     * likely to be context-switched with others, slowing them all
-     * down, especially if there is no work available, so all are
-     * busy scanning or idling.  Also, excess spare threads can
-     * only be suspended or removed when they are idle, not
-     * immediately when they aren't needed. So adding threads will
-     * raise parallelism level for longer than necessary.  Also,
-     * FJ applications often enounter highly transient peaks when
-     * many threads are blocked joining, but for less time than it
-     * takes to create or resume spares.
-     *
-     * @param joinMe if non-null, return early if done
-     * @param maintainParallelism if true, try to stay within
-     * target counts, else create only to avoid starvation
-     * @return true if joinMe known to be done
-     */
-    final boolean preJoin(ForkJoinTask<?> joinMe, boolean maintainParallelism) {
-        maintainParallelism &= maintainsParallelism; // overrride
-        boolean dec = false;  // true when running count decremented
-        while (spareStack == null || !tryResumeSpare(dec)) {
-            int counts = workerCounts;
-            if (dec || (dec = casWorkerCounts(counts, --counts))) { // CAS cheat
-                if (!needSpare(counts, maintainParallelism))
-                    break;
-                if (joinMe.status < 0)
-                    return true;
-                if (tryAddSpare(counts))
-                    break;
-            }
-        }
-        return false;
-    }
-
-    /**
-     * Same idea as preJoin
-     */
-    final boolean preBlock(ManagedBlocker blocker, boolean maintainParallelism){
-        maintainParallelism &= maintainsParallelism;
-        boolean dec = false;
-        while (spareStack == null || !tryResumeSpare(dec)) {
-            int counts = workerCounts;
-            if (dec || (dec = casWorkerCounts(counts, --counts))) {
-                if (!needSpare(counts, maintainParallelism))
-                    break;
-                if (blocker.isReleasable())
-                    return true;
-                if (tryAddSpare(counts))
-                    break;
-            }
-        }
-        return false;
-    }
-
-    /**
-     * Returns true if a spare thread appears to be needed.  If
-     * maintaining parallelism, returns true when the deficit in
-     * running threads is more than the surplus of total threads, and
-     * there is apparently some work to do.  This self-limiting rule
-     * means that the more threads that have already been added, the
-     * less parallelism we will tolerate before adding another.
-     * @param counts current worker counts
-     * @param maintainParallelism try to maintain parallelism
-     */
-    private boolean needSpare(int counts, boolean maintainParallelism) {
-        int ps = parallelism;
-        int rc = runningCountOf(counts);
-        int tc = totalCountOf(counts);
-        int runningDeficit = ps - rc;
-        int totalSurplus = tc - ps;
-        return (tc < maxPoolSize &&
-                (rc == 0 || totalSurplus < 0 ||
-                 (maintainParallelism &&
-                  runningDeficit > totalSurplus &&
-                  ForkJoinWorkerThread.hasQueuedTasks(workers))));
-    }
-
-    /**
-     * Add a spare worker if lock available and no more than the
-     * expected numbers of threads exist
-     * @return true if successful
-     */
-    private boolean tryAddSpare(int expectedCounts) {
-        final ReentrantLock lock = this.workerLock;
-        int expectedRunning = runningCountOf(expectedCounts);
-        int expectedTotal = totalCountOf(expectedCounts);
-        boolean success = false;
-        boolean locked = false;
-        // confirm counts while locking; CAS after obtaining lock
-        try {
-            for (;;) {
-                int s = workerCounts;
-                int tc = totalCountOf(s);
-                int rc = runningCountOf(s);
-                if (rc > expectedRunning || tc > expectedTotal)
-                    break;
-                if (!locked && !(locked = lock.tryLock()))
-                    break;
-                if (casWorkerCounts(s, workerCountsFor(tc+1, rc+1))) {
-                    createAndStartSpare(tc);
-                    success = true;
-                    break;
-                }
-            }
-        } finally {
-            if (locked)
-                lock.unlock();
-        }
-        return success;
-    }
-
-    /**
-     * Add the kth spare worker. On entry, pool coounts are already
-     * adjusted to reflect addition.
-     */
-    private void createAndStartSpare(int k) {
-        ForkJoinWorkerThread w = null;
-        ForkJoinWorkerThread[] ws = ensureWorkerArrayCapacity(k + 1);
-        int len = ws.length;
-        // Probably, we can place at slot k. If not, find empty slot
-        if (k < len && ws[k] != null) {
-            for (k = 0; k < len && ws[k] != null; ++k)
-                ;
-        }
-        if (k < len && !isTerminating() && (w = createWorker(k)) != null) {
-            ws[k] = w;
-            w.start();
-        }
-        else
-            updateWorkerCount(-1); // adjust on failure
-        signalIdleWorkers();
-    }
-
-    /**
-     * Suspend calling thread w if there are excess threads.  Called
-     * only from sync.  Spares are enqueued in a Treiber stack
-     * using the same WaitQueueNodes as barriers.  They are resumed
-     * mainly in preJoin, but are also woken on pool events that
-     * require all threads to check run state.
-     * @param w the caller
-     */
-    private boolean suspendIfSpare(ForkJoinWorkerThread w) {
-        WaitQueueNode node = null;
-        int s;
-        while (parallelism < runningCountOf(s = workerCounts)) {
-            if (node == null)
-                node = new WaitQueueNode(0, w);
-            if (casWorkerCounts(s, s-1)) { // representation-dependent
-                // push onto stack
-                do;while (!casSpareStack(node.next = spareStack, node));
-                // block until released by resumeSpare
-                node.awaitSpareRelease();
-                return true;
-            }
-        }
-        return false;
-    }
-
-    /**
-     * Try to pop and resume a spare thread.
-     * @param updateCount if true, increment running count on success
-     * @return true if successful
-     */
-    private boolean tryResumeSpare(boolean updateCount) {
-        WaitQueueNode q;
-        while ((q = spareStack) != null) {
-            if (casSpareStack(q, q.next)) {
-                if (updateCount)
-                    updateRunningCount(1);
-                q.signal();
-                return true;
-            }
-        }
-        return false;
-    }
-
-    /**
-     * Pop and resume all spare threads. Same idea as ensureSync.
-     * @return true if any spares released
-     */
-    private boolean resumeAllSpares() {
-        WaitQueueNode q;
-        while ( (q = spareStack) != null) {
-            if (casSpareStack(q, null)) {
-                do {
-                    updateRunningCount(1);
-                    q.signal();
-                } while ((q = q.next) != null);
-                return true;
-            }
-        }
-        return false;
-    }
-
-    /**
-     * Pop and shutdown excessive spare threads. Call only while
-     * holding lock. This is not guaranteed to eliminate all excess
-     * threads, only those suspended as spares, which are the ones
-     * unlikely to be needed in the future.
-     */
-    private void trimSpares() {
-        int surplus = totalCountOf(workerCounts) - parallelism;
-        WaitQueueNode q;
-        while (surplus > 0 && (q = spareStack) != null) {
-            if (casSpareStack(q, null)) {
-                do {
-                    updateRunningCount(1);
-                    ForkJoinWorkerThread w = q.thread;
-                    if (w != null && surplus > 0 &&
-                        runningCountOf(workerCounts) > 0 && w.shutdown())
-                        --surplus;
-                    q.signal();
-                } while ((q = q.next) != null);
-            }
-        }
-    }
-
     /**
      * Interface for extending managed parallelism for tasks running
-     * in ForkJoinPools. A ManagedBlocker provides two methods.
-     * Method <code>isReleasable</code> must return true if blocking is not
-     * necessary. Method <code>block</code> blocks the current thread
-     * if necessary (perhaps internally invoking isReleasable before
-     * actually blocking.).
+     * in {@link ForkJoinPool}s.
+     *
+     * <p>A {@code ManagedBlocker} provides two methods.  Method
+     * {@code isReleasable} must return {@code true} if blocking is
+     * not necessary. Method {@code block} blocks the current thread
+     * if necessary (perhaps internally invoking {@code isReleasable}
+     * before actually blocking). These actions are performed by any
+     * thread invoking {@link ForkJoinPool#managedBlock}.  The
+     * unusual methods in this API accommodate synchronizers that may,
+     * but don't usually, block for long periods. Similarly, they
+     * allow more efficient internal handling of cases in which
+     * additional workers may be, but usually are not, needed to
+     * ensure sufficient parallelism.  Toward this end,
+     * implementations of method {@code isReleasable} must be amenable
+     * to repeated invocation.
+     *
      * <p>For example, here is a ManagedBlocker based on a
      * ReentrantLock:
-     * <pre>
-     *   class ManagedLocker implements ManagedBlocker {
-     *     final ReentrantLock lock;
-     *     boolean hasLock = false;
-     *     ManagedLocker(ReentrantLock lock) { this.lock = lock; }
-     *     public boolean block() {
-     *        if (!hasLock)
-     *           lock.lock();
-     *        return true;
-     *     }
-     *     public boolean isReleasable() {
-     *        return hasLock || (hasLock = lock.tryLock());
-     *     }
+     *  <pre> {@code
+     * class ManagedLocker implements ManagedBlocker {
+     *   final ReentrantLock lock;
+     *   boolean hasLock = false;
+     *   ManagedLocker(ReentrantLock lock) { this.lock = lock; }
+     *   public boolean block() {
+     *     if (!hasLock)
+     *       lock.lock();
+     *     return true;
+     *   }
+     *   public boolean isReleasable() {
+     *     return hasLock || (hasLock = lock.tryLock());
+     *   }
+     * }}</pre>
+     *
+     * <p>Here is a class that possibly blocks waiting for an
+     * item on a given queue:
+     *  <pre> {@code
+     * class QueueTaker<E> implements ManagedBlocker {
+     *   final BlockingQueue<E> queue;
+     *   volatile E item = null;
+     *   QueueTaker(BlockingQueue<E> q) { this.queue = q; }
+     *   public boolean block() throws InterruptedException {
+     *     if (item == null)
+     *       item = queue.take();
+     *     return true;
      *   }
-     * </pre>
+     *   public boolean isReleasable() {
+     *     return item != null || (item = queue.poll()) != null;
+     *   }
+     *   public E getItem() { // call after pool.managedBlock completes
+     *     return item;
+     *   }
+     * }}</pre>
      */
     public static interface ManagedBlocker {
         /**
          * Possibly blocks the current thread, for example waiting for
          * a lock or condition.
-         * @return true if no additional blocking is necessary (i.e.,
-         * if isReleasable would return true).
+         *
+         * @return {@code true} if no additional blocking is necessary
+         * (i.e., if isReleasable would return true)
          * @throws InterruptedException if interrupted while waiting
-         * (the method is not required to do so, but is allowe to).
+         * (the method is not required to do so, but is allowed to)
          */
         boolean block() throws InterruptedException;
 
         /**
-         * Returns true if blocking is unnecessary.
+         * Returns {@code true} if blocking is unnecessary.
          */
         boolean isReleasable();
     }
 
     /**
      * Blocks in accord with the given blocker.  If the current thread
-     * is a ForkJoinWorkerThread, this method possibly arranges for a
-     * spare thread to be activated if necessary to ensure parallelism
-     * while the current thread is blocked.  If
-     * <code>maintainParallelism</code> is true and the pool supports
-     * it ({@link #getMaintainsParallelism}), this method attempts to
-     * maintain the pool's nominal parallelism. Otherwise if activates
-     * a thread only if necessary to avoid complete starvation. This
-     * option may be preferable when blockages use timeouts, or are
-     * almost always brief.
-     *
-     * <p> If the caller is not a ForkJoinTask, this method is behaviorally
-     * equivalent to
-     * <pre>
-     *   while (!blocker.isReleasable())
-     *      if (blocker.block())
-     *         return;
-     * </pre>
-     * If the caller is a ForkJoinTask, then the pool may first
-     * be expanded to ensure parallelism, and later adjusted.
+     * is a {@link ForkJoinWorkerThread}, this method possibly
+     * arranges for a spare thread to be activated if necessary to
+     * ensure sufficient parallelism while the current thread is blocked.
+     *
+     * <p>If the caller is not a {@link ForkJoinTask}, this method is
+     * behaviorally equivalent to
+     *  <pre> {@code
+     * while (!blocker.isReleasable())
+     *   if (blocker.block())
+     *     return;
+     * }</pre>
+     *
+     * If the caller is a {@code ForkJoinTask}, then the pool may
+     * first be expanded to ensure parallelism, and later adjusted.
      *
      * @param blocker the blocker
-     * @param maintainParallelism if true and supported by this pool,
-     * attempt to maintain the pool's nominal parallelism; otherwise
-     * activate a thread only if necessary to avoid complete
-     * starvation.
-     * @throws InterruptedException if blocker.block did so.
-     */
-    public static void managedBlock(ManagedBlocker blocker,
-                                    boolean maintainParallelism)
+     * @throws InterruptedException if blocker.block did so
+     */
+    public static void managedBlock(ManagedBlocker blocker)
         throws InterruptedException {
         Thread t = Thread.currentThread();
-        ForkJoinPool pool = (t instanceof ForkJoinWorkerThread?
-                             ((ForkJoinWorkerThread)t).pool : null);
-        if (!blocker.isReleasable()) {
-            try {
-                if (pool == null ||
-                    !pool.preBlock(blocker, maintainParallelism))
-                    awaitBlocker(blocker);
-            } finally {
-                if (pool != null)
-                    pool.updateRunningCount(1);
+        ForkJoinPool p = ((t instanceof ForkJoinWorkerThread) ?
+                          ((ForkJoinWorkerThread)t).pool : null);
+        while (!blocker.isReleasable()) {
+            if (p == null || p.tryCompensate(null, blocker)) {
+                try {
+                    do {} while (!blocker.isReleasable() && !blocker.block());
+                } finally {
+                    if (p != null)
+                        p.incrementActiveCount();
+                }
+                break;
             }
         }
     }
 
-    private static void awaitBlocker(ManagedBlocker blocker)
-        throws InterruptedException {
-        do;while (!blocker.isReleasable() && !blocker.block());
-    }
-
-    // AbstractExecutorService overrides
+    // AbstractExecutorService overrides.  These rely on undocumented
+    // fact that ForkJoinTask.adapt returns ForkJoinTasks that also
+    // implement RunnableFuture.
 
     protected <T> RunnableFuture<T> newTaskFor(Runnable runnable, T value) {
-        return new AdaptedRunnable(runnable, value);
+        return new ForkJoinTask.AdaptedRunnable<T>(runnable, value);
     }
 
     protected <T> RunnableFuture<T> newTaskFor(Callable<T> callable) {
-        return new AdaptedCallable(callable);
+        return new ForkJoinTask.AdaptedCallable<T>(callable);
     }
 
+    // Unsafe mechanics
+    private static final sun.misc.Unsafe U;
+    private static final long CTL;
+    private static final long PARKBLOCKER;
+    private static final int ABASE;
+    private static final int ASHIFT;
 
-    // Temporary Unsafe mechanics for preliminary release
-    private static Unsafe getUnsafe() throws Throwable {
+    static {
+        poolNumberGenerator = new AtomicInteger();
+        nextSubmitterSeed = new AtomicInteger(0x55555555);
+        modifyThreadPermission = new RuntimePermission("modifyThread");
+        defaultForkJoinWorkerThreadFactory =
+            new DefaultForkJoinWorkerThreadFactory();
+        submitters = new ThreadSubmitter();
+        int s;
         try {
-            return Unsafe.getUnsafe();
+            U = getUnsafe();
+            Class<?> k = ForkJoinPool.class;
+            Class<?> ak = ForkJoinTask[].class;
+            CTL = U.objectFieldOffset
+                (k.getDeclaredField("ctl"));
+            Class<?> tk = Thread.class;
+            PARKBLOCKER = U.objectFieldOffset
+                (tk.getDeclaredField("parkBlocker"));
+            ABASE = U.arrayBaseOffset(ak);
+            s = U.arrayIndexScale(ak);
+        } catch (Exception e) {
+            throw new Error(e);
+        }
+        if ((s & (s-1)) != 0)
+            throw new Error("data type scale not a power of two");
+        ASHIFT = 31 - Integer.numberOfLeadingZeros(s);
+    }
+
+    /**
+     * Returns a sun.misc.Unsafe.  Suitable for use in a 3rd party package.
+     * Replace with a simple call to Unsafe.getUnsafe when integrating
+     * into a jdk.
+     *
+     * @return a sun.misc.Unsafe
+     */
+    private static sun.misc.Unsafe getUnsafe() {
+        try {
+            return sun.misc.Unsafe.getUnsafe();
         } catch (SecurityException se) {
             try {
                 return java.security.AccessController.doPrivileged
-                    (new java.security.PrivilegedExceptionAction<Unsafe>() {
-                        public Unsafe run() throws Exception {
-                            return getUnsafePrivileged();
+                    (new java.security
+                     .PrivilegedExceptionAction<sun.misc.Unsafe>() {
+                        public sun.misc.Unsafe run() throws Exception {
+                            java.lang.reflect.Field f = sun.misc
+                                .Unsafe.class.getDeclaredField("theUnsafe");
+                            f.setAccessible(true);
+                            return (sun.misc.Unsafe) f.get(null);
                         }});
             } catch (java.security.PrivilegedActionException e) {
-                throw e.getCause();
+                throw new RuntimeException("Could not initialize intrinsics",
+                                           e.getCause());
             }
         }
     }
 
-    private static Unsafe getUnsafePrivileged()
-            throws NoSuchFieldException, IllegalAccessException {
-        Field f = Unsafe.class.getDeclaredField("theUnsafe");
-        f.setAccessible(true);
-        return (Unsafe) f.get(null);
-    }
-
-    private static long fieldOffset(String fieldName)
-            throws NoSuchFieldException {
-        return _unsafe.objectFieldOffset
-            (ForkJoinPool.class.getDeclaredField(fieldName));
-    }
-
-    static final Unsafe _unsafe;
-    static final long eventCountOffset;
-    static final long workerCountsOffset;
-    static final long runControlOffset;
-    static final long syncStackOffset;
-    static final long spareStackOffset;
-
-    static {
-        try {
-            _unsafe = getUnsafe();
-            eventCountOffset = fieldOffset("eventCount");
-            workerCountsOffset = fieldOffset("workerCounts");
-            runControlOffset = fieldOffset("runControl");
-            syncStackOffset = fieldOffset("syncStack");
-            spareStackOffset = fieldOffset("spareStack");
-        } catch (Throwable e) {
-            throw new RuntimeException("Could not initialize intrinsics", e);
-        }
-    }
-
-    private boolean casEventCount(long cmp, long val) {
-        return _unsafe.compareAndSwapLong(this, eventCountOffset, cmp, val);
-    }
-    private boolean casWorkerCounts(int cmp, int val) {
-        return _unsafe.compareAndSwapInt(this, workerCountsOffset, cmp, val);
-    }
-    private boolean casRunControl(int cmp, int val) {
-        return _unsafe.compareAndSwapInt(this, runControlOffset, cmp, val);
-    }
-    private boolean casSpareStack(WaitQueueNode cmp, WaitQueueNode val) {
-        return _unsafe.compareAndSwapObject(this, spareStackOffset, cmp, val);
-    }
-    private boolean casBarrierStack(WaitQueueNode cmp, WaitQueueNode val) {
-        return _unsafe.compareAndSwapObject(this, syncStackOffset, cmp, val);
-    }
 }
diff --git a/src/forkjoin/scala/concurrent/forkjoin/ForkJoinTask.java b/src/forkjoin/scala/concurrent/forkjoin/ForkJoinTask.java
index dc1a6bcccc..344f6887a6 100644
--- a/src/forkjoin/scala/concurrent/forkjoin/ForkJoinTask.java
+++ b/src/forkjoin/scala/concurrent/forkjoin/ForkJoinTask.java
@@ -1,470 +1,597 @@
 /*
  * Written by Doug Lea with assistance from members of JCP JSR-166
  * Expert Group and released to the public domain, as explained at
- * http://creativecommons.org/licenses/publicdomain
+ * http://creativecommons.org/publicdomain/zero/1.0/
  */
 
 package scala.concurrent.forkjoin;
 import java.io.Serializable;
-import java.util.*;
-import java.util.concurrent.*;
-import java.util.concurrent.atomic.*;
-import sun.misc.Unsafe;
-import java.lang.reflect.*;
+import java.util.Collection;
+import java.util.List;
+import java.util.RandomAccess;
+import java.lang.ref.WeakReference;
+import java.lang.ref.ReferenceQueue;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Future;
+import java.util.concurrent.RejectedExecutionException;
+//import java.util.concurrent.RunnableFuture;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.locks.ReentrantLock;
+import java.lang.reflect.Constructor;
 
 /**
- * Abstract base class for tasks that run within a {@link
- * ForkJoinPool}.  A ForkJoinTask is a thread-like entity that is much
+ * Abstract base class for tasks that run within a {@link ForkJoinPool}.
+ * A {@code ForkJoinTask} is a thread-like entity that is much
  * lighter weight than a normal thread.  Huge numbers of tasks and
  * subtasks may be hosted by a small number of actual threads in a
  * ForkJoinPool, at the price of some usage limitations.
  *
- * <p> A "main" ForkJoinTask begins execution when submitted to a
- * {@link ForkJoinPool}. Once started, it will usually in turn start
- * other subtasks.  As indicated by the name of this class, many
- * programs using ForkJoinTasks employ only methods <code>fork</code>
- * and <code>join</code>, or derivatives such as
- * <code>invokeAll</code>.  However, this class also provides a number
- * of other methods that can come into play in advanced usages, as
- * well as extension mechanics that allow support of new forms of
- * fork/join processing.
+ * <p>A "main" {@code ForkJoinTask} begins execution when submitted
+ * to a {@link ForkJoinPool}.  Once started, it will usually in turn
+ * start other subtasks.  As indicated by the name of this class,
+ * many programs using {@code ForkJoinTask} employ only methods
+ * {@link #fork} and {@link #join}, or derivatives such as {@link
+ * #invokeAll(ForkJoinTask...) invokeAll}.  However, this class also
+ * provides a number of other methods that can come into play in
+ * advanced usages, as well as extension mechanics that allow
+ * support of new forms of fork/join processing.
  *
- * <p>A ForkJoinTask is a lightweight form of {@link Future}.  The
- * efficiency of ForkJoinTasks stems from a set of restrictions (that
- * are only partially statically enforceable) reflecting their
- * intended use as computational tasks calculating pure functions or
- * operating on purely isolated objects.  The primary coordination
- * mechanisms are {@link #fork}, that arranges asynchronous execution,
- * and {@link #join}, that doesn't proceed until the task's result has
- * been computed.  Computations should avoid <code>synchronized</code>
- * methods or blocks, and should minimize other blocking
- * synchronization apart from joining other tasks or using
- * synchronizers such as Phasers that are advertised to cooperate with
- * fork/join scheduling. Tasks should also not perform blocking IO,
- * and should ideally access variables that are completely independent
- * of those accessed by other running tasks. Minor breaches of these
- * restrictions, for example using shared output streams, may be
- * tolerable in practice, but frequent use may result in poor
- * performance, and the potential to indefinitely stall if the number
- * of threads not waiting for IO or other external synchronization
- * becomes exhausted. This usage restriction is in part enforced by
- * not permitting checked exceptions such as <code>IOExceptions</code>
- * to be thrown. However, computations may still encounter unchecked
- * exceptions, that are rethrown to callers attempting join
- * them. These exceptions may additionally include
- * RejectedExecutionExceptions stemming from internal resource
- * exhaustion such as failure to allocate internal task queues.
+ * <p>A {@code ForkJoinTask} is a lightweight form of {@link Future}.
+ * The efficiency of {@code ForkJoinTask}s stems from a set of
+ * restrictions (that are only partially statically enforceable)
+ * reflecting their main use as computational tasks calculating pure
+ * functions or operating on purely isolated objects.  The primary
+ * coordination mechanisms are {@link #fork}, that arranges
+ * asynchronous execution, and {@link #join}, that doesn't proceed
+ * until the task's result has been computed.  Computations should
+ * ideally avoid {@code synchronized} methods or blocks, and should
+ * minimize other blocking synchronization apart from joining other
+ * tasks or using synchronizers such as Phasers that are advertised to
+ * cooperate with fork/join scheduling. Subdividable tasks should also
+ * not perform blocking IO, and should ideally access variables that
+ * are completely independent of those accessed by other running
+ * tasks. These guidelines are loosely enforced by not permitting
+ * checked exceptions such as {@code IOExceptions} to be
+ * thrown. However, computations may still encounter unchecked
+ * exceptions, that are rethrown to callers attempting to join
+ * them. These exceptions may additionally include {@link
+ * RejectedExecutionException} stemming from internal resource
+ * exhaustion, such as failure to allocate internal task
+ * queues. Rethrown exceptions behave in the same way as regular
+ * exceptions, but, when possible, contain stack traces (as displayed
+ * for example using {@code ex.printStackTrace()}) of both the thread
+ * that initiated the computation as well as the thread actually
+ * encountering the exception; minimally only the latter.
+ *
+ * <p>It is possible to define and use ForkJoinTasks that may block,
+ * but doing do requires three further considerations: (1) Completion
+ * of few if any <em>other</em> tasks should be dependent on a task
+ * that blocks on external synchronization or IO. Event-style async
+ * tasks that are never joined often fall into this category.  (2) To
+ * minimize resource impact, tasks should be small; ideally performing
+ * only the (possibly) blocking action. (3) Unless the {@link
+ * ForkJoinPool.ManagedBlocker} API is used, or the number of possibly
+ * blocked tasks is known to be less than the pool's {@link
+ * ForkJoinPool#getParallelism} level, the pool cannot guarantee that
+ * enough threads will be available to ensure progress or good
+ * performance.
  *
  * <p>The primary method for awaiting completion and extracting
  * results of a task is {@link #join}, but there are several variants:
  * The {@link Future#get} methods support interruptible and/or timed
- * waits for completion and report results using <code>Future</code>
- * conventions. Method {@link #helpJoin} enables callers to actively
- * execute other tasks while awaiting joins, which is sometimes more
- * efficient but only applies when all subtasks are known to be
- * strictly tree-structured. Method {@link #invoke} is semantically
- * equivalent to <code>fork(); join()</code> but always attempts to
- * begin execution in the current thread. The "<em>quiet</em>" forms
- * of these methods do not extract results or report exceptions. These
+ * waits for completion and report results using {@code Future}
+ * conventions. Method {@link #invoke} is semantically
+ * equivalent to {@code fork(); join()} but always attempts to begin
+ * execution in the current thread. The "<em>quiet</em>" forms of
+ * these methods do not extract results or report exceptions. These
  * may be useful when a set of tasks are being executed, and you need
  * to delay processing of results or exceptions until all complete.
- * Method <code>invokeAll</code> (available in multiple versions)
+ * Method {@code invokeAll} (available in multiple versions)
  * performs the most common form of parallel invocation: forking a set
  * of tasks and joining them all.
  *
- * <p> The ForkJoinTask class is not usually directly subclassed.
+ * <p>In the most typical usages, a fork-join pair act like a call
+ * (fork) and return (join) from a parallel recursive function. As is
+ * the case with other forms of recursive calls, returns (joins)
+ * should be performed innermost-first. For example, {@code a.fork();
+ * b.fork(); b.join(); a.join();} is likely to be substantially more
+ * efficient than joining {@code a} before {@code b}.
+ *
+ * <p>The execution status of tasks may be queried at several levels
+ * of detail: {@link #isDone} is true if a task completed in any way
+ * (including the case where a task was cancelled without executing);
+ * {@link #isCompletedNormally} is true if a task completed without
+ * cancellation or encountering an exception; {@link #isCancelled} is
+ * true if the task was cancelled (in which case {@link #getException}
+ * returns a {@link java.util.concurrent.CancellationException}); and
+ * {@link #isCompletedAbnormally} is true if a task was either
+ * cancelled or encountered an exception, in which case {@link
+ * #getException} will return either the encountered exception or
+ * {@link java.util.concurrent.CancellationException}.
+ *
+ * <p>The ForkJoinTask class is not usually directly subclassed.
  * Instead, you subclass one of the abstract classes that support a
- * particular style of fork/join processing.  Normally, a concrete
+ * particular style of fork/join processing, typically {@link
+ * RecursiveAction} for computations that do not return results, or
+ * {@link RecursiveTask} for those that do.  Normally, a concrete
  * ForkJoinTask subclass declares fields comprising its parameters,
- * established in a constructor, and then defines a <code>compute</code>
+ * established in a constructor, and then defines a {@code compute}
  * method that somehow uses the control methods supplied by this base
- * class. While these methods have <code>public</code> access (to allow
- * instances of different task subclasses to call each others
+ * class. While these methods have {@code public} access (to allow
+ * instances of different task subclasses to call each other's
  * methods), some of them may only be called from within other
- * ForkJoinTasks. Attempts to invoke them in other contexts result in
- * exceptions or errors possibly including ClassCastException.
+ * ForkJoinTasks (as may be determined using method {@link
+ * #inForkJoinPool}).  Attempts to invoke them in other contexts
+ * result in exceptions or errors, possibly including
+ * {@code ClassCastException}.
  *
- * <p>Most base support methods are <code>final</code> because their
- * implementations are intrinsically tied to the underlying
- * lightweight task scheduling framework, and so cannot be overridden.
- * Developers creating new basic styles of fork/join processing should
- * minimally implement <code>protected</code> methods
- * <code>exec</code>, <code>setRawResult</code>, and
- * <code>getRawResult</code>, while also introducing an abstract
- * computational method that can be implemented in its subclasses,
- * possibly relying on other <code>protected</code> methods provided
- * by this class.
+ * <p>Method {@link #join} and its variants are appropriate for use
+ * only when completion dependencies are acyclic; that is, the
+ * parallel computation can be described as a directed acyclic graph
+ * (DAG). Otherwise, executions may encounter a form of deadlock as
+ * tasks cyclically wait for each other.  However, this framework
+ * supports other methods and techniques (for example the use of
+ * {@link Phaser}, {@link #helpQuiesce}, and {@link #complete}) that
+ * may be of use in constructing custom subclasses for problems that
+ * are not statically structured as DAGs. To support such usages a
+ * ForkJoinTask may be atomically <em>marked</em> using {@link
+ * #markForkJoinTask} and checked for marking using {@link
+ * #isMarkedForkJoinTask}. The ForkJoinTask implementation does not
+ * use these {@code protected} methods or marks for any purpose, but
+ * they may be of use in the construction of specialized subclasses.
+ * For example, parallel graph traversals can use the supplied methods
+ * to avoid revisiting nodes/tasks that have already been processed.
+ * Also, completion based designs can use them to record that one
+ * subtask has completed. (Method names for marking are bulky in part
+ * to encourage definition of methods that reflect their usage
+ * patterns.)
+ *
+ * <p>Most base support methods are {@code final}, to prevent
+ * overriding of implementations that are intrinsically tied to the
+ * underlying lightweight task scheduling framework.  Developers
+ * creating new basic styles of fork/join processing should minimally
+ * implement {@code protected} methods {@link #exec}, {@link
+ * #setRawResult}, and {@link #getRawResult}, while also introducing
+ * an abstract computational method that can be implemented in its
+ * subclasses, possibly relying on other {@code protected} methods
+ * provided by this class.
  *
  * <p>ForkJoinTasks should perform relatively small amounts of
- * computations, othewise splitting into smaller tasks. As a very
- * rough rule of thumb, a task should perform more than 100 and less
- * than 10000 basic computational steps. If tasks are too big, then
- * parellelism cannot improve throughput. If too small, then memory
- * and internal task maintenance overhead may overwhelm processing.
+ * computation. Large tasks should be split into smaller subtasks,
+ * usually via recursive decomposition. As a very rough rule of thumb,
+ * a task should perform more than 100 and less than 10000 basic
+ * computational steps, and should avoid indefinite looping. If tasks
+ * are too big, then parallelism cannot improve throughput. If too
+ * small, then memory and internal task maintenance overhead may
+ * overwhelm processing.
+ *
+ * <p>This class provides {@code adapt} methods for {@link Runnable}
+ * and {@link Callable}, that may be of use when mixing execution of
+ * {@code ForkJoinTasks} with other kinds of tasks. When all tasks are
+ * of this form, consider using a pool constructed in <em>asyncMode</em>.
  *
- * <p>ForkJoinTasks are <code>Serializable</code>, which enables them
- * to be used in extensions such as remote execution frameworks. It is
- * in general sensible to serialize tasks only before or after, but
- * not during execution. Serialization is not relied on during
- * execution itself.
+ * <p>ForkJoinTasks are {@code Serializable}, which enables them to be
+ * used in extensions such as remote execution frameworks. It is
+ * sensible to serialize tasks only before or after, but not during,
+ * execution. Serialization is not relied on during execution itself.
+ *
+ * @since 1.7
+ * @author Doug Lea
  */
 public abstract class ForkJoinTask<V> implements Future<V>, Serializable {
 
-    /**
-     * Run control status bits packed into a single int to minimize
-     * footprint and to ensure atomicity (via CAS).  Status is
-     * initially zero, and takes on nonnegative values until
-     * completed, upon which status holds COMPLETED. CANCELLED, or
-     * EXCEPTIONAL, which use the top 3 bits.  Tasks undergoing
-     * blocking waits by other threads have SIGNAL_MASK bits set --
-     * bit 15 for external (nonFJ) waits, and the rest a count of
-     * waiting FJ threads.  (This representation relies on
-     * ForkJoinPool max thread limits). Completion of a stolen task
-     * with SIGNAL_MASK bits set awakens waiter via notifyAll. Even
-     * though suboptimal for some purposes, we use basic builtin
-     * wait/notify to take advantage of "monitor inflation" in JVMs
-     * that we would otherwise need to emulate to avoid adding further
-     * per-task bookkeeping overhead. Note that bits 16-28 are
-     * currently unused. Also value 0x80000000 is available as spare
-     * completion value.
+    /*
+     * See the internal documentation of class ForkJoinPool for a
+     * general implementation overview.  ForkJoinTasks are mainly
+     * responsible for maintaining their "status" field amidst relays
+     * to methods in ForkJoinWorkerThread and ForkJoinPool.
+     *
+     * The methods of this class are more-or-less layered into
+     * (1) basic status maintenance
+     * (2) execution and awaiting completion
+     * (3) user-level methods that additionally report results.
+     * This is sometimes hard to see because this file orders exported
+     * methods in a way that flows well in javadocs.
      */
-    volatile int status; // accessed directy by pool and workers
 
-    static final int COMPLETION_MASK      = 0xe0000000;
-    static final int NORMAL               = 0xe0000000; // == mask
-    static final int CANCELLED            = 0xc0000000;
-    static final int EXCEPTIONAL          = 0xa0000000;
-    static final int SIGNAL_MASK          = 0x0000ffff;
-    static final int INTERNAL_SIGNAL_MASK = 0x00007fff;
-    static final int EXTERNAL_SIGNAL      = 0x00008000; // top bit of low word
-
-    /**
-     * Table of exceptions thrown by tasks, to enable reporting by
-     * callers. Because exceptions are rare, we don't directly keep
-     * them with task objects, but instead us a weak ref table.  Note
-     * that cancellation exceptions don't appear in the table, but are
-     * instead recorded as status values.
-     * Todo: Use ConcurrentReferenceHashMap
+    /*
+     * The status field holds run control status bits packed into a
+     * single int to minimize footprint and to ensure atomicity (via
+     * CAS).  Status is initially zero, and takes on nonnegative
+     * values until completed, upon which status (anded with
+     * DONE_MASK) holds value NORMAL, CANCELLED, or EXCEPTIONAL. Tasks
+     * undergoing blocking waits by other threads have the SIGNAL bit
+     * set.  Completion of a stolen task with SIGNAL set awakens any
+     * waiters via notifyAll. Even though suboptimal for some
+     * purposes, we use basic builtin wait/notify to take advantage of
+     * "monitor inflation" in JVMs that we would otherwise need to
+     * emulate to avoid adding further per-task bookkeeping overhead.
+     * We want these monitors to be "fat", i.e., not use biasing or
+     * thin-lock techniques, so use some odd coding idioms that tend
+     * to avoid them, mainly by arranging that every synchronized
+     * block performs a wait, notifyAll or both.
      */
-    static final Map<ForkJoinTask<?>, Throwable> exceptionMap =
-        Collections.synchronizedMap
-        (new WeakHashMap<ForkJoinTask<?>, Throwable>());
 
-    // within-package utilities
+    /** The run status of this task */
+    volatile int status; // accessed directly by pool and workers
+    static final int DONE_MASK   = 0xf0000000;  // mask out non-completion bits
+    static final int NORMAL      = 0xf0000000;  // must be negative
+    static final int CANCELLED   = 0xc0000000;  // must be < NORMAL
+    static final int EXCEPTIONAL = 0x80000000;  // must be < CANCELLED
+    static final int SIGNAL      = 0x00000001;
+    static final int MARKED      = 0x00000002;
 
     /**
-     * Get current worker thread, or null if not a worker thread
-     */
-    static ForkJoinWorkerThread getWorker() {
-        Thread t = Thread.currentThread();
-        return ((t instanceof ForkJoinWorkerThread)?
-                (ForkJoinWorkerThread)t : null);
-    }
-
-    final boolean casStatus(int cmp, int val) {
-        return _unsafe.compareAndSwapInt(this, statusOffset, cmp, val);
-    }
-
-    /**
-     * Workaround for not being able to rethrow unchecked exceptions.
-     */
-    static void rethrowException(Throwable ex) {
-        if (ex != null)
-            _unsafe.throwException(ex);
-    }
-
-    // Setting completion status
-
-    /**
-     * Mark completion and wake up threads waiting to join this task.
+     * Marks completion and wakes up threads waiting to join this
+     * task. A specialization for NORMAL completion is in method
+     * doExec.
+     *
      * @param completion one of NORMAL, CANCELLED, EXCEPTIONAL
+     * @return completion status on exit
      */
-    final void setCompletion(int completion) {
-        ForkJoinPool pool = getPool();
-        if (pool != null) {
-            int s; // Clear signal bits while setting completion status
-            do;while ((s = status) >= 0 && !casStatus(s, completion));
-
-            if ((s & SIGNAL_MASK) != 0) {
-                if ((s &= INTERNAL_SIGNAL_MASK) != 0)
-                    pool.updateRunningCount(s);
-                synchronized(this) { notifyAll(); }
+    private int setCompletion(int completion) {
+        for (int s;;) {
+            if ((s = status) < 0)
+                return s;
+            if (U.compareAndSwapInt(this, STATUS, s, s | completion)) {
+                if ((s & SIGNAL) != 0)
+                    synchronized (this) { notifyAll(); }
+                return completion;
             }
         }
-        else
-            externallySetCompletion(completion);
-    }
-
-    /**
-     * Version of setCompletion for non-FJ threads.  Leaves signal
-     * bits for unblocked threads to adjust, and always notifies.
-     */
-    private void externallySetCompletion(int completion) {
-        int s;
-        do;while ((s = status) >= 0 &&
-                  !casStatus(s, (s & SIGNAL_MASK) | completion));
-        synchronized(this) { notifyAll(); }
-    }
-
-    /**
-     * Sets status to indicate normal completion
-     */
-    final void setNormalCompletion() {
-        // Try typical fast case -- single CAS, no signal, not already done.
-        // Manually expand casStatus to improve chances of inlining it
-        if (!_unsafe.compareAndSwapInt(this, statusOffset, 0, NORMAL))
-            setCompletion(NORMAL);
-    }
-
-    // internal waiting and notification
-
-    /**
-     * Performs the actual monitor wait for awaitDone
-     */
-    private void doAwaitDone() {
-        // Minimize lock bias and in/de-flation effects by maximizing
-        // chances of waiting inside sync
-        try {
-            while (status >= 0)
-                synchronized(this) { if (status >= 0) wait(); }
-        } catch (InterruptedException ie) {
-            onInterruptedWait();
-        }
     }
 
     /**
-     * Performs the actual monitor wait for awaitDone
+     * Primary execution method for stolen tasks. Unless done, calls
+     * exec and records status if completed, but doesn't wait for
+     * completion otherwise.
+     *
+     * @return status on exit from this method
      */
-    private void doAwaitDone(long startTime, long nanos) {
-        synchronized(this) {
+    final int doExec() {
+        int s; boolean completed;
+        if ((s = status) >= 0) {
             try {
-                while (status >= 0) {
-                    long nt = nanos - System.nanoTime() - startTime;
-                    if (nt <= 0)
-                        break;
-                    wait(nt / 1000000, (int)(nt % 1000000));
+                completed = exec();
+            } catch (Throwable rex) {
+                return setExceptionalCompletion(rex);
+            }
+            while ((s = status) >= 0 && completed) {
+                if (U.compareAndSwapInt(this, STATUS, s, s | NORMAL)) {
+                    if ((s & SIGNAL) != 0)
+                        synchronized (this) { notifyAll(); }
+                    return NORMAL;
                 }
-            } catch (InterruptedException ie) {
-                onInterruptedWait();
             }
         }
+        return s;
     }
 
-    // Awaiting completion
+    /**
+     * Tries to set SIGNAL status. Used by ForkJoinPool. Other
+     * variants are directly incorporated into externalAwaitDone etc.
+     *
+     * @return true if successful
+     */
+    final boolean trySetSignal() {
+        int s;
+        return U.compareAndSwapInt(this, STATUS, s = status, s | SIGNAL);
+    }
 
     /**
-     * Sets status to indicate there is joiner, then waits for join,
-     * surrounded with pool notifications.
-     * @return status upon exit
+     * Blocks a non-worker-thread until completion.
+     * @return status upon completion
      */
-    private int awaitDone(ForkJoinWorkerThread w, boolean maintainParallelism) {
-        ForkJoinPool pool = w == null? null : w.pool;
+    private int externalAwaitDone() {
+        boolean interrupted = false;
         int s;
         while ((s = status) >= 0) {
-            if (casStatus(s, pool == null? s|EXTERNAL_SIGNAL : s+1)) {
-                if (pool == null || !pool.preJoin(this, maintainParallelism))
-                    doAwaitDone();
-                if (((s = status) & INTERNAL_SIGNAL_MASK) != 0)
-                    adjustPoolCountsOnUnblock(pool);
-                break;
+            if (U.compareAndSwapInt(this, STATUS, s, s | SIGNAL)) {
+                synchronized (this) {
+                    if (status >= 0) {
+                        try {
+                            wait();
+                        } catch (InterruptedException ie) {
+                            interrupted = true;
+                        }
+                    }
+                    else
+                        notifyAll();
+                }
             }
         }
+        if (interrupted)
+            Thread.currentThread().interrupt();
         return s;
     }
 
     /**
-     * Timed version of awaitDone
-     * @return status upon exit
+     * Blocks a non-worker-thread until completion or interruption.
      */
-    private int awaitDone(ForkJoinWorkerThread w, long nanos) {
-        ForkJoinPool pool = w == null? null : w.pool;
+    private int externalInterruptibleAwaitDone() throws InterruptedException {
         int s;
+        if (Thread.interrupted())
+            throw new InterruptedException();
         while ((s = status) >= 0) {
-            if (casStatus(s, pool == null? s|EXTERNAL_SIGNAL : s+1)) {
-                long startTime = System.nanoTime();
-                if (pool == null || !pool.preJoin(this, false))
-                    doAwaitDone(startTime, nanos);
-                if ((s = status) >= 0) {
-                    adjustPoolCountsOnCancelledWait(pool);
-                    s = status;
+            if (U.compareAndSwapInt(this, STATUS, s, s | SIGNAL)) {
+                synchronized (this) {
+                    if (status >= 0)
+                        wait();
+                    else
+                        notifyAll();
                 }
-                if (s < 0 && (s & INTERNAL_SIGNAL_MASK) != 0)
-                    adjustPoolCountsOnUnblock(pool);
-                break;
             }
         }
         return s;
     }
 
-    /**
-     * Notify pool that thread is unblocked. Called by signalled
-     * threads when woken by non-FJ threads (which is atypical).
-     */
-    private void adjustPoolCountsOnUnblock(ForkJoinPool pool) {
-        int s;
-        do;while ((s = status) < 0 && !casStatus(s, s & COMPLETION_MASK));
-        if (pool != null && (s &= INTERNAL_SIGNAL_MASK) != 0)
-            pool.updateRunningCount(s);
-    }
 
     /**
-     * Notify pool to adjust counts on cancelled or timed out wait
+     * Implementation for join, get, quietlyJoin. Directly handles
+     * only cases of already-completed, external wait, and
+     * unfork+exec.  Others are relayed to ForkJoinPool.awaitJoin.
+     *
+     * @return status upon completion
      */
-    private void adjustPoolCountsOnCancelledWait(ForkJoinPool pool) {
-        if (pool != null) {
-            int s;
-            while ((s = status) >= 0 && (s & INTERNAL_SIGNAL_MASK) != 0) {
-                if (casStatus(s, s - 1)) {
-                    pool.updateRunningCount(1);
-                    break;
-                }
+    private int doJoin() {
+        int s; Thread t; ForkJoinWorkerThread wt; ForkJoinPool.WorkQueue w;
+        if ((s = status) >= 0) {
+            if (((t = Thread.currentThread()) instanceof ForkJoinWorkerThread)) {
+                if (!(w = (wt = (ForkJoinWorkerThread)t).workQueue).
+                    tryUnpush(this) || (s = doExec()) >= 0)
+                    s = wt.pool.awaitJoin(w, this);
             }
+            else
+                s = externalAwaitDone();
         }
+        return s;
     }
 
     /**
-     * Handle interruptions during waits.
+     * Implementation for invoke, quietlyInvoke.
+     *
+     * @return status upon completion
      */
-    private void onInterruptedWait() {
-        ForkJoinWorkerThread w = getWorker();
-        if (w == null)
-            Thread.currentThread().interrupt(); // re-interrupt
-        else if (w.isTerminating())
-            cancelIgnoringExceptions();
-        // else if FJworker, ignore interrupt
+    private int doInvoke() {
+        int s; Thread t; ForkJoinWorkerThread wt;
+        if ((s = doExec()) >= 0) {
+            if ((t = Thread.currentThread()) instanceof ForkJoinWorkerThread)
+                s = (wt = (ForkJoinWorkerThread)t).pool.awaitJoin(wt.workQueue,
+                                                                  this);
+            else
+                s = externalAwaitDone();
+        }
+        return s;
     }
 
-    // Recording and reporting exceptions
+    // Exception table support
 
-    private void setDoneExceptionally(Throwable rex) {
-        exceptionMap.put(this, rex);
-        setCompletion(EXCEPTIONAL);
-    }
+    /**
+     * Table of exceptions thrown by tasks, to enable reporting by
+     * callers. Because exceptions are rare, we don't directly keep
+     * them with task objects, but instead use a weak ref table.  Note
+     * that cancellation exceptions don't appear in the table, but are
+     * instead recorded as status values.
+     *
+     * Note: These statics are initialized below in static block.
+     */
+    private static final ExceptionNode[] exceptionTable;
+    private static final ReentrantLock exceptionTableLock;
+    private static final ReferenceQueue<Object> exceptionTableRefQueue;
 
     /**
-     * Throws the exception associated with status s;
-     * @throws the exception
+     * Fixed capacity for exceptionTable.
      */
-    private void reportException(int s) {
-        if ((s &= COMPLETION_MASK) < NORMAL) {
-            if (s == CANCELLED)
-                throw new CancellationException();
-            else
-                rethrowException(exceptionMap.get(this));
+    private static final int EXCEPTION_MAP_CAPACITY = 32;
+
+    /**
+     * Key-value nodes for exception table.  The chained hash table
+     * uses identity comparisons, full locking, and weak references
+     * for keys. The table has a fixed capacity because it only
+     * maintains task exceptions long enough for joiners to access
+     * them, so should never become very large for sustained
+     * periods. However, since we do not know when the last joiner
+     * completes, we must use weak references and expunge them. We do
+     * so on each operation (hence full locking). Also, some thread in
+     * any ForkJoinPool will call helpExpungeStaleExceptions when its
+     * pool becomes isQuiescent.
+     */
+    static final class ExceptionNode extends WeakReference<ForkJoinTask<?>> {
+        final Throwable ex;
+        ExceptionNode next;
+        final long thrower;  // use id not ref to avoid weak cycles
+        ExceptionNode(ForkJoinTask<?> task, Throwable ex, ExceptionNode next) {
+            super(task, exceptionTableRefQueue);
+            this.ex = ex;
+            this.next = next;
+            this.thrower = Thread.currentThread().getId();
         }
     }
 
     /**
-     * Returns result or throws exception using j.u.c.Future conventions
-     * Only call when isDone known to be true.
+     * Records exception and sets exceptional completion.
+     *
+     * @return status on exit
      */
-    private V reportFutureResult()
-        throws ExecutionException, InterruptedException {
-        int s = status & COMPLETION_MASK;
-        if (s < NORMAL) {
-            Throwable ex;
-            if (s == CANCELLED)
-                throw new CancellationException();
-            if (s == EXCEPTIONAL && (ex = exceptionMap.get(this)) != null)
-                throw new ExecutionException(ex);
-            if (Thread.interrupted())
-                throw new InterruptedException();
+    private int setExceptionalCompletion(Throwable ex) {
+        int h = System.identityHashCode(this);
+        final ReentrantLock lock = exceptionTableLock;
+        lock.lock();
+        try {
+            expungeStaleExceptions();
+            ExceptionNode[] t = exceptionTable;
+            int i = h & (t.length - 1);
+            for (ExceptionNode e = t[i]; ; e = e.next) {
+                if (e == null) {
+                    t[i] = new ExceptionNode(this, ex, t[i]);
+                    break;
+                }
+                if (e.get() == this) // already present
+                    break;
+            }
+        } finally {
+            lock.unlock();
         }
-        return getRawResult();
+        return setCompletion(EXCEPTIONAL);
     }
 
     /**
-     * Returns result or throws exception using j.u.c.Future conventions
-     * with timeouts
+     * Cancels, ignoring any exceptions thrown by cancel. Used during
+     * worker and pool shutdown. Cancel is spec'ed not to throw any
+     * exceptions, but if it does anyway, we have no recourse during
+     * shutdown, so guard against this case.
      */
-    private V reportTimedFutureResult()
-        throws InterruptedException, ExecutionException, TimeoutException {
-        Throwable ex;
-        int s = status & COMPLETION_MASK;
-        if (s == NORMAL)
-            return getRawResult();
-        if (s == CANCELLED)
-            throw new CancellationException();
-        if (s == EXCEPTIONAL && (ex = exceptionMap.get(this)) != null)
-            throw new ExecutionException(ex);
-        if (Thread.interrupted())
-            throw new InterruptedException();
-        throw new TimeoutException();
+    static final void cancelIgnoringExceptions(ForkJoinTask<?> t) {
+        if (t != null && t.status >= 0) {
+            try {
+                t.cancel(false);
+            } catch (Throwable ignore) {
+            }
+        }
     }
 
-    // internal execution methods
-
     /**
-     * Calls exec, recording completion, and rethrowing exception if
-     * encountered. Caller should normally check status before calling
-     * @return true if completed normally
+     * Removes exception node and clears status
      */
-    private boolean tryExec() {
-        try { // try block must contain only call to exec
-            if (!exec())
-                return false;
-        } catch (Throwable rex) {
-            setDoneExceptionally(rex);
-            rethrowException(rex);
-            return false; // not reached
+    private void clearExceptionalCompletion() {
+        int h = System.identityHashCode(this);
+        final ReentrantLock lock = exceptionTableLock;
+        lock.lock();
+        try {
+            ExceptionNode[] t = exceptionTable;
+            int i = h & (t.length - 1);
+            ExceptionNode e = t[i];
+            ExceptionNode pred = null;
+            while (e != null) {
+                ExceptionNode next = e.next;
+                if (e.get() == this) {
+                    if (pred == null)
+                        t[i] = next;
+                    else
+                        pred.next = next;
+                    break;
+                }
+                pred = e;
+                e = next;
+            }
+            expungeStaleExceptions();
+            status = 0;
+        } finally {
+            lock.unlock();
         }
-        setNormalCompletion();
-        return true;
     }
 
     /**
-     * Main execution method used by worker threads. Invokes
-     * base computation unless already complete
+     * Returns a rethrowable exception for the given task, if
+     * available. To provide accurate stack traces, if the exception
+     * was not thrown by the current thread, we try to create a new
+     * exception of the same type as the one thrown, but with the
+     * recorded exception as its cause. If there is no such
+     * constructor, we instead try to use a no-arg constructor,
+     * followed by initCause, to the same effect. If none of these
+     * apply, or any fail due to other exceptions, we return the
+     * recorded exception, which is still correct, although it may
+     * contain a misleading stack trace.
+     *
+     * @return the exception, or null if none
      */
-    final void quietlyExec() {
-        if (status >= 0) {
+    private Throwable getThrowableException() {
+        if ((status & DONE_MASK) != EXCEPTIONAL)
+            return null;
+        int h = System.identityHashCode(this);
+        ExceptionNode e;
+        final ReentrantLock lock = exceptionTableLock;
+        lock.lock();
+        try {
+            expungeStaleExceptions();
+            ExceptionNode[] t = exceptionTable;
+            e = t[h & (t.length - 1)];
+            while (e != null && e.get() != this)
+                e = e.next;
+        } finally {
+            lock.unlock();
+        }
+        Throwable ex;
+        if (e == null || (ex = e.ex) == null)
+            return null;
+        if (e.thrower != Thread.currentThread().getId()) {
+            Class<? extends Throwable> ec = ex.getClass();
             try {
-                if (!exec())
-                    return;
-            } catch(Throwable rex) {
-                setDoneExceptionally(rex);
-                return;
+                Constructor<?> noArgCtor = null;
+                Constructor<?>[] cs = ec.getConstructors();// public ctors only
+                for (int i = 0; i < cs.length; ++i) {
+                    Constructor<?> c = cs[i];
+                    Class<?>[] ps = c.getParameterTypes();
+                    if (ps.length == 0)
+                        noArgCtor = c;
+                    else if (ps.length == 1 && ps[0] == Throwable.class)
+                        return (Throwable)(c.newInstance(ex));
+                }
+                if (noArgCtor != null) {
+                    Throwable wx = (Throwable)(noArgCtor.newInstance());
+                    wx.initCause(ex);
+                    return wx;
+                }
+            } catch (Exception ignore) {
             }
-            setNormalCompletion();
         }
+        return ex;
     }
 
     /**
-     * Calls exec, recording but not rethrowing exception
-     * Caller should normally check status before calling
-     * @return true if completed normally
+     * Poll stale refs and remove them. Call only while holding lock.
      */
-    private boolean tryQuietlyInvoke() {
-        try {
-            if (!exec())
-                return false;
-        } catch (Throwable rex) {
-            setDoneExceptionally(rex);
-            return false;
+    private static void expungeStaleExceptions() {
+        for (Object x; (x = exceptionTableRefQueue.poll()) != null;) {
+            if (x instanceof ExceptionNode) {
+                ForkJoinTask<?> key = ((ExceptionNode)x).get();
+                ExceptionNode[] t = exceptionTable;
+                int i = System.identityHashCode(key) & (t.length - 1);
+                ExceptionNode e = t[i];
+                ExceptionNode pred = null;
+                while (e != null) {
+                    ExceptionNode next = e.next;
+                    if (e == x) {
+                        if (pred == null)
+                            t[i] = next;
+                        else
+                            pred.next = next;
+                        break;
+                    }
+                    pred = e;
+                    e = next;
+                }
+            }
         }
-        setNormalCompletion();
-        return true;
     }
 
     /**
-     * Cancel, ignoring any exceptions it throws
+     * If lock is available, poll stale refs and remove them.
+     * Called from ForkJoinPool when pools become quiescent.
      */
-    final void cancelIgnoringExceptions() {
-        try {
-            cancel(false);
-        } catch(Throwable ignore) {
+    static final void helpExpungeStaleExceptions() {
+        final ReentrantLock lock = exceptionTableLock;
+        if (lock.tryLock()) {
+            try {
+                expungeStaleExceptions();
+            } finally {
+                lock.unlock();
+            }
         }
     }
 
     /**
-     * Main implementation of helpJoin
+     * Throws exception, if any, associated with the given status.
      */
-    private int busyJoin(ForkJoinWorkerThread w) {
-        int s;
-        ForkJoinTask<?> t;
-        while ((s = status) >= 0 && (t = w.scanWhileJoining(this)) != null)
-            t.quietlyExec();
-        return (s >= 0)? awaitDone(w, false) : s; // block if no work
+    private void reportException(int s) {
+        Throwable ex = ((s == CANCELLED) ?  new CancellationException() :
+                        (s == EXCEPTIONAL) ? getThrowableException() :
+                        null);
+        if (ex != null)
+            U.throwException(ex);
     }
 
     // public methods
@@ -472,70 +599,111 @@ public abstract class ForkJoinTask<V> implements Future<V>, Serializable {
     /**
      * Arranges to asynchronously execute this task.  While it is not
      * necessarily enforced, it is a usage error to fork a task more
-     * than once unless it has completed and been reinitialized.  This
-     * method may be invoked only from within ForkJoinTask
-     * computations. Attempts to invoke in other contexts result in
-     * exceptions or errors possibly including ClassCastException.
+     * than once unless it has completed and been reinitialized.
+     * Subsequent modifications to the state of this task or any data
+     * it operates on are not necessarily consistently observable by
+     * any thread other than the one executing it unless preceded by a
+     * call to {@link #join} or related methods, or a call to {@link
+     * #isDone} returning {@code true}.
+     *
+     * <p>This method may be invoked only from within {@code
+     * ForkJoinPool} computations (as may be determined using method
+     * {@link #inForkJoinPool}).  Attempts to invoke in other contexts
+     * result in exceptions or errors, possibly including {@code
+     * ClassCastException}.
+     *
+     * @return {@code this}, to simplify usage
      */
-    public final void fork() {
-        ((ForkJoinWorkerThread)(Thread.currentThread())).pushTask(this);
+    public final ForkJoinTask<V> fork() {
+        ((ForkJoinWorkerThread)Thread.currentThread()).workQueue.push(this);
+        return this;
     }
 
     /**
-     * Returns the result of the computation when it is ready.
-     * This method differs from <code>get</code> in that abnormal
-     * completion results in RuntimeExceptions or Errors, not
-     * ExecutionExceptions.
+     * Returns the result of the computation when it {@link #isDone is
+     * done}.  This method differs from {@link #get()} in that
+     * abnormal completion results in {@code RuntimeException} or
+     * {@code Error}, not {@code ExecutionException}, and that
+     * interrupts of the calling thread do <em>not</em> cause the
+     * method to abruptly return by throwing {@code
+     * InterruptedException}.
      *
      * @return the computed result
      */
     public final V join() {
-        ForkJoinWorkerThread w = getWorker();
-        if (w == null || status < 0 || !w.unpushTask(this) || !tryExec())
-            reportException(awaitDone(w, true));
+        int s;
+        if ((s = doJoin() & DONE_MASK) != NORMAL)
+            reportException(s);
         return getRawResult();
     }
 
     /**
      * Commences performing this task, awaits its completion if
-     * necessary, and return its result.
-     * @throws Throwable (a RuntimeException, Error, or unchecked
-     * exception) if the underlying computation did so.
+     * necessary, and returns its result, or throws an (unchecked)
+     * {@code RuntimeException} or {@code Error} if the underlying
+     * computation did so.
+     *
      * @return the computed result
      */
     public final V invoke() {
-        if (status >= 0 && tryExec())
-            return getRawResult();
-        else
-            return join();
+        int s;
+        if ((s = doInvoke() & DONE_MASK) != NORMAL)
+            reportException(s);
+        return getRawResult();
     }
 
     /**
-     * Forks both tasks, returning when <code>isDone</code> holds for
-     * both of them or an exception is encountered. This method may be
-     * invoked only from within ForkJoinTask computations. Attempts to
-     * invoke in other contexts result in exceptions or errors
-     * possibly including ClassCastException.
-     * @param t1 one task
-     * @param t2 the other task
-     * @throws NullPointerException if t1 or t2 are null
-     * @throws RuntimeException or Error if either task did so.
+     * Forks the given tasks, returning when {@code isDone} holds for
+     * each task or an (unchecked) exception is encountered, in which
+     * case the exception is rethrown. If more than one task
+     * encounters an exception, then this method throws any one of
+     * these exceptions. If any task encounters an exception, the
+     * other may be cancelled. However, the execution status of
+     * individual tasks is not guaranteed upon exceptional return. The
+     * status of each task may be obtained using {@link
+     * #getException()} and related methods to check if they have been
+     * cancelled, completed normally or exceptionally, or left
+     * unprocessed.
+     *
+     * <p>This method may be invoked only from within {@code
+     * ForkJoinPool} computations (as may be determined using method
+     * {@link #inForkJoinPool}).  Attempts to invoke in other contexts
+     * result in exceptions or errors, possibly including {@code
+     * ClassCastException}.
+     *
+     * @param t1 the first task
+     * @param t2 the second task
+     * @throws NullPointerException if any task is null
      */
-    public static void invokeAll(ForkJoinTask<?>t1, ForkJoinTask<?> t2) {
+    public static void invokeAll(ForkJoinTask<?> t1, ForkJoinTask<?> t2) {
+        int s1, s2;
         t2.fork();
-        t1.invoke();
-        t2.join();
+        if ((s1 = t1.doInvoke() & DONE_MASK) != NORMAL)
+            t1.reportException(s1);
+        if ((s2 = t2.doJoin() & DONE_MASK) != NORMAL)
+            t2.reportException(s2);
     }
 
     /**
-     * Forks the given tasks, returning when <code>isDone</code> holds
-     * for all of them. If any task encounters an exception, others
-     * may be cancelled.  This method may be invoked only from within
-     * ForkJoinTask computations. Attempts to invoke in other contexts
-     * result in exceptions or errors possibly including ClassCastException.
-     * @param tasks the array of tasks
-     * @throws NullPointerException if tasks or any element are null.
-     * @throws RuntimeException or Error if any task did so.
+     * Forks the given tasks, returning when {@code isDone} holds for
+     * each task or an (unchecked) exception is encountered, in which
+     * case the exception is rethrown. If more than one task
+     * encounters an exception, then this method throws any one of
+     * these exceptions. If any task encounters an exception, others
+     * may be cancelled. However, the execution status of individual
+     * tasks is not guaranteed upon exceptional return. The status of
+     * each task may be obtained using {@link #getException()} and
+     * related methods to check if they have been cancelled, completed
+     * normally or exceptionally, or left unprocessed.
+     *
+     * <p>This method may be invoked only from within {@code
+     * ForkJoinPool} computations (as may be determined using method
+     * {@link #inForkJoinPool}).  Attempts to invoke in other contexts
+     * result in exceptions or errors, possibly including {@code
+     * ClassCastException}.
+     *
+     * @param tasks the tasks
+     * @throws NullPointerException if any task is null
      */
     public static void invokeAll(ForkJoinTask<?>... tasks) {
         Throwable ex = null;
@@ -548,46 +716,53 @@ public abstract class ForkJoinTask<V> implements Future<V>, Serializable {
             }
             else if (i != 0)
                 t.fork();
-            else {
-                t.quietlyInvoke();
-                if (ex == null)
-                    ex = t.getException();
-            }
+            else if (t.doInvoke() < NORMAL && ex == null)
+                ex = t.getException();
         }
         for (int i = 1; i <= last; ++i) {
             ForkJoinTask<?> t = tasks[i];
             if (t != null) {
                 if (ex != null)
                     t.cancel(false);
-                else {
-                    t.quietlyJoin();
-                    if (ex == null)
-                        ex = t.getException();
-                }
+                else if (t.doJoin() < NORMAL)
+                    ex = t.getException();
             }
         }
         if (ex != null)
-            rethrowException(ex);
+            U.throwException(ex);
     }
 
     /**
-     * Forks all tasks in the collection, returning when
-     * <code>isDone</code> holds for all of them. If any task
-     * encounters an exception, others may be cancelled.  This method
-     * may be invoked only from within ForkJoinTask
-     * computations. Attempts to invoke in other contexts resul!t in
-     * exceptions or errors possibly including ClassCastException.
+     * Forks all tasks in the specified collection, returning when
+     * {@code isDone} holds for each task or an (unchecked) exception
+     * is encountered, in which case the exception is rethrown. If
+     * more than one task encounters an exception, then this method
+     * throws any one of these exceptions. If any task encounters an
+     * exception, others may be cancelled. However, the execution
+     * status of individual tasks is not guaranteed upon exceptional
+     * return. The status of each task may be obtained using {@link
+     * #getException()} and related methods to check if they have been
+     * cancelled, completed normally or exceptionally, or left
+     * unprocessed.
+     *
+     * <p>This method may be invoked only from within {@code
+     * ForkJoinPool} computations (as may be determined using method
+     * {@link #inForkJoinPool}).  Attempts to invoke in other contexts
+     * result in exceptions or errors, possibly including {@code
+     * ClassCastException}.
+     *
      * @param tasks the collection of tasks
-     * @throws NullPointerException if tasks or any element are null.
-     * @throws RuntimeException or Error if any task did so.
+     * @return the tasks argument, to simplify usage
+     * @throws NullPointerException if tasks or any element are null
      */
-    public static void invokeAll(Collection<? extends ForkJoinTask<?>> tasks) {
-        if (!(tasks instanceof List)) {
-            invokeAll(tasks.toArray(new ForkJoinTask[tasks.size()]));
-            return;
+    public static <T extends ForkJoinTask<?>> Collection<T> invokeAll(Collection<T> tasks) {
+        if (!(tasks instanceof RandomAccess) || !(tasks instanceof List<?>)) {
+            invokeAll(tasks.toArray(new ForkJoinTask<?>[tasks.size()]));
+            return tasks;
         }
+        @SuppressWarnings("unchecked")
         List<? extends ForkJoinTask<?>> ts =
-            (List<? extends ForkJoinTask<?>>)tasks;
+            (List<? extends ForkJoinTask<?>>) tasks;
         Throwable ex = null;
         int last = ts.size() - 1;
         for (int i = last; i >= 0; --i) {
@@ -598,253 +773,326 @@ public abstract class ForkJoinTask<V> implements Future<V>, Serializable {
             }
             else if (i != 0)
                 t.fork();
-            else {
-                t.quietlyInvoke();
-                if (ex == null)
-                    ex = t.getException();
-            }
+            else if (t.doInvoke() < NORMAL && ex == null)
+                ex = t.getException();
         }
         for (int i = 1; i <= last; ++i) {
             ForkJoinTask<?> t = ts.get(i);
             if (t != null) {
                 if (ex != null)
                     t.cancel(false);
-                else {
-                    t.quietlyJoin();
-                    if (ex == null)
-                        ex = t.getException();
-                }
+                else if (t.doJoin() < NORMAL)
+                    ex = t.getException();
             }
         }
         if (ex != null)
-            rethrowException(ex);
+            U.throwException(ex);
+        return tasks;
     }
 
     /**
-     * Returns true if the computation performed by this task has
-     * completed (or has been cancelled).
-     * @return true if this computation has completed
+     * Attempts to cancel execution of this task. This attempt will
+     * fail if the task has already completed or could not be
+     * cancelled for some other reason. If successful, and this task
+     * has not started when {@code cancel} is called, execution of
+     * this task is suppressed. After this method returns
+     * successfully, unless there is an intervening call to {@link
+     * #reinitialize}, subsequent calls to {@link #isCancelled},
+     * {@link #isDone}, and {@code cancel} will return {@code true}
+     * and calls to {@link #join} and related methods will result in
+     * {@code CancellationException}.
+     *
+     * <p>This method may be overridden in subclasses, but if so, must
+     * still ensure that these properties hold. In particular, the
+     * {@code cancel} method itself must not throw exceptions.
+     *
+     * <p>This method is designed to be invoked by <em>other</em>
+     * tasks. To terminate the current task, you can just return or
+     * throw an unchecked exception from its computation method, or
+     * invoke {@link #completeExceptionally}.
+     *
+     * @param mayInterruptIfRunning this value has no effect in the
+     * default implementation because interrupts are not used to
+     * control cancellation.
+     *
+     * @return {@code true} if this task is now cancelled
      */
+    public boolean cancel(boolean mayInterruptIfRunning) {
+        return (setCompletion(CANCELLED) & DONE_MASK) == CANCELLED;
+    }
+
     public final boolean isDone() {
         return status < 0;
     }
 
-    /**
-     * Returns true if this task was cancelled.
-     * @return true if this task was cancelled
-     */
     public final boolean isCancelled() {
-        return (status & COMPLETION_MASK) == CANCELLED;
+        return (status & DONE_MASK) == CANCELLED;
     }
 
     /**
-     * Asserts that the results of this task's computation will not be
-     * used. If a cancellation occurs before atempting to execute this
-     * task, then execution will be suppressed, <code>isCancelled</code>
-     * will report true, and <code>join</code> will result in a
-     * <code>CancellationException</code> being thrown. Otherwise, when
-     * cancellation races with completion, there are no guarantees
-     * about whether <code>isCancelled</code> will report true, whether
-     * <code>join</code> will return normally or via an exception, or
-     * whether these behaviors will remain consistent upon repeated
-     * invocation.
-     *
-     * <p>This method may be overridden in subclasses, but if so, must
-     * still ensure that these minimal properties hold. In particular,
-     * the cancel method itself must not throw exceptions.
-     *
-     * <p> This method is designed to be invoked by <em>other</em>
-     * tasks. To terminate the current task, you can just return or
-     * throw an unchecked exception from its computation method, or
-     * invoke <code>completeExceptionally</code>.
-     *
-     * @param mayInterruptIfRunning this value is ignored in the
-     * default implementation because tasks are not in general
-     * cancelled via interruption.
+     * Returns {@code true} if this task threw an exception or was cancelled.
      *
-     * @return true if this task is now cancelled
+     * @return {@code true} if this task threw an exception or was cancelled
      */
-    public boolean cancel(boolean mayInterruptIfRunning) {
-        setCompletion(CANCELLED);
-        return (status & COMPLETION_MASK) == CANCELLED;
+    public final boolean isCompletedAbnormally() {
+        return status < NORMAL;
     }
 
     /**
-     * Returns true if this task threw an exception or was cancelled
-     * @return true if this task threw an exception or was cancelled
+     * Returns {@code true} if this task completed without throwing an
+     * exception and was not cancelled.
+     *
+     * @return {@code true} if this task completed without throwing an
+     * exception and was not cancelled
      */
-    public final boolean isCompletedAbnormally() {
-        return (status & COMPLETION_MASK) < NORMAL;
+    public final boolean isCompletedNormally() {
+        return (status & DONE_MASK) == NORMAL;
     }
 
     /**
      * Returns the exception thrown by the base computation, or a
-     * CancellationException if cancelled, or null if none or if the
-     * method has not yet completed.
-     * @return the exception, or null if none
+     * {@code CancellationException} if cancelled, or {@code null} if
+     * none or if the method has not yet completed.
+     *
+     * @return the exception, or {@code null} if none
      */
     public final Throwable getException() {
-        int s = status & COMPLETION_MASK;
-        if (s >= NORMAL)
-            return null;
-        if (s == CANCELLED)
-            return new CancellationException();
-        return exceptionMap.get(this);
+        int s = status & DONE_MASK;
+        return ((s >= NORMAL)    ? null :
+                (s == CANCELLED) ? new CancellationException() :
+                getThrowableException());
     }
 
     /**
      * Completes this task abnormally, and if not already aborted or
      * cancelled, causes it to throw the given exception upon
-     * <code>join</code> and related operations. This method may be used
+     * {@code join} and related operations. This method may be used
      * to induce exceptions in asynchronous tasks, or to force
      * completion of tasks that would not otherwise complete.  Its use
-     * in other situations is likely to be wrong.  This method is
-     * overridable, but overridden versions must invoke <code>super</code>
+     * in other situations is discouraged.  This method is
+     * overridable, but overridden versions must invoke {@code super}
      * implementation to maintain guarantees.
      *
-     * @param ex the exception to throw. If this exception is
-     * not a RuntimeException or Error, the actual exception thrown
-     * will be a RuntimeException with cause ex.
+     * @param ex the exception to throw. If this exception is not a
+     * {@code RuntimeException} or {@code Error}, the actual exception
+     * thrown will be a {@code RuntimeException} with cause {@code ex}.
      */
     public void completeExceptionally(Throwable ex) {
-        setDoneExceptionally((ex instanceof RuntimeException) ||
-                             (ex instanceof Error)? ex :
-                             new RuntimeException(ex));
+        setExceptionalCompletion((ex instanceof RuntimeException) ||
+                                 (ex instanceof Error) ? ex :
+                                 new RuntimeException(ex));
     }
 
     /**
      * Completes this task, and if not already aborted or cancelled,
-     * returning a <code>null</code> result upon <code>join</code> and related
-     * operations. This method may be used to provide results for
-     * asynchronous tasks, or to provide alternative handling for
-     * tasks that would not otherwise complete normally. Its use in
-     * other situations is likely to be wrong. This method is
-     * overridable, but overridden versions must invoke <code>super</code>
-     * implementation to maintain guarantees.
+     * returning the given value as the result of subsequent
+     * invocations of {@code join} and related operations. This method
+     * may be used to provide results for asynchronous tasks, or to
+     * provide alternative handling for tasks that would not otherwise
+     * complete normally. Its use in other situations is
+     * discouraged. This method is overridable, but overridden
+     * versions must invoke {@code super} implementation to maintain
+     * guarantees.
      *
-     * @param value the result value for this task.
+     * @param value the result value for this task
      */
     public void complete(V value) {
         try {
             setRawResult(value);
-        } catch(Throwable rex) {
-            setDoneExceptionally(rex);
+        } catch (Throwable rex) {
+            setExceptionalCompletion(rex);
             return;
         }
-        setNormalCompletion();
-    }
-
-    public final V get() throws InterruptedException, ExecutionException {
-        ForkJoinWorkerThread w = getWorker();
-        if (w == null || status < 0 || !w.unpushTask(this) || !tryQuietlyInvoke())
-            awaitDone(w, true);
-        return reportFutureResult();
-    }
-
-    public final V get(long timeout, TimeUnit unit)
-        throws InterruptedException, ExecutionException, TimeoutException {
-        ForkJoinWorkerThread w = getWorker();
-        if (w == null || status < 0 || !w.unpushTask(this) || !tryQuietlyInvoke())
-            awaitDone(w, unit.toNanos(timeout));
-        return reportTimedFutureResult();
+        setCompletion(NORMAL);
     }
 
     /**
-     * Possibly executes other tasks until this task is ready, then
-     * returns the result of the computation.  This method may be more
-     * efficient than <code>join</code>, but is only applicable when
-     * there are no potemtial dependencies between continuation of the
-     * current task and that of any other task that might be executed
-     * while helping. (This usually holds for pure divide-and-conquer
-     * tasks). This method may be invoked only from within
-     * ForkJoinTask computations. Attempts to invoke in other contexts
-     * resul!t in exceptions or errors possibly including ClassCastException.
+     * Waits if necessary for the computation to complete, and then
+     * retrieves its result.
+     *
      * @return the computed result
+     * @throws CancellationException if the computation was cancelled
+     * @throws ExecutionException if the computation threw an
+     * exception
+     * @throws InterruptedException if the current thread is not a
+     * member of a ForkJoinPool and was interrupted while waiting
      */
-    public final V helpJoin() {
-        ForkJoinWorkerThread w = (ForkJoinWorkerThread)(Thread.currentThread());
-        if (status < 0 || !w.unpushTask(this) || !tryExec())
-            reportException(busyJoin(w));
+    public final V get() throws InterruptedException, ExecutionException {
+        int s = (Thread.currentThread() instanceof ForkJoinWorkerThread) ?
+            doJoin() : externalInterruptibleAwaitDone();
+        Throwable ex;
+        if ((s &= DONE_MASK) == CANCELLED)
+            throw new CancellationException();
+        if (s == EXCEPTIONAL && (ex = getThrowableException()) != null)
+            throw new ExecutionException(ex);
         return getRawResult();
     }
 
     /**
-     * Possibly executes other tasks until this task is ready.  This
-     * method may be invoked only from within ForkJoinTask
-     * computations. Attempts to invoke in other contexts resul!t in
-     * exceptions or errors possibly including ClassCastException.
+     * Waits if necessary for at most the given time for the computation
+     * to complete, and then retrieves its result, if available.
+     *
+     * @param timeout the maximum time to wait
+     * @param unit the time unit of the timeout argument
+     * @return the computed result
+     * @throws CancellationException if the computation was cancelled
+     * @throws ExecutionException if the computation threw an
+     * exception
+     * @throws InterruptedException if the current thread is not a
+     * member of a ForkJoinPool and was interrupted while waiting
+     * @throws TimeoutException if the wait timed out
      */
-    public final void quietlyHelpJoin() {
-        if (status >= 0) {
-            ForkJoinWorkerThread w =
-                (ForkJoinWorkerThread)(Thread.currentThread());
-            if (!w.unpushTask(this) || !tryQuietlyInvoke())
-                busyJoin(w);
+    public final V get(long timeout, TimeUnit unit)
+        throws InterruptedException, ExecutionException, TimeoutException {
+        if (Thread.interrupted())
+            throw new InterruptedException();
+        // Messy in part because we measure in nanosecs, but wait in millisecs
+        int s; long ns, ms;
+        if ((s = status) >= 0 && (ns = unit.toNanos(timeout)) > 0L) {
+            long deadline = System.nanoTime() + ns;
+            ForkJoinPool p = null;
+            ForkJoinPool.WorkQueue w = null;
+            Thread t = Thread.currentThread();
+            if (t instanceof ForkJoinWorkerThread) {
+                ForkJoinWorkerThread wt = (ForkJoinWorkerThread)t;
+                p = wt.pool;
+                w = wt.workQueue;
+                s = p.helpJoinOnce(w, this); // no retries on failure
+            }
+            boolean canBlock = false;
+            boolean interrupted = false;
+            try {
+                while ((s = status) >= 0) {
+                    if (w != null && w.runState < 0)
+                        cancelIgnoringExceptions(this);
+                    else if (!canBlock) {
+                        if (p == null || p.tryCompensate(this, null))
+                            canBlock = true;
+                    }
+                    else {
+                        if ((ms = TimeUnit.NANOSECONDS.toMillis(ns)) > 0L &&
+                            U.compareAndSwapInt(this, STATUS, s, s | SIGNAL)) {
+                            synchronized (this) {
+                                if (status >= 0) {
+                                    try {
+                                        wait(ms);
+                                    } catch (InterruptedException ie) {
+                                        if (p == null)
+                                            interrupted = true;
+                                    }
+                                }
+                                else
+                                    notifyAll();
+                            }
+                        }
+                        if ((s = status) < 0 || interrupted ||
+                            (ns = deadline - System.nanoTime()) <= 0L)
+                            break;
+                    }
+                }
+            } finally {
+                if (p != null && canBlock)
+                    p.incrementActiveCount();
+            }
+            if (interrupted)
+                throw new InterruptedException();
+        }
+        if ((s &= DONE_MASK) != NORMAL) {
+            Throwable ex;
+            if (s == CANCELLED)
+                throw new CancellationException();
+            if (s != EXCEPTIONAL)
+                throw new TimeoutException();
+            if ((ex = getThrowableException()) != null)
+                throw new ExecutionException(ex);
         }
+        return getRawResult();
     }
 
     /**
-     * Joins this task, without returning its result or throwing an
+     * Joins this task, without returning its result or throwing its
      * exception. This method may be useful when processing
      * collections of tasks when some have been cancelled or otherwise
      * known to have aborted.
      */
     public final void quietlyJoin() {
-        if (status >= 0) {
-            ForkJoinWorkerThread w = getWorker();
-            if (w == null || !w.unpushTask(this) || !tryQuietlyInvoke())
-                awaitDone(w, true);
-        }
+        doJoin();
     }
 
     /**
      * Commences performing this task and awaits its completion if
-     * necessary, without returning its result or throwing an
-     * exception. This method may be useful when processing
-     * collections of tasks when some have been cancelled or otherwise
-     * known to have aborted.
+     * necessary, without returning its result or throwing its
+     * exception.
      */
     public final void quietlyInvoke() {
-        if (status >= 0 && !tryQuietlyInvoke())
-            quietlyJoin();
+        doInvoke();
     }
 
     /**
      * Possibly executes tasks until the pool hosting the current task
-     * {@link ForkJoinPool#isQuiescent}. This method may be of use in
-     * designs in which many tasks are forked, but none are explicitly
-     * joined, instead executing them until all are processed.
+     * {@link ForkJoinPool#isQuiescent is quiescent}. This method may
+     * be of use in designs in which many tasks are forked, but none
+     * are explicitly joined, instead executing them until all are
+     * processed.
+     *
+     * <p>This method may be invoked only from within {@code
+     * ForkJoinPool} computations (as may be determined using method
+     * {@link #inForkJoinPool}).  Attempts to invoke in other contexts
+     * result in exceptions or errors, possibly including {@code
+     * ClassCastException}.
      */
     public static void helpQuiesce() {
-        ((ForkJoinWorkerThread)(Thread.currentThread())).
-            helpQuiescePool();
+        ForkJoinWorkerThread wt =
+            (ForkJoinWorkerThread)Thread.currentThread();
+        wt.pool.helpQuiescePool(wt.workQueue);
     }
 
     /**
      * Resets the internal bookkeeping state of this task, allowing a
-     * subsequent <code>fork</code>. This method allows repeated reuse of
+     * subsequent {@code fork}. This method allows repeated reuse of
      * this task, but only if reuse occurs when this task has either
      * never been forked, or has been forked, then completed and all
      * outstanding joins of this task have also completed. Effects
-     * under any other usage conditions are not guaranteed, and are
-     * almost surely wrong. This method may be useful when executing
+     * under any other usage conditions are not guaranteed.
+     * This method may be useful when executing
      * pre-constructed trees of subtasks in loops.
+     *
+     * <p>Upon completion of this method, {@code isDone()} reports
+     * {@code false}, and {@code getException()} reports {@code
+     * null}. However, the value returned by {@code getRawResult} is
+     * unaffected. To clear this value, you can invoke {@code
+     * setRawResult(null)}.
      */
     public void reinitialize() {
-        if ((status & COMPLETION_MASK) == EXCEPTIONAL)
-            exceptionMap.remove(this);
-        status = 0;
+        if ((status & DONE_MASK) == EXCEPTIONAL)
+            clearExceptionalCompletion();
+        else
+            status = 0;
     }
 
     /**
      * Returns the pool hosting the current task execution, or null
-     * if this task is executing outside of any pool.
-     * @return the pool, or null if none.
+     * if this task is executing outside of any ForkJoinPool.
+     *
+     * @see #inForkJoinPool
+     * @return the pool, or {@code null} if none
      */
     public static ForkJoinPool getPool() {
         Thread t = Thread.currentThread();
-        return ((t instanceof ForkJoinWorkerThread)?
-                ((ForkJoinWorkerThread)t).pool : null);
+        return (t instanceof ForkJoinWorkerThread) ?
+            ((ForkJoinWorkerThread) t).pool : null;
+    }
+
+    /**
+     * Returns {@code true} if the current thread is a {@link
+     * ForkJoinWorkerThread} executing as a ForkJoinPool computation.
+     *
+     * @return {@code true} if the current thread is a {@link
+     * ForkJoinWorkerThread} executing as a ForkJoinPool computation,
+     * or {@code false} otherwise
+     */
+    public static boolean inForkJoinPool() {
+        return Thread.currentThread() instanceof ForkJoinWorkerThread;
     }
 
     /**
@@ -853,13 +1101,19 @@ public abstract class ForkJoinTask<V> implements Future<V>, Serializable {
      * by the current thread, and has not commenced executing in
      * another thread.  This method may be useful when arranging
      * alternative local processing of tasks that could have been, but
-     * were not, stolen. This method may be invoked only from within
-     * ForkJoinTask computations. Attempts to invoke in other contexts
-     * result in exceptions or errors possibly including ClassCastException.
-     * @return true if unforked
+     * were not, stolen.
+     *
+     * <p>This method may be invoked only from within {@code
+     * ForkJoinPool} computations (as may be determined using method
+     * {@link #inForkJoinPool}).  Attempts to invoke in other contexts
+     * result in exceptions or errors, possibly including {@code
+     * ClassCastException}.
+     *
+     * @return {@code true} if unforked
      */
     public boolean tryUnfork() {
-        return ((ForkJoinWorkerThread)(Thread.currentThread())).unpushTask(this);
+        return ((ForkJoinWorkerThread)Thread.currentThread())
+            .workQueue.tryUnpush(this);
     }
 
     /**
@@ -867,15 +1121,22 @@ public abstract class ForkJoinTask<V> implements Future<V>, Serializable {
      * forked by the current worker thread but not yet executed. This
      * value may be useful for heuristic decisions about whether to
      * fork other tasks.
+     *
+     * <p>This method may be invoked only from within {@code
+     * ForkJoinPool} computations (as may be determined using method
+     * {@link #inForkJoinPool}).  Attempts to invoke in other contexts
+     * result in exceptions or errors, possibly including {@code
+     * ClassCastException}.
+     *
      * @return the number of tasks
      */
     public static int getQueuedTaskCount() {
-        return ((ForkJoinWorkerThread)(Thread.currentThread())).
-            getQueueSize();
+        return ((ForkJoinWorkerThread) Thread.currentThread())
+            .workQueue.queueSize();
     }
 
     /**
-     * Returns a estimate of how many more locally queued tasks are
+     * Returns an estimate of how many more locally queued tasks are
      * held by the current worker thread than there are other worker
      * threads that might steal them.  This value may be useful for
      * heuristic decisions about whether to fork other tasks. In many
@@ -883,23 +1144,74 @@ public abstract class ForkJoinTask<V> implements Future<V>, Serializable {
      * aim to maintain a small constant surplus (for example, 3) of
      * tasks, and to process computations locally if this threshold is
      * exceeded.
+     *
+     * <p>This method may be invoked only from within {@code
+     * ForkJoinPool} computations (as may be determined using method
+     * {@link #inForkJoinPool}).  Attempts to invoke in other contexts
+     * result in exceptions or errors, possibly including {@code
+     * ClassCastException}.
+     *
      * @return the surplus number of tasks, which may be negative
      */
     public static int getSurplusQueuedTaskCount() {
-        return ((ForkJoinWorkerThread)(Thread.currentThread()))
-            .getEstimatedSurplusTaskCount();
+        /*
+         * The aim of this method is to return a cheap heuristic guide
+         * for task partitioning when programmers, frameworks, tools,
+         * or languages have little or no idea about task granularity.
+         * In essence by offering this method, we ask users only about
+         * tradeoffs in overhead vs expected throughput and its
+         * variance, rather than how finely to partition tasks.
+         *
+         * In a steady state strict (tree-structured) computation,
+         * each thread makes available for stealing enough tasks for
+         * other threads to remain active. Inductively, if all threads
+         * play by the same rules, each thread should make available
+         * only a constant number of tasks.
+         *
+         * The minimum useful constant is just 1. But using a value of
+         * 1 would require immediate replenishment upon each steal to
+         * maintain enough tasks, which is infeasible.  Further,
+         * partitionings/granularities of offered tasks should
+         * minimize steal rates, which in general means that threads
+         * nearer the top of computation tree should generate more
+         * than those nearer the bottom. In perfect steady state, each
+         * thread is at approximately the same level of computation
+         * tree. However, producing extra tasks amortizes the
+         * uncertainty of progress and diffusion assumptions.
+         *
+         * So, users will want to use values larger, but not much
+         * larger than 1 to both smooth over transient shortages and
+         * hedge against uneven progress; as traded off against the
+         * cost of extra task overhead. We leave the user to pick a
+         * threshold value to compare with the results of this call to
+         * guide decisions, but recommend values such as 3.
+         *
+         * When all threads are active, it is on average OK to
+         * estimate surplus strictly locally. In steady-state, if one
+         * thread is maintaining say 2 surplus tasks, then so are
+         * others. So we can just use estimated queue length.
+         * However, this strategy alone leads to serious mis-estimates
+         * in some non-steady-state conditions (ramp-up, ramp-down,
+         * other stalls). We can detect many of these by further
+         * considering the number of "idle" threads, that are known to
+         * have zero queued tasks, so compensate by a factor of
+         * (#idle/#active) threads.
+         */
+        ForkJoinWorkerThread wt =
+            (ForkJoinWorkerThread)Thread.currentThread();
+        return wt.workQueue.queueSize() - wt.pool.idlePerActive();
     }
 
     // Extension methods
 
     /**
-     * Returns the result that would be returned by <code>join</code>,
-     * even if this task completed abnormally, or null if this task is
-     * not known to have been completed.  This method is designed to
-     * aid debugging, as well as to support extensions. Its use in any
-     * other context is discouraged.
+     * Returns the result that would be returned by {@link #join}, even
+     * if this task completed abnormally, or {@code null} if this task
+     * is not known to have been completed.  This method is designed
+     * to aid debugging, as well as to support extensions. Its use in
+     * any other context is discouraged.
      *
-     * @return the result, or null if not completed.
+     * @return the result, or {@code null} if not completed
      */
     public abstract V getRawResult();
 
@@ -918,42 +1230,52 @@ public abstract class ForkJoinTask<V> implements Future<V>, Serializable {
      * called otherwise. The return value controls whether this task
      * is considered to be done normally. It may return false in
      * asynchronous actions that require explicit invocations of
-     * <code>complete</code> to become joinable. It may throw exceptions
-     * to indicate abnormal exit.
-     * @return true if completed normally
-     * @throws Error or RuntimeException if encountered during computation
+     * {@link #complete} to become joinable. It may also throw an
+     * (unchecked) exception to indicate abnormal exit.
+     *
+     * @return {@code true} if completed normally
      */
     protected abstract boolean exec();
 
     /**
-     * Returns, but does not unschedule or execute, the task queued by
-     * the current thread but not yet executed, if one is
+     * Returns, but does not unschedule or execute, a task queued by
+     * the current thread but not yet executed, if one is immediately
      * available. There is no guarantee that this task will actually
-     * be polled or executed next.  This method is designed primarily
-     * to support extensions, and is unlikely to be useful otherwise.
-     * This method may be invoked only from within ForkJoinTask
-     * computations. Attempts to invoke in other contexts result in
-     * exceptions or errors possibly including ClassCastException.
+     * be polled or executed next. Conversely, this method may return
+     * null even if a task exists but cannot be accessed without
+     * contention with other threads.  This method is designed
+     * primarily to support extensions, and is unlikely to be useful
+     * otherwise.
+     *
+     * <p>This method may be invoked only from within {@code
+     * ForkJoinPool} computations (as may be determined using method
+     * {@link #inForkJoinPool}).  Attempts to invoke in other contexts
+     * result in exceptions or errors, possibly including {@code
+     * ClassCastException}.
      *
-     * @return the next task, or null if none are available
+     * @return the next task, or {@code null} if none are available
      */
     protected static ForkJoinTask<?> peekNextLocalTask() {
-        return ((ForkJoinWorkerThread)(Thread.currentThread())).peekTask();
+        return ((ForkJoinWorkerThread) Thread.currentThread()).workQueue.peek();
     }
 
     /**
      * Unschedules and returns, without executing, the next task
      * queued by the current thread but not yet executed.  This method
      * is designed primarily to support extensions, and is unlikely to
-     * be useful otherwise.  This method may be invoked only from
-     * within ForkJoinTask computations. Attempts to invoke in other
-     * contexts result in exceptions or errors possibly including
-     * ClassCastException.
+     * be useful otherwise.
+     *
+     * <p>This method may be invoked only from within {@code
+     * ForkJoinPool} computations (as may be determined using method
+     * {@link #inForkJoinPool}).  Attempts to invoke in other contexts
+     * result in exceptions or errors, possibly including {@code
+     * ClassCastException}.
      *
-     * @return the next task, or null if none are available
+     * @return the next task, or {@code null} if none are available
      */
     protected static ForkJoinTask<?> pollNextLocalTask() {
-        return ((ForkJoinWorkerThread)(Thread.currentThread())).pollLocalTask();
+        return ((ForkJoinWorkerThread) Thread.currentThread())
+            .workQueue.nextLocalTask();
     }
 
     /**
@@ -961,19 +1283,170 @@ public abstract class ForkJoinTask<V> implements Future<V>, Serializable {
      * queued by the current thread but not yet executed, if one is
      * available, or if not available, a task that was forked by some
      * other thread, if available. Availability may be transient, so a
-     * <code>null</code> result does not necessarily imply quiecence
+     * {@code null} result does not necessarily imply quiescence
      * of the pool this task is operating in.  This method is designed
      * primarily to support extensions, and is unlikely to be useful
-     * otherwise.  This method may be invoked only from within
-     * ForkJoinTask computations. Attempts to invoke in other contexts
-     * result in exceptions or errors possibly including
-     * ClassCastException.
+     * otherwise.
      *
-     * @return a task, or null if none are available
+     * <p>This method may be invoked only from within {@code
+     * ForkJoinPool} computations (as may be determined using method
+     * {@link #inForkJoinPool}).  Attempts to invoke in other contexts
+     * result in exceptions or errors, possibly including {@code
+     * ClassCastException}.
+     *
+     * @return a task, or {@code null} if none are available
      */
     protected static ForkJoinTask<?> pollTask() {
-        return ((ForkJoinWorkerThread)(Thread.currentThread())).
-            pollTask();
+        ForkJoinWorkerThread wt =
+            (ForkJoinWorkerThread)Thread.currentThread();
+        return wt.pool.nextTaskFor(wt.workQueue);
+    }
+
+    // Mark-bit operations
+
+    /**
+     * Returns true if this task is marked.
+     *
+     * @return true if this task is marked
+     * @since 1.8
+     */
+    public final boolean isMarkedForkJoinTask() {
+        return (status & MARKED) != 0;
+    }
+
+    /**
+     * Atomically sets the mark on this task.
+     *
+     * @return true if this task was previously unmarked
+     * @since 1.8
+     */
+    public final boolean markForkJoinTask() {
+        for (int s;;) {
+            if (((s = status) & MARKED) != 0)
+                return false;
+            if (U.compareAndSwapInt(this, STATUS, s, s | MARKED))
+                return true;
+        }
+    }
+
+    /**
+     * Atomically clears the mark on this task.
+     *
+     * @return true if this task was previously marked
+     * @since 1.8
+     */
+    public final boolean unmarkForkJoinTask() {
+        for (int s;;) {
+            if (((s = status) & MARKED) == 0)
+                return false;
+            if (U.compareAndSwapInt(this, STATUS, s, s & ~MARKED))
+                return true;
+        }
+    }
+
+    /**
+     * Adaptor for Runnables. This implements RunnableFuture
+     * to be compliant with AbstractExecutorService constraints
+     * when used in ForkJoinPool.
+     */
+    static final class AdaptedRunnable<T> extends ForkJoinTask<T>
+        implements RunnableFuture<T> {
+        final Runnable runnable;
+        T result;
+        AdaptedRunnable(Runnable runnable, T result) {
+            if (runnable == null) throw new NullPointerException();
+            this.runnable = runnable;
+            this.result = result; // OK to set this even before completion
+        }
+        public final T getRawResult() { return result; }
+        public final void setRawResult(T v) { result = v; }
+        public final boolean exec() { runnable.run(); return true; }
+        public final void run() { invoke(); }
+        private static final long serialVersionUID = 5232453952276885070L;
+    }
+
+    /**
+     * Adaptor for Runnables without results
+     */
+    static final class AdaptedRunnableAction extends ForkJoinTask<Void>
+        implements RunnableFuture<Void> {
+        final Runnable runnable;
+        AdaptedRunnableAction(Runnable runnable) {
+            if (runnable == null) throw new NullPointerException();
+            this.runnable = runnable;
+        }
+        public final Void getRawResult() { return null; }
+        public final void setRawResult(Void v) { }
+        public final boolean exec() { runnable.run(); return true; }
+        public final void run() { invoke(); }
+        private static final long serialVersionUID = 5232453952276885070L;
+    }
+
+    /**
+     * Adaptor for Callables
+     */
+    static final class AdaptedCallable<T> extends ForkJoinTask<T>
+        implements RunnableFuture<T> {
+        final Callable<? extends T> callable;
+        T result;
+        AdaptedCallable(Callable<? extends T> callable) {
+            if (callable == null) throw new NullPointerException();
+            this.callable = callable;
+        }
+        public final T getRawResult() { return result; }
+        public final void setRawResult(T v) { result = v; }
+        public final boolean exec() {
+            try {
+                result = callable.call();
+                return true;
+            } catch (Error err) {
+                throw err;
+            } catch (RuntimeException rex) {
+                throw rex;
+            } catch (Exception ex) {
+                throw new RuntimeException(ex);
+            }
+        }
+        public final void run() { invoke(); }
+        private static final long serialVersionUID = 2838392045355241008L;
+    }
+
+    /**
+     * Returns a new {@code ForkJoinTask} that performs the {@code run}
+     * method of the given {@code Runnable} as its action, and returns
+     * a null result upon {@link #join}.
+     *
+     * @param runnable the runnable action
+     * @return the task
+     */
+    public static ForkJoinTask<?> adapt(Runnable runnable) {
+        return new AdaptedRunnableAction(runnable);
+    }
+
+    /**
+     * Returns a new {@code ForkJoinTask} that performs the {@code run}
+     * method of the given {@code Runnable} as its action, and returns
+     * the given result upon {@link #join}.
+     *
+     * @param runnable the runnable action
+     * @param result the result upon completion
+     * @return the task
+     */
+    public static <T> ForkJoinTask<T> adapt(Runnable runnable, T result) {
+        return new AdaptedRunnable<T>(runnable, result);
+    }
+
+    /**
+     * Returns a new {@code ForkJoinTask} that performs the {@code call}
+     * method of the given {@code Callable} as its action, and returns
+     * its result upon {@link #join}, translating any checked exceptions
+     * encountered into {@code RuntimeException}.
+     *
+     * @param callable the callable action
+     * @return the task
+     */
+    public static <T> ForkJoinTask<T> adapt(Callable<? extends T> callable) {
+        return new AdaptedCallable<T>(callable);
     }
 
     // Serialization support
@@ -981,11 +1454,10 @@ public abstract class ForkJoinTask<V> implements Future<V>, Serializable {
     private static final long serialVersionUID = -7721805057305804111L;
 
     /**
-     * Save the state to a stream.
+     * Saves this task to a stream (that is, serializes it).
      *
      * @serialData the current run status and the exception thrown
-     * during execution, or null if none.
-     * @param s the stream
+     * during execution, or {@code null} if none
      */
     private void writeObject(java.io.ObjectOutputStream s)
         throws java.io.IOException {
@@ -994,70 +1466,57 @@ public abstract class ForkJoinTask<V> implements Future<V>, Serializable {
     }
 
     /**
-     * Reconstitute the instance from a stream.
-     * @param s the stream
+     * Reconstitutes this task from a stream (that is, deserializes it).
      */
     private void readObject(java.io.ObjectInputStream s)
         throws java.io.IOException, ClassNotFoundException {
         s.defaultReadObject();
-        status &= ~INTERNAL_SIGNAL_MASK; // clear internal signal counts
-        status |= EXTERNAL_SIGNAL; // conservatively set external signal
         Object ex = s.readObject();
         if (ex != null)
-            setDoneExceptionally((Throwable)ex);
+            setExceptionalCompletion((Throwable)ex);
+    }
+
+    // Unsafe mechanics
+    private static final sun.misc.Unsafe U;
+    private static final long STATUS;
+    static {
+        exceptionTableLock = new ReentrantLock();
+        exceptionTableRefQueue = new ReferenceQueue<Object>();
+        exceptionTable = new ExceptionNode[EXCEPTION_MAP_CAPACITY];
+        try {
+            U = getUnsafe();
+            STATUS = U.objectFieldOffset
+                (ForkJoinTask.class.getDeclaredField("status"));
+        } catch (Exception e) {
+            throw new Error(e);
+        }
     }
 
-    // Temporary Unsafe mechanics for preliminary release
-    private static Unsafe getUnsafe() throws Throwable {
+    /**
+     * Returns a sun.misc.Unsafe.  Suitable for use in a 3rd party package.
+     * Replace with a simple call to Unsafe.getUnsafe when integrating
+     * into a jdk.
+     *
+     * @return a sun.misc.Unsafe
+     */
+    private static sun.misc.Unsafe getUnsafe() {
         try {
-            return Unsafe.getUnsafe();
+            return sun.misc.Unsafe.getUnsafe();
         } catch (SecurityException se) {
             try {
                 return java.security.AccessController.doPrivileged
-                    (new java.security.PrivilegedExceptionAction<Unsafe>() {
-                        public Unsafe run() throws Exception {
-                            return getUnsafePrivileged();
+                    (new java.security
+                     .PrivilegedExceptionAction<sun.misc.Unsafe>() {
+                        public sun.misc.Unsafe run() throws Exception {
+                            java.lang.reflect.Field f = sun.misc
+                                .Unsafe.class.getDeclaredField("theUnsafe");
+                            f.setAccessible(true);
+                            return (sun.misc.Unsafe) f.get(null);
                         }});
             } catch (java.security.PrivilegedActionException e) {
-                throw e.getCause();
+                throw new RuntimeException("Could not initialize intrinsics",
+                                           e.getCause());
             }
         }
     }
-
-    private static Unsafe getUnsafePrivileged()
-            throws NoSuchFieldException, IllegalAccessException {
-        Field f = Unsafe.class.getDeclaredField("theUnsafe");
-        f.setAccessible(true);
-        return (Unsafe) f.get(null);
-    }
-
-    private static long fieldOffset(String fieldName, Unsafe unsafe)
-            throws NoSuchFieldException {
-        // do not use _unsafe to avoid NPE
-        return unsafe.objectFieldOffset
-            (ForkJoinTask.class.getDeclaredField(fieldName));
-    }
-
-    static final Unsafe _unsafe;
-    static final long statusOffset;
-
-    static {
-        Unsafe tmpUnsafe = null;
-        long tmpStatusOffset = 0;
-        try {
-            tmpUnsafe = getUnsafe();
-            tmpStatusOffset = fieldOffset("status", tmpUnsafe);
-        } catch (Throwable e) {
-            // Ignore the failure to load sun.misc.Unsafe on Android so
-            // that platform can use the actor library without the
-            // fork/join scheduler.
-            String vmVendor = System.getProperty("java.vm.vendor");
-            if (!vmVendor.contains("Android")) {
-	        throw new RuntimeException("Could not initialize intrinsics", e);
-            }
-        }
-        _unsafe = tmpUnsafe;
-	statusOffset = tmpStatusOffset;
-    }
-
 }
diff --git a/src/forkjoin/scala/concurrent/forkjoin/ForkJoinWorkerThread.java b/src/forkjoin/scala/concurrent/forkjoin/ForkJoinWorkerThread.java
index b4d889750c..90a0af5723 100644
--- a/src/forkjoin/scala/concurrent/forkjoin/ForkJoinWorkerThread.java
+++ b/src/forkjoin/scala/concurrent/forkjoin/ForkJoinWorkerThread.java
@@ -1,224 +1,55 @@
 /*
  * Written by Doug Lea with assistance from members of JCP JSR-166
  * Expert Group and released to the public domain, as explained at
- * http://creativecommons.org/licenses/publicdomain
+ * http://creativecommons.org/publicdomain/zero/1.0/
  */
 
 package scala.concurrent.forkjoin;
-import java.util.*;
-import java.util.concurrent.*;
-import java.util.concurrent.atomic.*;
-import java.util.concurrent.locks.*;
-import sun.misc.Unsafe;
-import java.lang.reflect.*;
 
 /**
- * A thread managed by a {@link ForkJoinPool}.  This class is
- * subclassable solely for the sake of adding functionality -- there
- * are no overridable methods dealing with scheduling or
- * execution. However, you can override initialization and termination
- * methods surrounding the main task processing loop.  If you do
- * create such a subclass, you will also need to supply a custom
- * ForkJoinWorkerThreadFactory to use it in a ForkJoinPool.
+ * A thread managed by a {@link ForkJoinPool}, which executes
+ * {@link ForkJoinTask}s.
+ * This class is subclassable solely for the sake of adding
+ * functionality -- there are no overridable methods dealing with
+ * scheduling or execution.  However, you can override initialization
+ * and termination methods surrounding the main task processing loop.
+ * If you do create such a subclass, you will also need to supply a
+ * custom {@link ForkJoinPool.ForkJoinWorkerThreadFactory} to use it
+ * in a {@code ForkJoinPool}.
  *
+ * @since 1.7
+ * @author Doug Lea
  */
 public class ForkJoinWorkerThread extends Thread {
     /*
-     * Algorithm overview:
-     *
-     * 1. Work-Stealing: Work-stealing queues are special forms of
-     * Deques that support only three of the four possible
-     * end-operations -- push, pop, and deq (aka steal), and only do
-     * so under the constraints that push and pop are called only from
-     * the owning thread, while deq may be called from other threads.
-     * (If you are unfamiliar with them, you probably want to read
-     * Herlihy and Shavit's book "The Art of Multiprocessor
-     * programming", chapter 16 describing these in more detail before
-     * proceeding.)  The main work-stealing queue design is roughly
-     * similar to "Dynamic Circular Work-Stealing Deque" by David
-     * Chase and Yossi Lev, SPAA 2005
-     * (http://research.sun.com/scalable/pubs/index.html).  The main
-     * difference ultimately stems from gc requirements that we null
-     * out taken slots as soon as we can, to maintain as small a
-     * footprint as possible even in programs generating huge numbers
-     * of tasks. To accomplish this, we shift the CAS arbitrating pop
-     * vs deq (steal) from being on the indices ("base" and "sp") to
-     * the slots themselves (mainly via method "casSlotNull()"). So,
-     * both a successful pop and deq mainly entail CAS'ing a nonnull
-     * slot to null.  Because we rely on CASes of references, we do
-     * not need tag bits on base or sp.  They are simple ints as used
-     * in any circular array-based queue (see for example ArrayDeque).
-     * Updates to the indices must still be ordered in a way that
-     * guarantees that (sp - base) > 0 means the queue is empty, but
-     * otherwise may err on the side of possibly making the queue
-     * appear nonempty when a push, pop, or deq have not fully
-     * committed. Note that this means that the deq operation,
-     * considered individually, is not wait-free. One thief cannot
-     * successfully continue until another in-progress one (or, if
-     * previously empty, a push) completes.  However, in the
-     * aggregate, we ensure at least probablistic non-blockingness. If
-     * an attempted steal fails, a thief always chooses a different
-     * random victim target to try next. So, in order for one thief to
-     * progress, it suffices for any in-progress deq or new push on
-     * any empty queue to complete. One reason this works well here is
-     * that apparently-nonempty often means soon-to-be-stealable,
-     * which gives threads a chance to activate if necessary before
-     * stealing (see below).
-     *
-     * Efficient implementation of this approach currently relies on
-     * an uncomfortable amount of "Unsafe" mechanics. To maintain
-     * correct orderings, reads and writes of variable base require
-     * volatile ordering.  Variable sp does not require volatile write
-     * but needs cheaper store-ordering on writes.  Because they are
-     * protected by volatile base reads, reads of the queue array and
-     * its slots do not need volatile load semantics, but writes (in
-     * push) require store order and CASes (in pop and deq) require
-     * (volatile) CAS semantics. Since these combinations aren't
-     * supported using ordinary volatiles, the only way to accomplish
-     * these effciently is to use direct Unsafe calls. (Using external
-     * AtomicIntegers and AtomicReferenceArrays for the indices and
-     * array is significantly slower because of memory locality and
-     * indirection effects.) Further, performance on most platforms is
-     * very sensitive to placement and sizing of the (resizable) queue
-     * array.  Even though these queues don't usually become all that
-     * big, the initial size must be large enough to counteract cache
-     * contention effects across multiple queues (especially in the
-     * presence of GC cardmarking). Also, to improve thread-locality,
-     * queues are currently initialized immediately after the thread
-     * gets the initial signal to start processing tasks.  However,
-     * all queue-related methods except pushTask are written in a way
-     * that allows them to instead be lazily allocated and/or disposed
-     * of when empty. All together, these low-level implementation
-     * choices produce as much as a factor of 4 performance
-     * improvement compared to naive implementations, and enable the
-     * processing of billions of tasks per second, sometimes at the
-     * expense of ugliness.
-     *
-     * 2. Run control: The primary run control is based on a global
-     * counter (activeCount) held by the pool. It uses an algorithm
-     * similar to that in Herlihy and Shavit section 17.6 to cause
-     * threads to eventually block when all threads declare they are
-     * inactive. (See variable "scans".)  For this to work, threads
-     * must be declared active when executing tasks, and before
-     * stealing a task. They must be inactive before blocking on the
-     * Pool Barrier (awaiting a new submission or other Pool
-     * event). In between, there is some free play which we take
-     * advantage of to avoid contention and rapid flickering of the
-     * global activeCount: If inactive, we activate only if a victim
-     * queue appears to be nonempty (see above).  Similarly, a thread
-     * tries to inactivate only after a full scan of other threads.
-     * The net effect is that contention on activeCount is rarely a
-     * measurable performance issue. (There are also a few other cases
-     * where we scan for work rather than retry/block upon
-     * contention.)
-     *
-     * 3. Selection control. We maintain policy of always choosing to
-     * run local tasks rather than stealing, and always trying to
-     * steal tasks before trying to run a new submission. All steals
-     * are currently performed in randomly-chosen deq-order. It may be
-     * worthwhile to bias these with locality / anti-locality
-     * information, but doing this well probably requires more
-     * lower-level information from JVMs than currently provided.
-     */
-
-    /**
-     * Capacity of work-stealing queue array upon initialization.
-     * Must be a power of two. Initial size must be at least 2, but is
-     * padded to minimize cache effects.
-     */
-    private static final int INITIAL_QUEUE_CAPACITY = 1 << 13;
-
-    /**
-     * Maximum work-stealing queue array size.  Must be less than or
-     * equal to 1 << 28 to ensure lack of index wraparound. (This
-     * is less than usual bounds, because we need leftshift by 3
-     * to be in int range).
-     */
-    private static final int MAXIMUM_QUEUE_CAPACITY = 1 << 28;
-
-    /**
-     * The pool this thread works in. Accessed directly by ForkJoinTask
-     */
-    final ForkJoinPool pool;
-
-    /**
-     * The work-stealing queue array. Size must be a power of two.
-     * Initialized when thread starts, to improve memory locality.
-     */
-    private ForkJoinTask<?>[] queue;
-
-    /**
-     * Index (mod queue.length) of next queue slot to push to or pop
-     * from. It is written only by owner thread, via ordered store.
-     * Both sp and base are allowed to wrap around on overflow, but
-     * (sp - base) still estimates size.
-     */
-    private volatile int sp;
-
-    /**
-     * Index (mod queue.length) of least valid queue slot, which is
-     * always the next position to steal from if nonempty.
-     */
-    private volatile int base;
-
-    /**
-     * Activity status. When true, this worker is considered active.
-     * Must be false upon construction. It must be true when executing
-     * tasks, and BEFORE stealing a task. It must be false before
-     * calling pool.sync
-     */
-    private boolean active;
-
-    /**
-     * Run state of this worker. Supports simple versions of the usual
-     * shutdown/shutdownNow control.
-     */
-    private volatile int runState;
-
-    /**
-     * Seed for random number generator for choosing steal victims.
-     * Uses Marsaglia xorshift. Must be nonzero upon initialization.
-     */
-    private int seed;
-
-    /**
-     * Number of steals, transferred to pool when idle
+     * ForkJoinWorkerThreads are managed by ForkJoinPools and perform
+     * ForkJoinTasks. For explanation, see the internal documentation
+     * of class ForkJoinPool.
      */
-    private int stealCount;
 
-    /**
-     * Index of this worker in pool array. Set once by pool before
-     * running, and accessed directly by pool during cleanup etc
-     */
-    int poolIndex;
-
-    /**
-     * The last barrier event waited for. Accessed in pool callback
-     * methods, but only by current thread.
-     */
-    long lastEventCount;
-
-    /**
-     * True if use local fifo, not default lifo, for local polling
-     */
-    private boolean locallyFifo;
+    final ForkJoinPool.WorkQueue workQueue; // Work-stealing mechanics
+    final ForkJoinPool pool;                // the pool this thread works in
 
     /**
      * Creates a ForkJoinWorkerThread operating in the given pool.
+     *
      * @param pool the pool this thread works in
      * @throws NullPointerException if pool is null
      */
     protected ForkJoinWorkerThread(ForkJoinPool pool) {
-        if (pool == null) throw new NullPointerException();
+        super(pool.nextWorkerName());
+        setDaemon(true);
+        Thread.UncaughtExceptionHandler ueh = pool.ueh;
+        if (ueh != null)
+            setUncaughtExceptionHandler(ueh);
         this.pool = pool;
-        // Note: poolIndex is set by pool during construction
-        // Remaining initialization is deferred to onStart
+        pool.registerWorker(this.workQueue = new ForkJoinPool.WorkQueue
+                            (pool, this, pool.localMode));
     }
 
-    // Public access methods
-
     /**
-     * Returns the pool hosting this thread
+     * Returns the pool hosting this thread.
+     *
      * @return the pool
      */
     public ForkJoinPool getPool() {
@@ -231,543 +62,58 @@ public class ForkJoinWorkerThread extends Thread {
      * threads (minus one) that have ever been created in the pool.
      * This method may be useful for applications that track status or
      * collect results per-worker rather than per-task.
-     * @return the index number.
+     *
+     * @return the index number
      */
     public int getPoolIndex() {
-        return poolIndex;
-    }
-
-    /**
-     * Establishes local first-in-first-out scheduling mode for forked
-     * tasks that are never joined.
-     * @param async if true, use locally FIFO scheduling
-     */
-    void setAsyncMode(boolean async) {
-        locallyFifo = async;
-    }
-
-    // Runstate management
-
-    // Runstate values. Order matters
-    private static final int RUNNING     = 0;
-    private static final int SHUTDOWN    = 1;
-    private static final int TERMINATING = 2;
-    private static final int TERMINATED  = 3;
-
-    final boolean isShutdown()    { return runState >= SHUTDOWN;  }
-    final boolean isTerminating() { return runState >= TERMINATING;  }
-    final boolean isTerminated()  { return runState == TERMINATED; }
-    final boolean shutdown()      { return transitionRunStateTo(SHUTDOWN); }
-    final boolean shutdownNow()   { return transitionRunStateTo(TERMINATING); }
-
-    /**
-     * Transition to at least the given state. Return true if not
-     * already at least given state.
-     */
-    private boolean transitionRunStateTo(int state) {
-        for (;;) {
-            int s = runState;
-            if (s >= state)
-                return false;
-            if (_unsafe.compareAndSwapInt(this, runStateOffset, s, state))
-                return true;
-        }
-    }
-
-    /**
-     * Try to set status to active; fail on contention
-     */
-    private boolean tryActivate() {
-        if (!active) {
-            if (!pool.tryIncrementActiveCount())
-                return false;
-            active = true;
-        }
-        return true;
-    }
-
-    /**
-     * Try to set status to active; fail on contention
-     */
-    private boolean tryInactivate() {
-        if (active) {
-            if (!pool.tryDecrementActiveCount())
-                return false;
-            active = false;
-        }
-        return true;
-    }
-
-    /**
-     * Computes next value for random victim probe. Scans don't
-     * require a very high quality generator, but also not a crummy
-     * one. Marsaglia xor-shift is cheap and works well.
-     */
-    private static int xorShift(int r) {
-        r ^= r << 1;
-        r ^= r >>> 3;
-        r ^= r << 10;
-        return r;
-    }
-
-    // Lifecycle methods
-
-    /**
-     * This method is required to be public, but should never be
-     * called explicitly. It performs the main run loop to execute
-     * ForkJoinTasks.
-     */
-    public void run() {
-        Throwable exception = null;
-        try {
-            onStart();
-            pool.sync(this); // await first pool event
-            mainLoop();
-        } catch (Throwable ex) {
-            exception = ex;
-        } finally {
-            onTermination(exception);
-        }
-    }
-
-    /**
-     * Execute tasks until shut down.
-     */
-    private void mainLoop() {
-        while (!isShutdown()) {
-            ForkJoinTask<?> t = pollTask();
-            if (t != null || (t = pollSubmission()) != null)
-                t.quietlyExec();
-            else if (tryInactivate())
-                pool.sync(this);
-        }
+        return workQueue.poolIndex;
     }
 
     /**
      * Initializes internal state after construction but before
      * processing any tasks. If you override this method, you must
-     * invoke super.onStart() at the beginning of the method.
+     * invoke {@code super.onStart()} at the beginning of the method.
      * Initialization requires care: Most fields must have legal
      * default values, to ensure that attempted accesses from other
      * threads work correctly even before this thread starts
      * processing tasks.
      */
     protected void onStart() {
-        // Allocate while starting to improve chances of thread-local
-        // isolation
-        queue = new ForkJoinTask<?>[INITIAL_QUEUE_CAPACITY];
-        // Initial value of seed need not be especially random but
-        // should differ across workers and must be nonzero
-        int p = poolIndex + 1;
-        seed = p + (p << 8) + (p << 16) + (p << 24); // spread bits
     }
 
     /**
-     * Perform cleanup associated with termination of this worker
+     * Performs cleanup associated with termination of this worker
      * thread.  If you override this method, you must invoke
-     * super.onTermination at the end of the overridden method.
+     * {@code super.onTermination} at the end of the overridden method.
      *
      * @param exception the exception causing this thread to abort due
-     * to an unrecoverable error, or null if completed normally.
+     * to an unrecoverable error, or {@code null} if completed normally
      */
     protected void onTermination(Throwable exception) {
-        // Execute remaining local tasks unless aborting or terminating
-        while (exception == null &&  !pool.isTerminating() && base != sp) {
-            try {
-                ForkJoinTask<?> t = popTask();
-                if (t != null)
-                    t.quietlyExec();
-            } catch(Throwable ex) {
-                exception = ex;
-            }
-        }
-        // Cancel other tasks, transition status, notify pool, and
-        // propagate exception to uncaught exception handler
-        try {
-            do;while (!tryInactivate()); // ensure inactive
-            cancelTasks();
-            runState = TERMINATED;
-            pool.workerTerminated(this);
-        } catch (Throwable ex) {        // Shouldn't ever happen
-            if (exception == null)      // but if so, at least rethrown
-                exception = ex;
-        } finally {
-            if (exception != null)
-                ForkJoinTask.rethrowException(exception);
-        }
     }
 
-    // Intrinsics-based support for queue operations.
-
     /**
-     * Add in store-order the given task at given slot of q to
-     * null. Caller must ensure q is nonnull and index is in range.
-     */
-    private static void setSlot(ForkJoinTask<?>[] q, int i,
-                                ForkJoinTask<?> t){
-        _unsafe.putOrderedObject(q, (i << qShift) + qBase, t);
-    }
-
-    /**
-     * CAS given slot of q to null. Caller must ensure q is nonnull
-     * and index is in range.
-     */
-    private static boolean casSlotNull(ForkJoinTask<?>[] q, int i,
-                                       ForkJoinTask<?> t) {
-        return _unsafe.compareAndSwapObject(q, (i << qShift) + qBase, t, null);
-    }
-
-    /**
-     * Sets sp in store-order.
-     */
-    private void storeSp(int s) {
-        _unsafe.putOrderedInt(this, spOffset, s);
-    }
-
-    // Main queue methods
-
-    /**
-     * Pushes a task. Called only by current thread.
-     * @param t the task. Caller must ensure nonnull
-     */
-    final void pushTask(ForkJoinTask<?> t) {
-        ForkJoinTask<?>[] q = queue;
-        int mask = q.length - 1;
-        int s = sp;
-        setSlot(q, s & mask, t);
-        storeSp(++s);
-        if ((s -= base) == 1)
-            pool.signalWork();
-        else if (s >= mask)
-            growQueue();
-    }
-
-    /**
-     * Tries to take a task from the base of the queue, failing if
-     * either empty or contended.
-     * @return a task, or null if none or contended.
-     */
-    final ForkJoinTask<?> deqTask() {
-        ForkJoinTask<?> t;
-        ForkJoinTask<?>[] q;
-        int i;
-        int b;
-        if (sp != (b = base) &&
-            (q = queue) != null && // must read q after b
-            (t = q[i = (q.length - 1) & b]) != null &&
-            casSlotNull(q, i, t)) {
-            base = b + 1;
-            return t;
-        }
-        return null;
-    }
-
-    /**
-     * Returns a popped task, or null if empty. Ensures active status
-     * if nonnull. Called only by current thread.
-     */
-    final ForkJoinTask<?> popTask() {
-        int s = sp;
-        while (s != base) {
-            if (tryActivate()) {
-                ForkJoinTask<?>[] q = queue;
-                int mask = q.length - 1;
-                int i = (s - 1) & mask;
-                ForkJoinTask<?> t = q[i];
-                if (t == null || !casSlotNull(q, i, t))
-                    break;
-                storeSp(s - 1);
-                return t;
-            }
-        }
-        return null;
-    }
-
-    /**
-     * Specialized version of popTask to pop only if
-     * topmost element is the given task. Called only
-     * by current thread while active.
-     * @param t the task. Caller must ensure nonnull
-     */
-    final boolean unpushTask(ForkJoinTask<?> t) {
-        ForkJoinTask<?>[] q = queue;
-        int mask = q.length - 1;
-        int s = sp - 1;
-        if (casSlotNull(q, s & mask, t)) {
-            storeSp(s);
-            return true;
-        }
-        return false;
-    }
-
-    /**
-     * Returns next task.
-     */
-    final ForkJoinTask<?> peekTask() {
-        ForkJoinTask<?>[] q = queue;
-        if (q == null)
-            return null;
-        int mask = q.length - 1;
-        int i = locallyFifo? base : (sp - 1);
-        return q[i & mask];
-    }
-
-    /**
-     * Doubles queue array size. Transfers elements by emulating
-     * steals (deqs) from old array and placing, oldest first, into
-     * new array.
-     */
-    private void growQueue() {
-        ForkJoinTask<?>[] oldQ = queue;
-        int oldSize = oldQ.length;
-        int newSize = oldSize << 1;
-        if (newSize > MAXIMUM_QUEUE_CAPACITY)
-            throw new RejectedExecutionException("Queue capacity exceeded");
-        ForkJoinTask<?>[] newQ = queue = new ForkJoinTask<?>[newSize];
-
-        int b = base;
-        int bf = b + oldSize;
-        int oldMask = oldSize - 1;
-        int newMask = newSize - 1;
-        do {
-            int oldIndex = b & oldMask;
-            ForkJoinTask<?> t = oldQ[oldIndex];
-            if (t != null && !casSlotNull(oldQ, oldIndex, t))
-                t = null;
-            setSlot(newQ, b & newMask, t);
-        } while (++b != bf);
-        pool.signalWork();
-    }
-
-    /**
-     * Tries to steal a task from another worker. Starts at a random
-     * index of workers array, and probes workers until finding one
-     * with non-empty queue or finding that all are empty.  It
-     * randomly selects the first n probes. If these are empty, it
-     * resorts to a full circular traversal, which is necessary to
-     * accurately set active status by caller. Also restarts if pool
-     * events occurred since last scan, which forces refresh of
-     * workers array, in case barrier was associated with resize.
-     *
-     * This method must be both fast and quiet -- usually avoiding
-     * memory accesses that could disrupt cache sharing etc other than
-     * those needed to check for and take tasks. This accounts for,
-     * among other things, updating random seed in place without
-     * storing it until exit.
-     *
-     * @return a task, or null if none found
-     */
-    private ForkJoinTask<?> scan() {
-        ForkJoinTask<?> t = null;
-        int r = seed;                    // extract once to keep scan quiet
-        ForkJoinWorkerThread[] ws;       // refreshed on outer loop
-        int mask;                        // must be power 2 minus 1 and > 0
-        outer:do {
-            if ((ws = pool.workers) != null && (mask = ws.length - 1) > 0) {
-                int idx = r;
-                int probes = ~mask;      // use random index while negative
-                for (;;) {
-                    r = xorShift(r);     // update random seed
-                    ForkJoinWorkerThread v = ws[mask & idx];
-                    if (v == null || v.sp == v.base) {
-                        if (probes <= mask)
-                            idx = (probes++ < 0)? r : (idx + 1);
-                        else
-                            break;
-                    }
-                    else if (!tryActivate() || (t = v.deqTask()) == null)
-                        continue outer;  // restart on contention
-                    else
-                        break outer;
-                }
-            }
-        } while (pool.hasNewSyncEvent(this)); // retry on pool events
-        seed = r;
-        return t;
-    }
-
-    /**
-     * gets and removes a local or stolen a task
-     * @return a task, if available
-     */
-    final ForkJoinTask<?> pollTask() {
-        ForkJoinTask<?> t = locallyFifo? deqTask() : popTask();
-        if (t == null && (t = scan()) != null)
-            ++stealCount;
-        return t;
-    }
-
-    /**
-     * gets a local task
-     * @return a task, if available
-     */
-    final ForkJoinTask<?> pollLocalTask() {
-        return locallyFifo? deqTask() : popTask();
-    }
-
-    /**
-     * Returns a pool submission, if one exists, activating first.
-     * @return a submission, if available
-     */
-    private ForkJoinTask<?> pollSubmission() {
-        ForkJoinPool p = pool;
-        while (p.hasQueuedSubmissions()) {
-            ForkJoinTask<?> t;
-            if (tryActivate() && (t = p.pollSubmission()) != null)
-                return t;
-        }
-        return null;
-    }
-
-    // Methods accessed only by Pool
-
-    /**
-     * Removes and cancels all tasks in queue.  Can be called from any
-     * thread.
-     */
-    final void cancelTasks() {
-        ForkJoinTask<?> t;
-        while (base != sp && (t = deqTask()) != null)
-            t.cancelIgnoringExceptions();
-    }
-
-    /**
-     * Drains tasks to given collection c
-     * @return the number of tasks drained
-     */
-    final int drainTasksTo(Collection<ForkJoinTask<?>> c) {
-        int n = 0;
-        ForkJoinTask<?> t;
-        while (base != sp && (t = deqTask()) != null) {
-            c.add(t);
-            ++n;
-        }
-        return n;
-    }
-
-    /**
-     * Get and clear steal count for accumulation by pool.  Called
-     * only when known to be idle (in pool.sync and termination).
-     */
-    final int getAndClearStealCount() {
-        int sc = stealCount;
-        stealCount = 0;
-        return sc;
-    }
-
-    /**
-     * Returns true if at least one worker in the given array appears
-     * to have at least one queued task.
-     * @param ws array of workers
-     */
-    static boolean hasQueuedTasks(ForkJoinWorkerThread[] ws) {
-        if (ws != null) {
-            int len = ws.length;
-            for (int j = 0; j < 2; ++j) { // need two passes for clean sweep
-                for (int i = 0; i < len; ++i) {
-                    ForkJoinWorkerThread w = ws[i];
-                    if (w != null && w.sp != w.base)
-                        return true;
-                }
-            }
-        }
-        return false;
-    }
-
-    // Support methods for ForkJoinTask
-
-    /**
-     * Returns an estimate of the number of tasks in the queue.
-     */
-    final int getQueueSize() {
-        int n = sp - base;
-        return n < 0? 0 : n; // suppress momentarily negative values
-    }
-
-    /**
-     * Returns an estimate of the number of tasks, offset by a
-     * function of number of idle workers.
-     */
-    final int getEstimatedSurplusTaskCount() {
-        // The halving approximates weighting idle vs non-idle workers
-        return (sp - base) - (pool.getIdleThreadCount() >>> 1);
-    }
-
-    /**
-     * Scan, returning early if joinMe done
-     */
-    final ForkJoinTask<?> scanWhileJoining(ForkJoinTask<?> joinMe) {
-        ForkJoinTask<?> t = pollTask();
-        if (t != null && joinMe.status < 0 && sp == base) {
-            pushTask(t); // unsteal if done and this task would be stealable
-            t = null;
-        }
-        return t;
-    }
-
-    /**
-     * Runs tasks until pool isQuiescent
+     * This method is required to be public, but should never be
+     * called explicitly. It performs the main run loop to execute
+     * {@link ForkJoinTask}s.
      */
-    final void helpQuiescePool() {
-        for (;;) {
-            ForkJoinTask<?> t = pollTask();
-            if (t != null)
-                t.quietlyExec();
-            else if (tryInactivate() && pool.isQuiescent())
-                break;
-        }
-        do;while (!tryActivate()); // re-activate on exit
-    }
-
-    // Temporary Unsafe mechanics for preliminary release
-    private static Unsafe getUnsafe() throws Throwable {
+    public void run() {
+        Throwable exception = null;
         try {
-            return Unsafe.getUnsafe();
-        } catch (SecurityException se) {
+            onStart();
+            pool.runWorker(workQueue);
+        } catch (Throwable ex) {
+            exception = ex;
+        } finally {
             try {
-                return java.security.AccessController.doPrivileged
-                    (new java.security.PrivilegedExceptionAction<Unsafe>() {
-                        public Unsafe run() throws Exception {
-                            return getUnsafePrivileged();
-                        }});
-            } catch (java.security.PrivilegedActionException e) {
-                throw e.getCause();
+                onTermination(exception);
+            } catch (Throwable ex) {
+                if (exception == null)
+                    exception = ex;
+            } finally {
+                pool.deregisterWorker(this, exception);
             }
         }
     }
-
-    private static Unsafe getUnsafePrivileged()
-            throws NoSuchFieldException, IllegalAccessException {
-        Field f = Unsafe.class.getDeclaredField("theUnsafe");
-        f.setAccessible(true);
-        return (Unsafe) f.get(null);
-    }
-
-    private static long fieldOffset(String fieldName)
-            throws NoSuchFieldException {
-        return _unsafe.objectFieldOffset
-            (ForkJoinWorkerThread.class.getDeclaredField(fieldName));
-    }
-
-    static final Unsafe _unsafe;
-    static final long baseOffset;
-    static final long spOffset;
-    static final long runStateOffset;
-    static final long qBase;
-    static final int qShift;
-    static {
-        try {
-            _unsafe = getUnsafe();
-            baseOffset = fieldOffset("base");
-            spOffset = fieldOffset("sp");
-            runStateOffset = fieldOffset("runState");
-            qBase = _unsafe.arrayBaseOffset(ForkJoinTask[].class);
-            int s = _unsafe.arrayIndexScale(ForkJoinTask[].class);
-            if ((s & (s-1)) != 0)
-                throw new Error("data type scale not a power of two");
-            qShift = 31 - Integer.numberOfLeadingZeros(s);
-        } catch (Throwable e) {
-            throw new RuntimeException("Could not initialize intrinsics", e);
-        }
-    }
 }
+
diff --git a/src/forkjoin/scala/concurrent/forkjoin/LinkedTransferQueue.java b/src/forkjoin/scala/concurrent/forkjoin/LinkedTransferQueue.java
index 3b46c176ff..ceeb9212d5 100644
--- a/src/forkjoin/scala/concurrent/forkjoin/LinkedTransferQueue.java
+++ b/src/forkjoin/scala/concurrent/forkjoin/LinkedTransferQueue.java
@@ -1,30 +1,38 @@
 /*
  * Written by Doug Lea with assistance from members of JCP JSR-166
  * Expert Group and released to the public domain, as explained at
- * http://creativecommons.org/licenses/publicdomain
+ * http://creativecommons.org/publicdomain/zero/1.0/
  */
 
 package scala.concurrent.forkjoin;
-import java.util.concurrent.*;
-import java.util.concurrent.locks.*;
-import java.util.concurrent.atomic.*;
-import java.util.*;
-import java.io.*;
-import sun.misc.Unsafe;
-import java.lang.reflect.*;
+
+import java.util.AbstractQueue;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+import java.util.Queue;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.locks.LockSupport;
 
 /**
- * An unbounded {@linkplain TransferQueue} based on linked nodes.
+ * An unbounded {@link TransferQueue} based on linked nodes.
  * This queue orders elements FIFO (first-in-first-out) with respect
  * to any given producer.  The <em>head</em> of the queue is that
  * element that has been on the queue the longest time for some
  * producer.  The <em>tail</em> of the queue is that element that has
  * been on the queue the shortest time for some producer.
  *
- * <p>Beware that, unlike in most collections, the {@code size}
- * method is <em>NOT</em> a constant-time operation. Because of the
+ * <p>Beware that, unlike in most collections, the {@code size} method
+ * is <em>NOT</em> a constant-time operation. Because of the
  * asynchronous nature of these queues, determining the current number
- * of elements requires a traversal of the elements.
+ * of elements requires a traversal of the elements, and so may report
+ * inaccurate results if this collection is modified during traversal.
+ * Additionally, the bulk operations {@code addAll},
+ * {@code removeAll}, {@code retainAll}, {@code containsAll},
+ * {@code equals}, and {@code toArray} are <em>not</em> guaranteed
+ * to be performed atomically. For example, an iterator operating
+ * concurrently with an {@code addAll} operation might view only some
+ * of the added elements.
  *
  * <p>This class and its iterator implement all of the
  * <em>optional</em> methods of the {@link Collection} and {@link
@@ -44,381 +52,938 @@ import java.lang.reflect.*;
  * @since 1.7
  * @author Doug Lea
  * @param <E> the type of elements held in this collection
- *
  */
 public class LinkedTransferQueue<E> extends AbstractQueue<E>
     implements TransferQueue<E>, java.io.Serializable {
     private static final long serialVersionUID = -3223113410248163686L;
 
     /*
-     * This class extends the approach used in FIFO-mode
-     * SynchronousQueues. See the internal documentation, as well as
-     * the PPoPP 2006 paper "Scalable Synchronous Queues" by Scherer,
-     * Lea & Scott
-     * (http://www.cs.rice.edu/~wns1/papers/2006-PPoPP-SQ.pdf)
+     * *** Overview of Dual Queues with Slack ***
+     *
+     * Dual Queues, introduced by Scherer and Scott
+     * (http://www.cs.rice.edu/~wns1/papers/2004-DISC-DDS.pdf) are
+     * (linked) queues in which nodes may represent either data or
+     * requests.  When a thread tries to enqueue a data node, but
+     * encounters a request node, it instead "matches" and removes it;
+     * and vice versa for enqueuing requests. Blocking Dual Queues
+     * arrange that threads enqueuing unmatched requests block until
+     * other threads provide the match. Dual Synchronous Queues (see
+     * Scherer, Lea, & Scott
+     * http://www.cs.rochester.edu/u/scott/papers/2009_Scherer_CACM_SSQ.pdf)
+     * additionally arrange that threads enqueuing unmatched data also
+     * block.  Dual Transfer Queues support all of these modes, as
+     * dictated by callers.
+     *
+     * A FIFO dual queue may be implemented using a variation of the
+     * Michael & Scott (M&S) lock-free queue algorithm
+     * (http://www.cs.rochester.edu/u/scott/papers/1996_PODC_queues.pdf).
+     * It maintains two pointer fields, "head", pointing to a
+     * (matched) node that in turn points to the first actual
+     * (unmatched) queue node (or null if empty); and "tail" that
+     * points to the last node on the queue (or again null if
+     * empty). For example, here is a possible queue with four data
+     * elements:
+     *
+     *  head                tail
+     *    |                   |
+     *    v                   v
+     *    M -> U -> U -> U -> U
+     *
+     * The M&S queue algorithm is known to be prone to scalability and
+     * overhead limitations when maintaining (via CAS) these head and
+     * tail pointers. This has led to the development of
+     * contention-reducing variants such as elimination arrays (see
+     * Moir et al http://portal.acm.org/citation.cfm?id=1074013) and
+     * optimistic back pointers (see Ladan-Mozes & Shavit
+     * http://people.csail.mit.edu/edya/publications/OptimisticFIFOQueue-journal.pdf).
+     * However, the nature of dual queues enables a simpler tactic for
+     * improving M&S-style implementations when dual-ness is needed.
+     *
+     * In a dual queue, each node must atomically maintain its match
+     * status. While there are other possible variants, we implement
+     * this here as: for a data-mode node, matching entails CASing an
+     * "item" field from a non-null data value to null upon match, and
+     * vice-versa for request nodes, CASing from null to a data
+     * value. (Note that the linearization properties of this style of
+     * queue are easy to verify -- elements are made available by
+     * linking, and unavailable by matching.) Compared to plain M&S
+     * queues, this property of dual queues requires one additional
+     * successful atomic operation per enq/deq pair. But it also
+     * enables lower cost variants of queue maintenance mechanics. (A
+     * variation of this idea applies even for non-dual queues that
+     * support deletion of interior elements, such as
+     * j.u.c.ConcurrentLinkedQueue.)
+     *
+     * Once a node is matched, its match status can never again
+     * change.  We may thus arrange that the linked list of them
+     * contain a prefix of zero or more matched nodes, followed by a
+     * suffix of zero or more unmatched nodes. (Note that we allow
+     * both the prefix and suffix to be zero length, which in turn
+     * means that we do not use a dummy header.)  If we were not
+     * concerned with either time or space efficiency, we could
+     * correctly perform enqueue and dequeue operations by traversing
+     * from a pointer to the initial node; CASing the item of the
+     * first unmatched node on match and CASing the next field of the
+     * trailing node on appends. (Plus some special-casing when
+     * initially empty).  While this would be a terrible idea in
+     * itself, it does have the benefit of not requiring ANY atomic
+     * updates on head/tail fields.
+     *
+     * We introduce here an approach that lies between the extremes of
+     * never versus always updating queue (head and tail) pointers.
+     * This offers a tradeoff between sometimes requiring extra
+     * traversal steps to locate the first and/or last unmatched
+     * nodes, versus the reduced overhead and contention of fewer
+     * updates to queue pointers. For example, a possible snapshot of
+     * a queue is:
+     *
+     *  head           tail
+     *    |              |
+     *    v              v
+     *    M -> M -> U -> U -> U -> U
+     *
+     * The best value for this "slack" (the targeted maximum distance
+     * between the value of "head" and the first unmatched node, and
+     * similarly for "tail") is an empirical matter. We have found
+     * that using very small constants in the range of 1-3 work best
+     * over a range of platforms. Larger values introduce increasing
+     * costs of cache misses and risks of long traversal chains, while
+     * smaller values increase CAS contention and overhead.
+     *
+     * Dual queues with slack differ from plain M&S dual queues by
+     * virtue of only sometimes updating head or tail pointers when
+     * matching, appending, or even traversing nodes; in order to
+     * maintain a targeted slack.  The idea of "sometimes" may be
+     * operationalized in several ways. The simplest is to use a
+     * per-operation counter incremented on each traversal step, and
+     * to try (via CAS) to update the associated queue pointer
+     * whenever the count exceeds a threshold. Another, that requires
+     * more overhead, is to use random number generators to update
+     * with a given probability per traversal step.
+     *
+     * In any strategy along these lines, because CASes updating
+     * fields may fail, the actual slack may exceed targeted
+     * slack. However, they may be retried at any time to maintain
+     * targets.  Even when using very small slack values, this
+     * approach works well for dual queues because it allows all
+     * operations up to the point of matching or appending an item
+     * (hence potentially allowing progress by another thread) to be
+     * read-only, thus not introducing any further contention. As
+     * described below, we implement this by performing slack
+     * maintenance retries only after these points.
+     *
+     * As an accompaniment to such techniques, traversal overhead can
+     * be further reduced without increasing contention of head
+     * pointer updates: Threads may sometimes shortcut the "next" link
+     * path from the current "head" node to be closer to the currently
+     * known first unmatched node, and similarly for tail. Again, this
+     * may be triggered with using thresholds or randomization.
+     *
+     * These ideas must be further extended to avoid unbounded amounts
+     * of costly-to-reclaim garbage caused by the sequential "next"
+     * links of nodes starting at old forgotten head nodes: As first
+     * described in detail by Boehm
+     * (http://portal.acm.org/citation.cfm?doid=503272.503282) if a GC
+     * delays noticing that any arbitrarily old node has become
+     * garbage, all newer dead nodes will also be unreclaimed.
+     * (Similar issues arise in non-GC environments.)  To cope with
+     * this in our implementation, upon CASing to advance the head
+     * pointer, we set the "next" link of the previous head to point
+     * only to itself; thus limiting the length of connected dead lists.
+     * (We also take similar care to wipe out possibly garbage
+     * retaining values held in other Node fields.)  However, doing so
+     * adds some further complexity to traversal: If any "next"
+     * pointer links to itself, it indicates that the current thread
+     * has lagged behind a head-update, and so the traversal must
+     * continue from the "head".  Traversals trying to find the
+     * current tail starting from "tail" may also encounter
+     * self-links, in which case they also continue at "head".
+     *
+     * It is tempting in slack-based scheme to not even use CAS for
+     * updates (similarly to Ladan-Mozes & Shavit). However, this
+     * cannot be done for head updates under the above link-forgetting
+     * mechanics because an update may leave head at a detached node.
+     * And while direct writes are possible for tail updates, they
+     * increase the risk of long retraversals, and hence long garbage
+     * chains, which can be much more costly than is worthwhile
+     * considering that the cost difference of performing a CAS vs
+     * write is smaller when they are not triggered on each operation
+     * (especially considering that writes and CASes equally require
+     * additional GC bookkeeping ("write barriers") that are sometimes
+     * more costly than the writes themselves because of contention).
+     *
+     * *** Overview of implementation ***
+     *
+     * We use a threshold-based approach to updates, with a slack
+     * threshold of two -- that is, we update head/tail when the
+     * current pointer appears to be two or more steps away from the
+     * first/last node. The slack value is hard-wired: a path greater
+     * than one is naturally implemented by checking equality of
+     * traversal pointers except when the list has only one element,
+     * in which case we keep slack threshold at one. Avoiding tracking
+     * explicit counts across method calls slightly simplifies an
+     * already-messy implementation. Using randomization would
+     * probably work better if there were a low-quality dirt-cheap
+     * per-thread one available, but even ThreadLocalRandom is too
+     * heavy for these purposes.
+     *
+     * With such a small slack threshold value, it is not worthwhile
+     * to augment this with path short-circuiting (i.e., unsplicing
+     * interior nodes) except in the case of cancellation/removal (see
+     * below).
+     *
+     * We allow both the head and tail fields to be null before any
+     * nodes are enqueued; initializing upon first append.  This
+     * simplifies some other logic, as well as providing more
+     * efficient explicit control paths instead of letting JVMs insert
+     * implicit NullPointerExceptions when they are null.  While not
+     * currently fully implemented, we also leave open the possibility
+     * of re-nulling these fields when empty (which is complicated to
+     * arrange, for little benefit.)
+     *
+     * All enqueue/dequeue operations are handled by the single method
+     * "xfer" with parameters indicating whether to act as some form
+     * of offer, put, poll, take, or transfer (each possibly with
+     * timeout). The relative complexity of using one monolithic
+     * method outweighs the code bulk and maintenance problems of
+     * using separate methods for each case.
      *
-     * The main extension is to provide different Wait modes for the
-     * main "xfer" method that puts or takes items.  These don't
-     * impact the basic dual-queue logic, but instead control whether
-     * or how threads block upon insertion of request or data nodes
-     * into the dual queue. It also uses slightly different
-     * conventions for tracking whether nodes are off-list or
-     * cancelled.
+     * Operation consists of up to three phases. The first is
+     * implemented within method xfer, the second in tryAppend, and
+     * the third in method awaitMatch.
+     *
+     * 1. Try to match an existing node
+     *
+     *    Starting at head, skip already-matched nodes until finding
+     *    an unmatched node of opposite mode, if one exists, in which
+     *    case matching it and returning, also if necessary updating
+     *    head to one past the matched node (or the node itself if the
+     *    list has no other unmatched nodes). If the CAS misses, then
+     *    a loop retries advancing head by two steps until either
+     *    success or the slack is at most two. By requiring that each
+     *    attempt advances head by two (if applicable), we ensure that
+     *    the slack does not grow without bound. Traversals also check
+     *    if the initial head is now off-list, in which case they
+     *    start at the new head.
+     *
+     *    If no candidates are found and the call was untimed
+     *    poll/offer, (argument "how" is NOW) return.
+     *
+     * 2. Try to append a new node (method tryAppend)
+     *
+     *    Starting at current tail pointer, find the actual last node
+     *    and try to append a new node (or if head was null, establish
+     *    the first node). Nodes can be appended only if their
+     *    predecessors are either already matched or are of the same
+     *    mode. If we detect otherwise, then a new node with opposite
+     *    mode must have been appended during traversal, so we must
+     *    restart at phase 1. The traversal and update steps are
+     *    otherwise similar to phase 1: Retrying upon CAS misses and
+     *    checking for staleness.  In particular, if a self-link is
+     *    encountered, then we can safely jump to a node on the list
+     *    by continuing the traversal at current head.
+     *
+     *    On successful append, if the call was ASYNC, return.
+     *
+     * 3. Await match or cancellation (method awaitMatch)
+     *
+     *    Wait for another thread to match node; instead cancelling if
+     *    the current thread was interrupted or the wait timed out. On
+     *    multiprocessors, we use front-of-queue spinning: If a node
+     *    appears to be the first unmatched node in the queue, it
+     *    spins a bit before blocking. In either case, before blocking
+     *    it tries to unsplice any nodes between the current "head"
+     *    and the first unmatched node.
+     *
+     *    Front-of-queue spinning vastly improves performance of
+     *    heavily contended queues. And so long as it is relatively
+     *    brief and "quiet", spinning does not much impact performance
+     *    of less-contended queues.  During spins threads check their
+     *    interrupt status and generate a thread-local random number
+     *    to decide to occasionally perform a Thread.yield. While
+     *    yield has underdefined specs, we assume that it might help,
+     *    and will not hurt, in limiting impact of spinning on busy
+     *    systems.  We also use smaller (1/2) spins for nodes that are
+     *    not known to be front but whose predecessors have not
+     *    blocked -- these "chained" spins avoid artifacts of
+     *    front-of-queue rules which otherwise lead to alternating
+     *    nodes spinning vs blocking. Further, front threads that
+     *    represent phase changes (from data to request node or vice
+     *    versa) compared to their predecessors receive additional
+     *    chained spins, reflecting longer paths typically required to
+     *    unblock threads during phase changes.
+     *
+     *
+     * ** Unlinking removed interior nodes **
+     *
+     * In addition to minimizing garbage retention via self-linking
+     * described above, we also unlink removed interior nodes. These
+     * may arise due to timed out or interrupted waits, or calls to
+     * remove(x) or Iterator.remove.  Normally, given a node that was
+     * at one time known to be the predecessor of some node s that is
+     * to be removed, we can unsplice s by CASing the next field of
+     * its predecessor if it still points to s (otherwise s must
+     * already have been removed or is now offlist). But there are two
+     * situations in which we cannot guarantee to make node s
+     * unreachable in this way: (1) If s is the trailing node of list
+     * (i.e., with null next), then it is pinned as the target node
+     * for appends, so can only be removed later after other nodes are
+     * appended. (2) We cannot necessarily unlink s given a
+     * predecessor node that is matched (including the case of being
+     * cancelled): the predecessor may already be unspliced, in which
+     * case some previous reachable node may still point to s.
+     * (For further explanation see Herlihy & Shavit "The Art of
+     * Multiprocessor Programming" chapter 9).  Although, in both
+     * cases, we can rule out the need for further action if either s
+     * or its predecessor are (or can be made to be) at, or fall off
+     * from, the head of list.
+     *
+     * Without taking these into account, it would be possible for an
+     * unbounded number of supposedly removed nodes to remain
+     * reachable.  Situations leading to such buildup are uncommon but
+     * can occur in practice; for example when a series of short timed
+     * calls to poll repeatedly time out but never otherwise fall off
+     * the list because of an untimed call to take at the front of the
+     * queue.
+     *
+     * When these cases arise, rather than always retraversing the
+     * entire list to find an actual predecessor to unlink (which
+     * won't help for case (1) anyway), we record a conservative
+     * estimate of possible unsplice failures (in "sweepVotes").
+     * We trigger a full sweep when the estimate exceeds a threshold
+     * ("SWEEP_THRESHOLD") indicating the maximum number of estimated
+     * removal failures to tolerate before sweeping through, unlinking
+     * cancelled nodes that were not unlinked upon initial removal.
+     * We perform sweeps by the thread hitting threshold (rather than
+     * background threads or by spreading work to other threads)
+     * because in the main contexts in which removal occurs, the
+     * caller is already timed-out, cancelled, or performing a
+     * potentially O(n) operation (e.g. remove(x)), none of which are
+     * time-critical enough to warrant the overhead that alternatives
+     * would impose on other threads.
+     *
+     * Because the sweepVotes estimate is conservative, and because
+     * nodes become unlinked "naturally" as they fall off the head of
+     * the queue, and because we allow votes to accumulate even while
+     * sweeps are in progress, there are typically significantly fewer
+     * such nodes than estimated.  Choice of a threshold value
+     * balances the likelihood of wasted effort and contention, versus
+     * providing a worst-case bound on retention of interior nodes in
+     * quiescent queues. The value defined below was chosen
+     * empirically to balance these under various timeout scenarios.
+     *
+     * Note that we cannot self-link unlinked interior nodes during
+     * sweeps. However, the associated garbage chains terminate when
+     * some successor ultimately falls off the head of the list and is
+     * self-linked.
      */
 
-    // Wait modes for xfer method
-    static final int NOWAIT  = 0;
-    static final int TIMEOUT = 1;
-    static final int WAIT    = 2;
-
-    /** The number of CPUs, for spin control */
-    static final int NCPUS = Runtime.getRuntime().availableProcessors();
+    /** True if on multiprocessor */
+    private static final boolean MP =
+        Runtime.getRuntime().availableProcessors() > 1;
 
     /**
-     * The number of times to spin before blocking in timed waits.
-     * The value is empirically derived -- it works well across a
-     * variety of processors and OSes. Empirically, the best value
-     * seems not to vary with number of CPUs (beyond 2) so is just
-     * a constant.
+     * The number of times to spin (with randomly interspersed calls
+     * to Thread.yield) on multiprocessor before blocking when a node
+     * is apparently the first waiter in the queue.  See above for
+     * explanation. Must be a power of two. The value is empirically
+     * derived -- it works pretty well across a variety of processors,
+     * numbers of CPUs, and OSes.
      */
-    static final int maxTimedSpins = (NCPUS < 2)? 0 : 32;
+    private static final int FRONT_SPINS   = 1 << 7;
 
     /**
-     * The number of times to spin before blocking in untimed waits.
-     * This is greater than timed value because untimed waits spin
-     * faster since they don't need to check times on each spin.
+     * The number of times to spin before blocking when a node is
+     * preceded by another node that is apparently spinning.  Also
+     * serves as an increment to FRONT_SPINS on phase changes, and as
+     * base average frequency for yielding during spins. Must be a
+     * power of two.
      */
-    static final int maxUntimedSpins = maxTimedSpins * 16;
+    private static final int CHAINED_SPINS = FRONT_SPINS >>> 1;
 
     /**
-     * The number of nanoseconds for which it is faster to spin
-     * rather than to use timed park. A rough estimate suffices.
+     * The maximum number of estimated removal failures (sweepVotes)
+     * to tolerate before sweeping through the queue unlinking
+     * cancelled nodes that were not unlinked upon initial
+     * removal. See above for explanation. The value must be at least
+     * two to avoid useless sweeps when removing trailing nodes.
      */
-    static final long spinForTimeoutThreshold = 1000L;
+    static final int SWEEP_THRESHOLD = 32;
 
     /**
-     * Node class for LinkedTransferQueue. Opportunistically
-     * subclasses from AtomicReference to represent item. Uses Object,
-     * not E, to allow setting item to "this" after use, to avoid
-     * garbage retention. Similarly, setting the next field to this is
-     * used as sentinel that node is off list.
+     * Queue nodes. Uses Object, not E, for items to allow forgetting
+     * them after use.  Relies heavily on Unsafe mechanics to minimize
+     * unnecessary ordering constraints: Writes that are intrinsically
+     * ordered wrt other accesses or CASes use simple relaxed forms.
      */
-    static final class QNode extends AtomicReference<Object> {
-        volatile QNode next;
-        volatile Thread waiter;       // to control park/unpark
-        final boolean isData;
-        QNode(Object item, boolean isData) {
-            super(item);
+    static final class Node {
+        final boolean isData;   // false if this is a request node
+        volatile Object item;   // initially non-null if isData; CASed to match
+        volatile Node next;
+        volatile Thread waiter; // null until waiting
+
+        // CAS methods for fields
+        final boolean casNext(Node cmp, Node val) {
+            return UNSAFE.compareAndSwapObject(this, nextOffset, cmp, val);
+        }
+
+        final boolean casItem(Object cmp, Object val) {
+            // assert cmp == null || cmp.getClass() != Node.class;
+            return UNSAFE.compareAndSwapObject(this, itemOffset, cmp, val);
+        }
+
+        /**
+         * Constructs a new node.  Uses relaxed write because item can
+         * only be seen after publication via casNext.
+         */
+        Node(Object item, boolean isData) {
+            UNSAFE.putObject(this, itemOffset, item); // relaxed write
             this.isData = isData;
         }
 
-        static final AtomicReferenceFieldUpdater<QNode, QNode>
-            nextUpdater = AtomicReferenceFieldUpdater.newUpdater
-            (QNode.class, QNode.class, "next");
+        /**
+         * Links node to itself to avoid garbage retention.  Called
+         * only after CASing head field, so uses relaxed write.
+         */
+        final void forgetNext() {
+            UNSAFE.putObject(this, nextOffset, this);
+        }
 
-        final boolean casNext(QNode cmp, QNode val) {
-            return nextUpdater.compareAndSet(this, cmp, val);
+        /**
+         * Sets item to self and waiter to null, to avoid garbage
+         * retention after matching or cancelling. Uses relaxed writes
+         * because order is already constrained in the only calling
+         * contexts: item is forgotten only after volatile/atomic
+         * mechanics that extract items.  Similarly, clearing waiter
+         * follows either CAS or return from park (if ever parked;
+         * else we don't care).
+         */
+        final void forgetContents() {
+            UNSAFE.putObject(this, itemOffset, this);
+            UNSAFE.putObject(this, waiterOffset, null);
         }
 
-        final void clearNext() {
-            nextUpdater.lazySet(this, this);
+        /**
+         * Returns true if this node has been matched, including the
+         * case of artificial matches due to cancellation.
+         */
+        final boolean isMatched() {
+            Object x = item;
+            return (x == this) || ((x == null) == isData);
         }
 
-    }
+        /**
+         * Returns true if this is an unmatched request node.
+         */
+        final boolean isUnmatchedRequest() {
+            return !isData && item == null;
+        }
 
-    /**
-     * Padded version of AtomicReference used for head, tail and
-     * cleanMe, to alleviate contention across threads CASing one vs
-     * the other.
-     */
-    static final class PaddedAtomicReference<T> extends AtomicReference<T> {
-        // enough padding for 64bytes with 4byte refs
-        Object p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pa, pb, pc, pd, pe;
-        PaddedAtomicReference(T r) { super(r); }
+        /**
+         * Returns true if a node with the given mode cannot be
+         * appended to this node because this node is unmatched and
+         * has opposite data mode.
+         */
+        final boolean cannotPrecede(boolean haveData) {
+            boolean d = isData;
+            Object x;
+            return d != haveData && (x = item) != this && (x != null) == d;
+        }
+
+        /**
+         * Tries to artificially match a data node -- used by remove.
+         */
+        final boolean tryMatchData() {
+            // assert isData;
+            Object x = item;
+            if (x != null && x != this && casItem(x, null)) {
+                LockSupport.unpark(waiter);
+                return true;
+            }
+            return false;
+        }
+
+        private static final long serialVersionUID = -3375979862319811754L;
+
+        // Unsafe mechanics
+        private static final sun.misc.Unsafe UNSAFE;
+        private static final long itemOffset;
+        private static final long nextOffset;
+        private static final long waiterOffset;
+        static {
+            try {
+                UNSAFE = getUnsafe();
+                Class<?> k = Node.class;
+                itemOffset = UNSAFE.objectFieldOffset
+                    (k.getDeclaredField("item"));
+                nextOffset = UNSAFE.objectFieldOffset
+                    (k.getDeclaredField("next"));
+                waiterOffset = UNSAFE.objectFieldOffset
+                    (k.getDeclaredField("waiter"));
+            } catch (Exception e) {
+                throw new Error(e);
+            }
+        }
     }
 
+    /** head of the queue; null until first enqueue */
+    transient volatile Node head;
 
-    /** head of the queue */
-    private transient final PaddedAtomicReference<QNode> head;
-    /** tail of the queue */
-    private transient final PaddedAtomicReference<QNode> tail;
+    /** tail of the queue; null until first append */
+    private transient volatile Node tail;
 
-    /**
-     * Reference to a cancelled node that might not yet have been
-     * unlinked from queue because it was the last inserted node
-     * when it cancelled.
-     */
-    private transient final PaddedAtomicReference<QNode> cleanMe;
+    /** The number of apparent failures to unsplice removed nodes */
+    private transient volatile int sweepVotes;
 
-    /**
-     * Tries to cas nh as new head; if successful, unlink
-     * old head's next node to avoid garbage retention.
+    // CAS methods for fields
+    private boolean casTail(Node cmp, Node val) {
+        return UNSAFE.compareAndSwapObject(this, tailOffset, cmp, val);
+    }
+
+    private boolean casHead(Node cmp, Node val) {
+        return UNSAFE.compareAndSwapObject(this, headOffset, cmp, val);
+    }
+
+    private boolean casSweepVotes(int cmp, int val) {
+        return UNSAFE.compareAndSwapInt(this, sweepVotesOffset, cmp, val);
+    }
+
+    /*
+     * Possible values for "how" argument in xfer method.
      */
-    private boolean advanceHead(QNode h, QNode nh) {
-        if (h == head.get() && head.compareAndSet(h, nh)) {
-            h.clearNext(); // forget old next
-            return true;
-        }
-        return false;
+    private static final int NOW   = 0; // for untimed poll, tryTransfer
+    private static final int ASYNC = 1; // for offer, put, add
+    private static final int SYNC  = 2; // for transfer, take
+    private static final int TIMED = 3; // for timed poll, tryTransfer
+
+    @SuppressWarnings("unchecked")
+    static <E> E cast(Object item) {
+        // assert item == null || item.getClass() != Node.class;
+        return (E) item;
     }
 
     /**
-     * Puts or takes an item. Used for most queue operations (except
-     * poll() and tryTransfer()). See the similar code in
-     * SynchronousQueue for detailed explanation.
+     * Implements all queuing methods. See above for explanation.
      *
-     * @param e the item or if null, signifies that this is a take
-     * @param mode the wait mode: NOWAIT, TIMEOUT, WAIT
-     * @param nanos timeout in nanosecs, used only if mode is TIMEOUT
-     * @return an item, or null on failure
+     * @param e the item or null for take
+     * @param haveData true if this is a put, else a take
+     * @param how NOW, ASYNC, SYNC, or TIMED
+     * @param nanos timeout in nanosecs, used only if mode is TIMED
+     * @return an item if matched, else e
+     * @throws NullPointerException if haveData mode but e is null
      */
-    private Object xfer(Object e, int mode, long nanos) {
-        boolean isData = (e != null);
-        QNode s = null;
-        final PaddedAtomicReference<QNode> head = this.head;
-        final PaddedAtomicReference<QNode> tail = this.tail;
-
-        for (;;) {
-            QNode t = tail.get();
-            QNode h = head.get();
-
-            if (t != null && (t == h || t.isData == isData)) {
-                if (s == null)
-                    s = new QNode(e, isData);
-                QNode last = t.next;
-                if (last != null) {
-                    if (t == tail.get())
-                        tail.compareAndSet(t, last);
-                }
-                else if (t.casNext(null, s)) {
-                    tail.compareAndSet(t, s);
-                    return awaitFulfill(t, s, e, mode, nanos);
+    private E xfer(E e, boolean haveData, int how, long nanos) {
+        if (haveData && (e == null))
+            throw new NullPointerException();
+        Node s = null;                        // the node to append, if needed
+
+        retry:
+        for (;;) {                            // restart on append race
+
+            for (Node h = head, p = h; p != null;) { // find & match first node
+                boolean isData = p.isData;
+                Object item = p.item;
+                if (item != p && (item != null) == isData) { // unmatched
+                    if (isData == haveData)   // can't match
+                        break;
+                    if (p.casItem(item, e)) { // match
+                        for (Node q = p; q != h;) {
+                            Node n = q.next;  // update by 2 unless singleton
+                            if (head == h && casHead(h, n == null ? q : n)) {
+                                h.forgetNext();
+                                break;
+                            }                 // advance and retry
+                            if ((h = head)   == null ||
+                                (q = h.next) == null || !q.isMatched())
+                                break;        // unless slack < 2
+                        }
+                        LockSupport.unpark(p.waiter);
+                        return LinkedTransferQueue.<E>cast(item);
+                    }
                 }
+                Node n = p.next;
+                p = (p != n) ? n : (h = head); // Use head if p offlist
             }
 
-            else if (h != null) {
-                QNode first = h.next;
-                if (t == tail.get() && first != null &&
-                    advanceHead(h, first)) {
-                    Object x = first.get();
-                    if (x != first && first.compareAndSet(x, e)) {
-                        LockSupport.unpark(first.waiter);
-                        return isData? e : x;
-                    }
-                }
+            if (how != NOW) {                 // No matches available
+                if (s == null)
+                    s = new Node(e, haveData);
+                Node pred = tryAppend(s, haveData);
+                if (pred == null)
+                    continue retry;           // lost race vs opposite mode
+                if (how != ASYNC)
+                    return awaitMatch(s, pred, e, (how == TIMED), nanos);
             }
+            return e; // not waiting
         }
     }
 
-
     /**
-     * Version of xfer for poll() and tryTransfer, which
-     * simplifies control paths both here and in xfer.
+     * Tries to append node s as tail.
+     *
+     * @param s the node to append
+     * @param haveData true if appending in data mode
+     * @return null on failure due to losing race with append in
+     * different mode, else s's predecessor, or s itself if no
+     * predecessor
      */
-    private Object fulfill(Object e) {
-        boolean isData = (e != null);
-        final PaddedAtomicReference<QNode> head = this.head;
-        final PaddedAtomicReference<QNode> tail = this.tail;
-
-        for (;;) {
-            QNode t = tail.get();
-            QNode h = head.get();
-
-            if (t != null && (t == h || t.isData == isData)) {
-                QNode last = t.next;
-                if (t == tail.get()) {
-                    if (last != null)
-                        tail.compareAndSet(t, last);
-                    else
-                        return null;
-                }
+    private Node tryAppend(Node s, boolean haveData) {
+        for (Node t = tail, p = t;;) {        // move p to last node and append
+            Node n, u;                        // temps for reads of next & tail
+            if (p == null && (p = head) == null) {
+                if (casHead(null, s))
+                    return s;                 // initialize
             }
-            else if (h != null) {
-                QNode first = h.next;
-                if (t == tail.get() &&
-                    first != null &&
-                    advanceHead(h, first)) {
-                    Object x = first.get();
-                    if (x != first && first.compareAndSet(x, e)) {
-                        LockSupport.unpark(first.waiter);
-                        return isData? e : x;
-                    }
+            else if (p.cannotPrecede(haveData))
+                return null;                  // lost race vs opposite mode
+            else if ((n = p.next) != null)    // not last; keep traversing
+                p = p != t && t != (u = tail) ? (t = u) : // stale tail
+                    (p != n) ? n : null;      // restart if off list
+            else if (!p.casNext(null, s))
+                p = p.next;                   // re-read on CAS failure
+            else {
+                if (p != t) {                 // update if slack now >= 2
+                    while ((tail != t || !casTail(t, s)) &&
+                           (t = tail)   != null &&
+                           (s = t.next) != null && // advance and retry
+                           (s = s.next) != null && s != t);
                 }
+                return p;
             }
         }
     }
 
     /**
-     * Spins/blocks until node s is fulfilled or caller gives up,
-     * depending on wait mode.
+     * Spins/yields/blocks until node s is matched or caller gives up.
      *
-     * @param pred the predecessor of waiting node
      * @param s the waiting node
+     * @param pred the predecessor of s, or s itself if it has no
+     * predecessor, or null if unknown (the null case does not occur
+     * in any current calls but may in possible future extensions)
      * @param e the comparison value for checking match
-     * @param mode mode
-     * @param nanos timeout value
-     * @return matched item, or s if cancelled
+     * @param timed if true, wait only until timeout elapses
+     * @param nanos timeout in nanosecs, used only if timed is true
+     * @return matched item, or e if unmatched on interrupt or timeout
      */
-    private Object awaitFulfill(QNode pred, QNode s, Object e,
-                                int mode, long nanos) {
-        if (mode == NOWAIT)
-            return null;
-
-        long lastTime = (mode == TIMEOUT)? System.nanoTime() : 0;
+    private E awaitMatch(Node s, Node pred, E e, boolean timed, long nanos) {
+        long lastTime = timed ? System.nanoTime() : 0L;
         Thread w = Thread.currentThread();
-        int spins = -1; // set to desired spin count below
+        int spins = -1; // initialized after first item and cancel checks
+        ThreadLocalRandom randomYields = null; // bound if needed
+
         for (;;) {
-            if (w.isInterrupted())
-                s.compareAndSet(e, s);
-            Object x = s.get();
-            if (x != e) {                 // Node was matched or cancelled
-                advanceHead(pred, s);     // unlink if head
-                if (x == s) {             // was cancelled
-                    clean(pred, s);
-                    return null;
-                }
-                else if (x != null) {
-                    s.set(s);             // avoid garbage retention
-                    return x;
-                }
-                else
-                    return e;
+            Object item = s.item;
+            if (item != e) {                  // matched
+                // assert item != s;
+                s.forgetContents();           // avoid garbage
+                return LinkedTransferQueue.<E>cast(item);
             }
-            if (mode == TIMEOUT) {
-                long now = System.nanoTime();
-                nanos -= now - lastTime;
-                lastTime = now;
-                if (nanos <= 0) {
-                    s.compareAndSet(e, s); // try to cancel
-                    continue;
-                }
+            if ((w.isInterrupted() || (timed && nanos <= 0)) &&
+                    s.casItem(e, s)) {        // cancel
+                unsplice(pred, s);
+                return e;
             }
-            if (spins < 0) {
-                QNode h = head.get(); // only spin if at head
-                spins = ((h != null && h.next == s) ?
-                         (mode == TIMEOUT?
-                          maxTimedSpins : maxUntimedSpins) : 0);
+
+            if (spins < 0) {                  // establish spins at/near front
+                if ((spins = spinsFor(pred, s.isData)) > 0)
+                    randomYields = ThreadLocalRandom.current();
             }
-            if (spins > 0)
+            else if (spins > 0) {             // spin
                 --spins;
-            else if (s.waiter == null)
-                s.waiter = w;
-            else if (mode != TIMEOUT) {
-                LockSupport.park(this);
-                s.waiter = null;
-                spins = -1;
+                if (randomYields.nextInt(CHAINED_SPINS) == 0)
+                    Thread.yield();           // occasionally yield
+            }
+            else if (s.waiter == null) {
+                s.waiter = w;                 // request unpark then recheck
             }
-            else if (nanos > spinForTimeoutThreshold) {
-                LockSupport.parkNanos(this, nanos);
-                s.waiter = null;
-                spins = -1;
+            else if (timed) {
+                long now = System.nanoTime();
+                if ((nanos -= now - lastTime) > 0)
+                    LockSupport.parkNanos(this, nanos);
+                lastTime = now;
+            }
+            else {
+                LockSupport.park(this);
             }
         }
     }
 
     /**
-     * Returns validated tail for use in cleaning methods.
+     * Returns spin/yield value for a node with given predecessor and
+     * data mode. See above for explanation.
      */
-    private QNode getValidatedTail() {
-        for (;;) {
-            QNode h = head.get();
-            QNode first = h.next;
-            if (first != null && first.next == first) { // help advance
-                advanceHead(h, first);
-                continue;
-            }
-            QNode t = tail.get();
-            QNode last = t.next;
-            if (t == tail.get()) {
-                if (last != null)
-                    tail.compareAndSet(t, last); // help advance
-                else
-                    return t;
+    private static int spinsFor(Node pred, boolean haveData) {
+        if (MP && pred != null) {
+            if (pred.isData != haveData)      // phase change
+                return FRONT_SPINS + CHAINED_SPINS;
+            if (pred.isMatched())             // probably at front
+                return FRONT_SPINS;
+            if (pred.waiter == null)          // pred apparently spinning
+                return CHAINED_SPINS;
+        }
+        return 0;
+    }
+
+    /* -------------- Traversal methods -------------- */
+
+    /**
+     * Returns the successor of p, or the head node if p.next has been
+     * linked to self, which will only be true if traversing with a
+     * stale pointer that is now off the list.
+     */
+    final Node succ(Node p) {
+        Node next = p.next;
+        return (p == next) ? head : next;
+    }
+
+    /**
+     * Returns the first unmatched node of the given mode, or null if
+     * none.  Used by methods isEmpty, hasWaitingConsumer.
+     */
+    private Node firstOfMode(boolean isData) {
+        for (Node p = head; p != null; p = succ(p)) {
+            if (!p.isMatched())
+                return (p.isData == isData) ? p : null;
+        }
+        return null;
+    }
+
+    /**
+     * Returns the item in the first unmatched node with isData; or
+     * null if none.  Used by peek.
+     */
+    private E firstDataItem() {
+        for (Node p = head; p != null; p = succ(p)) {
+            Object item = p.item;
+            if (p.isData) {
+                if (item != null && item != p)
+                    return LinkedTransferQueue.<E>cast(item);
             }
+            else if (item == null)
+                return null;
         }
+        return null;
     }
 
     /**
-     * Gets rid of cancelled node s with original predecessor pred.
-     *
-     * @param pred predecessor of cancelled node
-     * @param s the cancelled node
+     * Traverses and counts unmatched nodes of the given mode.
+     * Used by methods size and getWaitingConsumerCount.
      */
-    private void clean(QNode pred, QNode s) {
-        Thread w = s.waiter;
-        if (w != null) {             // Wake up thread
-            s.waiter = null;
-            if (w != Thread.currentThread())
-                LockSupport.unpark(w);
+    private int countOfMode(boolean data) {
+        int count = 0;
+        for (Node p = head; p != null; ) {
+            if (!p.isMatched()) {
+                if (p.isData != data)
+                    return 0;
+                if (++count == Integer.MAX_VALUE) // saturated
+                    break;
+            }
+            Node n = p.next;
+            if (n != p)
+                p = n;
+            else {
+                count = 0;
+                p = head;
+            }
         }
+        return count;
+    }
 
-        if (pred == null)
-            return;
+    final class Itr implements Iterator<E> {
+        private Node nextNode;   // next node to return item for
+        private E nextItem;      // the corresponding item
+        private Node lastRet;    // last returned node, to support remove
+        private Node lastPred;   // predecessor to unlink lastRet
 
-        /*
-         * At any given time, exactly one node on list cannot be
-         * deleted -- the last inserted node. To accommodate this, if
-         * we cannot delete s, we save its predecessor as "cleanMe",
-         * processing the previously saved version first. At least one
-         * of node s or the node previously saved can always be
-         * processed, so this always terminates.
+        /**
+         * Moves to next node after prev, or first node if prev null.
          */
-        while (pred.next == s) {
-            QNode oldpred = reclean();  // First, help get rid of cleanMe
-            QNode t = getValidatedTail();
-            if (s != t) {               // If not tail, try to unsplice
-                QNode sn = s.next;      // s.next == s means s already off list
-                if (sn == s || pred.casNext(s, sn))
+        private void advance(Node prev) {
+            /*
+             * To track and avoid buildup of deleted nodes in the face
+             * of calls to both Queue.remove and Itr.remove, we must
+             * include variants of unsplice and sweep upon each
+             * advance: Upon Itr.remove, we may need to catch up links
+             * from lastPred, and upon other removes, we might need to
+             * skip ahead from stale nodes and unsplice deleted ones
+             * found while advancing.
+             */
+
+            Node r, b; // reset lastPred upon possible deletion of lastRet
+            if ((r = lastRet) != null && !r.isMatched())
+                lastPred = r;    // next lastPred is old lastRet
+            else if ((b = lastPred) == null || b.isMatched())
+                lastPred = null; // at start of list
+            else {
+                Node s, n;       // help with removal of lastPred.next
+                while ((s = b.next) != null &&
+                       s != b && s.isMatched() &&
+                       (n = s.next) != null && n != s)
+                    b.casNext(s, n);
+            }
+
+            this.lastRet = prev;
+
+            for (Node p = prev, s, n;;) {
+                s = (p == null) ? head : p.next;
+                if (s == null)
+                    break;
+                else if (s == p) {
+                    p = null;
+                    continue;
+                }
+                Object item = s.item;
+                if (s.isData) {
+                    if (item != null && item != s) {
+                        nextItem = LinkedTransferQueue.<E>cast(item);
+                        nextNode = s;
+                        return;
+                    }
+                }
+                else if (item == null)
+                    break;
+                // assert s.isMatched();
+                if (p == null)
+                    p = s;
+                else if ((n = s.next) == null)
                     break;
+                else if (s == n)
+                    p = null;
+                else
+                    p.casNext(s, n);
             }
-            else if (oldpred == pred || // Already saved
-                     (oldpred == null && cleanMe.compareAndSet(null, pred)))
-                break;                  // Postpone cleaning
+            nextNode = null;
+            nextItem = null;
+        }
+
+        Itr() {
+            advance(null);
+        }
+
+        public final boolean hasNext() {
+            return nextNode != null;
+        }
+
+        public final E next() {
+            Node p = nextNode;
+            if (p == null) throw new NoSuchElementException();
+            E e = nextItem;
+            advance(p);
+            return e;
+        }
+
+        public final void remove() {
+            final Node lastRet = this.lastRet;
+            if (lastRet == null)
+                throw new IllegalStateException();
+            this.lastRet = null;
+            if (lastRet.tryMatchData())
+                unsplice(lastPred, lastRet);
         }
     }
 
+    /* -------------- Removal methods -------------- */
+
     /**
-     * Tries to unsplice the cancelled node held in cleanMe that was
-     * previously uncleanable because it was at tail.
+     * Unsplices (now or later) the given deleted/cancelled node with
+     * the given predecessor.
      *
-     * @return current cleanMe node (or null)
+     * @param pred a node that was at one time known to be the
+     * predecessor of s, or null or s itself if s is/was at head
+     * @param s the node to be unspliced
      */
-    private QNode reclean() {
+    final void unsplice(Node pred, Node s) {
+        s.forgetContents(); // forget unneeded fields
         /*
-         * cleanMe is, or at one time was, predecessor of cancelled
-         * node s that was the tail so could not be unspliced.  If s
-         * is no longer the tail, try to unsplice if necessary and
-         * make cleanMe slot available.  This differs from similar
-         * code in clean() because we must check that pred still
-         * points to a cancelled node that must be unspliced -- if
-         * not, we can (must) clear cleanMe without unsplicing.
-         * This can loop only due to contention on casNext or
-         * clearing cleanMe.
+         * See above for rationale. Briefly: if pred still points to
+         * s, try to unlink s.  If s cannot be unlinked, because it is
+         * trailing node or pred might be unlinked, and neither pred
+         * nor s are head or offlist, add to sweepVotes, and if enough
+         * votes have accumulated, sweep.
          */
-        QNode pred;
-        while ((pred = cleanMe.get()) != null) {
-            QNode t = getValidatedTail();
-            QNode s = pred.next;
-            if (s != t) {
-                QNode sn;
-                if (s == null || s == pred || s.get() != s ||
-                    (sn = s.next) == s || pred.casNext(s, sn))
-                    cleanMe.compareAndSet(pred, null);
+        if (pred != null && pred != s && pred.next == s) {
+            Node n = s.next;
+            if (n == null ||
+                (n != s && pred.casNext(s, n) && pred.isMatched())) {
+                for (;;) {               // check if at, or could be, head
+                    Node h = head;
+                    if (h == pred || h == s || h == null)
+                        return;          // at head or list empty
+                    if (!h.isMatched())
+                        break;
+                    Node hn = h.next;
+                    if (hn == null)
+                        return;          // now empty
+                    if (hn != h && casHead(h, hn))
+                        h.forgetNext();  // advance head
+                }
+                if (pred.next != pred && s.next != s) { // recheck if offlist
+                    for (;;) {           // sweep now if enough votes
+                        int v = sweepVotes;
+                        if (v < SWEEP_THRESHOLD) {
+                            if (casSweepVotes(v, v + 1))
+                                break;
+                        }
+                        else if (casSweepVotes(v, 0)) {
+                            sweep();
+                            break;
+                        }
+                    }
+                }
             }
-            else // s is still tail; cannot clean
+        }
+    }
+
+    /**
+     * Unlinks matched (typically cancelled) nodes encountered in a
+     * traversal from head.
+     */
+    private void sweep() {
+        for (Node p = head, s, n; p != null && (s = p.next) != null; ) {
+            if (!s.isMatched())
+                // Unmatched nodes are never self-linked
+                p = s;
+            else if ((n = s.next) == null) // trailing node is pinned
                 break;
+            else if (s == n)    // stale
+                // No need to also check for p == s, since that implies s == n
+                p = head;
+            else
+                p.casNext(s, n);
         }
-        return pred;
     }
 
     /**
+     * Main implementation of remove(Object)
+     */
+    private boolean findAndRemove(Object e) {
+        if (e != null) {
+            for (Node pred = null, p = head; p != null; ) {
+                Object item = p.item;
+                if (p.isData) {
+                    if (item != null && item != p && e.equals(item) &&
+                        p.tryMatchData()) {
+                        unsplice(pred, p);
+                        return true;
+                    }
+                }
+                else if (item == null)
+                    break;
+                pred = p;
+                if ((p = p.next) == pred) { // stale
+                    pred = null;
+                    p = head;
+                }
+            }
+        }
+        return false;
+    }
+
+
+    /**
      * Creates an initially empty {@code LinkedTransferQueue}.
      */
     public LinkedTransferQueue() {
-        QNode dummy = new QNode(null, false);
-        head = new PaddedAtomicReference<QNode>(dummy);
-        tail = new PaddedAtomicReference<QNode>(dummy);
-        cleanMe = new PaddedAtomicReference<QNode>(null);
     }
 
     /**
@@ -435,252 +1000,200 @@ public class LinkedTransferQueue<E> extends AbstractQueue<E>
         addAll(c);
     }
 
-    public void put(E e) throws InterruptedException {
-        if (e == null) throw new NullPointerException();
-        if (Thread.interrupted()) throw new InterruptedException();
-        xfer(e, NOWAIT, 0);
+    /**
+     * Inserts the specified element at the tail of this queue.
+     * As the queue is unbounded, this method will never block.
+     *
+     * @throws NullPointerException if the specified element is null
+     */
+    public void put(E e) {
+        xfer(e, true, ASYNC, 0);
     }
 
-    public boolean offer(E e, long timeout, TimeUnit unit)
-        throws InterruptedException {
-        if (e == null) throw new NullPointerException();
-        if (Thread.interrupted()) throw new InterruptedException();
-        xfer(e, NOWAIT, 0);
+    /**
+     * Inserts the specified element at the tail of this queue.
+     * As the queue is unbounded, this method will never block or
+     * return {@code false}.
+     *
+     * @return {@code true} (as specified by
+     *  {@link java.util.concurrent.BlockingQueue#offer(Object,long,TimeUnit)
+     *  BlockingQueue.offer})
+     * @throws NullPointerException if the specified element is null
+     */
+    public boolean offer(E e, long timeout, TimeUnit unit) {
+        xfer(e, true, ASYNC, 0);
         return true;
     }
 
+    /**
+     * Inserts the specified element at the tail of this queue.
+     * As the queue is unbounded, this method will never return {@code false}.
+     *
+     * @return {@code true} (as specified by {@link Queue#offer})
+     * @throws NullPointerException if the specified element is null
+     */
     public boolean offer(E e) {
-        if (e == null) throw new NullPointerException();
-        xfer(e, NOWAIT, 0);
+        xfer(e, true, ASYNC, 0);
         return true;
     }
 
+    /**
+     * Inserts the specified element at the tail of this queue.
+     * As the queue is unbounded, this method will never throw
+     * {@link IllegalStateException} or return {@code false}.
+     *
+     * @return {@code true} (as specified by {@link Collection#add})
+     * @throws NullPointerException if the specified element is null
+     */
     public boolean add(E e) {
-        if (e == null) throw new NullPointerException();
-        xfer(e, NOWAIT, 0);
+        xfer(e, true, ASYNC, 0);
         return true;
     }
 
+    /**
+     * Transfers the element to a waiting consumer immediately, if possible.
+     *
+     * <p>More precisely, transfers the specified element immediately
+     * if there exists a consumer already waiting to receive it (in
+     * {@link #take} or timed {@link #poll(long,TimeUnit) poll}),
+     * otherwise returning {@code false} without enqueuing the element.
+     *
+     * @throws NullPointerException if the specified element is null
+     */
+    public boolean tryTransfer(E e) {
+        return xfer(e, true, NOW, 0) == null;
+    }
+
+    /**
+     * Transfers the element to a consumer, waiting if necessary to do so.
+     *
+     * <p>More precisely, transfers the specified element immediately
+     * if there exists a consumer already waiting to receive it (in
+     * {@link #take} or timed {@link #poll(long,TimeUnit) poll}),
+     * else inserts the specified element at the tail of this queue
+     * and waits until the element is received by a consumer.
+     *
+     * @throws NullPointerException if the specified element is null
+     */
     public void transfer(E e) throws InterruptedException {
-        if (e == null) throw new NullPointerException();
-        if (xfer(e, WAIT, 0) == null) {
-            Thread.interrupted();
+        if (xfer(e, true, SYNC, 0) != null) {
+            Thread.interrupted(); // failure possible only due to interrupt
             throw new InterruptedException();
         }
     }
 
+    /**
+     * Transfers the element to a consumer if it is possible to do so
+     * before the timeout elapses.
+     *
+     * <p>More precisely, transfers the specified element immediately
+     * if there exists a consumer already waiting to receive it (in
+     * {@link #take} or timed {@link #poll(long,TimeUnit) poll}),
+     * else inserts the specified element at the tail of this queue
+     * and waits until the element is received by a consumer,
+     * returning {@code false} if the specified wait time elapses
+     * before the element can be transferred.
+     *
+     * @throws NullPointerException if the specified element is null
+     */
     public boolean tryTransfer(E e, long timeout, TimeUnit unit)
         throws InterruptedException {
-        if (e == null) throw new NullPointerException();
-        if (xfer(e, TIMEOUT, unit.toNanos(timeout)) != null)
+        if (xfer(e, true, TIMED, unit.toNanos(timeout)) == null)
             return true;
         if (!Thread.interrupted())
             return false;
         throw new InterruptedException();
     }
 
-    public boolean tryTransfer(E e) {
-        if (e == null) throw new NullPointerException();
-        return fulfill(e) != null;
-    }
-
     public E take() throws InterruptedException {
-        Object e = xfer(null, WAIT, 0);
+        E e = xfer(null, false, SYNC, 0);
         if (e != null)
-            return (E)e;
+            return e;
         Thread.interrupted();
         throw new InterruptedException();
     }
 
     public E poll(long timeout, TimeUnit unit) throws InterruptedException {
-        Object e = xfer(null, TIMEOUT, unit.toNanos(timeout));
+        E e = xfer(null, false, TIMED, unit.toNanos(timeout));
         if (e != null || !Thread.interrupted())
-            return (E)e;
+            return e;
         throw new InterruptedException();
     }
 
     public E poll() {
-        return (E)fulfill(null);
+        return xfer(null, false, NOW, 0);
     }
 
+    /**
+     * @throws NullPointerException     {@inheritDoc}
+     * @throws IllegalArgumentException {@inheritDoc}
+     */
     public int drainTo(Collection<? super E> c) {
         if (c == null)
             throw new NullPointerException();
         if (c == this)
             throw new IllegalArgumentException();
         int n = 0;
-        E e;
-        while ( (e = poll()) != null) {
+        for (E e; (e = poll()) != null;) {
             c.add(e);
             ++n;
         }
         return n;
     }
 
+    /**
+     * @throws NullPointerException     {@inheritDoc}
+     * @throws IllegalArgumentException {@inheritDoc}
+     */
     public int drainTo(Collection<? super E> c, int maxElements) {
         if (c == null)
             throw new NullPointerException();
         if (c == this)
             throw new IllegalArgumentException();
         int n = 0;
-        E e;
-        while (n < maxElements && (e = poll()) != null) {
+        for (E e; n < maxElements && (e = poll()) != null;) {
             c.add(e);
             ++n;
         }
         return n;
     }
 
-    // Traversal-based methods
-
     /**
-     * Returns head after performing any outstanding helping steps.
+     * Returns an iterator over the elements in this queue in proper sequence.
+     * The elements will be returned in order from first (head) to last (tail).
+     *
+     * <p>The returned iterator is a "weakly consistent" iterator that
+     * will never throw {@link java.util.ConcurrentModificationException
+     * ConcurrentModificationException}, and guarantees to traverse
+     * elements as they existed upon construction of the iterator, and
+     * may (but is not guaranteed to) reflect any modifications
+     * subsequent to construction.
+     *
+     * @return an iterator over the elements in this queue in proper sequence
      */
-    private QNode traversalHead() {
-        for (;;) {
-            QNode t = tail.get();
-            QNode h = head.get();
-            if (h != null && t != null) {
-                QNode last = t.next;
-                QNode first = h.next;
-                if (t == tail.get()) {
-                    if (last != null)
-                        tail.compareAndSet(t, last);
-                    else if (first != null) {
-                        Object x = first.get();
-                        if (x == first)
-                            advanceHead(h, first);
-                        else
-                            return h;
-                    }
-                    else
-                        return h;
-                }
-            }
-            reclean();
-        }
-    }
-
-
     public Iterator<E> iterator() {
         return new Itr();
     }
 
-    /**
-     * Iterators. Basic strategy is to traverse list, treating
-     * non-data (i.e., request) nodes as terminating list.
-     * Once a valid data node is found, the item is cached
-     * so that the next call to next() will return it even
-     * if subsequently removed.
-     */
-    class Itr implements Iterator<E> {
-        QNode next;        // node to return next
-        QNode pnext;       // predecessor of next
-        QNode snext;       // successor of next
-        QNode curr;        // last returned node, for remove()
-        QNode pcurr;       // predecessor of curr, for remove()
-        E nextItem;        // Cache of next item, once commited to in next
-
-        Itr() {
-            findNext();
-        }
-
-        /**
-         * Ensures next points to next valid node, or null if none.
-         */
-        void findNext() {
-            for (;;) {
-                QNode pred = pnext;
-                QNode q = next;
-                if (pred == null || pred == q) {
-                    pred = traversalHead();
-                    q = pred.next;
-                }
-                if (q == null || !q.isData) {
-                    next = null;
-                    return;
-                }
-                Object x = q.get();
-                QNode s = q.next;
-                if (x != null && q != x && q != s) {
-                    nextItem = (E)x;
-                    snext = s;
-                    pnext = pred;
-                    next = q;
-                    return;
-                }
-                pnext = q;
-                next = s;
-            }
-        }
-
-        public boolean hasNext() {
-            return next != null;
-        }
-
-        public E next() {
-            if (next == null) throw new NoSuchElementException();
-            pcurr = pnext;
-            curr = next;
-            pnext = next;
-            next = snext;
-            E x = nextItem;
-            findNext();
-            return x;
-        }
-
-        public void remove() {
-            QNode p = curr;
-            if (p == null)
-                throw new IllegalStateException();
-            Object x = p.get();
-            if (x != null && x != p && p.compareAndSet(x, p))
-                clean(pcurr, p);
-        }
-    }
-
     public E peek() {
-        for (;;) {
-            QNode h = traversalHead();
-            QNode p = h.next;
-            if (p == null)
-                return null;
-            Object x = p.get();
-            if (p != x) {
-                if (!p.isData)
-                    return null;
-                if (x != null)
-                    return (E)x;
-            }
-        }
+        return firstDataItem();
     }
 
+    /**
+     * Returns {@code true} if this queue contains no elements.
+     *
+     * @return {@code true} if this queue contains no elements
+     */
     public boolean isEmpty() {
-        for (;;) {
-            QNode h = traversalHead();
-            QNode p = h.next;
-            if (p == null)
-                return true;
-            Object x = p.get();
-            if (p != x) {
-                if (!p.isData)
-                    return true;
-                if (x != null)
-                    return false;
-            }
+        for (Node p = head; p != null; p = succ(p)) {
+            if (!p.isMatched())
+                return !p.isData;
         }
+        return true;
     }
 
     public boolean hasWaitingConsumer() {
-        for (;;) {
-            QNode h = traversalHead();
-            QNode p = h.next;
-            if (p == null)
-                return false;
-            Object x = p.get();
-            if (p != x)
-                return !p.isData;
-        }
+        return firstOfMode(false) != null;
     }
 
     /**
@@ -696,58 +1209,64 @@ public class LinkedTransferQueue<E> extends AbstractQueue<E>
      * @return the number of elements in this queue
      */
     public int size() {
-        int count = 0;
-        QNode h = traversalHead();
-        for (QNode p = h.next; p != null && p.isData; p = p.next) {
-            Object x = p.get();
-            if (x != null && x != p) {
-                if (++count == Integer.MAX_VALUE) // saturated
-                    break;
-            }
-        }
-        return count;
+        return countOfMode(true);
     }
 
     public int getWaitingConsumerCount() {
-        int count = 0;
-        QNode h = traversalHead();
-        for (QNode p = h.next; p != null && !p.isData; p = p.next) {
-            if (p.get() == null) {
-                if (++count == Integer.MAX_VALUE)
-                    break;
-            }
-        }
-        return count;
+        return countOfMode(false);
     }
 
-    public int remainingCapacity() {
-        return Integer.MAX_VALUE;
+    /**
+     * Removes a single instance of the specified element from this queue,
+     * if it is present.  More formally, removes an element {@code e} such
+     * that {@code o.equals(e)}, if this queue contains one or more such
+     * elements.
+     * Returns {@code true} if this queue contained the specified element
+     * (or equivalently, if this queue changed as a result of the call).
+     *
+     * @param o element to be removed from this queue, if present
+     * @return {@code true} if this queue changed as a result of the call
+     */
+    public boolean remove(Object o) {
+        return findAndRemove(o);
     }
 
-    public boolean remove(Object o) {
-        if (o == null)
-            return false;
-        for (;;) {
-            QNode pred = traversalHead();
-            for (;;) {
-                QNode q = pred.next;
-                if (q == null || !q.isData)
-                    return false;
-                if (q == pred) // restart
-                    break;
-                Object x = q.get();
-                if (x != null && x != q && o.equals(x) &&
-                    q.compareAndSet(x, q)) {
-                    clean(pred, q);
+    /**
+     * Returns {@code true} if this queue contains the specified element.
+     * More formally, returns {@code true} if and only if this queue contains
+     * at least one element {@code e} such that {@code o.equals(e)}.
+     *
+     * @param o object to be checked for containment in this queue
+     * @return {@code true} if this queue contains the specified element
+     */
+    public boolean contains(Object o) {
+        if (o == null) return false;
+        for (Node p = head; p != null; p = succ(p)) {
+            Object item = p.item;
+            if (p.isData) {
+                if (item != null && item != p && o.equals(item))
                     return true;
-                }
-                pred = q;
             }
+            else if (item == null)
+                break;
         }
+        return false;
+    }
+
+    /**
+     * Always returns {@code Integer.MAX_VALUE} because a
+     * {@code LinkedTransferQueue} is not capacity constrained.
+     *
+     * @return {@code Integer.MAX_VALUE} (as specified by
+     *         {@link java.util.concurrent.BlockingQueue#remainingCapacity()
+     *         BlockingQueue.remainingCapacity})
+     */
+    public int remainingCapacity() {
+        return Integer.MAX_VALUE;
     }
 
     /**
-     * Save the state to a stream (that is, serialize it).
+     * Saves the state to a stream (that is, serializes it).
      *
      * @serialData All of the elements (each an {@code E}) in
      * the proper order, followed by a null
@@ -763,16 +1282,17 @@ public class LinkedTransferQueue<E> extends AbstractQueue<E>
     }
 
     /**
-     * Reconstitute the Queue instance from a stream (that is,
-     * deserialize it).
+     * Reconstitutes the Queue instance from a stream (that is,
+     * deserializes it).
+     *
      * @param s the stream
      */
     private void readObject(java.io.ObjectInputStream s)
         throws java.io.IOException, ClassNotFoundException {
         s.defaultReadObject();
-        resetHeadAndTail();
         for (;;) {
-            E item = (E)s.readObject();
+            @SuppressWarnings("unchecked")
+            E item = (E) s.readObject();
             if (item == null)
                 break;
             else
@@ -780,61 +1300,53 @@ public class LinkedTransferQueue<E> extends AbstractQueue<E>
         }
     }
 
+    // Unsafe mechanics
 
-    // Support for resetting head/tail while deserializing
-    private void resetHeadAndTail() {
-        QNode dummy = new QNode(null, false);
-        _unsafe.putObjectVolatile(this, headOffset,
-                                  new PaddedAtomicReference<QNode>(dummy));
-        _unsafe.putObjectVolatile(this, tailOffset,
-                                  new PaddedAtomicReference<QNode>(dummy));
-        _unsafe.putObjectVolatile(this, cleanMeOffset,
-                                  new PaddedAtomicReference<QNode>(null));
+    private static final sun.misc.Unsafe UNSAFE;
+    private static final long headOffset;
+    private static final long tailOffset;
+    private static final long sweepVotesOffset;
+    static {
+        try {
+            UNSAFE = getUnsafe();
+            Class<?> k = LinkedTransferQueue.class;
+            headOffset = UNSAFE.objectFieldOffset
+                (k.getDeclaredField("head"));
+            tailOffset = UNSAFE.objectFieldOffset
+                (k.getDeclaredField("tail"));
+            sweepVotesOffset = UNSAFE.objectFieldOffset
+                (k.getDeclaredField("sweepVotes"));
+        } catch (Exception e) {
+            throw new Error(e);
+        }
     }
 
-    // Temporary Unsafe mechanics for preliminary release
-    private static Unsafe getUnsafe() throws Throwable {
+    /**
+     * Returns a sun.misc.Unsafe.  Suitable for use in a 3rd party package.
+     * Replace with a simple call to Unsafe.getUnsafe when integrating
+     * into a jdk.
+     *
+     * @return a sun.misc.Unsafe
+     */
+    static sun.misc.Unsafe getUnsafe() {
         try {
-            return Unsafe.getUnsafe();
+            return sun.misc.Unsafe.getUnsafe();
         } catch (SecurityException se) {
             try {
                 return java.security.AccessController.doPrivileged
-                    (new java.security.PrivilegedExceptionAction<Unsafe>() {
-                        public Unsafe run() throws Exception {
-                            return getUnsafePrivileged();
+                    (new java.security
+                     .PrivilegedExceptionAction<sun.misc.Unsafe>() {
+                        public sun.misc.Unsafe run() throws Exception {
+                            java.lang.reflect.Field f = sun.misc
+                                .Unsafe.class.getDeclaredField("theUnsafe");
+                            f.setAccessible(true);
+                            return (sun.misc.Unsafe) f.get(null);
                         }});
             } catch (java.security.PrivilegedActionException e) {
-                throw e.getCause();
+                throw new RuntimeException("Could not initialize intrinsics",
+                                           e.getCause());
             }
         }
     }
 
-    private static Unsafe getUnsafePrivileged()
-            throws NoSuchFieldException, IllegalAccessException {
-        Field f = Unsafe.class.getDeclaredField("theUnsafe");
-        f.setAccessible(true);
-        return (Unsafe) f.get(null);
-    }
-
-    private static long fieldOffset(String fieldName)
-            throws NoSuchFieldException {
-        return _unsafe.objectFieldOffset
-            (LinkedTransferQueue.class.getDeclaredField(fieldName));
-    }
-
-    private static final Unsafe _unsafe;
-    private static final long headOffset;
-    private static final long tailOffset;
-    private static final long cleanMeOffset;
-    static {
-        try {
-            _unsafe = getUnsafe();
-            headOffset = fieldOffset("head");
-            tailOffset = fieldOffset("tail");
-            cleanMeOffset = fieldOffset("cleanMe");
-        } catch (Throwable e) {
-            throw new RuntimeException("Could not initialize intrinsics", e);
-        }
-    }
-
 }
diff --git a/src/forkjoin/scala/concurrent/forkjoin/RecursiveAction.java b/src/forkjoin/scala/concurrent/forkjoin/RecursiveAction.java
index 2d36f7eb33..1e7cdd952d 100644
--- a/src/forkjoin/scala/concurrent/forkjoin/RecursiveAction.java
+++ b/src/forkjoin/scala/concurrent/forkjoin/RecursiveAction.java
@@ -1,64 +1,73 @@
 /*
  * Written by Doug Lea with assistance from members of JCP JSR-166
  * Expert Group and released to the public domain, as explained at
- * http://creativecommons.org/licenses/publicdomain
+ * http://creativecommons.org/publicdomain/zero/1.0/
  */
 
 package scala.concurrent.forkjoin;
 
 /**
- * Recursive resultless ForkJoinTasks. This class establishes
- * conventions to parameterize resultless actions as <tt>Void</tt>
- * ForkJoinTasks. Because <tt>null</tt> is the only valid value of
- * <tt>Void</tt>, methods such as join always return <tt>null</tt>
- * upon completion.
+ * A recursive resultless {@link ForkJoinTask}.  This class
+ * establishes conventions to parameterize resultless actions as
+ * {@code Void} {@code ForkJoinTask}s. Because {@code null} is the
+ * only valid value of type {@code Void}, methods such as {@code join}
+ * always return {@code null} upon completion.
  *
- * <p><b>Sample Usages.</b> Here is a sketch of a ForkJoin sort that
- * sorts a given <tt>long[]</tt> array:
+ * <p><b>Sample Usages.</b> Here is a simple but complete ForkJoin
+ * sort that sorts a given {@code long[]} array:
  *
- * <pre>
- * class SortTask extends RecursiveAction {
- *   final long[] array; final int lo; final int hi;
+ *  <pre> {@code
+ * static class SortTask extends RecursiveAction {
+ *   final long[] array; final int lo, hi;
  *   SortTask(long[] array, int lo, int hi) {
  *     this.array = array; this.lo = lo; this.hi = hi;
  *   }
+ *   SortTask(long[] array) { this(array, 0, array.length); }
  *   protected void compute() {
- *     if (hi - lo &lt; THRESHOLD)
- *       sequentiallySort(array, lo, hi);
+ *     if (hi - lo < THRESHOLD)
+ *       sortSequentially(lo, hi);
  *     else {
- *       int mid = (lo + hi) &gt;&gt;&gt; 1;
+ *       int mid = (lo + hi) >>> 1;
  *       invokeAll(new SortTask(array, lo, mid),
  *                 new SortTask(array, mid, hi));
- *       merge(array, lo, hi);
+ *       merge(lo, mid, hi);
  *     }
  *   }
- * }
- * </pre>
+ *   // implementation details follow:
+ *   final static int THRESHOLD = 1000;
+ *   void sortSequentially(int lo, int hi) {
+ *     Arrays.sort(array, lo, hi);
+ *   }
+ *   void merge(int lo, int mid, int hi) {
+ *     long[] buf = Arrays.copyOfRange(array, lo, mid);
+ *     for (int i = 0, j = lo, k = mid; i < buf.length; j++)
+ *       array[j] = (k == hi || buf[i] < array[k]) ?
+ *         buf[i++] : array[k++];
+ *   }
+ * }}</pre>
  *
- * You could then sort anArray by creating <tt>new SortTask(anArray, 0,
- * anArray.length-1) </tt> and invoking it in a ForkJoinPool.
- * As a more concrete simple example, the following task increments
- * each element of an array:
- * <pre>
+ * You could then sort {@code anArray} by creating {@code new
+ * SortTask(anArray)} and invoking it in a ForkJoinPool.  As a more
+ * concrete simple example, the following task increments each element
+ * of an array:
+ *  <pre> {@code
  * class IncrementTask extends RecursiveAction {
- *   final long[] array; final int lo; final int hi;
+ *   final long[] array; final int lo, hi;
  *   IncrementTask(long[] array, int lo, int hi) {
  *     this.array = array; this.lo = lo; this.hi = hi;
  *   }
  *   protected void compute() {
- *     if (hi - lo &lt; THRESHOLD) {
- *       for (int i = lo; i &lt; hi; ++i)
+ *     if (hi - lo < THRESHOLD) {
+ *       for (int i = lo; i < hi; ++i)
  *         array[i]++;
  *     }
  *     else {
- *       int mid = (lo + hi) &gt;&gt;&gt; 1;
+ *       int mid = (lo + hi) >>> 1;
  *       invokeAll(new IncrementTask(array, lo, mid),
  *                 new IncrementTask(array, mid, hi));
  *     }
  *   }
- * }
- * </pre>
- *
+ * }}</pre>
  *
  * <p>The following example illustrates some refinements and idioms
  * that may lead to better performance: RecursiveActions need not be
@@ -66,33 +75,33 @@ package scala.concurrent.forkjoin;
  * divide-and-conquer approach. Here is a class that sums the squares
  * of each element of a double array, by subdividing out only the
  * right-hand-sides of repeated divisions by two, and keeping track of
- * them with a chain of <tt>next</tt> references. It uses a dynamic
- * threshold based on method <tt>surplus</tt>, but counterbalances
- * potential excess partitioning by directly performing leaf actions
- * on unstolen tasks rather than further subdividing.
+ * them with a chain of {@code next} references. It uses a dynamic
+ * threshold based on method {@code getSurplusQueuedTaskCount}, but
+ * counterbalances potential excess partitioning by directly
+ * performing leaf actions on unstolen tasks rather than further
+ * subdividing.
  *
- * <pre>
+ *  <pre> {@code
  * double sumOfSquares(ForkJoinPool pool, double[] array) {
  *   int n = array.length;
- *   int seqSize = 1 + n / (8 * pool.getParallelism());
- *   Applyer a = new Applyer(array, 0, n, seqSize, null);
+ *   Applyer a = new Applyer(array, 0, n, null);
  *   pool.invoke(a);
  *   return a.result;
  * }
  *
  * class Applyer extends RecursiveAction {
  *   final double[] array;
- *   final int lo, hi, seqSize;
+ *   final int lo, hi;
  *   double result;
  *   Applyer next; // keeps track of right-hand-side tasks
- *   Applyer(double[] array, int lo, int hi, int seqSize, Applyer next) {
+ *   Applyer(double[] array, int lo, int hi, Applyer next) {
  *     this.array = array; this.lo = lo; this.hi = hi;
- *     this.seqSize = seqSize; this.next = next;
+ *     this.next = next;
  *   }
  *
- *   double atLeaf(int l, int r) {
+ *   double atLeaf(int l, int h) {
  *     double sum = 0;
- *     for (int i = l; i &lt; h; ++i) // perform leftmost base step
+ *     for (int i = l; i < h; ++i) // perform leftmost base step
  *       sum += array[i] * array[i];
  *     return sum;
  *   }
@@ -101,10 +110,9 @@ package scala.concurrent.forkjoin;
  *     int l = lo;
  *     int h = hi;
  *     Applyer right = null;
- *     while (h - l &gt; 1 &amp;&amp;
- *        ForkJoinWorkerThread.getEstimatedSurplusTaskCount() &lt;= 3) {
- *        int mid = (l + h) &gt;&gt;&gt; 1;
- *        right = new Applyer(array, mid, h, seqSize, right);
+ *     while (h - l > 1 && getSurplusQueuedTaskCount() <= 3) {
+ *        int mid = (l + h) >>> 1;
+ *        right = new Applyer(array, mid, h, right);
  *        right.fork();
  *        h = mid;
  *     }
@@ -113,17 +121,20 @@ package scala.concurrent.forkjoin;
  *        if (right.tryUnfork()) // directly calculate if not stolen
  *          sum += right.atLeaf(right.lo, right.hi);
  *       else {
- *          right.helpJoin();
+ *          right.join();
  *          sum += right.result;
  *        }
  *        right = right.next;
  *      }
  *     result = sum;
  *   }
- * }
- * </pre>
+ * }}</pre>
+ *
+ * @since 1.7
+ * @author Doug Lea
  */
 public abstract class RecursiveAction extends ForkJoinTask<Void> {
+    private static final long serialVersionUID = 5232453952276485070L;
 
     /**
      * The main computation performed by this task.
@@ -131,7 +142,9 @@ public abstract class RecursiveAction extends ForkJoinTask<Void> {
     protected abstract void compute();
 
     /**
-     * Always returns null
+     * Always returns {@code null}.
+     *
+     * @return {@code null} always
      */
     public final Void getRawResult() { return null; }
 
@@ -141,7 +154,7 @@ public abstract class RecursiveAction extends ForkJoinTask<Void> {
     protected final void setRawResult(Void mustBeNull) { }
 
     /**
-     * Implements execution conventions for RecursiveActions
+     * Implements execution conventions for RecursiveActions.
      */
     protected final boolean exec() {
         compute();
diff --git a/src/forkjoin/scala/concurrent/forkjoin/RecursiveTask.java b/src/forkjoin/scala/concurrent/forkjoin/RecursiveTask.java
index a526f75597..d1e1547143 100644
--- a/src/forkjoin/scala/concurrent/forkjoin/RecursiveTask.java
+++ b/src/forkjoin/scala/concurrent/forkjoin/RecursiveTask.java
@@ -1,29 +1,29 @@
 /*
  * Written by Doug Lea with assistance from members of JCP JSR-166
  * Expert Group and released to the public domain, as explained at
- * http://creativecommons.org/licenses/publicdomain
+ * http://creativecommons.org/publicdomain/zero/1.0/
  */
 
 package scala.concurrent.forkjoin;
 
 /**
- * Recursive result-bearing ForkJoinTasks.
- * <p> For a classic example, here is a task computing Fibonacci numbers:
+ * A recursive result-bearing {@link ForkJoinTask}.
  *
- * <pre>
- * class Fibonacci extends RecursiveTask&lt;Integer&gt; {
+ * <p>For a classic example, here is a task computing Fibonacci numbers:
+ *
+ *  <pre> {@code
+ * class Fibonacci extends RecursiveTask<Integer> {
  *   final int n;
- *   Fibonnaci(int n) { this.n = n; }
+ *   Fibonacci(int n) { this.n = n; }
  *   Integer compute() {
- *     if (n &lt;= 1)
+ *     if (n <= 1)
  *        return n;
  *     Fibonacci f1 = new Fibonacci(n - 1);
  *     f1.fork();
  *     Fibonacci f2 = new Fibonacci(n - 2);
  *     return f2.compute() + f1.join();
  *   }
- * }
- * </pre>
+ * }}</pre>
  *
  * However, besides being a dumb way to compute Fibonacci functions
  * (there is a simple fast linear algorithm that you'd use in
@@ -33,17 +33,14 @@ package scala.concurrent.forkjoin;
  * minimum granularity size (for example 10 here) for which you always
  * sequentially solve rather than subdividing.
  *
+ * @since 1.7
+ * @author Doug Lea
  */
 public abstract class RecursiveTask<V> extends ForkJoinTask<V> {
+    private static final long serialVersionUID = 5232453952276485270L;
 
     /**
-     * Empty constructor for use by subclasses.
-     */
-    protected RecursiveTask() {
-    }
-
-    /**
-     * The result returned by compute method.
+     * The result of the computation.
      */
     V result;
 
@@ -61,7 +58,7 @@ public abstract class RecursiveTask<V> extends ForkJoinTask<V> {
     }
 
     /**
-     * Implements execution conventions for RecursiveTask
+     * Implements execution conventions for RecursiveTask.
      */
     protected final boolean exec() {
         result = compute();
diff --git a/src/forkjoin/scala/concurrent/forkjoin/ThreadLocalRandom.java b/src/forkjoin/scala/concurrent/forkjoin/ThreadLocalRandom.java
index 34e2e37f37..19237c9092 100644
--- a/src/forkjoin/scala/concurrent/forkjoin/ThreadLocalRandom.java
+++ b/src/forkjoin/scala/concurrent/forkjoin/ThreadLocalRandom.java
@@ -1,49 +1,53 @@
 /*
  * Written by Doug Lea with assistance from members of JCP JSR-166
  * Expert Group and released to the public domain, as explained at
- * http://creativecommons.org/licenses/publicdomain
+ * http://creativecommons.org/publicdomain/zero/1.0/
  */
 
 package scala.concurrent.forkjoin;
-import java.util.*;
+
+import java.util.Random;
 
 /**
- * A random number generator with the same properties as class {@link
- * Random} but isolated to the current Thread.  Like the global
- * generator used by the {@link java.lang.Math} class, a
- * ThreadLocalRandom is initialized with an internally generated seed
- * that may not otherwise be modified. When applicable, use of
- * ThreadLocalRandom rather than shared Random objects in concurrent
- * programs will typically encounter much less overhead and
- * contention.  ThreadLocalRandoms are particularly appropriate when
- * multiple tasks (for example, each a {@link ForkJoinTask}), use
- * random numbers in parallel in thread pools.
+ * A random number generator isolated to the current thread.  Like the
+ * global {@link java.util.Random} generator used by the {@link
+ * java.lang.Math} class, a {@code ThreadLocalRandom} is initialized
+ * with an internally generated seed that may not otherwise be
+ * modified. When applicable, use of {@code ThreadLocalRandom} rather
+ * than shared {@code Random} objects in concurrent programs will
+ * typically encounter much less overhead and contention.  Use of
+ * {@code ThreadLocalRandom} is particularly appropriate when multiple
+ * tasks (for example, each a {@link ForkJoinTask}) use random numbers
+ * in parallel in thread pools.
  *
  * <p>Usages of this class should typically be of the form:
- * <code>ThreadLocalRandom.current().nextX(...)</code> (where
- * <code>X</code> is <code>Int</code>, <code>Long</code>, etc).
+ * {@code ThreadLocalRandom.current().nextX(...)} (where
+ * {@code X} is {@code Int}, {@code Long}, etc).
  * When all usages are of this form, it is never possible to
- * accidently share ThreadLocalRandoms across multiple threads.
+ * accidently share a {@code ThreadLocalRandom} across multiple threads.
  *
  * <p>This class also provides additional commonly used bounded random
  * generation methods.
+ *
+ * @since 1.7
+ * @author Doug Lea
  */
 public class ThreadLocalRandom extends Random {
     // same constants as Random, but must be redeclared because private
-    private final static long multiplier = 0x5DEECE66DL;
-    private final static long addend = 0xBL;
-    private final static long mask = (1L << 48) - 1;
+    private static final long multiplier = 0x5DEECE66DL;
+    private static final long addend = 0xBL;
+    private static final long mask = (1L << 48) - 1;
 
     /**
-     * The random seed. We can't use super.seed
+     * The random seed. We can't use super.seed.
      */
     private long rnd;
 
     /**
-     * Initialization flag to permit the first and only allowed call
-     * to setSeed (inside Random constructor) to succeed.  We can't
-     * allow others since it would cause setting seed in one part of a
-     * program to unintentionally impact other usages by the thread.
+     * Initialization flag to permit calls to setSeed to succeed only
+     * while executing the Random constructor.  We can't allow others
+     * since it would cause setting seed in one part of a program to
+     * unintentionally impact other usages by the thread.
      */
     boolean initialized;
 
@@ -65,40 +69,42 @@ public class ThreadLocalRandom extends Random {
 
     /**
      * Constructor called only by localRandom.initialValue.
-     * We rely on the fact that the superclass no-arg constructor
-     * invokes setSeed exactly once to initialize.
      */
     ThreadLocalRandom() {
         super();
+        initialized = true;
     }
 
     /**
-     * Returns the current Thread's ThreadLocalRandom
-     * @return the current Thread's ThreadLocalRandom
+     * Returns the current thread's {@code ThreadLocalRandom}.
+     *
+     * @return the current thread's {@code ThreadLocalRandom}
      */
     public static ThreadLocalRandom current() {
         return localRandom.get();
     }
 
     /**
-     * Throws UnsupportedOperationException. Setting seeds in this
-     * generator is unsupported.
+     * Throws {@code UnsupportedOperationException}.  Setting seeds in
+     * this generator is not supported.
+     *
      * @throws UnsupportedOperationException always
      */
     public void setSeed(long seed) {
         if (initialized)
             throw new UnsupportedOperationException();
-        initialized = true;
         rnd = (seed ^ multiplier) & mask;
     }
 
     protected int next(int bits) {
-        return (int)((rnd = (rnd * multiplier + addend) & mask) >>> (48-bits));
+        rnd = (rnd * multiplier + addend) & mask;
+        return (int) (rnd >>> (48-bits));
     }
 
     /**
      * Returns a pseudorandom, uniformly distributed value between the
      * given least value (inclusive) and bound (exclusive).
+     *
      * @param least the least value returned
      * @param bound the upper bound (exclusive)
      * @throws IllegalArgumentException if least greater than or equal
@@ -113,7 +119,8 @@ public class ThreadLocalRandom extends Random {
 
     /**
      * Returns a pseudorandom, uniformly distributed value
-     * between 0 (inclusive) and the specified value (exclusive)
+     * between 0 (inclusive) and the specified value (exclusive).
+     *
      * @param n the bound on the random number to be returned.  Must be
      *        positive.
      * @return the next value
@@ -131,17 +138,18 @@ public class ThreadLocalRandom extends Random {
         while (n >= Integer.MAX_VALUE) {
             int bits = next(2);
             long half = n >>> 1;
-            long nextn = ((bits & 2) == 0)? half : n - half;
+            long nextn = ((bits & 2) == 0) ? half : n - half;
             if ((bits & 1) == 0)
                 offset += n - nextn;
             n = nextn;
         }
-        return offset + nextInt((int)n);
+        return offset + nextInt((int) n);
     }
 
     /**
      * Returns a pseudorandom, uniformly distributed value between the
      * given least value (inclusive) and bound (exclusive).
+     *
      * @param least the least value returned
      * @param bound the upper bound (exclusive)
      * @return the next value
@@ -156,7 +164,8 @@ public class ThreadLocalRandom extends Random {
 
     /**
      * Returns a pseudorandom, uniformly distributed {@code double} value
-     * between 0 (inclusive) and the specified value (exclusive)
+     * between 0 (inclusive) and the specified value (exclusive).
+     *
      * @param n the bound on the random number to be returned.  Must be
      *        positive.
      * @return the next value
@@ -171,6 +180,7 @@ public class ThreadLocalRandom extends Random {
     /**
      * Returns a pseudorandom, uniformly distributed value between the
      * given least value (inclusive) and bound (exclusive).
+     *
      * @param least the least value returned
      * @param bound the upper bound (exclusive)
      * @return the next value
@@ -183,4 +193,5 @@ public class ThreadLocalRandom extends Random {
         return nextDouble() * (bound - least) + least;
     }
 
+    private static final long serialVersionUID = -5851777807851030925L;
 }
diff --git a/src/forkjoin/scala/concurrent/forkjoin/TransferQueue.java b/src/forkjoin/scala/concurrent/forkjoin/TransferQueue.java
index 9c7b2289c4..7d149c7ae5 100644
--- a/src/forkjoin/scala/concurrent/forkjoin/TransferQueue.java
+++ b/src/forkjoin/scala/concurrent/forkjoin/TransferQueue.java
@@ -1,7 +1,7 @@
 /*
  * Written by Doug Lea with assistance from members of JCP JSR-166
  * Expert Group and released to the public domain, as explained at
- * http://creativecommons.org/licenses/publicdomain
+ * http://creativecommons.org/publicdomain/zero/1.0/
  */
 
 package scala.concurrent.forkjoin;
@@ -11,21 +11,23 @@ import java.util.concurrent.*;
  * A {@link BlockingQueue} in which producers may wait for consumers
  * to receive elements.  A {@code TransferQueue} may be useful for
  * example in message passing applications in which producers
- * sometimes (using method {@code transfer}) await receipt of
- * elements by consumers invoking {@code take} or {@code poll},
- * while at other times enqueue elements (via method {@code put})
- * without waiting for receipt. Non-blocking and time-out versions of
- * {@code tryTransfer} are also available.  A TransferQueue may also
- * be queried via {@code hasWaitingConsumer} whether there are any
- * threads waiting for items, which is a converse analogy to a
- * {@code peek} operation.
+ * sometimes (using method {@link #transfer}) await receipt of
+ * elements by consumers invoking {@code take} or {@code poll}, while
+ * at other times enqueue elements (via method {@code put}) without
+ * waiting for receipt.
+ * {@linkplain #tryTransfer(Object) Non-blocking} and
+ * {@linkplain #tryTransfer(Object,long,TimeUnit) time-out} versions of
+ * {@code tryTransfer} are also available.
+ * A {@code TransferQueue} may also be queried, via {@link
+ * #hasWaitingConsumer}, whether there are any threads waiting for
+ * items, which is a converse analogy to a {@code peek} operation.
  *
- * <p>Like any {@code BlockingQueue}, a {@code TransferQueue} may be
- * capacity bounded. If so, an attempted {@code transfer} operation
- * may initially block waiting for available space, and/or
- * subsequently block waiting for reception by a consumer.  Note that
- * in a queue with zero capacity, such as {@link SynchronousQueue},
- * {@code put} and {@code transfer} are effectively synonymous.
+ * <p>Like other blocking queues, a {@code TransferQueue} may be
+ * capacity bounded.  If so, an attempted transfer operation may
+ * initially block waiting for available space, and/or subsequently
+ * block waiting for reception by a consumer.  Note that in a queue
+ * with zero capacity, such as {@link SynchronousQueue}, {@code put}
+ * and {@code transfer} are effectively synonymous.
  *
  * <p>This interface is a member of the
  * <a href="{@docRoot}/../technotes/guides/collections/index.html">
@@ -37,9 +39,12 @@ import java.util.concurrent.*;
  */
 public interface TransferQueue<E> extends BlockingQueue<E> {
     /**
-     * Transfers the specified element if there exists a consumer
-     * already waiting to receive it, otherwise returning {@code false}
-     * without enqueuing the element.
+     * Transfers the element to a waiting consumer immediately, if possible.
+     *
+     * <p>More precisely, transfers the specified element immediately
+     * if there exists a consumer already waiting to receive it (in
+     * {@link #take} or timed {@link #poll(long,TimeUnit) poll}),
+     * otherwise returning {@code false} without enqueuing the element.
      *
      * @param e the element to transfer
      * @return {@code true} if the element was transferred, else
@@ -53,13 +58,16 @@ public interface TransferQueue<E> extends BlockingQueue<E> {
     boolean tryTransfer(E e);
 
     /**
-     * Inserts the specified element into this queue, waiting if
-     * necessary for space to become available and the element to be
-     * dequeued by a consumer invoking {@code take} or {@code poll}.
+     * Transfers the element to a consumer, waiting if necessary to do so.
+     *
+     * <p>More precisely, transfers the specified element immediately
+     * if there exists a consumer already waiting to receive it (in
+     * {@link #take} or timed {@link #poll(long,TimeUnit) poll}),
+     * else waits until the element is received by a consumer.
      *
      * @param e the element to transfer
      * @throws InterruptedException if interrupted while waiting,
-     *         in which case the element is not enqueued.
+     *         in which case the element is not left enqueued
      * @throws ClassCastException if the class of the specified element
      *         prevents it from being added to this queue
      * @throws NullPointerException if the specified element is null
@@ -69,10 +77,15 @@ public interface TransferQueue<E> extends BlockingQueue<E> {
     void transfer(E e) throws InterruptedException;
 
     /**
-     * Inserts the specified element into this queue, waiting up to
-     * the specified wait time if necessary for space to become
-     * available and the element to be dequeued by a consumer invoking
-     * {@code take} or {@code poll}.
+     * Transfers the element to a consumer if it is possible to do so
+     * before the timeout elapses.
+     *
+     * <p>More precisely, transfers the specified element immediately
+     * if there exists a consumer already waiting to receive it (in
+     * {@link #take} or timed {@link #poll(long,TimeUnit) poll}),
+     * else waits until the element is received by a consumer,
+     * returning {@code false} if the specified wait time elapses
+     * before the element can be transferred.
      *
      * @param e the element to transfer
      * @param timeout how long to wait before giving up, in units of
@@ -81,9 +94,9 @@ public interface TransferQueue<E> extends BlockingQueue<E> {
      *        {@code timeout} parameter
      * @return {@code true} if successful, or {@code false} if
      *         the specified waiting time elapses before completion,
-     *         in which case the element is not enqueued.
+     *         in which case the element is not left enqueued
      * @throws InterruptedException if interrupted while waiting,
-     *         in which case the element is not enqueued.
+     *         in which case the element is not left enqueued
      * @throws ClassCastException if the class of the specified element
      *         prevents it from being added to this queue
      * @throws NullPointerException if the specified element is null
@@ -95,7 +108,8 @@ public interface TransferQueue<E> extends BlockingQueue<E> {
 
     /**
      * Returns {@code true} if there is at least one consumer waiting
-     * to dequeue an element via {@code take} or {@code poll}.
+     * to receive an element via {@link #take} or
+     * timed {@link #poll(long,TimeUnit) poll}.
      * The return value represents a momentary state of affairs.
      *
      * @return {@code true} if there is at least one waiting consumer
@@ -104,15 +118,16 @@ public interface TransferQueue<E> extends BlockingQueue<E> {
 
     /**
      * Returns an estimate of the number of consumers waiting to
-     * dequeue elements via {@code take} or {@code poll}. The return
-     * value is an approximation of a momentary state of affairs, that
-     * may be inaccurate if consumers have completed or given up
-     * waiting. The value may be useful for monitoring and heuristics,
-     * but not for synchronization control. Implementations of this
+     * receive elements via {@link #take} or timed
+     * {@link #poll(long,TimeUnit) poll}.  The return value is an
+     * approximation of a momentary state of affairs, that may be
+     * inaccurate if consumers have completed or given up waiting.
+     * The value may be useful for monitoring and heuristics, but
+     * not for synchronization control.  Implementations of this
      * method are likely to be noticeably slower than those for
      * {@link #hasWaitingConsumer}.
      *
-     * @return the number of consumers waiting to dequeue elements
+     * @return the number of consumers waiting to receive elements
      */
     int getWaitingConsumerCount();
 }
diff --git a/src/forkjoin/scala/concurrent/forkjoin/package-info.java b/src/forkjoin/scala/concurrent/forkjoin/package-info.java
index b8fa0fad02..3561b9b44a 100644
--- a/src/forkjoin/scala/concurrent/forkjoin/package-info.java
+++ b/src/forkjoin/scala/concurrent/forkjoin/package-info.java
@@ -1,7 +1,7 @@
 /*
  * Written by Doug Lea with assistance from members of JCP JSR-166
  * Expert Group and released to the public domain, as explained at
- * http://creativecommons.org/licenses/publicdomain
+ * http://creativecommons.org/publicdomain/zero/1.0/
  */
 
 
@@ -15,7 +15,7 @@
  * Threads. However, when applicable, they typically provide
  * significantly greater performance on multiprocessor platforms.
  *
- * <p> Candidates for fork/join processing mainly include those that
+ * <p>Candidates for fork/join processing mainly include those that
  * can be expressed using parallel divide-and-conquer techniques: To
  * solve a problem, break it in two (or more) parts, and then solve
  * those parts in parallel, continuing on in this way until the
@@ -24,6 +24,5 @@
  * available to other threads (normally one per CPU), that help
  * complete the tasks.  In general, the most efficient ForkJoinTasks
  * are those that directly implement this algorithmic design pattern.
- *
  */
 package scala.concurrent.forkjoin;
author	Heather Miller <heather.miller@epfl.ch>	2012-02-25 17:36:08 +0100
committer	Heather Miller <heather.miller@epfl.ch>	2012-02-25 17:36:08 +0100
commit	76e9da2ca4c31daec2b04848c3c2dbad6ecd426e (patch)
tree	d8f27ad3952d43c2049805cb8805ea3479431dc8
parent	0c2f493804db6b594d7ec68e49e76c75a316230b (diff)
download	scala-76e9da2ca4c31daec2b04848c3c2dbad6ecd426e.tar.gz scala-76e9da2ca4c31daec2b04848c3c2dbad6ecd426e.tar.bz2 scala-76e9da2ca4c31daec2b04848c3c2dbad6ecd426e.zip