1 files changed, 171 insertions, 236 deletions
diff --git a/java/core/src/main/java/com/google/protobuf/Utf8.java b/java/core/src/main/java/com/google/protobuf/Utf8.java
index 6968abb3..4512bf9b 100644
--- a/java/core/src/main/java/com/google/protobuf/Utf8.java
+++ b/java/core/src/main/java/com/google/protobuf/Utf8.java
@@ -42,39 +42,33 @@ import static java.lang.Character.isSurrogatePair;
 import static java.lang.Character.toCodePoint;
 
 import java.nio.ByteBuffer;
-import java.util.Arrays;
 
 /**
- * A set of low-level, high-performance static utility methods related
- * to the UTF-8 character encoding.  This class has no dependencies
- * outside of the core JDK libraries.
+ * A set of low-level, high-performance static utility methods related to the UTF-8 character
+ * encoding. This class has no dependencies outside of the core JDK libraries.
  *
- * <p>There are several variants of UTF-8.  The one implemented by
- * this class is the restricted definition of UTF-8 introduced in
- * Unicode 3.1, which mandates the rejection of "overlong" byte
- * sequences as well as rejection of 3-byte surrogate codepoint byte
- * sequences.  Note that the UTF-8 decoder included in Oracle's JDK
- * has been modified to also reject "overlong" byte sequences, but (as
- * of 2011) still accepts 3-byte surrogate codepoint byte sequences.
+ * <p>There are several variants of UTF-8. The one implemented by this class is the restricted
+ * definition of UTF-8 introduced in Unicode 3.1, which mandates the rejection of "overlong" byte
+ * sequences as well as rejection of 3-byte surrogate codepoint byte sequences. Note that the UTF-8
+ * decoder included in Oracle's JDK has been modified to also reject "overlong" byte sequences, but
+ * (as of 2011) still accepts 3-byte surrogate codepoint byte sequences.
  *
- * <p>The byte sequences considered valid by this class are exactly
- * those that can be roundtrip converted to Strings and back to bytes
- * using the UTF-8 charset, without loss: <pre> {@code
+ * <p>The byte sequences considered valid by this class are exactly those that can be roundtrip
+ * converted to Strings and back to bytes using the UTF-8 charset, without loss:
+ *
+ * <pre>{@code
  * Arrays.equals(bytes, new String(bytes, Internal.UTF_8).getBytes(Internal.UTF_8))
  * }</pre>
  *
- * <p>See the Unicode Standard,</br>
- * Table 3-6. <em>UTF-8 Bit Distribution</em>,</br>
- * Table 3-7. <em>Well Formed UTF-8 Byte Sequences</em>.
+ * <p>See the Unicode Standard,</br> Table 3-6. <em>UTF-8 Bit Distribution</em>,</br> Table 3-7.
+ * <em>Well Formed UTF-8 Byte Sequences</em>.
  *
- * <p>This class supports decoding of partial byte sequences, so that the
- * bytes in a complete UTF-8 byte sequences can be stored in multiple
- * segments.  Methods typically return {@link #MALFORMED} if the partial
- * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is
- * well-formed in the absence of additional input, or if the byte sequence
- * apparently terminated in the middle of a character, an opaque integer
- * "state" value containing enough information to decode the character when
- * passed to a subsequent invocation of a partial decoding method.
+ * <p>This class supports decoding of partial byte sequences, so that the bytes in a complete UTF-8
+ * byte sequences can be stored in multiple segments. Methods typically return {@link #MALFORMED} if
+ * the partial byte sequence is definitely not well-formed, {@link #COMPLETE} if it is well-formed
+ * in the absence of additional input, or if the byte sequence apparently terminated in the middle
+ * of a character, an opaque integer "state" value containing enough information to decode the
+ * character when passed to a subsequent invocation of a partial decoding method.
  *
  * @author martinrb@google.com (Martin Buchholz)
  */
@@ -87,7 +81,9 @@ final class Utf8 {
    * delegate for which all methods are delegated directly to.
    */
   private static final Processor processor =
-      UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor();
+      (UnsafeProcessor.isAvailable() && !Android.isOnAndroidDevice())
+          ? new UnsafeProcessor()
+          : new SafeProcessor();
 
   /**
    * A mask used when performing unsafe reads to determine if a long value contains any non-ASCII
@@ -97,31 +93,28 @@ final class Utf8 {
 
   /**
    * Maximum number of bytes per Java UTF-16 char in UTF-8.
+   *
    * @see java.nio.charset.CharsetEncoder#maxBytesPerChar()
    */
   static final int MAX_BYTES_PER_CHAR = 3;
 
   /**
-   * State value indicating that the byte sequence is well-formed and
-   * complete (no further bytes are needed to complete a character).
+   * State value indicating that the byte sequence is well-formed and complete (no further bytes are
+   * needed to complete a character).
    */
   public static final int COMPLETE = 0;
 
-  /**
-   * State value indicating that the byte sequence is definitely not
-   * well-formed.
-   */
+  /** State value indicating that the byte sequence is definitely not well-formed. */
   public static final int MALFORMED = -1;
 
   /**
    * Used by {@code Unsafe} UTF-8 string validation logic to determine the minimum string length
    * above which to employ an optimized algorithm for counting ASCII characters. The reason for this
    * threshold is that for small strings, the optimization may not be beneficial or may even
-   * negatively impact performance since it requires additional logic to avoid unaligned reads
-   * (when calling {@code Unsafe.getLong}). This threshold guarantees that even if the initial
-   * offset is unaligned, we're guaranteed to make at least one call to {@code Unsafe.getLong()}
-   * which provides a performance improvement that entirely subsumes the cost of the additional
-   * logic.
+   * negatively impact performance since it requires additional logic to avoid unaligned reads (when
+   * calling {@code Unsafe.getLong}). This threshold guarantees that even if the initial offset is
+   * unaligned, we're guaranteed to make at least one call to {@code Unsafe.getLong()} which
+   * provides a performance improvement that entirely subsumes the cost of the additional logic.
    */
   private static final int UNSAFE_COUNT_ASCII_THRESHOLD = 16;
 
@@ -145,76 +138,69 @@ final class Utf8 {
   // are valid trailing bytes.
 
   /**
-   * Returns {@code true} if the given byte array is a well-formed
-   * UTF-8 byte sequence.
+   * Returns {@code true} if the given byte array is a well-formed UTF-8 byte sequence.
    *
-   * <p>This is a convenience method, equivalent to a call to {@code
-   * isValidUtf8(bytes, 0, bytes.length)}.
+   * <p>This is a convenience method, equivalent to a call to {@code isValidUtf8(bytes, 0,
+   * bytes.length)}.
    */
   public static boolean isValidUtf8(byte[] bytes) {
     return processor.isValidUtf8(bytes, 0, bytes.length);
   }
 
   /**
-   * Returns {@code true} if the given byte array slice is a
-   * well-formed UTF-8 byte sequence.  The range of bytes to be
-   * checked extends from index {@code index}, inclusive, to {@code
-   * limit}, exclusive.
+   * Returns {@code true} if the given byte array slice is a well-formed UTF-8 byte sequence. The
+   * range of bytes to be checked extends from index {@code index}, inclusive, to {@code limit},
+   * exclusive.
    *
-   * <p>This is a convenience method, equivalent to {@code
-   * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}.
+   * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit)
+   * == Utf8.COMPLETE}.
    */
   public static boolean isValidUtf8(byte[] bytes, int index, int limit) {
     return processor.isValidUtf8(bytes, index, limit);
   }
 
   /**
-   * Tells whether the given byte array slice is a well-formed,
-   * malformed, or incomplete UTF-8 byte sequence.  The range of bytes
-   * to be checked extends from index {@code index}, inclusive, to
+   * Tells whether the given byte array slice is a well-formed, malformed, or incomplete UTF-8 byte
+   * sequence. The range of bytes to be checked extends from index {@code index}, inclusive, to
    * {@code limit}, exclusive.
    *
-   * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding
-   * operation) or the value returned from a call to a partial decoding method
-   * for the previous bytes
-   *
-   * @return {@link #MALFORMED} if the partial byte sequence is
-   * definitely not well-formed, {@link #COMPLETE} if it is well-formed
-   * (no additional input needed), or if the byte sequence is
-   * "incomplete", i.e. apparently terminated in the middle of a character,
-   * an opaque integer "state" value containing enough information to
-   * decode the character when passed to a subsequent invocation of a
-   * partial decoding method.
+   * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding operation) or the
+   *     value returned from a call to a partial decoding method for the previous bytes
+   * @return {@link #MALFORMED} if the partial byte sequence is definitely not well-formed, {@link
+   *     #COMPLETE} if it is well-formed (no additional input needed), or if the byte sequence is
+   *     "incomplete", i.e. apparently terminated in the middle of a character, an opaque integer
+   *     "state" value containing enough information to decode the character when passed to a
+   *     subsequent invocation of a partial decoding method.
    */
   public static int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
     return processor.partialIsValidUtf8(state, bytes, index, limit);
   }
 
   private static int incompleteStateFor(int byte1) {
-    return (byte1 > (byte) 0xF4) ?
-        MALFORMED : byte1;
+    return (byte1 > (byte) 0xF4) ? MALFORMED : byte1;
   }
 
   private static int incompleteStateFor(int byte1, int byte2) {
-    return (byte1 > (byte) 0xF4 ||
-            byte2 > (byte) 0xBF) ?
-        MALFORMED : byte1 ^ (byte2 << 8);
+    return (byte1 > (byte) 0xF4 || byte2 > (byte) 0xBF) ? MALFORMED : byte1 ^ (byte2 << 8);
   }
 
   private static int incompleteStateFor(int byte1, int byte2, int byte3) {
-    return (byte1 > (byte) 0xF4 ||
-            byte2 > (byte) 0xBF ||
-            byte3 > (byte) 0xBF) ?
-        MALFORMED : byte1 ^ (byte2 << 8) ^ (byte3 << 16);
+    return (byte1 > (byte) 0xF4 || byte2 > (byte) 0xBF || byte3 > (byte) 0xBF)
+        ? MALFORMED
+        : byte1 ^ (byte2 << 8) ^ (byte3 << 16);
   }
 
   private static int incompleteStateFor(byte[] bytes, int index, int limit) {
     int byte1 = bytes[index - 1];
     switch (limit - index) {
-      case 0: return incompleteStateFor(byte1);
-      case 1: return incompleteStateFor(byte1, bytes[index]);
-      case 2: return incompleteStateFor(byte1, bytes[index], bytes[index + 1]);
-      default: throw new AssertionError();
+      case 0:
+        return incompleteStateFor(byte1);
+      case 1:
+        return incompleteStateFor(byte1, bytes[index]);
+      case 2:
+        return incompleteStateFor(byte1, bytes[index], bytes[index + 1]);
+      default:
+        throw new AssertionError();
     }
   }
 
@@ -235,7 +221,7 @@ final class Utf8 {
   // These UTF-8 handling methods are copied from Guava's Utf8 class with a modification to throw
   // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can
   // fallback to more lenient behavior.
-  
+
   static class UnpairedSurrogateException extends IllegalArgumentException {
     UnpairedSurrogateException(int index, int length) {
       super("Unpaired surrogate at index " + index + " of " + length);
@@ -243,9 +229,9 @@ final class Utf8 {
   }
 
   /**
-   * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
-   * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
-   * both time and space.
+   * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, this
+   * method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in both
+   * time and space.
    *
    * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
    *     surrogates)
@@ -265,7 +251,7 @@ final class Utf8 {
     for (; i < utf16Length; i++) {
       char c = sequence.charAt(i);
       if (c < 0x800) {
-        utf8Length += ((0x7f - c) >>> 31);  // branch free!
+        utf8Length += ((0x7f - c) >>> 31); // branch free!
       } else {
         utf8Length += encodedLengthGeneral(sequence, i);
         break;
@@ -274,8 +260,8 @@ final class Utf8 {
 
     if (utf8Length < utf16Length) {
       // Necessary and sufficient condition for overflow because of maximum 3x expansion
-      throw new IllegalArgumentException("UTF-8 length does not fit in int: "
-              + (utf8Length + (1L << 32)));
+      throw new IllegalArgumentException(
+          "UTF-8 length does not fit in int: " + (utf8Length + (1L << 32)));
     }
     return utf8Length;
   }
@@ -369,15 +355,15 @@ final class Utf8 {
   }
 
   /**
-   * Counts (approximately) the number of consecutive ASCII characters in the given buffer.
-   * The byte order of the {@link ByteBuffer} does not matter, so performance can be improved if
-   * native byte order is used (i.e. no byte-swapping in {@link ByteBuffer#getLong(int)}).
+   * Counts (approximately) the number of consecutive ASCII characters in the given buffer. The byte
+   * order of the {@link ByteBuffer} does not matter, so performance can be improved if native byte
+   * order is used (i.e. no byte-swapping in {@link ByteBuffer#getLong(int)}).
    *
    * @param buffer the buffer to be scanned for ASCII chars
    * @param index the starting index of the scan
    * @param limit the limit within buffer for the scan
-   * @return the number of ASCII characters found. The stopping position will be at or
-   * before the first non-ASCII byte.
+   * @return the number of ASCII characters found. The stopping position will be at or before the
+   *     first non-ASCII byte.
    */
   private static int estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit) {
     int i = index;
@@ -389,52 +375,43 @@ final class Utf8 {
     return i - index;
   }
 
-  /**
-   * A processor of UTF-8 strings, providing methods for checking validity and encoding.
-   */
+  /** A processor of UTF-8 strings, providing methods for checking validity and encoding. */
   // TODO(nathanmittler): Add support for Memory/MemoryBlock on Android.
   abstract static class Processor {
     /**
-     * Returns {@code true} if the given byte array slice is a
-     * well-formed UTF-8 byte sequence.  The range of bytes to be
-     * checked extends from index {@code index}, inclusive, to {@code
-     * limit}, exclusive.
+     * Returns {@code true} if the given byte array slice is a well-formed UTF-8 byte sequence. The
+     * range of bytes to be checked extends from index {@code index}, inclusive, to {@code limit},
+     * exclusive.
      *
-     * <p>This is a convenience method, equivalent to {@code
-     * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}.
+     * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit)
+     * == Utf8.COMPLETE}.
      */
     final boolean isValidUtf8(byte[] bytes, int index, int limit) {
       return partialIsValidUtf8(COMPLETE, bytes, index, limit) == COMPLETE;
     }
 
     /**
-     * Tells whether the given byte array slice is a well-formed,
-     * malformed, or incomplete UTF-8 byte sequence.  The range of bytes
-     * to be checked extends from index {@code index}, inclusive, to
-     * {@code limit}, exclusive.
-     *
-     * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding
-     * operation) or the value returned from a call to a partial decoding method
-     * for the previous bytes
+     * Tells whether the given byte array slice is a well-formed, malformed, or incomplete UTF-8
+     * byte sequence. The range of bytes to be checked extends from index {@code index}, inclusive,
+     * to {@code limit}, exclusive.
      *
-     * @return {@link #MALFORMED} if the partial byte sequence is
-     * definitely not well-formed, {@link #COMPLETE} if it is well-formed
-     * (no additional input needed), or if the byte sequence is
-     * "incomplete", i.e. apparently terminated in the middle of a character,
-     * an opaque integer "state" value containing enough information to
-     * decode the character when passed to a subsequent invocation of a
-     * partial decoding method.
+     * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding operation) or the
+     *     value returned from a call to a partial decoding method for the previous bytes
+     * @return {@link #MALFORMED} if the partial byte sequence is definitely not well-formed, {@link
+     *     #COMPLETE} if it is well-formed (no additional input needed), or if the byte sequence is
+     *     "incomplete", i.e. apparently terminated in the middle of a character, an opaque integer
+     *     "state" value containing enough information to decode the character when passed to a
+     *     subsequent invocation of a partial decoding method.
      */
     abstract int partialIsValidUtf8(int state, byte[] bytes, int index, int limit);
 
     /**
-     * Returns {@code true} if the given portion of the {@link ByteBuffer} is a
-     * well-formed UTF-8 byte sequence.  The range of bytes to be
-     * checked extends from index {@code index}, inclusive, to {@code
-     * limit}, exclusive.
+     * Returns {@code true} if the given portion of the {@link ByteBuffer} is a well-formed UTF-8
+     * byte sequence. The range of bytes to be checked extends from index {@code index}, inclusive,
+     * to {@code limit}, exclusive.
      *
-     * <p>This is a convenience method, equivalent to {@code
-     * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}.
+     * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit)
+     * == Utf8.COMPLETE}.
      */
     final boolean isValidUtf8(ByteBuffer buffer, int index, int limit) {
       return partialIsValidUtf8(COMPLETE, buffer, index, limit) == COMPLETE;
@@ -451,22 +428,20 @@ final class Utf8 {
       if (buffer.hasArray()) {
         final int offset = buffer.arrayOffset();
         return partialIsValidUtf8(state, buffer.array(), offset + index, offset + limit);
-      } else if (buffer.isDirect()){
+      } else if (buffer.isDirect()) {
         return partialIsValidUtf8Direct(state, buffer, index, limit);
       }
       return partialIsValidUtf8Default(state, buffer, index, limit);
     }
 
-    /**
-     * Performs validation for direct {@link ByteBuffer} instances.
-     */
+    /** Performs validation for direct {@link ByteBuffer} instances. */
     abstract int partialIsValidUtf8Direct(
         final int state, final ByteBuffer buffer, int index, final int limit);
 
     /**
      * Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather
-     * than potentially faster approaches. This first completes validation for the current
-     * character (provided by {@code state}) and then finishes validation for the sequence.
+     * than potentially faster approaches. This first completes validation for the current character
+     * (provided by {@code state}) and then finishes validation for the sequence.
      */
     final int partialIsValidUtf8Default(
         final int state, final ByteBuffer buffer, int index, final int limit) {
@@ -565,7 +540,7 @@ final class Utf8 {
     private static int partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit) {
       index += estimateConsecutiveAscii(buffer, index, limit);
 
-      for (;;) {
+      for (; ; ) {
         // Optimize for interior runs of ASCII bytes.
         // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
         // Maybe after seeing a few in a row that are ASCII, go back to fast mode?
@@ -657,15 +632,13 @@ final class Utf8 {
       return decodeUtf8Default(buffer, index, size);
     }
 
-    /**
-     * Decodes direct {@link ByteBuffer} instances into {@link String}.
-     */
+    /** Decodes direct {@link ByteBuffer} instances into {@link String}. */
     abstract String decodeUtf8Direct(ByteBuffer buffer, int index, int size)
         throws InvalidProtocolBufferException;
 
     /**
-     * Decodes {@link ByteBuffer} instances using the {@link ByteBuffer} API rather than
-     * potentially faster approaches.
+     * Decodes {@link ByteBuffer} instances using the {@link ByteBuffer} API rather than potentially
+     * faster approaches.
      */
     final String decodeUtf8Default(ByteBuffer buffer, int index, int size)
         throws InvalidProtocolBufferException {
@@ -746,21 +719,22 @@ final class Utf8 {
     /**
      * Encodes an input character sequence ({@code in}) to UTF-8 in the target array ({@code out}).
      * For a string, this method is similar to
+     *
      * <pre>{@code
      * byte[] a = string.getBytes(UTF_8);
      * System.arraycopy(a, 0, bytes, offset, a.length);
      * return offset + a.length;
      * }</pre>
      *
-     * but is more efficient in both time and space. One key difference is that this method
-     * requires paired surrogates, and therefore does not support chunking.
-     * While {@code String.getBytes(UTF_8)} replaces unpaired surrogates with the default
-     * replacement character, this method throws {@link UnpairedSurrogateException}.
+     * but is more efficient in both time and space. One key difference is that this method requires
+     * paired surrogates, and therefore does not support chunking. While {@code
+     * String.getBytes(UTF_8)} replaces unpaired surrogates with the default replacement character,
+     * this method throws {@link UnpairedSurrogateException}.
      *
      * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to
-     * compute the exact amount needed, or leave room for 
-     * {@code Utf8.MAX_BYTES_PER_CHAR * sequence.length()}, which is the largest possible number
-     * of bytes that any input can be encoded to.
+     * compute the exact amount needed, or leave room for {@code Utf8.MAX_BYTES_PER_CHAR *
+     * sequence.length()}, which is the largest possible number of bytes that any input can be
+     * encoded to.
      *
      * @param in the input character sequence to be encoded
      * @param out the target array
@@ -777,26 +751,24 @@ final class Utf8 {
     /**
      * Encodes an input character sequence ({@code in}) to UTF-8 in the target buffer ({@code out}).
      * Upon returning from this method, the {@code out} position will point to the position after
-     * the last encoded byte. This method requires paired surrogates, and therefore does not
-     * support chunking.
+     * the last encoded byte. This method requires paired surrogates, and therefore does not support
+     * chunking.
      *
      * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to
-     * compute the exact amount needed, or leave room for
-     * {@code Utf8.MAX_BYTES_PER_CHAR * in.length()}, which is the largest possible number
-     * of bytes that any input can be encoded to.
+     * compute the exact amount needed, or leave room for {@code Utf8.MAX_BYTES_PER_CHAR *
+     * in.length()}, which is the largest possible number of bytes that any input can be encoded to.
      *
      * @param in the source character sequence to be encoded
      * @param out the target buffer
      * @throws UnpairedSurrogateException if {@code in} contains ill-formed UTF-16 (unpaired
      *     surrogates)
-     * @throws ArrayIndexOutOfBoundsException if {@code in} encoded in UTF-8 is longer than
-     *     {@code out.remaining()}
+     * @throws ArrayIndexOutOfBoundsException if {@code in} encoded in UTF-8 is longer than {@code
+     *     out.remaining()}
      */
     final void encodeUtf8(CharSequence in, ByteBuffer out) {
       if (out.hasArray()) {
         final int offset = out.arrayOffset();
-        int endIndex =
-            Utf8.encode(in, out.array(), offset + out.position(), out.remaining());
+        int endIndex = Utf8.encode(in, out.array(), offset + out.position(), out.remaining());
         out.position(endIndex - offset);
       } else if (out.isDirect()) {
         encodeUtf8Direct(in, out);
@@ -805,9 +777,7 @@ final class Utf8 {
       }
     }
 
-    /**
-     * Encodes the input character sequence to a direct {@link ByteBuffer} instance.
-     */
+    /** Encodes the input character sequence to a direct {@link ByteBuffer} instance. */
     abstract void encodeUtf8Direct(CharSequence in, ByteBuffer out);
 
     /**
@@ -886,9 +856,7 @@ final class Utf8 {
     }
   }
 
-  /**
-   * {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods.
-   */
+  /** {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods. */
   static final class SafeProcessor extends Processor {
     @Override
     int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
@@ -900,7 +868,7 @@ final class Utf8 {
         //
         // We expect such "straddler characters" to be rare.
 
-        if (index >= limit) {  // No bytes? No progress.
+        if (index >= limit) { // No bytes? No progress.
           return state;
         }
         int byte1 = (byte) state;
@@ -1097,8 +1065,7 @@ final class Utf8 {
           // Minimum code point represented by a surrogate pair is 0x10000, 17 bits,
           // four UTF-8 bytes
           final char low;
-          if (i + 1 == in.length()
-                  || !Character.isSurrogatePair(c, (low = in.charAt(++i)))) {
+          if (i + 1 == in.length() || !Character.isSurrogatePair(c, (low = in.charAt(++i)))) {
             throw new UnpairedSurrogateException((i - 1), utf16Length);
           }
           int codePoint = Character.toCodePoint(c, low);
@@ -1110,8 +1077,7 @@ final class Utf8 {
           // If we are surrogates and we're not a surrogate pair, always throw an
           // UnpairedSurrogateException instead of an ArrayOutOfBoundsException.
           if ((Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE)
-              && (i + 1 == in.length()
-                  || !Character.isSurrogatePair(c, in.charAt(i + 1)))) {
+              && (i + 1 == in.length() || !Character.isSurrogatePair(c, in.charAt(i + 1)))) {
             throw new UnpairedSurrogateException(i, utf16Length);
           }
           throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + j);
@@ -1137,7 +1103,7 @@ final class Utf8 {
     }
 
     private static int partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit) {
-      for (;;) {
+      for (; ; ) {
         int byte1, byte2;
 
         // Optimize for interior runs of ASCII bytes.
@@ -1157,8 +1123,7 @@ final class Utf8 {
 
           // Simultaneously checks for illegal trailing-byte in
           // leading position and overlong 2-byte form.
-          if (byte1 < (byte) 0xC2
-              || bytes[index++] > (byte) 0xBF) {
+          if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) {
             return MALFORMED;
           }
         } else if (byte1 < (byte) 0xF0) {
@@ -1179,7 +1144,7 @@ final class Utf8 {
         } else {
           // four-byte form
 
-          if (index >= limit - 2) {  // incomplete sequence
+          if (index >= limit - 2) { // incomplete sequence
             return incompleteStateFor(bytes, index, limit);
           }
           if ((byte2 = bytes[index++]) > (byte) 0xBF
@@ -1199,13 +1164,9 @@ final class Utf8 {
     }
   }
 
-  /**
-   * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to improve performance.
-   */
+  /** {@link Processor} that uses {@code sun.misc.Unsafe} where possible to improve performance. */
   static final class UnsafeProcessor extends Processor {
-    /**
-     * Indicates whether or not all required unsafe operations are supported on this platform.
-     */
+    /** Indicates whether or not all required unsafe operations are supported on this platform. */
     static boolean isAvailable() {
       return hasUnsafeArrayOperations() && hasUnsafeByteBufferOperations();
     }
@@ -1227,7 +1188,7 @@ final class Utf8 {
         //
         // We expect such "straddler characters" to be rare.
 
-        if (offset >= offsetLimit) {  // No bytes? No progress.
+        if (offset >= offsetLimit) { // No bytes? No progress.
           return state;
         }
         int byte1 = (byte) state;
@@ -1474,10 +1435,7 @@ final class Utf8 {
         }
       }
 
-      if (resultPos < resultArr.length) {
-        resultArr = Arrays.copyOf(resultArr, resultPos);
-      }
-      return UnsafeUtil.moveToString(resultArr);
+      return new String(resultArr, 0, resultPos);
     }
 
     @Override
@@ -1553,10 +1511,7 @@ final class Utf8 {
         }
       }
 
-      if (resultPos < resultArr.length) {
-        resultArr = Arrays.copyOf(resultArr, resultPos);
-      }
-      return UnsafeUtil.moveToString(resultArr);
+      return new String(resultArr, 0, resultPos);
     }
 
     @Override
@@ -1690,8 +1645,8 @@ final class Utf8 {
      * @param bytes the array containing the character sequence
      * @param offset the offset position of the index (same as index + arrayBaseOffset)
      * @param maxChars the maximum number of characters to count
-     * @return the number of ASCII characters found. The stopping position will be at or
-     * before the first non-ASCII byte.
+     * @return the number of ASCII characters found. The stopping position will be at or before the
+     *     first non-ASCII byte.
      */
     private static int unsafeEstimateConsecutiveAscii(
         byte[] bytes, long offset, final int maxChars) {
@@ -1733,24 +1688,24 @@ final class Utf8 {
       // To speed things up further, we're reading longs instead of bytes so we use a mask to
       // determine if any byte in the current long is non-ASCII.
       remaining -= unaligned;
-      for (; remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) == 0;
+      for (;
+          remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) == 0;
           address += 8, remaining -= 8) {}
       return maxChars - remaining;
     }
 
     private static int partialIsValidUtf8(final byte[] bytes, long offset, int remaining) {
-      // Skip past ASCII characters as quickly as possible. 
+      // Skip past ASCII characters as quickly as possible.
       final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remaining);
       remaining -= skipped;
       offset += skipped;
 
-      for (;;) {
+      for (; ; ) {
         // Optimize for interior runs of ASCII bytes.
         // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
         // Maybe after seeing a few in a row that are ASCII, go back to fast mode?
         int byte1 = 0;
-        for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >= 0; --remaining) {
-        }
+        for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >= 0; --remaining) {}
         if (remaining == 0) {
           return COMPLETE;
         }
@@ -1767,8 +1722,7 @@ final class Utf8 {
 
           // Simultaneously checks for illegal trailing-byte in
           // leading position and overlong 2-byte form.
-          if (byte1 < (byte) 0xC2
-              || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
+          if (byte1 < (byte) 0xC2 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
             return MALFORMED;
           }
         } else if (byte1 < (byte) 0xF0) {
@@ -1820,13 +1774,12 @@ final class Utf8 {
       address += skipped;
       remaining -= skipped;
 
-      for (;;) {
+      for (; ; ) {
         // Optimize for interior runs of ASCII bytes.
         // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
         // Maybe after seeing a few in a row that are ASCII, go back to fast mode?
         int byte1 = 0;
-        for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; --remaining) {
-        }
+        for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; --remaining) {}
         if (remaining == 0) {
           return COMPLETE;
         }
@@ -1891,40 +1844,32 @@ final class Utf8 {
       }
     }
 
-    private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long offset,
-        int remaining) {
+    private static int unsafeIncompleteStateFor(
+        byte[] bytes, int byte1, long offset, int remaining) {
       switch (remaining) {
-        case 0: {
+        case 0:
           return incompleteStateFor(byte1);
-        }
-        case 1: {
+        case 1:
           return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset));
-        }
-        case 2: {
-          return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset),
-              UnsafeUtil.getByte(bytes, offset + 1));
-        }
-        default: {
+        case 2:
+          return incompleteStateFor(
+              byte1, UnsafeUtil.getByte(bytes, offset), UnsafeUtil.getByte(bytes, offset + 1));
+        default:
           throw new AssertionError();
-        }
       }
     }
 
     private static int unsafeIncompleteStateFor(long address, final int byte1, int remaining) {
       switch (remaining) {
-        case 0: {
+        case 0:
           return incompleteStateFor(byte1);
-        }
-        case 1: {
+        case 1:
           return incompleteStateFor(byte1, UnsafeUtil.getByte(address));
-        }
-        case 2: {
-          return incompleteStateFor(byte1, UnsafeUtil.getByte(address),
-              UnsafeUtil.getByte(address + 1));
-        }
-        default: {
+        case 2:
+          return incompleteStateFor(
+              byte1, UnsafeUtil.getByte(address), UnsafeUtil.getByte(address + 1));
+        default:
           throw new AssertionError();
-        }
       }
     }
   }
@@ -1936,23 +1881,17 @@ final class Utf8 {
    */
   private static class DecodeUtil {
 
-    /**
-     * Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'.
-     */
+    /** Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'. */
     private static boolean isOneByte(byte b) {
       return b >= 0;
     }
 
-    /**
-     * Returns whether this is a two-byte codepoint with the form '10XXXXXX'.
-     */
+    /** Returns whether this is a two-byte codepoint with the form '10XXXXXX'. */
     private static boolean isTwoBytes(byte b) {
       return b < (byte) 0xE0;
     }
 
-    /**
-     * Returns whether this is a three-byte codepoint with the form '110XXXXX'.
-     */
+    /** Returns whether this is a three-byte codepoint with the form '110XXXXX'. */
     private static boolean isThreeBytes(byte b) {
       return b < (byte) 0xF0;
     }
@@ -1961,13 +1900,11 @@ final class Utf8 {
       resultArr[resultPos] = (char) byte1;
     }
 
-    private static void handleTwoBytes(
-        byte byte1, byte byte2, char[] resultArr, int resultPos)
+    private static void handleTwoBytes(byte byte1, byte byte2, char[] resultArr, int resultPos)
         throws InvalidProtocolBufferException {
       // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
       // overlong 2-byte, '11000001'.
-      if (byte1 < (byte) 0xC2
-          || isNotTrailingByte(byte2)) {
+      if (byte1 < (byte) 0xC2 || isNotTrailingByte(byte2)) {
         throw InvalidProtocolBufferException.invalidUtf8();
       }
       resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2));
@@ -1984,13 +1921,14 @@ final class Utf8 {
           || isNotTrailingByte(byte3)) {
         throw InvalidProtocolBufferException.invalidUtf8();
       }
-      resultArr[resultPos] = (char)
-          (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3));
+      resultArr[resultPos] =
+          (char)
+              (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3));
     }
 
     private static void handleFourBytes(
         byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)
-        throws InvalidProtocolBufferException{
+        throws InvalidProtocolBufferException {
       if (isNotTrailingByte(byte2)
           // Check that 1 <= plane <= 16.  Tricky optimized form of:
           //   valid 4-byte leading byte?
@@ -2004,31 +1942,28 @@ final class Utf8 {
           || isNotTrailingByte(byte4)) {
         throw InvalidProtocolBufferException.invalidUtf8();
       }
-      int codepoint = ((byte1 & 0x07) << 18)
-          | (trailingByteValue(byte2) << 12)
-          | (trailingByteValue(byte3) << 6)
-          | trailingByteValue(byte4);
+      int codepoint =
+          ((byte1 & 0x07) << 18)
+              | (trailingByteValue(byte2) << 12)
+              | (trailingByteValue(byte3) << 6)
+              | trailingByteValue(byte4);
       resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint);
       resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint);
     }
 
-    /**
-     * Returns whether the byte is not a valid continuation of the form '10XXXXXX'.
-     */
+    /** Returns whether the byte is not a valid continuation of the form '10XXXXXX'. */
     private static boolean isNotTrailingByte(byte b) {
       return b > (byte) 0xBF;
     }
 
-    /**
-     * Returns the actual value of the trailing byte (removes the prefix '10') for composition.
-     */
+    /** Returns the actual value of the trailing byte (removes the prefix '10') for composition. */
     private static int trailingByteValue(byte b) {
       return b & 0x3F;
     }
 
     private static char highSurrogate(int codePoint) {
-      return (char) ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))
-          + (codePoint >>> 10));
+      return (char)
+          ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)) + (codePoint >>> 10));
     }
 
     private static char lowSurrogate(int codePoint) {