3 files changed, 586 insertions, 37 deletions
diff --git a/java/core/src/main/java/com/google/protobuf/CodedInputStream.java b/java/core/src/main/java/com/google/protobuf/CodedInputStream.java
index e08a993b..7a3f0eb9 100644
--- a/java/core/src/main/java/com/google/protobuf/CodedInputStream.java
+++ b/java/core/src/main/java/com/google/protobuf/CodedInputStream.java
@@ -64,6 +64,14 @@ public abstract class CodedInputStream {
   // Integer.MAX_VALUE == 0x7FFFFFF == INT_MAX from limits.h
   private static final int DEFAULT_SIZE_LIMIT = Integer.MAX_VALUE;
 
+  /**
+   * Whether to enable our custom UTF-8 decode codepath which does not use {@link StringCoding}.
+   * Enabled by default, disable by setting
+   * {@code -Dcom.google.protobuf.enableCustomutf8Decode=false} in JVM args.
+   */
+  private static final boolean ENABLE_CUSTOM_UTF8_DECODE
+      = !"false".equals(System.getProperty("com.google.protobuf.enableCustomUtf8Decode"));
+
   /** Visible for subclasses. See setRecursionLimit() */
   int recursionDepth;
 
@@ -825,13 +833,19 @@ public abstract class CodedInputStream {
     public String readStringRequireUtf8() throws IOException {
       final int size = readRawVarint32();
       if (size > 0 && size <= (limit - pos)) {
-        // TODO(martinrb): We could save a pass by validating while decoding.
-        if (!Utf8.isValidUtf8(buffer, pos, pos + size)) {
-          throw InvalidProtocolBufferException.invalidUtf8();
+        if (ENABLE_CUSTOM_UTF8_DECODE) {
+          String result = Utf8.decodeUtf8(buffer, pos, size);
+          pos += size;
+          return result;
+        } else {
+          // TODO(martinrb): We could save a pass by validating while decoding.
+          if (!Utf8.isValidUtf8(buffer, pos, pos + size)) {
+            throw InvalidProtocolBufferException.invalidUtf8();
+          }
+          final int tempPos = pos;
+          pos += size;
+          return new String(buffer, tempPos, size, UTF_8);
         }
-        final int tempPos = pos;
-        pos += size;
-        return new String(buffer, tempPos, size, UTF_8);
       }
 
       if (size == 0) {
@@ -1524,6 +1538,8 @@ public abstract class CodedInputStream {
       final int size = readRawVarint32();
       if (size > 0 && size <= remaining()) {
         // TODO(nathanmittler): Is there a way to avoid this copy?
+        // TODO(anuraaga): It might be possible to share the optimized loop with
+        // readStringRequireUtf8 by implementing Java replacement logic there.
         // The same as readBytes' logic
         byte[] bytes = new byte[size];
         UnsafeUtil.copyMemory(pos, bytes, 0, size);
@@ -1544,19 +1560,26 @@ public abstract class CodedInputStream {
     @Override
     public String readStringRequireUtf8() throws IOException {
       final int size = readRawVarint32();
-      if (size >= 0 && size <= remaining()) {
-        // TODO(nathanmittler): Is there a way to avoid this copy?
-        // The same as readBytes' logic
-        byte[] bytes = new byte[size];
-        UnsafeUtil.copyMemory(pos, bytes, 0, size);
-        // TODO(martinrb): We could save a pass by validating while decoding.
-        if (!Utf8.isValidUtf8(bytes)) {
-          throw InvalidProtocolBufferException.invalidUtf8();
-        }
+      if (size > 0 && size <= remaining()) {
+        if (ENABLE_CUSTOM_UTF8_DECODE) {
+          final int bufferPos = bufferPos(pos);
+          String result = Utf8.decodeUtf8(buffer, bufferPos, size);
+          pos += size;
+          return result;
+        } else {
+          // TODO(nathanmittler): Is there a way to avoid this copy?
+          // The same as readBytes' logic
+          byte[] bytes = new byte[size];
+          UnsafeUtil.copyMemory(pos, bytes, 0, size);
+          // TODO(martinrb): We could save a pass by validating while decoding.
+          if (!Utf8.isValidUtf8(bytes)) {
+            throw InvalidProtocolBufferException.invalidUtf8();
+          }
 
-        String result = new String(bytes, UTF_8);
-        pos += size;
-        return result;
+          String result = new String(bytes, UTF_8);
+          pos += size;
+          return result;
+        }
       }
 
       if (size == 0) {
@@ -2324,11 +2347,15 @@ public abstract class CodedInputStream {
         bytes = readRawBytesSlowPath(size);
         tempPos = 0;
       }
-      // TODO(martinrb): We could save a pass by validating while decoding.
-      if (!Utf8.isValidUtf8(bytes, tempPos, tempPos + size)) {
-        throw InvalidProtocolBufferException.invalidUtf8();
+      if (ENABLE_CUSTOM_UTF8_DECODE) {
+        return Utf8.decodeUtf8(bytes, tempPos, size);
+      } else {
+        // TODO(martinrb): We could save a pass by validating while decoding.
+        if (!Utf8.isValidUtf8(bytes, tempPos, tempPos + size)) {
+          throw InvalidProtocolBufferException.invalidUtf8();
+        }
+        return new String(bytes, tempPos, size, UTF_8);
       }
-      return new String(bytes, tempPos, size, UTF_8);
     }
 
     @Override
@@ -3348,23 +3375,34 @@ public abstract class CodedInputStream {
     public String readStringRequireUtf8() throws IOException {
       final int size = readRawVarint32();
       if (size > 0 && size <= currentByteBufferLimit - currentByteBufferPos) {
-        byte[] bytes = new byte[size];
-        UnsafeUtil.copyMemory(currentByteBufferPos, bytes, 0, size);
-        if (!Utf8.isValidUtf8(bytes)) {
-          throw InvalidProtocolBufferException.invalidUtf8();
+        if (ENABLE_CUSTOM_UTF8_DECODE) {
+          final int bufferPos = (int) (currentByteBufferPos - currentByteBufferStartPos);
+          String result = Utf8.decodeUtf8(currentByteBuffer, bufferPos, size);
+          currentByteBufferPos += size;
+          return result;
+        } else {
+          byte[] bytes = new byte[size];
+          UnsafeUtil.copyMemory(currentByteBufferPos, bytes, 0, size);
+          if (!Utf8.isValidUtf8(bytes)) {
+            throw InvalidProtocolBufferException.invalidUtf8();
+          }
+          String result = new String(bytes, UTF_8);
+          currentByteBufferPos += size;
+          return result;
         }
-        String result = new String(bytes, UTF_8);
-        currentByteBufferPos += size;
-        return result;
       }
       if (size >= 0 && size <= remaining()) {
         byte[] bytes = new byte[size];
         readRawBytesTo(bytes, 0, size);
-        if (!Utf8.isValidUtf8(bytes)) {
-          throw InvalidProtocolBufferException.invalidUtf8();
+        if (ENABLE_CUSTOM_UTF8_DECODE) {
+          return Utf8.decodeUtf8(bytes, 0, size);
+        } else {
+          if (!Utf8.isValidUtf8(bytes)) {
+            throw InvalidProtocolBufferException.invalidUtf8();
+          }
+          String result = new String(bytes, UTF_8);
+          return result;
         }
-        String result = new String(bytes, UTF_8);
-        return result;
       }
 
       if (size == 0) {
diff --git a/java/core/src/main/java/com/google/protobuf/UnsafeUtil.java b/java/core/src/main/java/com/google/protobuf/UnsafeUtil.java
index 88315cb6..76fc687e 100644
--- a/java/core/src/main/java/com/google/protobuf/UnsafeUtil.java
+++ b/java/core/src/main/java/com/google/protobuf/UnsafeUtil.java
@@ -33,7 +33,6 @@ package com.google.protobuf;
 import java.lang.reflect.Field;
 import java.nio.Buffer;
 import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
 import java.security.AccessController;
 import java.security.PrivilegedExceptionAction;
 import java.util.logging.Level;
@@ -72,6 +71,8 @@ final class UnsafeUtil {
 
   private static final long BUFFER_ADDRESS_OFFSET = fieldOffset(bufferAddressField());
 
+  private static final long STRING_VALUE_OFFSET = fieldOffset(stringValueField());
+
   private UnsafeUtil() {}
 
   static boolean hasUnsafeArrayOperations() {
@@ -259,6 +260,26 @@ final class UnsafeUtil {
     return MEMORY_ACCESSOR.getLong(buffer, BUFFER_ADDRESS_OFFSET);
   }
 
+  /**
+   * Returns a new {@link String} backed by the given {@code chars}. The char array should not
+   * be mutated any more after calling this function.
+   */
+  static String moveToString(char[] chars) {
+    if (STRING_VALUE_OFFSET == -1) {
+      // In the off-chance that this JDK does not implement String as we'd expect, just do a copy.
+      return new String(chars);
+    }
+    final String str;
+    try {
+      str = (String) UNSAFE.allocateInstance(String.class);
+    } catch (InstantiationException e) {
+      // This should never happen, but return a copy as a fallback just in case.
+      return new String(chars);
+    }
+    putObject(str, STRING_VALUE_OFFSET, chars);
+    return str;
+  }
+
   static Object getStaticObject(Field field) {
     return MEMORY_ACCESSOR.getStaticObject(field);
   }
@@ -375,7 +396,12 @@ final class UnsafeUtil {
 
   /** Finds the address field within a direct {@link Buffer}. */
   private static Field bufferAddressField() {
-    return field(Buffer.class, "address");
+    return field(Buffer.class, "address", long.class);
+  }
+
+  /** Finds the value field within a {@link String}. */
+  private static Field stringValueField() {
+    return field(String.class, "value", char[].class);
   }
 
   /**
@@ -390,11 +416,14 @@ final class UnsafeUtil {
    * Gets the field with the given name within the class, or {@code null} if not found. If found,
    * the field is made accessible.
    */
-  private static Field field(Class<?> clazz, String fieldName) {
+  private static Field field(Class<?> clazz, String fieldName, Class<?> expectedType) {
     Field field;
     try {
       field = clazz.getDeclaredField(fieldName);
       field.setAccessible(true);
+      if (!field.getType().equals(expectedType)) {
+        return null;
+      }
     } catch (Throwable t) {
       // Failed to access the fields.
       field = null;
diff --git a/java/core/src/main/java/com/google/protobuf/Utf8.java b/java/core/src/main/java/com/google/protobuf/Utf8.java
index 1b136144..6968abb3 100644
--- a/java/core/src/main/java/com/google/protobuf/Utf8.java
+++ b/java/core/src/main/java/com/google/protobuf/Utf8.java
@@ -34,11 +34,15 @@ import static com.google.protobuf.UnsafeUtil.addressOffset;
 import static com.google.protobuf.UnsafeUtil.hasUnsafeArrayOperations;
 import static com.google.protobuf.UnsafeUtil.hasUnsafeByteBufferOperations;
 import static java.lang.Character.MAX_SURROGATE;
+import static java.lang.Character.MIN_HIGH_SURROGATE;
+import static java.lang.Character.MIN_LOW_SURROGATE;
+import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
 import static java.lang.Character.MIN_SURROGATE;
 import static java.lang.Character.isSurrogatePair;
 import static java.lang.Character.toCodePoint;
 
 import java.nio.ByteBuffer;
+import java.util.Arrays;
 
 /**
  * A set of low-level, high-performance static utility methods related
@@ -289,7 +293,7 @@ final class Utf8 {
         if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {
           // Check that we have a well-formed surrogate pair.
           int cp = Character.codePointAt(sequence, i);
-          if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
+          if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
             throw new UnpairedSurrogateException(i, utf16Length);
           }
           i++;
@@ -331,6 +335,26 @@ final class Utf8 {
   }
 
   /**
+   * Decodes the given UTF-8 portion of the {@link ByteBuffer} into a {@link String}.
+   *
+   * @throws InvalidProtocolBufferException if the input is not valid UTF-8.
+   */
+  static String decodeUtf8(ByteBuffer buffer, int index, int size)
+      throws InvalidProtocolBufferException {
+    return processor.decodeUtf8(buffer, index, size);
+  }
+
+  /**
+   * Decodes the given UTF-8 encoded byte array slice into a {@link String}.
+   *
+   * @throws InvalidProtocolBufferException if the input is not valid UTF-8.
+   */
+  static String decodeUtf8(byte[] bytes, int index, int size)
+      throws InvalidProtocolBufferException {
+    return processor.decodeUtf8(bytes, index, size);
+  }
+
+  /**
    * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding.
    *
    * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
@@ -610,6 +634,116 @@ final class Utf8 {
     }
 
     /**
+     * Decodes the given byte array slice into a {@link String}.
+     *
+     * @throws InvalidProtocolBufferException if the byte array slice is not valid UTF-8.
+     */
+    abstract String decodeUtf8(byte[] bytes, int index, int size)
+        throws InvalidProtocolBufferException;
+
+    /**
+     * Decodes the given portion of the {@link ByteBuffer} into a {@link String}.
+     *
+     * @throws InvalidProtocolBufferException if the portion of the buffer is not valid UTF-8.
+     */
+    final String decodeUtf8(ByteBuffer buffer, int index, int size)
+        throws InvalidProtocolBufferException {
+      if (buffer.hasArray()) {
+        final int offset = buffer.arrayOffset();
+        return decodeUtf8(buffer.array(), offset + index, size);
+      } else if (buffer.isDirect()) {
+        return decodeUtf8Direct(buffer, index, size);
+      }
+      return decodeUtf8Default(buffer, index, size);
+    }
+
+    /**
+     * Decodes direct {@link ByteBuffer} instances into {@link String}.
+     */
+    abstract String decodeUtf8Direct(ByteBuffer buffer, int index, int size)
+        throws InvalidProtocolBufferException;
+
+    /**
+     * Decodes {@link ByteBuffer} instances using the {@link ByteBuffer} API rather than
+     * potentially faster approaches.
+     */
+    final String decodeUtf8Default(ByteBuffer buffer, int index, int size)
+        throws InvalidProtocolBufferException {
+      // Bitwise OR combines the sign bits so any negative value fails the check.
+      if ((index | size | buffer.limit() - index - size) < 0) {
+        throw new ArrayIndexOutOfBoundsException(
+            String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), index, size));
+      }
+
+      int offset = index;
+      final int limit = offset + size;
+
+      // The longest possible resulting String is the same as the number of input bytes, when it is
+      // all ASCII. For other cases, this over-allocates and we will truncate in the end.
+      char[] resultArr = new char[size];
+      int resultPos = 0;
+
+      // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
+      // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
+      while (offset < limit) {
+        byte b = buffer.get(offset);
+        if (!DecodeUtil.isOneByte(b)) {
+          break;
+        }
+        offset++;
+        DecodeUtil.handleOneByte(b, resultArr, resultPos++);
+      }
+
+      while (offset < limit) {
+        byte byte1 = buffer.get(offset++);
+        if (DecodeUtil.isOneByte(byte1)) {
+          DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
+          // It's common for there to be multiple ASCII characters in a run mixed in, so add an
+          // extra optimized loop to take care of these runs.
+          while (offset < limit) {
+            byte b = buffer.get(offset);
+            if (!DecodeUtil.isOneByte(b)) {
+              break;
+            }
+            offset++;
+            DecodeUtil.handleOneByte(b, resultArr, resultPos++);
+          }
+        } else if (DecodeUtil.isTwoBytes(byte1)) {
+          if (offset >= limit) {
+            throw InvalidProtocolBufferException.invalidUtf8();
+          }
+          DecodeUtil.handleTwoBytes(
+              byte1, /* byte2 */ buffer.get(offset++), resultArr, resultPos++);
+        } else if (DecodeUtil.isThreeBytes(byte1)) {
+          if (offset >= limit - 1) {
+            throw InvalidProtocolBufferException.invalidUtf8();
+          }
+          DecodeUtil.handleThreeBytes(
+              byte1,
+              /* byte2 */ buffer.get(offset++),
+              /* byte3 */ buffer.get(offset++),
+              resultArr,
+              resultPos++);
+        } else {
+          if (offset >= limit - 2) {
+            throw InvalidProtocolBufferException.invalidUtf8();
+          }
+          DecodeUtil.handleFourBytes(
+              byte1,
+              /* byte2 */ buffer.get(offset++),
+              /* byte3 */ buffer.get(offset++),
+              /* byte4 */ buffer.get(offset++),
+              resultArr,
+              resultPos++);
+          // 4-byte case requires two chars.
+          resultPos++;
+        }
+      }
+
+      return new String(resultArr, 0, resultPos);
+    }
+
+    /**
      * Encodes an input character sequence ({@code in}) to UTF-8 in the target array ({@code out}).
      * For a string, this method is similar to
      * <pre>{@code
@@ -851,6 +985,88 @@ final class Utf8 {
     }
 
     @Override
+    String decodeUtf8(byte[] bytes, int index, int size) throws InvalidProtocolBufferException {
+      // Bitwise OR combines the sign bits so any negative value fails the check.
+      if ((index | size | bytes.length - index - size) < 0) {
+        throw new ArrayIndexOutOfBoundsException(
+            String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size));
+      }
+
+      int offset = index;
+      final int limit = offset + size;
+
+      // The longest possible resulting String is the same as the number of input bytes, when it is
+      // all ASCII. For other cases, this over-allocates and we will truncate in the end.
+      char[] resultArr = new char[size];
+      int resultPos = 0;
+
+      // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
+      // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
+      while (offset < limit) {
+        byte b = bytes[offset];
+        if (!DecodeUtil.isOneByte(b)) {
+          break;
+        }
+        offset++;
+        DecodeUtil.handleOneByte(b, resultArr, resultPos++);
+      }
+
+      while (offset < limit) {
+        byte byte1 = bytes[offset++];
+        if (DecodeUtil.isOneByte(byte1)) {
+          DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
+          // It's common for there to be multiple ASCII characters in a run mixed in, so add an
+          // extra optimized loop to take care of these runs.
+          while (offset < limit) {
+            byte b = bytes[offset];
+            if (!DecodeUtil.isOneByte(b)) {
+              break;
+            }
+            offset++;
+            DecodeUtil.handleOneByte(b, resultArr, resultPos++);
+          }
+        } else if (DecodeUtil.isTwoBytes(byte1)) {
+          if (offset >= limit) {
+            throw InvalidProtocolBufferException.invalidUtf8();
+          }
+          DecodeUtil.handleTwoBytes(byte1, /* byte2 */ bytes[offset++], resultArr, resultPos++);
+        } else if (DecodeUtil.isThreeBytes(byte1)) {
+          if (offset >= limit - 1) {
+            throw InvalidProtocolBufferException.invalidUtf8();
+          }
+          DecodeUtil.handleThreeBytes(
+              byte1,
+              /* byte2 */ bytes[offset++],
+              /* byte3 */ bytes[offset++],
+              resultArr,
+              resultPos++);
+        } else {
+          if (offset >= limit - 2) {
+            throw InvalidProtocolBufferException.invalidUtf8();
+          }
+          DecodeUtil.handleFourBytes(
+              byte1,
+              /* byte2 */ bytes[offset++],
+              /* byte3 */ bytes[offset++],
+              /* byte4 */ bytes[offset++],
+              resultArr,
+              resultPos++);
+          // 4-byte case requires two chars.
+          resultPos++;
+        }
+      }
+
+      return new String(resultArr, 0, resultPos);
+    }
+
+    @Override
+    String decodeUtf8Direct(ByteBuffer buffer, int index, int size)
+        throws InvalidProtocolBufferException {
+      // For safe processing, we have to use the ByteBufferAPI.
+      return decodeUtf8Default(buffer, index, size);
+    }
+
+    @Override
     int encodeUtf8(CharSequence in, byte[] out, int offset, int length) {
       int utf16Length = in.length();
       int j = offset;
@@ -996,6 +1212,7 @@ final class Utf8 {
 
     @Override
     int partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit) {
+      // Bitwise OR combines the sign bits so any negative value fails the check.
       if ((index | limit | bytes.length - limit) < 0) {
         throw new ArrayIndexOutOfBoundsException(
             String.format("Array length=%d, index=%d, limit=%d", bytes.length, index, limit));
@@ -1091,6 +1308,7 @@ final class Utf8 {
     @Override
     int partialIsValidUtf8Direct(
         final int state, ByteBuffer buffer, final int index, final int limit) {
+      // Bitwise OR combines the sign bits so any negative value fails the check.
       if ((index | limit | buffer.limit() - limit) < 0) {
         throw new ArrayIndexOutOfBoundsException(
             String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), index, limit));
@@ -1185,6 +1403,163 @@ final class Utf8 {
     }
 
     @Override
+    String decodeUtf8(byte[] bytes, int index, int size) throws InvalidProtocolBufferException {
+      if ((index | size | bytes.length - index - size) < 0) {
+        throw new ArrayIndexOutOfBoundsException(
+            String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size));
+      }
+
+      int offset = index;
+      final int limit = offset + size;
+
+      // The longest possible resulting String is the same as the number of input bytes, when it is
+      // all ASCII. For other cases, this over-allocates and we will truncate in the end.
+      char[] resultArr = new char[size];
+      int resultPos = 0;
+
+      // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
+      // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
+      while (offset < limit) {
+        byte b = UnsafeUtil.getByte(bytes, offset);
+        if (!DecodeUtil.isOneByte(b)) {
+          break;
+        }
+        offset++;
+        DecodeUtil.handleOneByte(b, resultArr, resultPos++);
+      }
+
+      while (offset < limit) {
+        byte byte1 = UnsafeUtil.getByte(bytes, offset++);
+        if (DecodeUtil.isOneByte(byte1)) {
+          DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
+          // It's common for there to be multiple ASCII characters in a run mixed in, so add an
+          // extra optimized loop to take care of these runs.
+          while (offset < limit) {
+            byte b = UnsafeUtil.getByte(bytes, offset);
+            if (!DecodeUtil.isOneByte(b)) {
+              break;
+            }
+            offset++;
+            DecodeUtil.handleOneByte(b, resultArr, resultPos++);
+          }
+        } else if (DecodeUtil.isTwoBytes(byte1)) {
+          if (offset >= limit) {
+            throw InvalidProtocolBufferException.invalidUtf8();
+          }
+          DecodeUtil.handleTwoBytes(
+              byte1, /* byte2 */ UnsafeUtil.getByte(bytes, offset++), resultArr, resultPos++);
+        } else if (DecodeUtil.isThreeBytes(byte1)) {
+          if (offset >= limit - 1) {
+            throw InvalidProtocolBufferException.invalidUtf8();
+          }
+          DecodeUtil.handleThreeBytes(
+              byte1,
+              /* byte2 */ UnsafeUtil.getByte(bytes, offset++),
+              /* byte3 */ UnsafeUtil.getByte(bytes, offset++),
+              resultArr,
+              resultPos++);
+        } else {
+          if (offset >= limit - 2) {
+            throw InvalidProtocolBufferException.invalidUtf8();
+          }
+          DecodeUtil.handleFourBytes(
+              byte1,
+              /* byte2 */ UnsafeUtil.getByte(bytes, offset++),
+              /* byte3 */ UnsafeUtil.getByte(bytes, offset++),
+              /* byte4 */ UnsafeUtil.getByte(bytes, offset++),
+              resultArr,
+              resultPos++);
+          // 4-byte case requires two chars.
+          resultPos++;
+        }
+      }
+
+      if (resultPos < resultArr.length) {
+        resultArr = Arrays.copyOf(resultArr, resultPos);
+      }
+      return UnsafeUtil.moveToString(resultArr);
+    }
+
+    @Override
+    String decodeUtf8Direct(ByteBuffer buffer, int index, int size)
+        throws InvalidProtocolBufferException {
+      // Bitwise OR combines the sign bits so any negative value fails the check.
+      if ((index | size | buffer.limit() - index - size) < 0) {
+        throw new ArrayIndexOutOfBoundsException(
+            String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), index, size));
+      }
+      long address = UnsafeUtil.addressOffset(buffer) + index;
+      final long addressLimit = address + size;
+
+      // The longest possible resulting String is the same as the number of input bytes, when it is
+      // all ASCII. For other cases, this over-allocates and we will truncate in the end.
+      char[] resultArr = new char[size];
+      int resultPos = 0;
+
+      // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
+      // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
+      while (address < addressLimit) {
+        byte b = UnsafeUtil.getByte(address);
+        if (!DecodeUtil.isOneByte(b)) {
+          break;
+        }
+        address++;
+        DecodeUtil.handleOneByte(b, resultArr, resultPos++);
+      }
+
+      while (address < addressLimit) {
+        byte byte1 = UnsafeUtil.getByte(address++);
+        if (DecodeUtil.isOneByte(byte1)) {
+          DecodeUtil.handleOneByte(byte1, resultArr, resultPos++);
+          // It's common for there to be multiple ASCII characters in a run mixed in, so add an
+          // extra optimized loop to take care of these runs.
+          while (address < addressLimit) {
+            byte b = UnsafeUtil.getByte(address);
+            if (!DecodeUtil.isOneByte(b)) {
+              break;
+            }
+            address++;
+            DecodeUtil.handleOneByte(b, resultArr, resultPos++);
+          }
+        } else if (DecodeUtil.isTwoBytes(byte1)) {
+          if (address >= addressLimit) {
+            throw InvalidProtocolBufferException.invalidUtf8();
+          }
+          DecodeUtil.handleTwoBytes(
+              byte1, /* byte2 */ UnsafeUtil.getByte(address++), resultArr, resultPos++);
+        } else if (DecodeUtil.isThreeBytes(byte1)) {
+          if (address >= addressLimit - 1) {
+            throw InvalidProtocolBufferException.invalidUtf8();
+          }
+          DecodeUtil.handleThreeBytes(
+              byte1,
+              /* byte2 */ UnsafeUtil.getByte(address++),
+              /* byte3 */ UnsafeUtil.getByte(address++),
+              resultArr,
+              resultPos++);
+        } else {
+          if (address >= addressLimit - 2) {
+            throw InvalidProtocolBufferException.invalidUtf8();
+          }
+          DecodeUtil.handleFourBytes(
+              byte1,
+              /* byte2 */ UnsafeUtil.getByte(address++),
+              /* byte3 */ UnsafeUtil.getByte(address++),
+              /* byte4 */ UnsafeUtil.getByte(address++),
+              resultArr,
+              resultPos++);
+          // 4-byte case requires two chars.
+          resultPos++;
+        }
+      }
+
+      if (resultPos < resultArr.length) {
+        resultArr = Arrays.copyOf(resultArr, resultPos);
+      }
+      return UnsafeUtil.moveToString(resultArr);
+    }
+
+    @Override
     int encodeUtf8(final CharSequence in, final byte[] out, final int offset, final int length) {
       long outIx = offset;
       final long outLimit = outIx + length;
@@ -1554,5 +1929,112 @@ final class Utf8 {
     }
   }
 
+  /**
+   * Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting
+   * bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity
+   * checks and codepoint conversion happen in this class.
+   */
+  private static class DecodeUtil {
+
+    /**
+     * Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'.
+     */
+    private static boolean isOneByte(byte b) {
+      return b >= 0;
+    }
+
+    /**
+     * Returns whether this is a two-byte codepoint with the form '10XXXXXX'.
+     */
+    private static boolean isTwoBytes(byte b) {
+      return b < (byte) 0xE0;
+    }
+
+    /**
+     * Returns whether this is a three-byte codepoint with the form '110XXXXX'.
+     */
+    private static boolean isThreeBytes(byte b) {
+      return b < (byte) 0xF0;
+    }
+
+    private static void handleOneByte(byte byte1, char[] resultArr, int resultPos) {
+      resultArr[resultPos] = (char) byte1;
+    }
+
+    private static void handleTwoBytes(
+        byte byte1, byte byte2, char[] resultArr, int resultPos)
+        throws InvalidProtocolBufferException {
+      // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
+      // overlong 2-byte, '11000001'.
+      if (byte1 < (byte) 0xC2
+          || isNotTrailingByte(byte2)) {
+        throw InvalidProtocolBufferException.invalidUtf8();
+      }
+      resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2));
+    }
+
+    private static void handleThreeBytes(
+        byte byte1, byte byte2, byte byte3, char[] resultArr, int resultPos)
+        throws InvalidProtocolBufferException {
+      if (isNotTrailingByte(byte2)
+          // overlong? 5 most significant bits must not all be zero
+          || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
+          // check for illegal surrogate codepoints
+          || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
+          || isNotTrailingByte(byte3)) {
+        throw InvalidProtocolBufferException.invalidUtf8();
+      }
+      resultArr[resultPos] = (char)
+          (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3));
+    }
+
+    private static void handleFourBytes(
+        byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)
+        throws InvalidProtocolBufferException{
+      if (isNotTrailingByte(byte2)
+          // Check that 1 <= plane <= 16.  Tricky optimized form of:
+          //   valid 4-byte leading byte?
+          // if (byte1 > (byte) 0xF4 ||
+          //   overlong? 4 most significant bits must not all be zero
+          //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
+          //   codepoint larger than the highest code point (U+10FFFF)?
+          //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
+          || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
+          || isNotTrailingByte(byte3)
+          || isNotTrailingByte(byte4)) {
+        throw InvalidProtocolBufferException.invalidUtf8();
+      }
+      int codepoint = ((byte1 & 0x07) << 18)
+          | (trailingByteValue(byte2) << 12)
+          | (trailingByteValue(byte3) << 6)
+          | trailingByteValue(byte4);
+      resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint);
+      resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint);
+    }
+
+    /**
+     * Returns whether the byte is not a valid continuation of the form '10XXXXXX'.
+     */
+    private static boolean isNotTrailingByte(byte b) {
+      return b > (byte) 0xBF;
+    }
+
+    /**
+     * Returns the actual value of the trailing byte (removes the prefix '10') for composition.
+     */
+    private static int trailingByteValue(byte b) {
+      return b & 0x3F;
+    }
+
+    private static char highSurrogate(int codePoint) {
+      return (char) ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))
+          + (codePoint >>> 10));
+    }
+
+    private static char lowSurrogate(int codePoint) {
+      return (char) (MIN_LOW_SURROGATE + (codePoint & 0x3ff));
+    }
+  }
+
   private Utf8() {}
 }