aboutsummaryrefslogtreecommitdiff
path: root/java/core/src/main/java/com/google/protobuf/Utf8.java
diff options
context:
space:
mode:
Diffstat (limited to 'java/core/src/main/java/com/google/protobuf/Utf8.java')
-rw-r--r--java/core/src/main/java/com/google/protobuf/Utf8.java407
1 files changed, 171 insertions, 236 deletions
diff --git a/java/core/src/main/java/com/google/protobuf/Utf8.java b/java/core/src/main/java/com/google/protobuf/Utf8.java
index 6968abb3..4512bf9b 100644
--- a/java/core/src/main/java/com/google/protobuf/Utf8.java
+++ b/java/core/src/main/java/com/google/protobuf/Utf8.java
@@ -42,39 +42,33 @@ import static java.lang.Character.isSurrogatePair;
import static java.lang.Character.toCodePoint;
import java.nio.ByteBuffer;
-import java.util.Arrays;
/**
- * A set of low-level, high-performance static utility methods related
- * to the UTF-8 character encoding. This class has no dependencies
- * outside of the core JDK libraries.
+ * A set of low-level, high-performance static utility methods related to the UTF-8 character
+ * encoding. This class has no dependencies outside of the core JDK libraries.
*
- * <p>There are several variants of UTF-8. The one implemented by
- * this class is the restricted definition of UTF-8 introduced in
- * Unicode 3.1, which mandates the rejection of "overlong" byte
- * sequences as well as rejection of 3-byte surrogate codepoint byte
- * sequences. Note that the UTF-8 decoder included in Oracle's JDK
- * has been modified to also reject "overlong" byte sequences, but (as
- * of 2011) still accepts 3-byte surrogate codepoint byte sequences.
+ * <p>There are several variants of UTF-8. The one implemented by this class is the restricted
+ * definition of UTF-8 introduced in Unicode 3.1, which mandates the rejection of "overlong" byte
+ * sequences as well as rejection of 3-byte surrogate codepoint byte sequences. Note that the UTF-8
+ * decoder included in Oracle's JDK has been modified to also reject "overlong" byte sequences, but
+ * (as of 2011) still accepts 3-byte surrogate codepoint byte sequences.
*
- * <p>The byte sequences considered valid by this class are exactly
- * those that can be roundtrip converted to Strings and back to bytes
- * using the UTF-8 charset, without loss: <pre> {@code
+ * <p>The byte sequences considered valid by this class are exactly those that can be roundtrip
+ * converted to Strings and back to bytes using the UTF-8 charset, without loss:
+ *
+ * <pre>{@code
* Arrays.equals(bytes, new String(bytes, Internal.UTF_8).getBytes(Internal.UTF_8))
* }</pre>
*
- * <p>See the Unicode Standard,</br>
- * Table 3-6. <em>UTF-8 Bit Distribution</em>,</br>
- * Table 3-7. <em>Well Formed UTF-8 Byte Sequences</em>.
+ * <p>See the Unicode Standard,</br> Table 3-6. <em>UTF-8 Bit Distribution</em>,</br> Table 3-7.
+ * <em>Well Formed UTF-8 Byte Sequences</em>.
*
- * <p>This class supports decoding of partial byte sequences, so that the
- * bytes in a complete UTF-8 byte sequences can be stored in multiple
- * segments. Methods typically return {@link #MALFORMED} if the partial
- * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is
- * well-formed in the absence of additional input, or if the byte sequence
- * apparently terminated in the middle of a character, an opaque integer
- * "state" value containing enough information to decode the character when
- * passed to a subsequent invocation of a partial decoding method.
+ * <p>This class supports decoding of partial byte sequences, so that the bytes in a complete UTF-8
+ * byte sequences can be stored in multiple segments. Methods typically return {@link #MALFORMED} if
+ * the partial byte sequence is definitely not well-formed, {@link #COMPLETE} if it is well-formed
+ * in the absence of additional input, or if the byte sequence apparently terminated in the middle
+ * of a character, an opaque integer "state" value containing enough information to decode the
+ * character when passed to a subsequent invocation of a partial decoding method.
*
* @author martinrb@google.com (Martin Buchholz)
*/
@@ -87,7 +81,9 @@ final class Utf8 {
* delegate for which all methods are delegated directly to.
*/
private static final Processor processor =
- UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor();
+ (UnsafeProcessor.isAvailable() && !Android.isOnAndroidDevice())
+ ? new UnsafeProcessor()
+ : new SafeProcessor();
/**
* A mask used when performing unsafe reads to determine if a long value contains any non-ASCII
@@ -97,31 +93,28 @@ final class Utf8 {
/**
* Maximum number of bytes per Java UTF-16 char in UTF-8.
+ *
* @see java.nio.charset.CharsetEncoder#maxBytesPerChar()
*/
static final int MAX_BYTES_PER_CHAR = 3;
/**
- * State value indicating that the byte sequence is well-formed and
- * complete (no further bytes are needed to complete a character).
+ * State value indicating that the byte sequence is well-formed and complete (no further bytes are
+ * needed to complete a character).
*/
public static final int COMPLETE = 0;
- /**
- * State value indicating that the byte sequence is definitely not
- * well-formed.
- */
+ /** State value indicating that the byte sequence is definitely not well-formed. */
public static final int MALFORMED = -1;
/**
* Used by {@code Unsafe} UTF-8 string validation logic to determine the minimum string length
* above which to employ an optimized algorithm for counting ASCII characters. The reason for this
* threshold is that for small strings, the optimization may not be beneficial or may even
- * negatively impact performance since it requires additional logic to avoid unaligned reads
- * (when calling {@code Unsafe.getLong}). This threshold guarantees that even if the initial
- * offset is unaligned, we're guaranteed to make at least one call to {@code Unsafe.getLong()}
- * which provides a performance improvement that entirely subsumes the cost of the additional
- * logic.
+ * negatively impact performance since it requires additional logic to avoid unaligned reads (when
+ * calling {@code Unsafe.getLong}). This threshold guarantees that even if the initial offset is
+ * unaligned, we're guaranteed to make at least one call to {@code Unsafe.getLong()} which
+ * provides a performance improvement that entirely subsumes the cost of the additional logic.
*/
private static final int UNSAFE_COUNT_ASCII_THRESHOLD = 16;
@@ -145,76 +138,69 @@ final class Utf8 {
// are valid trailing bytes.
/**
- * Returns {@code true} if the given byte array is a well-formed
- * UTF-8 byte sequence.
+ * Returns {@code true} if the given byte array is a well-formed UTF-8 byte sequence.
*
- * <p>This is a convenience method, equivalent to a call to {@code
- * isValidUtf8(bytes, 0, bytes.length)}.
+ * <p>This is a convenience method, equivalent to a call to {@code isValidUtf8(bytes, 0,
+ * bytes.length)}.
*/
public static boolean isValidUtf8(byte[] bytes) {
return processor.isValidUtf8(bytes, 0, bytes.length);
}
/**
- * Returns {@code true} if the given byte array slice is a
- * well-formed UTF-8 byte sequence. The range of bytes to be
- * checked extends from index {@code index}, inclusive, to {@code
- * limit}, exclusive.
+ * Returns {@code true} if the given byte array slice is a well-formed UTF-8 byte sequence. The
+ * range of bytes to be checked extends from index {@code index}, inclusive, to {@code limit},
+ * exclusive.
*
- * <p>This is a convenience method, equivalent to {@code
- * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}.
+ * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit)
+ * == Utf8.COMPLETE}.
*/
public static boolean isValidUtf8(byte[] bytes, int index, int limit) {
return processor.isValidUtf8(bytes, index, limit);
}
/**
- * Tells whether the given byte array slice is a well-formed,
- * malformed, or incomplete UTF-8 byte sequence. The range of bytes
- * to be checked extends from index {@code index}, inclusive, to
+ * Tells whether the given byte array slice is a well-formed, malformed, or incomplete UTF-8 byte
+ * sequence. The range of bytes to be checked extends from index {@code index}, inclusive, to
* {@code limit}, exclusive.
*
- * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding
- * operation) or the value returned from a call to a partial decoding method
- * for the previous bytes
- *
- * @return {@link #MALFORMED} if the partial byte sequence is
- * definitely not well-formed, {@link #COMPLETE} if it is well-formed
- * (no additional input needed), or if the byte sequence is
- * "incomplete", i.e. apparently terminated in the middle of a character,
- * an opaque integer "state" value containing enough information to
- * decode the character when passed to a subsequent invocation of a
- * partial decoding method.
+ * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding operation) or the
+ * value returned from a call to a partial decoding method for the previous bytes
+ * @return {@link #MALFORMED} if the partial byte sequence is definitely not well-formed, {@link
+ * #COMPLETE} if it is well-formed (no additional input needed), or if the byte sequence is
+ * "incomplete", i.e. apparently terminated in the middle of a character, an opaque integer
+ * "state" value containing enough information to decode the character when passed to a
+ * subsequent invocation of a partial decoding method.
*/
public static int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
return processor.partialIsValidUtf8(state, bytes, index, limit);
}
private static int incompleteStateFor(int byte1) {
- return (byte1 > (byte) 0xF4) ?
- MALFORMED : byte1;
+ return (byte1 > (byte) 0xF4) ? MALFORMED : byte1;
}
private static int incompleteStateFor(int byte1, int byte2) {
- return (byte1 > (byte) 0xF4 ||
- byte2 > (byte) 0xBF) ?
- MALFORMED : byte1 ^ (byte2 << 8);
+ return (byte1 > (byte) 0xF4 || byte2 > (byte) 0xBF) ? MALFORMED : byte1 ^ (byte2 << 8);
}
private static int incompleteStateFor(int byte1, int byte2, int byte3) {
- return (byte1 > (byte) 0xF4 ||
- byte2 > (byte) 0xBF ||
- byte3 > (byte) 0xBF) ?
- MALFORMED : byte1 ^ (byte2 << 8) ^ (byte3 << 16);
+ return (byte1 > (byte) 0xF4 || byte2 > (byte) 0xBF || byte3 > (byte) 0xBF)
+ ? MALFORMED
+ : byte1 ^ (byte2 << 8) ^ (byte3 << 16);
}
private static int incompleteStateFor(byte[] bytes, int index, int limit) {
int byte1 = bytes[index - 1];
switch (limit - index) {
- case 0: return incompleteStateFor(byte1);
- case 1: return incompleteStateFor(byte1, bytes[index]);
- case 2: return incompleteStateFor(byte1, bytes[index], bytes[index + 1]);
- default: throw new AssertionError();
+ case 0:
+ return incompleteStateFor(byte1);
+ case 1:
+ return incompleteStateFor(byte1, bytes[index]);
+ case 2:
+ return incompleteStateFor(byte1, bytes[index], bytes[index + 1]);
+ default:
+ throw new AssertionError();
}
}
@@ -235,7 +221,7 @@ final class Utf8 {
// These UTF-8 handling methods are copied from Guava's Utf8 class with a modification to throw
// a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can
// fallback to more lenient behavior.
-
+
static class UnpairedSurrogateException extends IllegalArgumentException {
UnpairedSurrogateException(int index, int length) {
super("Unpaired surrogate at index " + index + " of " + length);
@@ -243,9 +229,9 @@ final class Utf8 {
}
/**
- * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
- * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
- * both time and space.
+ * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, this
+ * method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in both
+ * time and space.
*
* @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
* surrogates)
@@ -265,7 +251,7 @@ final class Utf8 {
for (; i < utf16Length; i++) {
char c = sequence.charAt(i);
if (c < 0x800) {
- utf8Length += ((0x7f - c) >>> 31); // branch free!
+ utf8Length += ((0x7f - c) >>> 31); // branch free!
} else {
utf8Length += encodedLengthGeneral(sequence, i);
break;
@@ -274,8 +260,8 @@ final class Utf8 {
if (utf8Length < utf16Length) {
// Necessary and sufficient condition for overflow because of maximum 3x expansion
- throw new IllegalArgumentException("UTF-8 length does not fit in int: "
- + (utf8Length + (1L << 32)));
+ throw new IllegalArgumentException(
+ "UTF-8 length does not fit in int: " + (utf8Length + (1L << 32)));
}
return utf8Length;
}
@@ -369,15 +355,15 @@ final class Utf8 {
}
/**
- * Counts (approximately) the number of consecutive ASCII characters in the given buffer.
- * The byte order of the {@link ByteBuffer} does not matter, so performance can be improved if
- * native byte order is used (i.e. no byte-swapping in {@link ByteBuffer#getLong(int)}).
+ * Counts (approximately) the number of consecutive ASCII characters in the given buffer. The byte
+ * order of the {@link ByteBuffer} does not matter, so performance can be improved if native byte
+ * order is used (i.e. no byte-swapping in {@link ByteBuffer#getLong(int)}).
*
* @param buffer the buffer to be scanned for ASCII chars
* @param index the starting index of the scan
* @param limit the limit within buffer for the scan
- * @return the number of ASCII characters found. The stopping position will be at or
- * before the first non-ASCII byte.
+ * @return the number of ASCII characters found. The stopping position will be at or before the
+ * first non-ASCII byte.
*/
private static int estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit) {
int i = index;
@@ -389,52 +375,43 @@ final class Utf8 {
return i - index;
}
- /**
- * A processor of UTF-8 strings, providing methods for checking validity and encoding.
- */
+ /** A processor of UTF-8 strings, providing methods for checking validity and encoding. */
// TODO(nathanmittler): Add support for Memory/MemoryBlock on Android.
abstract static class Processor {
/**
- * Returns {@code true} if the given byte array slice is a
- * well-formed UTF-8 byte sequence. The range of bytes to be
- * checked extends from index {@code index}, inclusive, to {@code
- * limit}, exclusive.
+ * Returns {@code true} if the given byte array slice is a well-formed UTF-8 byte sequence. The
+ * range of bytes to be checked extends from index {@code index}, inclusive, to {@code limit},
+ * exclusive.
*
- * <p>This is a convenience method, equivalent to {@code
- * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}.
+ * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit)
+ * == Utf8.COMPLETE}.
*/
final boolean isValidUtf8(byte[] bytes, int index, int limit) {
return partialIsValidUtf8(COMPLETE, bytes, index, limit) == COMPLETE;
}
/**
- * Tells whether the given byte array slice is a well-formed,
- * malformed, or incomplete UTF-8 byte sequence. The range of bytes
- * to be checked extends from index {@code index}, inclusive, to
- * {@code limit}, exclusive.
- *
- * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding
- * operation) or the value returned from a call to a partial decoding method
- * for the previous bytes
+ * Tells whether the given byte array slice is a well-formed, malformed, or incomplete UTF-8
+ * byte sequence. The range of bytes to be checked extends from index {@code index}, inclusive,
+ * to {@code limit}, exclusive.
*
- * @return {@link #MALFORMED} if the partial byte sequence is
- * definitely not well-formed, {@link #COMPLETE} if it is well-formed
- * (no additional input needed), or if the byte sequence is
- * "incomplete", i.e. apparently terminated in the middle of a character,
- * an opaque integer "state" value containing enough information to
- * decode the character when passed to a subsequent invocation of a
- * partial decoding method.
+ * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding operation) or the
+ * value returned from a call to a partial decoding method for the previous bytes
+ * @return {@link #MALFORMED} if the partial byte sequence is definitely not well-formed, {@link
+ * #COMPLETE} if it is well-formed (no additional input needed), or if the byte sequence is
+ * "incomplete", i.e. apparently terminated in the middle of a character, an opaque integer
+ * "state" value containing enough information to decode the character when passed to a
+ * subsequent invocation of a partial decoding method.
*/
abstract int partialIsValidUtf8(int state, byte[] bytes, int index, int limit);
/**
- * Returns {@code true} if the given portion of the {@link ByteBuffer} is a
- * well-formed UTF-8 byte sequence. The range of bytes to be
- * checked extends from index {@code index}, inclusive, to {@code
- * limit}, exclusive.
+ * Returns {@code true} if the given portion of the {@link ByteBuffer} is a well-formed UTF-8
+ * byte sequence. The range of bytes to be checked extends from index {@code index}, inclusive,
+ * to {@code limit}, exclusive.
*
- * <p>This is a convenience method, equivalent to {@code
- * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}.
+ * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit)
+ * == Utf8.COMPLETE}.
*/
final boolean isValidUtf8(ByteBuffer buffer, int index, int limit) {
return partialIsValidUtf8(COMPLETE, buffer, index, limit) == COMPLETE;
@@ -451,22 +428,20 @@ final class Utf8 {
if (buffer.hasArray()) {
final int offset = buffer.arrayOffset();
return partialIsValidUtf8(state, buffer.array(), offset + index, offset + limit);
- } else if (buffer.isDirect()){
+ } else if (buffer.isDirect()) {
return partialIsValidUtf8Direct(state, buffer, index, limit);
}
return partialIsValidUtf8Default(state, buffer, index, limit);
}
- /**
- * Performs validation for direct {@link ByteBuffer} instances.
- */
+ /** Performs validation for direct {@link ByteBuffer} instances. */
abstract int partialIsValidUtf8Direct(
final int state, final ByteBuffer buffer, int index, final int limit);
/**
* Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather
- * than potentially faster approaches. This first completes validation for the current
- * character (provided by {@code state}) and then finishes validation for the sequence.
+ * than potentially faster approaches. This first completes validation for the current character
+ * (provided by {@code state}) and then finishes validation for the sequence.
*/
final int partialIsValidUtf8Default(
final int state, final ByteBuffer buffer, int index, final int limit) {
@@ -565,7 +540,7 @@ final class Utf8 {
private static int partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit) {
index += estimateConsecutiveAscii(buffer, index, limit);
- for (;;) {
+ for (; ; ) {
// Optimize for interior runs of ASCII bytes.
// TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
// Maybe after seeing a few in a row that are ASCII, go back to fast mode?
@@ -657,15 +632,13 @@ final class Utf8 {
return decodeUtf8Default(buffer, index, size);
}
- /**
- * Decodes direct {@link ByteBuffer} instances into {@link String}.
- */
+ /** Decodes direct {@link ByteBuffer} instances into {@link String}. */
abstract String decodeUtf8Direct(ByteBuffer buffer, int index, int size)
throws InvalidProtocolBufferException;
/**
- * Decodes {@link ByteBuffer} instances using the {@link ByteBuffer} API rather than
- * potentially faster approaches.
+ * Decodes {@link ByteBuffer} instances using the {@link ByteBuffer} API rather than potentially
+ * faster approaches.
*/
final String decodeUtf8Default(ByteBuffer buffer, int index, int size)
throws InvalidProtocolBufferException {
@@ -746,21 +719,22 @@ final class Utf8 {
/**
* Encodes an input character sequence ({@code in}) to UTF-8 in the target array ({@code out}).
* For a string, this method is similar to
+ *
* <pre>{@code
* byte[] a = string.getBytes(UTF_8);
* System.arraycopy(a, 0, bytes, offset, a.length);
* return offset + a.length;
* }</pre>
*
- * but is more efficient in both time and space. One key difference is that this method
- * requires paired surrogates, and therefore does not support chunking.
- * While {@code String.getBytes(UTF_8)} replaces unpaired surrogates with the default
- * replacement character, this method throws {@link UnpairedSurrogateException}.
+ * but is more efficient in both time and space. One key difference is that this method requires
+ * paired surrogates, and therefore does not support chunking. While {@code
+ * String.getBytes(UTF_8)} replaces unpaired surrogates with the default replacement character,
+ * this method throws {@link UnpairedSurrogateException}.
*
* <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to
- * compute the exact amount needed, or leave room for
- * {@code Utf8.MAX_BYTES_PER_CHAR * sequence.length()}, which is the largest possible number
- * of bytes that any input can be encoded to.
+ * compute the exact amount needed, or leave room for {@code Utf8.MAX_BYTES_PER_CHAR *
+ * sequence.length()}, which is the largest possible number of bytes that any input can be
+ * encoded to.
*
* @param in the input character sequence to be encoded
* @param out the target array
@@ -777,26 +751,24 @@ final class Utf8 {
/**
* Encodes an input character sequence ({@code in}) to UTF-8 in the target buffer ({@code out}).
* Upon returning from this method, the {@code out} position will point to the position after
- * the last encoded byte. This method requires paired surrogates, and therefore does not
- * support chunking.
+ * the last encoded byte. This method requires paired surrogates, and therefore does not support
+ * chunking.
*
* <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to
- * compute the exact amount needed, or leave room for
- * {@code Utf8.MAX_BYTES_PER_CHAR * in.length()}, which is the largest possible number
- * of bytes that any input can be encoded to.
+ * compute the exact amount needed, or leave room for {@code Utf8.MAX_BYTES_PER_CHAR *
+ * in.length()}, which is the largest possible number of bytes that any input can be encoded to.
*
* @param in the source character sequence to be encoded
* @param out the target buffer
* @throws UnpairedSurrogateException if {@code in} contains ill-formed UTF-16 (unpaired
* surrogates)
- * @throws ArrayIndexOutOfBoundsException if {@code in} encoded in UTF-8 is longer than
- * {@code out.remaining()}
+ * @throws ArrayIndexOutOfBoundsException if {@code in} encoded in UTF-8 is longer than {@code
+ * out.remaining()}
*/
final void encodeUtf8(CharSequence in, ByteBuffer out) {
if (out.hasArray()) {
final int offset = out.arrayOffset();
- int endIndex =
- Utf8.encode(in, out.array(), offset + out.position(), out.remaining());
+ int endIndex = Utf8.encode(in, out.array(), offset + out.position(), out.remaining());
out.position(endIndex - offset);
} else if (out.isDirect()) {
encodeUtf8Direct(in, out);
@@ -805,9 +777,7 @@ final class Utf8 {
}
}
- /**
- * Encodes the input character sequence to a direct {@link ByteBuffer} instance.
- */
+ /** Encodes the input character sequence to a direct {@link ByteBuffer} instance. */
abstract void encodeUtf8Direct(CharSequence in, ByteBuffer out);
/**
@@ -886,9 +856,7 @@ final class Utf8 {
}
}
- /**
- * {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods.
- */
+ /** {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods. */
static final class SafeProcessor extends Processor {
@Override
int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
@@ -900,7 +868,7 @@ final class Utf8 {
//
// We expect such "straddler characters" to be rare.
- if (index >= limit) { // No bytes? No progress.
+ if (index >= limit) { // No bytes? No progress.
return state;
}
int byte1 = (byte) state;
@@ -1097,8 +1065,7 @@ final class Utf8 {
// Minimum code point represented by a surrogate pair is 0x10000, 17 bits,
// four UTF-8 bytes
final char low;
- if (i + 1 == in.length()
- || !Character.isSurrogatePair(c, (low = in.charAt(++i)))) {
+ if (i + 1 == in.length() || !Character.isSurrogatePair(c, (low = in.charAt(++i)))) {
throw new UnpairedSurrogateException((i - 1), utf16Length);
}
int codePoint = Character.toCodePoint(c, low);
@@ -1110,8 +1077,7 @@ final class Utf8 {
// If we are surrogates and we're not a surrogate pair, always throw an
// UnpairedSurrogateException instead of an ArrayOutOfBoundsException.
if ((Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE)
- && (i + 1 == in.length()
- || !Character.isSurrogatePair(c, in.charAt(i + 1)))) {
+ && (i + 1 == in.length() || !Character.isSurrogatePair(c, in.charAt(i + 1)))) {
throw new UnpairedSurrogateException(i, utf16Length);
}
throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + j);
@@ -1137,7 +1103,7 @@ final class Utf8 {
}
private static int partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit) {
- for (;;) {
+ for (; ; ) {
int byte1, byte2;
// Optimize for interior runs of ASCII bytes.
@@ -1157,8 +1123,7 @@ final class Utf8 {
// Simultaneously checks for illegal trailing-byte in
// leading position and overlong 2-byte form.
- if (byte1 < (byte) 0xC2
- || bytes[index++] > (byte) 0xBF) {
+ if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) {
return MALFORMED;
}
} else if (byte1 < (byte) 0xF0) {
@@ -1179,7 +1144,7 @@ final class Utf8 {
} else {
// four-byte form
- if (index >= limit - 2) { // incomplete sequence
+ if (index >= limit - 2) { // incomplete sequence
return incompleteStateFor(bytes, index, limit);
}
if ((byte2 = bytes[index++]) > (byte) 0xBF
@@ -1199,13 +1164,9 @@ final class Utf8 {
}
}
- /**
- * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to improve performance.
- */
+ /** {@link Processor} that uses {@code sun.misc.Unsafe} where possible to improve performance. */
static final class UnsafeProcessor extends Processor {
- /**
- * Indicates whether or not all required unsafe operations are supported on this platform.
- */
+ /** Indicates whether or not all required unsafe operations are supported on this platform. */
static boolean isAvailable() {
return hasUnsafeArrayOperations() && hasUnsafeByteBufferOperations();
}
@@ -1227,7 +1188,7 @@ final class Utf8 {
//
// We expect such "straddler characters" to be rare.
- if (offset >= offsetLimit) { // No bytes? No progress.
+ if (offset >= offsetLimit) { // No bytes? No progress.
return state;
}
int byte1 = (byte) state;
@@ -1474,10 +1435,7 @@ final class Utf8 {
}
}
- if (resultPos < resultArr.length) {
- resultArr = Arrays.copyOf(resultArr, resultPos);
- }
- return UnsafeUtil.moveToString(resultArr);
+ return new String(resultArr, 0, resultPos);
}
@Override
@@ -1553,10 +1511,7 @@ final class Utf8 {
}
}
- if (resultPos < resultArr.length) {
- resultArr = Arrays.copyOf(resultArr, resultPos);
- }
- return UnsafeUtil.moveToString(resultArr);
+ return new String(resultArr, 0, resultPos);
}
@Override
@@ -1690,8 +1645,8 @@ final class Utf8 {
* @param bytes the array containing the character sequence
* @param offset the offset position of the index (same as index + arrayBaseOffset)
* @param maxChars the maximum number of characters to count
- * @return the number of ASCII characters found. The stopping position will be at or
- * before the first non-ASCII byte.
+ * @return the number of ASCII characters found. The stopping position will be at or before the
+ * first non-ASCII byte.
*/
private static int unsafeEstimateConsecutiveAscii(
byte[] bytes, long offset, final int maxChars) {
@@ -1733,24 +1688,24 @@ final class Utf8 {
// To speed things up further, we're reading longs instead of bytes so we use a mask to
// determine if any byte in the current long is non-ASCII.
remaining -= unaligned;
- for (; remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) == 0;
+ for (;
+ remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) == 0;
address += 8, remaining -= 8) {}
return maxChars - remaining;
}
private static int partialIsValidUtf8(final byte[] bytes, long offset, int remaining) {
- // Skip past ASCII characters as quickly as possible.
+ // Skip past ASCII characters as quickly as possible.
final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remaining);
remaining -= skipped;
offset += skipped;
- for (;;) {
+ for (; ; ) {
// Optimize for interior runs of ASCII bytes.
// TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
// Maybe after seeing a few in a row that are ASCII, go back to fast mode?
int byte1 = 0;
- for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >= 0; --remaining) {
- }
+ for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >= 0; --remaining) {}
if (remaining == 0) {
return COMPLETE;
}
@@ -1767,8 +1722,7 @@ final class Utf8 {
// Simultaneously checks for illegal trailing-byte in
// leading position and overlong 2-byte form.
- if (byte1 < (byte) 0xC2
- || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
+ if (byte1 < (byte) 0xC2 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
return MALFORMED;
}
} else if (byte1 < (byte) 0xF0) {
@@ -1820,13 +1774,12 @@ final class Utf8 {
address += skipped;
remaining -= skipped;
- for (;;) {
+ for (; ; ) {
// Optimize for interior runs of ASCII bytes.
// TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
// Maybe after seeing a few in a row that are ASCII, go back to fast mode?
int byte1 = 0;
- for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; --remaining) {
- }
+ for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; --remaining) {}
if (remaining == 0) {
return COMPLETE;
}
@@ -1891,40 +1844,32 @@ final class Utf8 {
}
}
- private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long offset,
- int remaining) {
+ private static int unsafeIncompleteStateFor(
+ byte[] bytes, int byte1, long offset, int remaining) {
switch (remaining) {
- case 0: {
+ case 0:
return incompleteStateFor(byte1);
- }
- case 1: {
+ case 1:
return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset));
- }
- case 2: {
- return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset),
- UnsafeUtil.getByte(bytes, offset + 1));
- }
- default: {
+ case 2:
+ return incompleteStateFor(
+ byte1, UnsafeUtil.getByte(bytes, offset), UnsafeUtil.getByte(bytes, offset + 1));
+ default:
throw new AssertionError();
- }
}
}
private static int unsafeIncompleteStateFor(long address, final int byte1, int remaining) {
switch (remaining) {
- case 0: {
+ case 0:
return incompleteStateFor(byte1);
- }
- case 1: {
+ case 1:
return incompleteStateFor(byte1, UnsafeUtil.getByte(address));
- }
- case 2: {
- return incompleteStateFor(byte1, UnsafeUtil.getByte(address),
- UnsafeUtil.getByte(address + 1));
- }
- default: {
+ case 2:
+ return incompleteStateFor(
+ byte1, UnsafeUtil.getByte(address), UnsafeUtil.getByte(address + 1));
+ default:
throw new AssertionError();
- }
}
}
}
@@ -1936,23 +1881,17 @@ final class Utf8 {
*/
private static class DecodeUtil {
- /**
- * Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'.
- */
+ /** Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'. */
private static boolean isOneByte(byte b) {
return b >= 0;
}
- /**
- * Returns whether this is a two-byte codepoint with the form '10XXXXXX'.
- */
+ /** Returns whether this is a two-byte codepoint with the form '10XXXXXX'. */
private static boolean isTwoBytes(byte b) {
return b < (byte) 0xE0;
}
- /**
- * Returns whether this is a three-byte codepoint with the form '110XXXXX'.
- */
+ /** Returns whether this is a three-byte codepoint with the form '110XXXXX'. */
private static boolean isThreeBytes(byte b) {
return b < (byte) 0xF0;
}
@@ -1961,13 +1900,11 @@ final class Utf8 {
resultArr[resultPos] = (char) byte1;
}
- private static void handleTwoBytes(
- byte byte1, byte byte2, char[] resultArr, int resultPos)
+ private static void handleTwoBytes(byte byte1, byte byte2, char[] resultArr, int resultPos)
throws InvalidProtocolBufferException {
// Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
// overlong 2-byte, '11000001'.
- if (byte1 < (byte) 0xC2
- || isNotTrailingByte(byte2)) {
+ if (byte1 < (byte) 0xC2 || isNotTrailingByte(byte2)) {
throw InvalidProtocolBufferException.invalidUtf8();
}
resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2));
@@ -1984,13 +1921,14 @@ final class Utf8 {
|| isNotTrailingByte(byte3)) {
throw InvalidProtocolBufferException.invalidUtf8();
}
- resultArr[resultPos] = (char)
- (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3));
+ resultArr[resultPos] =
+ (char)
+ (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3));
}
private static void handleFourBytes(
byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos)
- throws InvalidProtocolBufferException{
+ throws InvalidProtocolBufferException {
if (isNotTrailingByte(byte2)
// Check that 1 <= plane <= 16. Tricky optimized form of:
// valid 4-byte leading byte?
@@ -2004,31 +1942,28 @@ final class Utf8 {
|| isNotTrailingByte(byte4)) {
throw InvalidProtocolBufferException.invalidUtf8();
}
- int codepoint = ((byte1 & 0x07) << 18)
- | (trailingByteValue(byte2) << 12)
- | (trailingByteValue(byte3) << 6)
- | trailingByteValue(byte4);
+ int codepoint =
+ ((byte1 & 0x07) << 18)
+ | (trailingByteValue(byte2) << 12)
+ | (trailingByteValue(byte3) << 6)
+ | trailingByteValue(byte4);
resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint);
resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint);
}
- /**
- * Returns whether the byte is not a valid continuation of the form '10XXXXXX'.
- */
+ /** Returns whether the byte is not a valid continuation of the form '10XXXXXX'. */
private static boolean isNotTrailingByte(byte b) {
return b > (byte) 0xBF;
}
- /**
- * Returns the actual value of the trailing byte (removes the prefix '10') for composition.
- */
+ /** Returns the actual value of the trailing byte (removes the prefix '10') for composition. */
private static int trailingByteValue(byte b) {
return b & 0x3F;
}
private static char highSurrogate(int codePoint) {
- return (char) ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10))
- + (codePoint >>> 10));
+ return (char)
+ ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)) + (codePoint >>> 10));
}
private static char lowSurrogate(int codePoint) {