diff options
Diffstat (limited to 'java/core/src/main/java/com/google/protobuf/Utf8.java')
-rw-r--r-- | java/core/src/main/java/com/google/protobuf/Utf8.java | 392 |
1 files changed, 166 insertions, 226 deletions
diff --git a/java/core/src/main/java/com/google/protobuf/Utf8.java b/java/core/src/main/java/com/google/protobuf/Utf8.java index b4a81ca3..4512bf9b 100644 --- a/java/core/src/main/java/com/google/protobuf/Utf8.java +++ b/java/core/src/main/java/com/google/protobuf/Utf8.java @@ -44,36 +44,31 @@ import static java.lang.Character.toCodePoint; import java.nio.ByteBuffer; /** - * A set of low-level, high-performance static utility methods related - * to the UTF-8 character encoding. This class has no dependencies - * outside of the core JDK libraries. + * A set of low-level, high-performance static utility methods related to the UTF-8 character + * encoding. This class has no dependencies outside of the core JDK libraries. * - * <p>There are several variants of UTF-8. The one implemented by - * this class is the restricted definition of UTF-8 introduced in - * Unicode 3.1, which mandates the rejection of "overlong" byte - * sequences as well as rejection of 3-byte surrogate codepoint byte - * sequences. Note that the UTF-8 decoder included in Oracle's JDK - * has been modified to also reject "overlong" byte sequences, but (as - * of 2011) still accepts 3-byte surrogate codepoint byte sequences. + * <p>There are several variants of UTF-8. The one implemented by this class is the restricted + * definition of UTF-8 introduced in Unicode 3.1, which mandates the rejection of "overlong" byte + * sequences as well as rejection of 3-byte surrogate codepoint byte sequences. Note that the UTF-8 + * decoder included in Oracle's JDK has been modified to also reject "overlong" byte sequences, but + * (as of 2011) still accepts 3-byte surrogate codepoint byte sequences. * - * <p>The byte sequences considered valid by this class are exactly - * those that can be roundtrip converted to Strings and back to bytes - * using the UTF-8 charset, without loss: <pre> {@code + * <p>The byte sequences considered valid by this class are exactly those that can be roundtrip + * converted to Strings and back to bytes using the UTF-8 charset, without loss: + * + * <pre>{@code * Arrays.equals(bytes, new String(bytes, Internal.UTF_8).getBytes(Internal.UTF_8)) * }</pre> * - * <p>See the Unicode Standard,</br> - * Table 3-6. <em>UTF-8 Bit Distribution</em>,</br> - * Table 3-7. <em>Well Formed UTF-8 Byte Sequences</em>. + * <p>See the Unicode Standard,</br> Table 3-6. <em>UTF-8 Bit Distribution</em>,</br> Table 3-7. + * <em>Well Formed UTF-8 Byte Sequences</em>. * - * <p>This class supports decoding of partial byte sequences, so that the - * bytes in a complete UTF-8 byte sequences can be stored in multiple - * segments. Methods typically return {@link #MALFORMED} if the partial - * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is - * well-formed in the absence of additional input, or if the byte sequence - * apparently terminated in the middle of a character, an opaque integer - * "state" value containing enough information to decode the character when - * passed to a subsequent invocation of a partial decoding method. + * <p>This class supports decoding of partial byte sequences, so that the bytes in a complete UTF-8 + * byte sequences can be stored in multiple segments. Methods typically return {@link #MALFORMED} if + * the partial byte sequence is definitely not well-formed, {@link #COMPLETE} if it is well-formed + * in the absence of additional input, or if the byte sequence apparently terminated in the middle + * of a character, an opaque integer "state" value containing enough information to decode the + * character when passed to a subsequent invocation of a partial decoding method. * * @author martinrb@google.com (Martin Buchholz) */ @@ -98,31 +93,28 @@ final class Utf8 { /** * Maximum number of bytes per Java UTF-16 char in UTF-8. + * * @see java.nio.charset.CharsetEncoder#maxBytesPerChar() */ static final int MAX_BYTES_PER_CHAR = 3; /** - * State value indicating that the byte sequence is well-formed and - * complete (no further bytes are needed to complete a character). + * State value indicating that the byte sequence is well-formed and complete (no further bytes are + * needed to complete a character). */ public static final int COMPLETE = 0; - /** - * State value indicating that the byte sequence is definitely not - * well-formed. - */ + /** State value indicating that the byte sequence is definitely not well-formed. */ public static final int MALFORMED = -1; /** * Used by {@code Unsafe} UTF-8 string validation logic to determine the minimum string length * above which to employ an optimized algorithm for counting ASCII characters. The reason for this * threshold is that for small strings, the optimization may not be beneficial or may even - * negatively impact performance since it requires additional logic to avoid unaligned reads - * (when calling {@code Unsafe.getLong}). This threshold guarantees that even if the initial - * offset is unaligned, we're guaranteed to make at least one call to {@code Unsafe.getLong()} - * which provides a performance improvement that entirely subsumes the cost of the additional - * logic. + * negatively impact performance since it requires additional logic to avoid unaligned reads (when + * calling {@code Unsafe.getLong}). This threshold guarantees that even if the initial offset is + * unaligned, we're guaranteed to make at least one call to {@code Unsafe.getLong()} which + * provides a performance improvement that entirely subsumes the cost of the additional logic. */ private static final int UNSAFE_COUNT_ASCII_THRESHOLD = 16; @@ -146,76 +138,69 @@ final class Utf8 { // are valid trailing bytes. /** - * Returns {@code true} if the given byte array is a well-formed - * UTF-8 byte sequence. + * Returns {@code true} if the given byte array is a well-formed UTF-8 byte sequence. * - * <p>This is a convenience method, equivalent to a call to {@code - * isValidUtf8(bytes, 0, bytes.length)}. + * <p>This is a convenience method, equivalent to a call to {@code isValidUtf8(bytes, 0, + * bytes.length)}. */ public static boolean isValidUtf8(byte[] bytes) { return processor.isValidUtf8(bytes, 0, bytes.length); } /** - * Returns {@code true} if the given byte array slice is a - * well-formed UTF-8 byte sequence. The range of bytes to be - * checked extends from index {@code index}, inclusive, to {@code - * limit}, exclusive. + * Returns {@code true} if the given byte array slice is a well-formed UTF-8 byte sequence. The + * range of bytes to be checked extends from index {@code index}, inclusive, to {@code limit}, + * exclusive. * - * <p>This is a convenience method, equivalent to {@code - * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}. + * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit) + * == Utf8.COMPLETE}. */ public static boolean isValidUtf8(byte[] bytes, int index, int limit) { return processor.isValidUtf8(bytes, index, limit); } /** - * Tells whether the given byte array slice is a well-formed, - * malformed, or incomplete UTF-8 byte sequence. The range of bytes - * to be checked extends from index {@code index}, inclusive, to + * Tells whether the given byte array slice is a well-formed, malformed, or incomplete UTF-8 byte + * sequence. The range of bytes to be checked extends from index {@code index}, inclusive, to * {@code limit}, exclusive. * - * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding - * operation) or the value returned from a call to a partial decoding method - * for the previous bytes - * - * @return {@link #MALFORMED} if the partial byte sequence is - * definitely not well-formed, {@link #COMPLETE} if it is well-formed - * (no additional input needed), or if the byte sequence is - * "incomplete", i.e. apparently terminated in the middle of a character, - * an opaque integer "state" value containing enough information to - * decode the character when passed to a subsequent invocation of a - * partial decoding method. + * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding operation) or the + * value returned from a call to a partial decoding method for the previous bytes + * @return {@link #MALFORMED} if the partial byte sequence is definitely not well-formed, {@link + * #COMPLETE} if it is well-formed (no additional input needed), or if the byte sequence is + * "incomplete", i.e. apparently terminated in the middle of a character, an opaque integer + * "state" value containing enough information to decode the character when passed to a + * subsequent invocation of a partial decoding method. */ public static int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) { return processor.partialIsValidUtf8(state, bytes, index, limit); } private static int incompleteStateFor(int byte1) { - return (byte1 > (byte) 0xF4) ? - MALFORMED : byte1; + return (byte1 > (byte) 0xF4) ? MALFORMED : byte1; } private static int incompleteStateFor(int byte1, int byte2) { - return (byte1 > (byte) 0xF4 || - byte2 > (byte) 0xBF) ? - MALFORMED : byte1 ^ (byte2 << 8); + return (byte1 > (byte) 0xF4 || byte2 > (byte) 0xBF) ? MALFORMED : byte1 ^ (byte2 << 8); } private static int incompleteStateFor(int byte1, int byte2, int byte3) { - return (byte1 > (byte) 0xF4 || - byte2 > (byte) 0xBF || - byte3 > (byte) 0xBF) ? - MALFORMED : byte1 ^ (byte2 << 8) ^ (byte3 << 16); + return (byte1 > (byte) 0xF4 || byte2 > (byte) 0xBF || byte3 > (byte) 0xBF) + ? MALFORMED + : byte1 ^ (byte2 << 8) ^ (byte3 << 16); } private static int incompleteStateFor(byte[] bytes, int index, int limit) { int byte1 = bytes[index - 1]; switch (limit - index) { - case 0: return incompleteStateFor(byte1); - case 1: return incompleteStateFor(byte1, bytes[index]); - case 2: return incompleteStateFor(byte1, bytes[index], bytes[index + 1]); - default: throw new AssertionError(); + case 0: + return incompleteStateFor(byte1); + case 1: + return incompleteStateFor(byte1, bytes[index]); + case 2: + return incompleteStateFor(byte1, bytes[index], bytes[index + 1]); + default: + throw new AssertionError(); } } @@ -236,7 +221,7 @@ final class Utf8 { // These UTF-8 handling methods are copied from Guava's Utf8 class with a modification to throw // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can // fallback to more lenient behavior. - + static class UnpairedSurrogateException extends IllegalArgumentException { UnpairedSurrogateException(int index, int length) { super("Unpaired surrogate at index " + index + " of " + length); @@ -244,9 +229,9 @@ final class Utf8 { } /** - * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, - * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in - * both time and space. + * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, this + * method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in both + * time and space. * * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired * surrogates) @@ -266,7 +251,7 @@ final class Utf8 { for (; i < utf16Length; i++) { char c = sequence.charAt(i); if (c < 0x800) { - utf8Length += ((0x7f - c) >>> 31); // branch free! + utf8Length += ((0x7f - c) >>> 31); // branch free! } else { utf8Length += encodedLengthGeneral(sequence, i); break; @@ -275,8 +260,8 @@ final class Utf8 { if (utf8Length < utf16Length) { // Necessary and sufficient condition for overflow because of maximum 3x expansion - throw new IllegalArgumentException("UTF-8 length does not fit in int: " - + (utf8Length + (1L << 32))); + throw new IllegalArgumentException( + "UTF-8 length does not fit in int: " + (utf8Length + (1L << 32))); } return utf8Length; } @@ -370,15 +355,15 @@ final class Utf8 { } /** - * Counts (approximately) the number of consecutive ASCII characters in the given buffer. - * The byte order of the {@link ByteBuffer} does not matter, so performance can be improved if - * native byte order is used (i.e. no byte-swapping in {@link ByteBuffer#getLong(int)}). + * Counts (approximately) the number of consecutive ASCII characters in the given buffer. The byte + * order of the {@link ByteBuffer} does not matter, so performance can be improved if native byte + * order is used (i.e. no byte-swapping in {@link ByteBuffer#getLong(int)}). * * @param buffer the buffer to be scanned for ASCII chars * @param index the starting index of the scan * @param limit the limit within buffer for the scan - * @return the number of ASCII characters found. The stopping position will be at or - * before the first non-ASCII byte. + * @return the number of ASCII characters found. The stopping position will be at or before the + * first non-ASCII byte. */ private static int estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit) { int i = index; @@ -390,52 +375,43 @@ final class Utf8 { return i - index; } - /** - * A processor of UTF-8 strings, providing methods for checking validity and encoding. - */ + /** A processor of UTF-8 strings, providing methods for checking validity and encoding. */ // TODO(nathanmittler): Add support for Memory/MemoryBlock on Android. abstract static class Processor { /** - * Returns {@code true} if the given byte array slice is a - * well-formed UTF-8 byte sequence. The range of bytes to be - * checked extends from index {@code index}, inclusive, to {@code - * limit}, exclusive. + * Returns {@code true} if the given byte array slice is a well-formed UTF-8 byte sequence. The + * range of bytes to be checked extends from index {@code index}, inclusive, to {@code limit}, + * exclusive. * - * <p>This is a convenience method, equivalent to {@code - * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}. + * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit) + * == Utf8.COMPLETE}. */ final boolean isValidUtf8(byte[] bytes, int index, int limit) { return partialIsValidUtf8(COMPLETE, bytes, index, limit) == COMPLETE; } /** - * Tells whether the given byte array slice is a well-formed, - * malformed, or incomplete UTF-8 byte sequence. The range of bytes - * to be checked extends from index {@code index}, inclusive, to - * {@code limit}, exclusive. + * Tells whether the given byte array slice is a well-formed, malformed, or incomplete UTF-8 + * byte sequence. The range of bytes to be checked extends from index {@code index}, inclusive, + * to {@code limit}, exclusive. * - * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding - * operation) or the value returned from a call to a partial decoding method - * for the previous bytes - * - * @return {@link #MALFORMED} if the partial byte sequence is - * definitely not well-formed, {@link #COMPLETE} if it is well-formed - * (no additional input needed), or if the byte sequence is - * "incomplete", i.e. apparently terminated in the middle of a character, - * an opaque integer "state" value containing enough information to - * decode the character when passed to a subsequent invocation of a - * partial decoding method. + * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding operation) or the + * value returned from a call to a partial decoding method for the previous bytes + * @return {@link #MALFORMED} if the partial byte sequence is definitely not well-formed, {@link + * #COMPLETE} if it is well-formed (no additional input needed), or if the byte sequence is + * "incomplete", i.e. apparently terminated in the middle of a character, an opaque integer + * "state" value containing enough information to decode the character when passed to a + * subsequent invocation of a partial decoding method. */ abstract int partialIsValidUtf8(int state, byte[] bytes, int index, int limit); /** - * Returns {@code true} if the given portion of the {@link ByteBuffer} is a - * well-formed UTF-8 byte sequence. The range of bytes to be - * checked extends from index {@code index}, inclusive, to {@code - * limit}, exclusive. + * Returns {@code true} if the given portion of the {@link ByteBuffer} is a well-formed UTF-8 + * byte sequence. The range of bytes to be checked extends from index {@code index}, inclusive, + * to {@code limit}, exclusive. * - * <p>This is a convenience method, equivalent to {@code - * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}. + * <p>This is a convenience method, equivalent to {@code partialIsValidUtf8(bytes, index, limit) + * == Utf8.COMPLETE}. */ final boolean isValidUtf8(ByteBuffer buffer, int index, int limit) { return partialIsValidUtf8(COMPLETE, buffer, index, limit) == COMPLETE; @@ -452,22 +428,20 @@ final class Utf8 { if (buffer.hasArray()) { final int offset = buffer.arrayOffset(); return partialIsValidUtf8(state, buffer.array(), offset + index, offset + limit); - } else if (buffer.isDirect()){ + } else if (buffer.isDirect()) { return partialIsValidUtf8Direct(state, buffer, index, limit); } return partialIsValidUtf8Default(state, buffer, index, limit); } - /** - * Performs validation for direct {@link ByteBuffer} instances. - */ + /** Performs validation for direct {@link ByteBuffer} instances. */ abstract int partialIsValidUtf8Direct( final int state, final ByteBuffer buffer, int index, final int limit); /** * Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather - * than potentially faster approaches. This first completes validation for the current - * character (provided by {@code state}) and then finishes validation for the sequence. + * than potentially faster approaches. This first completes validation for the current character + * (provided by {@code state}) and then finishes validation for the sequence. */ final int partialIsValidUtf8Default( final int state, final ByteBuffer buffer, int index, final int limit) { @@ -566,7 +540,7 @@ final class Utf8 { private static int partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit) { index += estimateConsecutiveAscii(buffer, index, limit); - for (;;) { + for (; ; ) { // Optimize for interior runs of ASCII bytes. // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold? // Maybe after seeing a few in a row that are ASCII, go back to fast mode? @@ -658,15 +632,13 @@ final class Utf8 { return decodeUtf8Default(buffer, index, size); } - /** - * Decodes direct {@link ByteBuffer} instances into {@link String}. - */ + /** Decodes direct {@link ByteBuffer} instances into {@link String}. */ abstract String decodeUtf8Direct(ByteBuffer buffer, int index, int size) throws InvalidProtocolBufferException; /** - * Decodes {@link ByteBuffer} instances using the {@link ByteBuffer} API rather than - * potentially faster approaches. + * Decodes {@link ByteBuffer} instances using the {@link ByteBuffer} API rather than potentially + * faster approaches. */ final String decodeUtf8Default(ByteBuffer buffer, int index, int size) throws InvalidProtocolBufferException { @@ -747,21 +719,22 @@ final class Utf8 { /** * Encodes an input character sequence ({@code in}) to UTF-8 in the target array ({@code out}). * For a string, this method is similar to + * * <pre>{@code * byte[] a = string.getBytes(UTF_8); * System.arraycopy(a, 0, bytes, offset, a.length); * return offset + a.length; * }</pre> * - * but is more efficient in both time and space. One key difference is that this method - * requires paired surrogates, and therefore does not support chunking. - * While {@code String.getBytes(UTF_8)} replaces unpaired surrogates with the default - * replacement character, this method throws {@link UnpairedSurrogateException}. + * but is more efficient in both time and space. One key difference is that this method requires + * paired surrogates, and therefore does not support chunking. While {@code + * String.getBytes(UTF_8)} replaces unpaired surrogates with the default replacement character, + * this method throws {@link UnpairedSurrogateException}. * * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to - * compute the exact amount needed, or leave room for - * {@code Utf8.MAX_BYTES_PER_CHAR * sequence.length()}, which is the largest possible number - * of bytes that any input can be encoded to. + * compute the exact amount needed, or leave room for {@code Utf8.MAX_BYTES_PER_CHAR * + * sequence.length()}, which is the largest possible number of bytes that any input can be + * encoded to. * * @param in the input character sequence to be encoded * @param out the target array @@ -778,26 +751,24 @@ final class Utf8 { /** * Encodes an input character sequence ({@code in}) to UTF-8 in the target buffer ({@code out}). * Upon returning from this method, the {@code out} position will point to the position after - * the last encoded byte. This method requires paired surrogates, and therefore does not - * support chunking. + * the last encoded byte. This method requires paired surrogates, and therefore does not support + * chunking. * * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to - * compute the exact amount needed, or leave room for - * {@code Utf8.MAX_BYTES_PER_CHAR * in.length()}, which is the largest possible number - * of bytes that any input can be encoded to. + * compute the exact amount needed, or leave room for {@code Utf8.MAX_BYTES_PER_CHAR * + * in.length()}, which is the largest possible number of bytes that any input can be encoded to. * * @param in the source character sequence to be encoded * @param out the target buffer * @throws UnpairedSurrogateException if {@code in} contains ill-formed UTF-16 (unpaired * surrogates) - * @throws ArrayIndexOutOfBoundsException if {@code in} encoded in UTF-8 is longer than - * {@code out.remaining()} + * @throws ArrayIndexOutOfBoundsException if {@code in} encoded in UTF-8 is longer than {@code + * out.remaining()} */ final void encodeUtf8(CharSequence in, ByteBuffer out) { if (out.hasArray()) { final int offset = out.arrayOffset(); - int endIndex = - Utf8.encode(in, out.array(), offset + out.position(), out.remaining()); + int endIndex = Utf8.encode(in, out.array(), offset + out.position(), out.remaining()); out.position(endIndex - offset); } else if (out.isDirect()) { encodeUtf8Direct(in, out); @@ -806,9 +777,7 @@ final class Utf8 { } } - /** - * Encodes the input character sequence to a direct {@link ByteBuffer} instance. - */ + /** Encodes the input character sequence to a direct {@link ByteBuffer} instance. */ abstract void encodeUtf8Direct(CharSequence in, ByteBuffer out); /** @@ -887,9 +856,7 @@ final class Utf8 { } } - /** - * {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods. - */ + /** {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods. */ static final class SafeProcessor extends Processor { @Override int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) { @@ -901,7 +868,7 @@ final class Utf8 { // // We expect such "straddler characters" to be rare. - if (index >= limit) { // No bytes? No progress. + if (index >= limit) { // No bytes? No progress. return state; } int byte1 = (byte) state; @@ -1098,8 +1065,7 @@ final class Utf8 { // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, // four UTF-8 bytes final char low; - if (i + 1 == in.length() - || !Character.isSurrogatePair(c, (low = in.charAt(++i)))) { + if (i + 1 == in.length() || !Character.isSurrogatePair(c, (low = in.charAt(++i)))) { throw new UnpairedSurrogateException((i - 1), utf16Length); } int codePoint = Character.toCodePoint(c, low); @@ -1111,8 +1077,7 @@ final class Utf8 { // If we are surrogates and we're not a surrogate pair, always throw an // UnpairedSurrogateException instead of an ArrayOutOfBoundsException. if ((Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) - && (i + 1 == in.length() - || !Character.isSurrogatePair(c, in.charAt(i + 1)))) { + && (i + 1 == in.length() || !Character.isSurrogatePair(c, in.charAt(i + 1)))) { throw new UnpairedSurrogateException(i, utf16Length); } throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + j); @@ -1138,7 +1103,7 @@ final class Utf8 { } private static int partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit) { - for (;;) { + for (; ; ) { int byte1, byte2; // Optimize for interior runs of ASCII bytes. @@ -1158,8 +1123,7 @@ final class Utf8 { // Simultaneously checks for illegal trailing-byte in // leading position and overlong 2-byte form. - if (byte1 < (byte) 0xC2 - || bytes[index++] > (byte) 0xBF) { + if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) { return MALFORMED; } } else if (byte1 < (byte) 0xF0) { @@ -1180,7 +1144,7 @@ final class Utf8 { } else { // four-byte form - if (index >= limit - 2) { // incomplete sequence + if (index >= limit - 2) { // incomplete sequence return incompleteStateFor(bytes, index, limit); } if ((byte2 = bytes[index++]) > (byte) 0xBF @@ -1200,13 +1164,9 @@ final class Utf8 { } } - /** - * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to improve performance. - */ + /** {@link Processor} that uses {@code sun.misc.Unsafe} where possible to improve performance. */ static final class UnsafeProcessor extends Processor { - /** - * Indicates whether or not all required unsafe operations are supported on this platform. - */ + /** Indicates whether or not all required unsafe operations are supported on this platform. */ static boolean isAvailable() { return hasUnsafeArrayOperations() && hasUnsafeByteBufferOperations(); } @@ -1228,7 +1188,7 @@ final class Utf8 { // // We expect such "straddler characters" to be rare. - if (offset >= offsetLimit) { // No bytes? No progress. + if (offset >= offsetLimit) { // No bytes? No progress. return state; } int byte1 = (byte) state; @@ -1685,8 +1645,8 @@ final class Utf8 { * @param bytes the array containing the character sequence * @param offset the offset position of the index (same as index + arrayBaseOffset) * @param maxChars the maximum number of characters to count - * @return the number of ASCII characters found. The stopping position will be at or - * before the first non-ASCII byte. + * @return the number of ASCII characters found. The stopping position will be at or before the + * first non-ASCII byte. */ private static int unsafeEstimateConsecutiveAscii( byte[] bytes, long offset, final int maxChars) { @@ -1728,24 +1688,24 @@ final class Utf8 { // To speed things up further, we're reading longs instead of bytes so we use a mask to // determine if any byte in the current long is non-ASCII. remaining -= unaligned; - for (; remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) == 0; + for (; + remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) == 0; address += 8, remaining -= 8) {} return maxChars - remaining; } private static int partialIsValidUtf8(final byte[] bytes, long offset, int remaining) { - // Skip past ASCII characters as quickly as possible. + // Skip past ASCII characters as quickly as possible. final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remaining); remaining -= skipped; offset += skipped; - for (;;) { + for (; ; ) { // Optimize for interior runs of ASCII bytes. // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold? // Maybe after seeing a few in a row that are ASCII, go back to fast mode? int byte1 = 0; - for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >= 0; --remaining) { - } + for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >= 0; --remaining) {} if (remaining == 0) { return COMPLETE; } @@ -1762,8 +1722,7 @@ final class Utf8 { // Simultaneously checks for illegal trailing-byte in // leading position and overlong 2-byte form. - if (byte1 < (byte) 0xC2 - || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { + if (byte1 < (byte) 0xC2 || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) { return MALFORMED; } } else if (byte1 < (byte) 0xF0) { @@ -1815,13 +1774,12 @@ final class Utf8 { address += skipped; remaining -= skipped; - for (;;) { + for (; ; ) { // Optimize for interior runs of ASCII bytes. // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold? // Maybe after seeing a few in a row that are ASCII, go back to fast mode? int byte1 = 0; - for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; --remaining) { - } + for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; --remaining) {} if (remaining == 0) { return COMPLETE; } @@ -1886,40 +1844,32 @@ final class Utf8 { } } - private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long offset, - int remaining) { + private static int unsafeIncompleteStateFor( + byte[] bytes, int byte1, long offset, int remaining) { switch (remaining) { - case 0: { + case 0: return incompleteStateFor(byte1); - } - case 1: { + case 1: return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset)); - } - case 2: { - return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset), - UnsafeUtil.getByte(bytes, offset + 1)); - } - default: { + case 2: + return incompleteStateFor( + byte1, UnsafeUtil.getByte(bytes, offset), UnsafeUtil.getByte(bytes, offset + 1)); + default: throw new AssertionError(); - } } } private static int unsafeIncompleteStateFor(long address, final int byte1, int remaining) { switch (remaining) { - case 0: { + case 0: return incompleteStateFor(byte1); - } - case 1: { + case 1: return incompleteStateFor(byte1, UnsafeUtil.getByte(address)); - } - case 2: { - return incompleteStateFor(byte1, UnsafeUtil.getByte(address), - UnsafeUtil.getByte(address + 1)); - } - default: { + case 2: + return incompleteStateFor( + byte1, UnsafeUtil.getByte(address), UnsafeUtil.getByte(address + 1)); + default: throw new AssertionError(); - } } } } @@ -1931,23 +1881,17 @@ final class Utf8 { */ private static class DecodeUtil { - /** - * Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'. - */ + /** Returns whether this is a single-byte codepoint (i.e., ASCII) with the form '0XXXXXXX'. */ private static boolean isOneByte(byte b) { return b >= 0; } - /** - * Returns whether this is a two-byte codepoint with the form '10XXXXXX'. - */ + /** Returns whether this is a two-byte codepoint with the form '10XXXXXX'. */ private static boolean isTwoBytes(byte b) { return b < (byte) 0xE0; } - /** - * Returns whether this is a three-byte codepoint with the form '110XXXXX'. - */ + /** Returns whether this is a three-byte codepoint with the form '110XXXXX'. */ private static boolean isThreeBytes(byte b) { return b < (byte) 0xF0; } @@ -1956,13 +1900,11 @@ final class Utf8 { resultArr[resultPos] = (char) byte1; } - private static void handleTwoBytes( - byte byte1, byte byte2, char[] resultArr, int resultPos) + private static void handleTwoBytes(byte byte1, byte byte2, char[] resultArr, int resultPos) throws InvalidProtocolBufferException { // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and // overlong 2-byte, '11000001'. - if (byte1 < (byte) 0xC2 - || isNotTrailingByte(byte2)) { + if (byte1 < (byte) 0xC2 || isNotTrailingByte(byte2)) { throw InvalidProtocolBufferException.invalidUtf8(); } resultArr[resultPos] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2)); @@ -1979,13 +1921,14 @@ final class Utf8 { || isNotTrailingByte(byte3)) { throw InvalidProtocolBufferException.invalidUtf8(); } - resultArr[resultPos] = (char) - (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3)); + resultArr[resultPos] = + (char) + (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3)); } private static void handleFourBytes( byte byte1, byte byte2, byte byte3, byte byte4, char[] resultArr, int resultPos) - throws InvalidProtocolBufferException{ + throws InvalidProtocolBufferException { if (isNotTrailingByte(byte2) // Check that 1 <= plane <= 16. Tricky optimized form of: // valid 4-byte leading byte? @@ -1999,31 +1942,28 @@ final class Utf8 { || isNotTrailingByte(byte4)) { throw InvalidProtocolBufferException.invalidUtf8(); } - int codepoint = ((byte1 & 0x07) << 18) - | (trailingByteValue(byte2) << 12) - | (trailingByteValue(byte3) << 6) - | trailingByteValue(byte4); + int codepoint = + ((byte1 & 0x07) << 18) + | (trailingByteValue(byte2) << 12) + | (trailingByteValue(byte3) << 6) + | trailingByteValue(byte4); resultArr[resultPos] = DecodeUtil.highSurrogate(codepoint); resultArr[resultPos + 1] = DecodeUtil.lowSurrogate(codepoint); } - /** - * Returns whether the byte is not a valid continuation of the form '10XXXXXX'. - */ + /** Returns whether the byte is not a valid continuation of the form '10XXXXXX'. */ private static boolean isNotTrailingByte(byte b) { return b > (byte) 0xBF; } - /** - * Returns the actual value of the trailing byte (removes the prefix '10') for composition. - */ + /** Returns the actual value of the trailing byte (removes the prefix '10') for composition. */ private static int trailingByteValue(byte b) { return b & 0x3F; } private static char highSurrogate(int codePoint) { - return (char) ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)) - + (codePoint >>> 10)); + return (char) + ((MIN_HIGH_SURROGATE - (MIN_SUPPLEMENTARY_CODE_POINT >>> 10)) + (codePoint >>> 10)); } private static char lowSurrogate(int codePoint) { |