diff options
author | liujisi@google.com <liujisi@google.com@630680e5-0e50-0410-840e-4b1c322b438d> | 2010-11-02 13:14:58 +0000 |
---|---|---|
committer | liujisi@google.com <liujisi@google.com@630680e5-0e50-0410-840e-4b1c322b438d> | 2010-11-02 13:14:58 +0000 |
commit | 33165fe0d5c265c92f2a67fc2b437b567c24e294 (patch) | |
tree | 52def0850ddd2e976da238d1a437fbda79c96e44 /java/src/main/java/com/google/protobuf/Internal.java | |
parent | 80aa23df6c63750e8cdfdcf3996fbc37d63cac61 (diff) | |
download | protobuf-33165fe0d5c265c92f2a67fc2b437b567c24e294.tar.gz protobuf-33165fe0d5c265c92f2a67fc2b437b567c24e294.tar.bz2 protobuf-33165fe0d5c265c92f2a67fc2b437b567c24e294.zip |
Submit recent changes from internal branch. See CHANGES.txt for more details.
Diffstat (limited to 'java/src/main/java/com/google/protobuf/Internal.java')
-rw-r--r-- | java/src/main/java/com/google/protobuf/Internal.java | 85 |
1 files changed, 85 insertions, 0 deletions
diff --git a/java/src/main/java/com/google/protobuf/Internal.java b/java/src/main/java/com/google/protobuf/Internal.java index 965465e1..05eab57a 100644 --- a/java/src/main/java/com/google/protobuf/Internal.java +++ b/java/src/main/java/com/google/protobuf/Internal.java @@ -100,6 +100,91 @@ public class Internal { } /** + * Helper called by generated code to determine if a byte array is a valid + * UTF-8 encoded string such that the original bytes can be converted to + * a String object and then back to a byte array round tripping the bytes + * without loss. + * <p> + * This is inspired by UTF_8.java in sun.nio.cs. + * + * @param byteString the string to check + * @return whether the byte array is round trippable + */ + public static boolean isValidUtf8(ByteString byteString) { + int index = 0; + int size = byteString.size(); + // To avoid the masking, we could change this to use bytes; + // Then X > 0xC2 gets turned into X < -0xC2; X < 0x80 + // gets turned into X >= 0, etc. + + while (index < size) { + int byte1 = byteString.byteAt(index++) & 0xFF; + if (byte1 < 0x80) { + // fast loop for single bytes + continue; + + // we know from this point on that we have 2-4 byte forms + } else if (byte1 < 0xC2 || byte1 > 0xF4) { + // catch illegal first bytes: < C2 or > F4 + return false; + } + if (index >= size) { + // fail if we run out of bytes + return false; + } + int byte2 = byteString.byteAt(index++) & 0xFF; + if (byte2 < 0x80 || byte2 > 0xBF) { + // general trail-byte test + return false; + } + if (byte1 <= 0xDF) { + // two-byte form; general trail-byte test is sufficient + continue; + } + + // we know from this point on that we have 3 or 4 byte forms + if (index >= size) { + // fail if we run out of bytes + return false; + } + int byte3 = byteString.byteAt(index++) & 0xFF; + if (byte3 < 0x80 || byte3 > 0xBF) { + // general trail-byte test + return false; + } + if (byte1 <= 0xEF) { + // three-byte form. Vastly more frequent than four-byte forms + // The following has an extra test, but not worth restructuring + if (byte1 == 0xE0 && byte2 < 0xA0 || + byte1 == 0xED && byte2 > 0x9F) { + // check special cases of byte2 + return false; + } + + } else { + // four-byte form + + if (index >= size) { + // fail if we run out of bytes + return false; + } + int byte4 = byteString.byteAt(index++) & 0xFF; + if (byte4 < 0x80 || byte4 > 0xBF) { + // general trail-byte test + return false; + } + // The following has an extra test, but not worth restructuring + if (byte1 == 0xF0 && byte2 < 0x90 || + byte1 == 0xF4 && byte2 > 0x8F) { + // check special cases of byte2 + return false; + } + } + } + return true; + } + + /** * Interface for an enum value or value descriptor, to be used in FieldSet. * The lite library stores enum values directly in FieldSets but the full * library stores EnumValueDescriptors in order to better support reflection. |