Down-integrate from internal branch

author: xiaofeng@google.com <xiaofeng@google.com@630680e5-0e50-0410-840e-4b1c322b438d> 2012-09-22 02:40:50 +0000
committer: xiaofeng@google.com <xiaofeng@google.com@630680e5-0e50-0410-840e-4b1c322b438d> 2012-09-22 02:40:50 +0000
commit: b55a20fa2c669b181f47ea9219b8e74d1263da19 (patch)
tree: 3936a0e7c22196587a6d8397372de41434fe2129 /java/src/main/java/com/google/protobuf/Internal.java
parent: 9ced30caf94bb4e7e9629c199679ff44e8ca7389 (diff)
download: protobuf-b55a20fa2c669b181f47ea9219b8e74d1263da19.tar.gz
protobuf-b55a20fa2c669b181f47ea9219b8e74d1263da19.tar.bz2
protobuf-b55a20fa2c669b181f47ea9219b8e74d1263da19.zip
1 files changed, 21 insertions, 74 deletions
diff --git a/java/src/main/java/com/google/protobuf/Internal.java b/java/src/main/java/com/google/protobuf/Internal.java
index 05eab57a..81af2583 100644
--- a/java/src/main/java/com/google/protobuf/Internal.java
+++ b/java/src/main/java/com/google/protobuf/Internal.java
@@ -103,85 +103,32 @@ public class Internal {
    * Helper called by generated code to determine if a byte array is a valid
    * UTF-8 encoded string such that the original bytes can be converted to
    * a String object and then back to a byte array round tripping the bytes
-   * without loss.
-   * <p>
-   * This is inspired by UTF_8.java in sun.nio.cs.
+   * without loss.  More precisely, returns {@code true} whenever:
+   * <pre>   {@code
+   * Arrays.equals(byteString.toByteArray(),
+   *     new String(byteString.toByteArray(), "UTF-8").getBytes("UTF-8"))
+   * }</pre>
+   *
+   * <p>This method rejects "overlong" byte sequences, as well as
+   * 3-byte sequences that would map to a surrogate character, in
+   * accordance with the restricted definition of UTF-8 introduced in
+   * Unicode 3.1.  Note that the UTF-8 decoder included in Oracle's
+   * JDK has been modified to also reject "overlong" byte sequences,
+   * but currently (2011) still accepts 3-byte surrogate character
+   * byte sequences.
+   *
+   * <p>See the Unicode Standard,</br>
+   * Table 3-6. <em>UTF-8 Bit Distribution</em>,</br>
+   * Table 3-7. <em>Well Formed UTF-8 Byte Sequences</em>.
+   *
+   * <p>As of 2011-02, this method simply returns the result of {@link
+   * ByteString#isValidUtf8()}.  Calling that method directly is preferred.
    *
    * @param byteString the string to check
    * @return whether the byte array is round trippable
    */
   public static boolean isValidUtf8(ByteString byteString) {
-    int index = 0;
-    int size = byteString.size();
-    // To avoid the masking, we could change this to use bytes;
-    // Then X > 0xC2 gets turned into X < -0xC2; X < 0x80
-    // gets turned into X >= 0, etc.
-
-    while (index < size) {
-      int byte1 = byteString.byteAt(index++) & 0xFF;
-      if (byte1 < 0x80) {
-        // fast loop for single bytes
-        continue;
-
-        // we know from this point on that we have 2-4 byte forms
-      } else if (byte1 < 0xC2 || byte1 > 0xF4) {
-        // catch illegal first bytes: < C2 or > F4
-        return false;
-      }
-      if (index >= size) {
-        // fail if we run out of bytes
-        return false;
-      }
-      int byte2 = byteString.byteAt(index++) & 0xFF;
-      if (byte2 < 0x80 || byte2 > 0xBF) {
-        // general trail-byte test
-        return false;
-      }
-      if (byte1 <= 0xDF) {
-        // two-byte form; general trail-byte test is sufficient
-        continue;
-      }
-
-      // we know from this point on that we have 3 or 4 byte forms
-      if (index >= size) {
-        // fail if we run out of bytes
-        return false;
-      }
-      int byte3 = byteString.byteAt(index++) & 0xFF;
-      if (byte3 < 0x80 || byte3 > 0xBF) {
-        // general trail-byte test
-        return false;
-      }
-      if (byte1 <= 0xEF) {
-        // three-byte form. Vastly more frequent than four-byte forms
-        // The following has an extra test, but not worth restructuring
-        if (byte1 == 0xE0 && byte2 < 0xA0 ||
-            byte1 == 0xED && byte2 > 0x9F) {
-          // check special cases of byte2
-          return false;
-        }
-
-      } else {
-        // four-byte form
-
-        if (index >= size) {
-          // fail if we run out of bytes
-          return false;
-        }
-        int byte4 = byteString.byteAt(index++) & 0xFF;
-        if (byte4 < 0x80 || byte4 > 0xBF) {
-          // general trail-byte test
-          return false;
-        }
-        // The following has an extra test, but not worth restructuring
-        if (byte1 == 0xF0 && byte2 < 0x90 ||
-            byte1 == 0xF4 && byte2 > 0x8F) {
-          // check special cases of byte2
-          return false;
-        }
-      }
-    }
-    return true;
+    return byteString.isValidUtf8();
   }
 
   /**
author	xiaofeng@google.com <xiaofeng@google.com@630680e5-0e50-0410-840e-4b1c322b438d>	2012-09-22 02:40:50 +0000
committer	xiaofeng@google.com <xiaofeng@google.com@630680e5-0e50-0410-840e-4b1c322b438d>	2012-09-22 02:40:50 +0000
commit	b55a20fa2c669b181f47ea9219b8e74d1263da19 (patch)
tree	3936a0e7c22196587a6d8397372de41434fe2129 /java/src/main/java/com/google/protobuf/Internal.java
parent	9ced30caf94bb4e7e9629c199679ff44e8ca7389 (diff)
download	protobuf-b55a20fa2c669b181f47ea9219b8e74d1263da19.tar.gz protobuf-b55a20fa2c669b181f47ea9219b8e74d1263da19.tar.bz2 protobuf-b55a20fa2c669b181f47ea9219b8e74d1263da19.zip