perf: String#getBytes(Charset) vs getBytes(String)

author: Viktor Szathmáry <phraktle@gmail.com> 2014-09-09 16:31:51 +0200
committer: Tamir Duberstein <tamird@gmail.com> 2015-04-02 14:48:43 -0700
commit: e84893f6768f136cc86e2db69fc1d40ff2be7e3b (patch)
tree: c36057efe7fc3c3bf50381c96bae16cf73234fa5 /java
parent: 7139d1eff739682a088ea2c2dbdfef2f108321f8 (diff)
download: protobuf-e84893f6768f136cc86e2db69fc1d40ff2be7e3b.tar.gz
protobuf-e84893f6768f136cc86e2db69fc1d40ff2be7e3b.tar.bz2
protobuf-e84893f6768f136cc86e2db69fc1d40ff2be7e3b.zip
7 files changed, 117 insertions, 28 deletions
diff --git a/java/src/main/java/com/google/protobuf/ByteString.java b/java/src/main/java/com/google/protobuf/ByteString.java
index 637df5f4..ee2eddb0 100644
--- a/java/src/main/java/com/google/protobuf/ByteString.java
+++ b/java/src/main/java/com/google/protobuf/ByteString.java
@@ -37,6 +37,8 @@ import java.io.OutputStream;
 import java.io.Serializable;
 import java.io.UnsupportedEncodingException;
 import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.UnsupportedCharsetException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Iterator;
@@ -76,8 +78,7 @@ public abstract class ByteString implements Iterable<Byte>, Serializable {
   static final int MIN_READ_FROM_CHUNK_SIZE = 0x100;  // 256b
   static final int MAX_READ_FROM_CHUNK_SIZE = 0x2000;  // 8k
 
-  // Defined by java.nio.charset.Charset
-  protected static final String UTF_8 = "UTF-8";
+  protected static final Charset UTF_8 = Charset.forName("UTF-8");
 
   /**
    * Empty {@code ByteString}.
@@ -269,11 +270,7 @@ public abstract class ByteString implements Iterable<Byte>, Serializable {
    * @return new {@code ByteString}
    */
   public static ByteString copyFromUtf8(String text) {
-    try {
-      return new LiteralByteString(text.getBytes(UTF_8));
-    } catch (UnsupportedEncodingException e) {
-      throw new RuntimeException("UTF-8 not supported?", e);
-    }
+    return new LiteralByteString(text.getBytes(UTF_8));
   }
 
   // =================================================================
@@ -612,8 +609,36 @@ public abstract class ByteString implements Iterable<Byte>, Serializable {
    * @return new string
    * @throws UnsupportedEncodingException if charset isn't recognized
    */
-  public abstract String toString(String charsetName)
-      throws UnsupportedEncodingException;
+  public String toString(String charsetName)
+      throws UnsupportedEncodingException {
+    try {
+      return toString(Charset.forName(charsetName));
+    } catch (UnsupportedCharsetException e) {
+      UnsupportedEncodingException exception = new UnsupportedEncodingException(charsetName);
+      exception.initCause(e);
+      throw exception;
+    }
+  }
+
+  /**
+   * Constructs a new {@code String} by decoding the bytes using the
+   * specified charset. Returns the same empty String if empty.
+   *
+   * @param charset encode using this charset
+   * @return new string
+   */
+  public String toString(Charset charset) {
+    return size() == 0 ? "" : toStringInternal(charset);
+  }
+
+  /**
+   * Constructs a new {@code String} by decoding the bytes using the
+   * specified charset.
+   *
+   * @param charset encode using this charset
+   * @return new string
+   */
+  protected abstract String toStringInternal(Charset charset);
 
   // =================================================================
   // UTF-8 decoding
@@ -624,11 +649,7 @@ public abstract class ByteString implements Iterable<Byte>, Serializable {
    * @return new string using UTF-8 encoding
    */
   public String toStringUtf8() {
-    try {
-      return toString(UTF_8);
-    } catch (UnsupportedEncodingException e) {
-      throw new RuntimeException("UTF-8 not supported?", e);
-    }
+    return toString(UTF_8);
   }
 
   /**
diff --git a/java/src/main/java/com/google/protobuf/LiteralByteString.java b/java/src/main/java/com/google/protobuf/LiteralByteString.java
index 81d8e74b..3462c395 100644
--- a/java/src/main/java/com/google/protobuf/LiteralByteString.java
+++ b/java/src/main/java/com/google/protobuf/LiteralByteString.java
@@ -36,6 +36,7 @@ import java.io.InputStream;
 import java.io.OutputStream;
 import java.io.UnsupportedEncodingException;
 import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.NoSuchElementException;
@@ -152,13 +153,8 @@ class LiteralByteString extends ByteString {
   }
 
   @Override
-  public String toString(String charsetName)
-      throws UnsupportedEncodingException {
-    // Optimize for empty strings, but ensure we don't silently ignore invalid
-    // encodings.
-    return size() == 0 && UTF_8.equals(charsetName)
-        ? ""
-        : new String(bytes, getOffsetIntoBytes(), size(), charsetName);
+  protected String toStringInternal(Charset charset) {
+    return new String(bytes, getOffsetIntoBytes(), size(), charset);
   }
 
   // =================================================================
diff --git a/java/src/main/java/com/google/protobuf/RopeByteString.java b/java/src/main/java/com/google/protobuf/RopeByteString.java
index 168bcce2..0900a042 100644
--- a/java/src/main/java/com/google/protobuf/RopeByteString.java
+++ b/java/src/main/java/com/google/protobuf/RopeByteString.java
@@ -38,6 +38,7 @@ import java.io.OutputStream;
 import java.io.UnsupportedEncodingException;
 import java.io.ByteArrayInputStream;
 import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Iterator;
@@ -418,13 +419,8 @@ class RopeByteString extends ByteString {
   }
 
   @Override
-  public String toString(String charsetName)
-      throws UnsupportedEncodingException {
-    // Optimize for empty strings, but ensure we don't silently ignore invalid
-    // encodings.
-    return size() == 0 && UTF_8.equals(charsetName)
-        ? ""
-        : new String(toByteArray(), charsetName);
+  protected String toStringInternal(Charset charset) {
+    return new String(toByteArray(), charset);
   }
 
   // =================================================================
diff --git a/java/src/test/java/com/google/protobuf/BoundedByteStringTest.java b/java/src/test/java/com/google/protobuf/BoundedByteStringTest.java
index 6c9596ca..a11bef2e 100644
--- a/java/src/test/java/com/google/protobuf/BoundedByteStringTest.java
+++ b/java/src/test/java/com/google/protobuf/BoundedByteStringTest.java
@@ -72,6 +72,19 @@ public class BoundedByteStringTest extends LiteralByteStringTest {
         testString.substring(2, testString.length() - 6), roundTripString);
   }
 
+  @Override
+  public void testCharsetToString() throws UnsupportedEncodingException {
+    String testString = "I love unicode \u1234\u5678 characters";
+    LiteralByteString unicode = new LiteralByteString(testString.getBytes(ByteString.UTF_8));
+    ByteString chopped = unicode.substring(2, unicode.size() - 6);
+    assertEquals(classUnderTest + ".substring() must have the expected type",
+        classUnderTest, getActualClassName(chopped));
+
+    String roundTripString = chopped.toString(ByteString.UTF_8);
+    assertEquals(classUnderTest + " unicode bytes must match",
+        testString.substring(2, testString.length() - 6), roundTripString);
+  }
+
   public void testJavaSerialization() throws Exception {
     ByteArrayOutputStream out = new ByteArrayOutputStream();
     ObjectOutputStream oos = new ObjectOutputStream(out);
diff --git a/java/src/test/java/com/google/protobuf/LiteralByteStringTest.java b/java/src/test/java/com/google/protobuf/LiteralByteStringTest.java
index f3ad774f..8607040e 100644
--- a/java/src/test/java/com/google/protobuf/LiteralByteStringTest.java
+++ b/java/src/test/java/com/google/protobuf/LiteralByteStringTest.java
@@ -298,6 +298,13 @@ public class LiteralByteStringTest extends TestCase {
     assertEquals(classUnderTest + " unicode must match", testString, roundTripString);
   }
 
+  public void testCharsetToString() throws UnsupportedEncodingException {
+    String testString = "I love unicode \u1234\u5678 characters";
+    LiteralByteString unicode = new LiteralByteString(testString.getBytes(ByteString.UTF_8));
+    String roundTripString = unicode.toString(ByteString.UTF_8);
+    assertEquals(classUnderTest + " unicode must match", testString, roundTripString);
+  }
+
   public void testToString_returnsCanonicalEmptyString() throws UnsupportedEncodingException{
     assertSame(classUnderTest + " must be the same string references",
         ByteString.EMPTY.toString(UTF_8), new LiteralByteString(new byte[]{}).toString(UTF_8));
diff --git a/java/src/test/java/com/google/protobuf/RopeByteStringSubstringTest.java b/java/src/test/java/com/google/protobuf/RopeByteStringSubstringTest.java
index 8106201d..43872d1d 100644
--- a/java/src/test/java/com/google/protobuf/RopeByteStringSubstringTest.java
+++ b/java/src/test/java/com/google/protobuf/RopeByteStringSubstringTest.java
@@ -94,4 +94,34 @@ public class RopeByteStringSubstringTest extends LiteralByteStringTest {
     assertEquals(classUnderTest + " string must must have same hashCode as the flat string",
         flatString.hashCode(), unicode.hashCode());
   }
+
+  @Override
+  public void testCharsetToString() throws UnsupportedEncodingException {
+    String sourceString = "I love unicode \u1234\u5678 characters";
+    ByteString sourceByteString = ByteString.copyFromUtf8(sourceString);
+    int copies = 250;
+
+    // By building the RopeByteString by concatenating, this is actually a fairly strenuous test.
+    StringBuilder builder = new StringBuilder(copies * sourceString.length());
+    ByteString unicode = ByteString.EMPTY;
+    for (int i = 0; i < copies; ++i) {
+      builder.append(sourceString);
+      unicode = RopeByteString.concatenate(unicode, sourceByteString);
+    }
+    String testString = builder.toString();
+
+    // Do the substring part
+    testString = testString.substring(2, testString.length() - 6);
+    unicode = unicode.substring(2, unicode.size() - 6);
+
+    assertEquals(classUnderTest + " from string must have the expected type",
+        classUnderTest, getActualClassName(unicode));
+    String roundTripString = unicode.toString(ByteString.UTF_8);
+    assertEquals(classUnderTest + " unicode bytes must match",
+        testString, roundTripString);
+    ByteString flatString = ByteString.copyFromUtf8(testString);
+    assertEquals(classUnderTest + " string must equal the flat string", flatString, unicode);
+    assertEquals(classUnderTest + " string must must have same hashCode as the flat string",
+        flatString.hashCode(), unicode.hashCode());
+  }
 }
diff --git a/java/src/test/java/com/google/protobuf/RopeByteStringTest.java b/java/src/test/java/com/google/protobuf/RopeByteStringTest.java
index 4775f03a..54eb9683 100644
--- a/java/src/test/java/com/google/protobuf/RopeByteStringTest.java
+++ b/java/src/test/java/com/google/protobuf/RopeByteStringTest.java
@@ -119,6 +119,32 @@ public class RopeByteStringTest extends LiteralByteStringTest {
   }
 
   @Override
+  public void testCharsetToString() throws UnsupportedEncodingException {
+    String sourceString = "I love unicode \u1234\u5678 characters";
+    ByteString sourceByteString = ByteString.copyFromUtf8(sourceString);
+    int copies = 250;
+
+    // By building the RopeByteString by concatenating, this is actually a fairly strenuous test.
+    StringBuilder builder = new StringBuilder(copies * sourceString.length());
+    ByteString unicode = ByteString.EMPTY;
+    for (int i = 0; i < copies; ++i) {
+      builder.append(sourceString);
+      unicode = RopeByteString.concatenate(unicode, sourceByteString);
+    }
+    String testString = builder.toString();
+
+    assertEquals(classUnderTest + " from string must have the expected type",
+        classUnderTest, getActualClassName(unicode));
+    String roundTripString = unicode.toString(ByteString.UTF_8);
+    assertEquals(classUnderTest + " unicode bytes must match",
+        testString, roundTripString);
+    ByteString flatString = ByteString.copyFromUtf8(testString);
+    assertEquals(classUnderTest + " string must equal the flat string", flatString, unicode);
+    assertEquals(classUnderTest + " string must must have same hashCode as the flat string",
+        flatString.hashCode(), unicode.hashCode());
+  }
+
+  @Override
   public void testToString_returnsCanonicalEmptyString() throws UnsupportedEncodingException {
     RopeByteString ropeByteString =
         RopeByteString.newInstanceForTest(ByteString.EMPTY, ByteString.EMPTY);
author	Viktor Szathmáry <phraktle@gmail.com>	2014-09-09 16:31:51 +0200
committer	Tamir Duberstein <tamird@gmail.com>	2015-04-02 14:48:43 -0700
commit	e84893f6768f136cc86e2db69fc1d40ff2be7e3b (patch)
tree	c36057efe7fc3c3bf50381c96bae16cf73234fa5 /java
parent	7139d1eff739682a088ea2c2dbdfef2f108321f8 (diff)
download	protobuf-e84893f6768f136cc86e2db69fc1d40ff2be7e3b.tar.gz protobuf-e84893f6768f136cc86e2db69fc1d40ff2be7e3b.tar.bz2 protobuf-e84893f6768f136cc86e2db69fc1d40ff2be7e3b.zip