From daee05168eb0f2cf102b7ef80c0af87c80729612 Mon Sep 17 00:00:00 2001
From: "kenton@google.com"
 <kenton@google.com@630680e5-0e50-0410-840e-4b1c322b438d>
Date: Mon, 1 Feb 2010 17:41:59 +0000
Subject: Optimize Java string serialization.  Patch from Evan Jones.

---
 CHANGES.txt                                        |  5 ++
 CONTRIBUTORS.txt                                   |  2 +
 .../com/google/protobuf/CodedOutputStream.java     | 17 +++++++
 .../com/google/protobuf/CodedOutputStreamTest.java | 24 +++++++++
 .../protobuf/compiler/java/java_primitive_field.cc | 58 ++++++++++++++++++----
 .../protobuf/compiler/java/java_primitive_field.h  |  2 +
 6 files changed, 99 insertions(+), 9 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 11b332ed..8262587b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,8 @@
+????-??-?? version 2.3.1:
+
+  Java
+  * Improved performance of string serialization.
+
 2010-01-08 version 2.3.0:
 
   General
diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt
index 6c002af9..7fe99ecb 100644
--- a/CONTRIBUTORS.txt
+++ b/CONTRIBUTORS.txt
@@ -80,6 +80,8 @@ Patch contributors:
     * Fixes for Solaris 10 32/64-bit confusion.
   Evan Jones <evanj@mit.edu>
     * Optimize Java serialization code when writing a small message to a stream.
+    * Optimize Java serialization of strings so that UTF-8 encoding happens only
+      once per string per serialization call.
     * Clean up some Java warnings.
   Michael Kucharski <m.kucharski@gmail.com>
     * Added CodedInputStream.getTotalBytesRead().
diff --git a/java/src/main/java/com/google/protobuf/CodedOutputStream.java b/java/src/main/java/com/google/protobuf/CodedOutputStream.java
index 58dd1506..18da6d9d 100644
--- a/java/src/main/java/com/google/protobuf/CodedOutputStream.java
+++ b/java/src/main/java/com/google/protobuf/CodedOutputStream.java
@@ -193,6 +193,23 @@ public final class CodedOutputStream {
     writeStringNoTag(value);
   }
 
+  /**
+   * Write a {@code string} field, including tag, to the stream, where bytes
+   * is the encoded version of value. Used by the SPEED version of messages
+   * to avoid performing the UTF-8 conversion twice. bytes is simply a hint
+   * and may be null. If it is null, value will be converted as usual.
+   */
+  public void writeStringCached(final int fieldNumber, final String value,
+                                ByteString bytes)
+                                throws IOException {
+    // The cache can be null if serializing without getting the size first, or
+    // if there are multiple threads.
+    if (bytes == null) {
+      bytes = ByteString.copyFromUtf8(value);
+    }
+    writeBytes(fieldNumber, bytes);
+  }
+
   /** Write a {@code group} field, including tag, to the stream. */
   public void writeGroup(final int fieldNumber, final MessageLite value)
                          throws IOException {
diff --git a/java/src/test/java/com/google/protobuf/CodedOutputStreamTest.java b/java/src/test/java/com/google/protobuf/CodedOutputStreamTest.java
index 48e54657..85691d60 100644
--- a/java/src/test/java/com/google/protobuf/CodedOutputStreamTest.java
+++ b/java/src/test/java/com/google/protobuf/CodedOutputStreamTest.java
@@ -36,6 +36,7 @@ import protobuf_unittest.UnittestProto.TestPackedTypes;
 import junit.framework.TestCase;
 
 import java.io.ByteArrayOutputStream;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 
@@ -211,6 +212,29 @@ public class CodedOutputStreamTest extends TestCase {
       0x9abcdef012345678L);
   }
 
+  /** Test writing cached strings. */
+  public void testWriteStringCached() throws IOException {
+    final ByteArrayOutputStream output = new ByteArrayOutputStream();
+    final CodedOutputStream stream = CodedOutputStream.newInstance(output);
+
+    // Test writing a string that is not cached
+    stream.writeStringCached(5, "hello", null);
+    stream.flush();
+    CodedInputStream in = CodedInputStream.newInstance(output.toByteArray());
+    assertEquals(WireFormat.makeTag(5, WireFormat.WIRETYPE_LENGTH_DELIMITED),
+                 in.readTag());
+    assertEquals("hello", in.readString());
+
+    // Write a cached string: the real string is ignored
+    output.reset();
+    stream.writeStringCached(5, "ignored", ByteString.copyFromUtf8("hello"));
+    stream.flush();
+    in = CodedInputStream.newInstance(output.toByteArray());
+    assertEquals(WireFormat.makeTag(5, WireFormat.WIRETYPE_LENGTH_DELIMITED),
+                 in.readTag());
+    assertEquals("hello", in.readString());
+  }
+
   /** Test encodeZigZag32() and encodeZigZag64(). */
   public void testEncodeZigZag() throws Exception {
     assertEquals(0, CodedOutputStream.encodeZigZag32( 0));
diff --git a/src/google/protobuf/compiler/java/java_primitive_field.cc b/src/google/protobuf/compiler/java/java_primitive_field.cc
index f6179bfa..d0fd081f 100644
--- a/src/google/protobuf/compiler/java/java_primitive_field.cc
+++ b/src/google/protobuf/compiler/java/java_primitive_field.cc
@@ -199,6 +199,14 @@ GenerateMembers(io::Printer* printer) const {
     "private $type$ $name$_ = $default$;\n"
     "public boolean has$capitalized_name$() { return has$capitalized_name$; }\n"
     "public $type$ get$capitalized_name$() { return $name$_; }\n");
+  // Avoid double encoding for Java strings
+  // This field does not need to be volatile because ByteString is immutable.
+  // http://www.cs.umd.edu/~pugh/java/memoryModel/jsr-133-faq.html#finalRight
+  // However, it seems better to be safe than sorry.
+  if (ShouldUseStringEncodingCache()) {
+    printer->Print(variables_,
+      "private volatile com.google.protobuf.ByteString $name$EncodedCache_;\n");
+  }
 }
 
 void PrimitiveFieldGenerator::
@@ -259,25 +267,57 @@ GenerateParsingCode(io::Printer* printer) const {
 
 void PrimitiveFieldGenerator::
 GenerateSerializationCode(io::Printer* printer) const {
-  printer->Print(variables_,
-    "if (has$capitalized_name$()) {\n"
-    "  output.write$capitalized_type$($number$, get$capitalized_name$());\n"
-    "}\n");
+  if (ShouldUseStringEncodingCache()) {
+    // Pass the cached serialized version, then forget it.
+    // The cached version could be null if we didn't compute the size first,
+    // or if there are two threads attempting to serialize simultaneously.
+    // CodedOutputStream.writeStringCached handles this for us.
+    printer->Print(variables_,
+      "if (has$capitalized_name$()) {\n"
+      "  output.write$capitalized_type$Cached($number$,\n"
+      "                                       get$capitalized_name$(),\n"
+      "                                       $name$EncodedCache_);\n"
+      "  $name$EncodedCache_ = null;\n"
+      "}\n");
+  } else {
+    printer->Print(variables_,
+      "if (has$capitalized_name$()) {\n"
+      "  output.write$capitalized_type$($number$, get$capitalized_name$());\n"
+      "}\n");
+  }
 }
 
 void PrimitiveFieldGenerator::
 GenerateSerializedSizeCode(io::Printer* printer) const {
-  printer->Print(variables_,
-    "if (has$capitalized_name$()) {\n"
-    "  size += com.google.protobuf.CodedOutputStream\n"
-    "    .compute$capitalized_type$Size($number$, get$capitalized_name$());\n"
-    "}\n");
+  // Avoid double encoding for strings: serialize the string here
+  if (ShouldUseStringEncodingCache()) {
+    printer->Print(variables_,
+      "if (has$capitalized_name$()) {\n"
+      "  com.google.protobuf.ByteString serialized = \n"
+      "    com.google.protobuf.ByteString.copyFromUtf8(\n"
+      "      get$capitalized_name$());\n"
+      "  $name$EncodedCache_ = serialized;\n"
+      "  size += com.google.protobuf.CodedOutputStream\n"
+      "    .computeBytesSize($number$, serialized);\n"
+      "}\n");
+  } else {
+    printer->Print(variables_,
+      "if (has$capitalized_name$()) {\n"
+      "  size += com.google.protobuf.CodedOutputStream\n"
+      "    .compute$capitalized_type$Size($number$, get$capitalized_name$());\n"
+      "}\n");
+  }
 }
 
 string PrimitiveFieldGenerator::GetBoxedType() const {
   return BoxedPrimitiveTypeName(GetJavaType(descriptor_));
 }
 
+bool PrimitiveFieldGenerator::ShouldUseStringEncodingCache() const {
+  return GetType(descriptor_) == FieldDescriptor::TYPE_STRING &&
+      descriptor_->file()->options().optimize_for() == FileOptions::SPEED;
+}
+
 // ===================================================================
 
 RepeatedPrimitiveFieldGenerator::
diff --git a/src/google/protobuf/compiler/java/java_primitive_field.h b/src/google/protobuf/compiler/java/java_primitive_field.h
index 4e482a05..9d6c3d5f 100644
--- a/src/google/protobuf/compiler/java/java_primitive_field.h
+++ b/src/google/protobuf/compiler/java/java_primitive_field.h
@@ -62,6 +62,8 @@ class PrimitiveFieldGenerator : public FieldGenerator {
   string GetBoxedType() const;
 
  private:
+  bool ShouldUseStringEncodingCache() const;
+
   const FieldDescriptor* descriptor_;
   map<string, string> variables_;
 
-- 
cgit v1.2.3