From daee05168eb0f2cf102b7ef80c0af87c80729612 Mon Sep 17 00:00:00 2001 From: "kenton@google.com" Date: Mon, 1 Feb 2010 17:41:59 +0000 Subject: Optimize Java string serialization. Patch from Evan Jones. --- CHANGES.txt | 5 ++ CONTRIBUTORS.txt | 2 + .../com/google/protobuf/CodedOutputStream.java | 17 +++++++ .../com/google/protobuf/CodedOutputStreamTest.java | 24 +++++++++ .../protobuf/compiler/java/java_primitive_field.cc | 58 ++++++++++++++++++---- .../protobuf/compiler/java/java_primitive_field.h | 2 + 6 files changed, 99 insertions(+), 9 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 11b332ed..8262587b 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,8 @@ +????-??-?? version 2.3.1: + + Java + * Improved performance of string serialization. + 2010-01-08 version 2.3.0: General diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt index 6c002af9..7fe99ecb 100644 --- a/CONTRIBUTORS.txt +++ b/CONTRIBUTORS.txt @@ -80,6 +80,8 @@ Patch contributors: * Fixes for Solaris 10 32/64-bit confusion. Evan Jones * Optimize Java serialization code when writing a small message to a stream. + * Optimize Java serialization of strings so that UTF-8 encoding happens only + once per string per serialization call. * Clean up some Java warnings. Michael Kucharski * Added CodedInputStream.getTotalBytesRead(). diff --git a/java/src/main/java/com/google/protobuf/CodedOutputStream.java b/java/src/main/java/com/google/protobuf/CodedOutputStream.java index 58dd1506..18da6d9d 100644 --- a/java/src/main/java/com/google/protobuf/CodedOutputStream.java +++ b/java/src/main/java/com/google/protobuf/CodedOutputStream.java @@ -193,6 +193,23 @@ public final class CodedOutputStream { writeStringNoTag(value); } + /** + * Write a {@code string} field, including tag, to the stream, where bytes + * is the encoded version of value. Used by the SPEED version of messages + * to avoid performing the UTF-8 conversion twice. bytes is simply a hint + * and may be null. If it is null, value will be converted as usual. + */ + public void writeStringCached(final int fieldNumber, final String value, + ByteString bytes) + throws IOException { + // The cache can be null if serializing without getting the size first, or + // if there are multiple threads. + if (bytes == null) { + bytes = ByteString.copyFromUtf8(value); + } + writeBytes(fieldNumber, bytes); + } + /** Write a {@code group} field, including tag, to the stream. */ public void writeGroup(final int fieldNumber, final MessageLite value) throws IOException { diff --git a/java/src/test/java/com/google/protobuf/CodedOutputStreamTest.java b/java/src/test/java/com/google/protobuf/CodedOutputStreamTest.java index 48e54657..85691d60 100644 --- a/java/src/test/java/com/google/protobuf/CodedOutputStreamTest.java +++ b/java/src/test/java/com/google/protobuf/CodedOutputStreamTest.java @@ -36,6 +36,7 @@ import protobuf_unittest.UnittestProto.TestPackedTypes; import junit.framework.TestCase; import java.io.ByteArrayOutputStream; +import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -211,6 +212,29 @@ public class CodedOutputStreamTest extends TestCase { 0x9abcdef012345678L); } + /** Test writing cached strings. */ + public void testWriteStringCached() throws IOException { + final ByteArrayOutputStream output = new ByteArrayOutputStream(); + final CodedOutputStream stream = CodedOutputStream.newInstance(output); + + // Test writing a string that is not cached + stream.writeStringCached(5, "hello", null); + stream.flush(); + CodedInputStream in = CodedInputStream.newInstance(output.toByteArray()); + assertEquals(WireFormat.makeTag(5, WireFormat.WIRETYPE_LENGTH_DELIMITED), + in.readTag()); + assertEquals("hello", in.readString()); + + // Write a cached string: the real string is ignored + output.reset(); + stream.writeStringCached(5, "ignored", ByteString.copyFromUtf8("hello")); + stream.flush(); + in = CodedInputStream.newInstance(output.toByteArray()); + assertEquals(WireFormat.makeTag(5, WireFormat.WIRETYPE_LENGTH_DELIMITED), + in.readTag()); + assertEquals("hello", in.readString()); + } + /** Test encodeZigZag32() and encodeZigZag64(). */ public void testEncodeZigZag() throws Exception { assertEquals(0, CodedOutputStream.encodeZigZag32( 0)); diff --git a/src/google/protobuf/compiler/java/java_primitive_field.cc b/src/google/protobuf/compiler/java/java_primitive_field.cc index f6179bfa..d0fd081f 100644 --- a/src/google/protobuf/compiler/java/java_primitive_field.cc +++ b/src/google/protobuf/compiler/java/java_primitive_field.cc @@ -199,6 +199,14 @@ GenerateMembers(io::Printer* printer) const { "private $type$ $name$_ = $default$;\n" "public boolean has$capitalized_name$() { return has$capitalized_name$; }\n" "public $type$ get$capitalized_name$() { return $name$_; }\n"); + // Avoid double encoding for Java strings + // This field does not need to be volatile because ByteString is immutable. + // http://www.cs.umd.edu/~pugh/java/memoryModel/jsr-133-faq.html#finalRight + // However, it seems better to be safe than sorry. + if (ShouldUseStringEncodingCache()) { + printer->Print(variables_, + "private volatile com.google.protobuf.ByteString $name$EncodedCache_;\n"); + } } void PrimitiveFieldGenerator:: @@ -259,25 +267,57 @@ GenerateParsingCode(io::Printer* printer) const { void PrimitiveFieldGenerator:: GenerateSerializationCode(io::Printer* printer) const { - printer->Print(variables_, - "if (has$capitalized_name$()) {\n" - " output.write$capitalized_type$($number$, get$capitalized_name$());\n" - "}\n"); + if (ShouldUseStringEncodingCache()) { + // Pass the cached serialized version, then forget it. + // The cached version could be null if we didn't compute the size first, + // or if there are two threads attempting to serialize simultaneously. + // CodedOutputStream.writeStringCached handles this for us. + printer->Print(variables_, + "if (has$capitalized_name$()) {\n" + " output.write$capitalized_type$Cached($number$,\n" + " get$capitalized_name$(),\n" + " $name$EncodedCache_);\n" + " $name$EncodedCache_ = null;\n" + "}\n"); + } else { + printer->Print(variables_, + "if (has$capitalized_name$()) {\n" + " output.write$capitalized_type$($number$, get$capitalized_name$());\n" + "}\n"); + } } void PrimitiveFieldGenerator:: GenerateSerializedSizeCode(io::Printer* printer) const { - printer->Print(variables_, - "if (has$capitalized_name$()) {\n" - " size += com.google.protobuf.CodedOutputStream\n" - " .compute$capitalized_type$Size($number$, get$capitalized_name$());\n" - "}\n"); + // Avoid double encoding for strings: serialize the string here + if (ShouldUseStringEncodingCache()) { + printer->Print(variables_, + "if (has$capitalized_name$()) {\n" + " com.google.protobuf.ByteString serialized = \n" + " com.google.protobuf.ByteString.copyFromUtf8(\n" + " get$capitalized_name$());\n" + " $name$EncodedCache_ = serialized;\n" + " size += com.google.protobuf.CodedOutputStream\n" + " .computeBytesSize($number$, serialized);\n" + "}\n"); + } else { + printer->Print(variables_, + "if (has$capitalized_name$()) {\n" + " size += com.google.protobuf.CodedOutputStream\n" + " .compute$capitalized_type$Size($number$, get$capitalized_name$());\n" + "}\n"); + } } string PrimitiveFieldGenerator::GetBoxedType() const { return BoxedPrimitiveTypeName(GetJavaType(descriptor_)); } +bool PrimitiveFieldGenerator::ShouldUseStringEncodingCache() const { + return GetType(descriptor_) == FieldDescriptor::TYPE_STRING && + descriptor_->file()->options().optimize_for() == FileOptions::SPEED; +} + // =================================================================== RepeatedPrimitiveFieldGenerator:: diff --git a/src/google/protobuf/compiler/java/java_primitive_field.h b/src/google/protobuf/compiler/java/java_primitive_field.h index 4e482a05..9d6c3d5f 100644 --- a/src/google/protobuf/compiler/java/java_primitive_field.h +++ b/src/google/protobuf/compiler/java/java_primitive_field.h @@ -62,6 +62,8 @@ class PrimitiveFieldGenerator : public FieldGenerator { string GetBoxedType() const; private: + bool ShouldUseStringEncodingCache() const; + const FieldDescriptor* descriptor_; map variables_; -- cgit v1.2.3