Optimize Java string serialization. Patch from Evan Jones.

15 years ago · daee05168e
parent ab6950d75d
commit daee05168e
6 changed files with 99 additions and 9 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -1,3 +1,8 @@
 ????-??-?? version 2.3.1:
  Java
  * Improved performance of string serialization.
 2010-01-08 version 2.3.0:
  General
--- a/CONTRIBUTORS.txt
+++ b/CONTRIBUTORS.txt
@ -80,6 +80,8 @@ Patch contributors:
    * Fixes for Solaris 10 32/64-bit confusion.
  Evan Jones <evanj@mit.edu>
    * Optimize Java serialization code when writing a small message to a stream.
    * Optimize Java serialization of strings so that UTF-8 encoding happens only
      once per string per serialization call.
    * Clean up some Java warnings.
  Michael Kucharski <m.kucharski@gmail.com>
    * Added CodedInputStream.getTotalBytesRead().
--- a/java/src/main/java/com/google/protobuf/CodedOutputStream.java
+++ b/java/src/main/java/com/google/protobuf/CodedOutputStream.java
@ -193,6 +193,23 @@ public final class CodedOutputStream {
    writeStringNoTag(value);
  }
  /**
   * Write a {@code string} field, including tag, to the stream, where bytes
   * is the encoded version of value. Used by the SPEED version of messages
   * to avoid performing the UTF-8 conversion twice. bytes is simply a hint
   * and may be null. If it is null, value will be converted as usual.
   */
  public void writeStringCached(final int fieldNumber, final String value,
                                ByteString bytes)
                                throws IOException {
    // The cache can be null if serializing without getting the size first, or
    // if there are multiple threads.
    if (bytes == null) {
      bytes = ByteString.copyFromUtf8(value);
    }
    writeBytes(fieldNumber, bytes);
  }
  /** Write a {@code group} field, including tag, to the stream. */
  public void writeGroup(final int fieldNumber, final MessageLite value)
                         throws IOException {
--- a/java/src/test/java/com/google/protobuf/CodedOutputStreamTest.java
+++ b/java/src/test/java/com/google/protobuf/CodedOutputStreamTest.java
@ -36,6 +36,7 @@ import protobuf_unittest.UnittestProto.TestPackedTypes;
 import junit.framework.TestCase;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
@ -211,6 +212,29 @@ public class CodedOutputStreamTest extends TestCase {
      0x9abcdef012345678L);
  }
  /** Test writing cached strings. */
  public void testWriteStringCached() throws IOException {
    final ByteArrayOutputStream output = new ByteArrayOutputStream();
    final CodedOutputStream stream = CodedOutputStream.newInstance(output);
    // Test writing a string that is not cached
    stream.writeStringCached(5, "hello", null);
    stream.flush();
    CodedInputStream in = CodedInputStream.newInstance(output.toByteArray());
    assertEquals(WireFormat.makeTag(5, WireFormat.WIRETYPE_LENGTH_DELIMITED),
                 in.readTag());
    assertEquals("hello", in.readString());
    // Write a cached string: the real string is ignored
    output.reset();
    stream.writeStringCached(5, "ignored", ByteString.copyFromUtf8("hello"));
    stream.flush();
    in = CodedInputStream.newInstance(output.toByteArray());
    assertEquals(WireFormat.makeTag(5, WireFormat.WIRETYPE_LENGTH_DELIMITED),
                 in.readTag());
    assertEquals("hello", in.readString());
  }
  /** Test encodeZigZag32() and encodeZigZag64(). */
  public void testEncodeZigZag() throws Exception {
    assertEquals(0, CodedOutputStream.encodeZigZag32( 0));
--- a/src/google/protobuf/compiler/java/java_primitive_field.cc
+++ b/src/google/protobuf/compiler/java/java_primitive_field.cc
@ -199,6 +199,14 @@ GenerateMembers(io::Printer* printer) const {
    "private $type$ $name$_ = $default$;\n"
    "public boolean has$capitalized_name$() { return has$capitalized_name$; }\n"
    "public $type$ get$capitalized_name$() { return $name$_; }\n");
  // Avoid double encoding for Java strings
  // This field does not need to be volatile because ByteString is immutable.
  // http://www.cs.umd.edu/~pugh/java/memoryModel/jsr-133-faq.html#finalRight
  // However, it seems better to be safe than sorry.
  if (ShouldUseStringEncodingCache()) {
    printer->Print(variables_,
      "private volatile com.google.protobuf.ByteString $name$EncodedCache_;\n");
  }
 }
 void PrimitiveFieldGenerator::
@ -259,25 +267,57 @@ GenerateParsingCode(io::Printer* printer) const {
 void PrimitiveFieldGenerator::
 GenerateSerializationCode(io::Printer* printer) const {
-  printer->Print(variables_,
+  if (ShouldUseStringEncodingCache()) {
-    "if (has$capitalized_name$()) {\n"
+    // Pass the cached serialized version, then forget it.
-    "  output.write$capitalized_type$($number$, get$capitalized_name$());\n"
+    // The cached version could be null if we didn't compute the size first,
-    "}\n");
+    // or if there are two threads attempting to serialize simultaneously.
    // CodedOutputStream.writeStringCached handles this for us.
    printer->Print(variables_,
      "if (has$capitalized_name$()) {\n"
      "  output.write$capitalized_type$Cached($number$,\n"
      "                                       get$capitalized_name$(),\n"
      "                                       $name$EncodedCache_);\n"
      "  $name$EncodedCache_ = null;\n"
      "}\n");
  } else {
    printer->Print(variables_,
      "if (has$capitalized_name$()) {\n"
      "  output.write$capitalized_type$($number$, get$capitalized_name$());\n"
      "}\n");
  }
 }
 void PrimitiveFieldGenerator::
 GenerateSerializedSizeCode(io::Printer* printer) const {
-  printer->Print(variables_,
+  // Avoid double encoding for strings: serialize the string here
-    "if (has$capitalized_name$()) {\n"
+  if (ShouldUseStringEncodingCache()) {
-    "  size += com.google.protobuf.CodedOutputStream\n"
+    printer->Print(variables_,
-    "    .compute$capitalized_type$Size($number$, get$capitalized_name$());\n"
+      "if (has$capitalized_name$()) {\n"
-    "}\n");
+      "  com.google.protobuf.ByteString serialized = \n"
      "    com.google.protobuf.ByteString.copyFromUtf8(\n"
      "      get$capitalized_name$());\n"
      "  $name$EncodedCache_ = serialized;\n"
      "  size += com.google.protobuf.CodedOutputStream\n"
      "    .computeBytesSize($number$, serialized);\n"
      "}\n");
  } else {
    printer->Print(variables_,
      "if (has$capitalized_name$()) {\n"
      "  size += com.google.protobuf.CodedOutputStream\n"
      "    .compute$capitalized_type$Size($number$, get$capitalized_name$());\n"
      "}\n");
  }
 }
 string PrimitiveFieldGenerator::GetBoxedType() const {
  return BoxedPrimitiveTypeName(GetJavaType(descriptor_));
 }
 bool PrimitiveFieldGenerator::ShouldUseStringEncodingCache() const {
  return GetType(descriptor_) == FieldDescriptor::TYPE_STRING &&
      descriptor_->file()->options().optimize_for() == FileOptions::SPEED;
 }
 // ===================================================================
 RepeatedPrimitiveFieldGenerator::
--- a/src/google/protobuf/compiler/java/java_primitive_field.h
+++ b/src/google/protobuf/compiler/java/java_primitive_field.h
@ -62,6 +62,8 @@ class PrimitiveFieldGenerator : public FieldGenerator {
  string GetBoxedType() const;
 private:
  bool ShouldUseStringEncodingCache() const;
  const FieldDescriptor* descriptor_;
  map<string, string> variables_;