perf: String#getBytes(Charset) vs getBytes(String)

11 years ago · e84893f676
parent 7139d1eff7
commit e84893f676
7 changed files with 117 additions and 28 deletions
--- a/java/src/main/java/com/google/protobuf/ByteString.java
+++ b/java/src/main/java/com/google/protobuf/ByteString.java
@ -37,6 +37,8 @@ import java.io.OutputStream;
 import java.io.Serializable;
 import java.io.UnsupportedEncodingException;
 import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.UnsupportedCharsetException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Iterator;
@ -76,8 +78,7 @@ public abstract class ByteString implements Iterable<Byte>, Serializable {
  static final int MIN_READ_FROM_CHUNK_SIZE = 0x100;  // 256b
  static final int MAX_READ_FROM_CHUNK_SIZE = 0x2000;  // 8k

-  // Defined by java.nio.charset.Charset
-  protected static final String UTF_8 = "UTF-8";
+  protected static final Charset UTF_8 = Charset.forName("UTF-8");

  /**
   * Empty {@code ByteString}.
@ -269,11 +270,7 @@ public abstract class ByteString implements Iterable<Byte>, Serializable {
   * @return new {@code ByteString}
   */
  public static ByteString copyFromUtf8(String text) {
-    try {
-      return new LiteralByteString(text.getBytes(UTF_8));
-    } catch (UnsupportedEncodingException e) {
-      throw new RuntimeException("UTF-8 not supported?", e);
-    }
+    return new LiteralByteString(text.getBytes(UTF_8));
  }

  // =================================================================
@ -612,8 +609,36 @@ public abstract class ByteString implements Iterable<Byte>, Serializable {
   * @return new string
   * @throws UnsupportedEncodingException if charset isn't recognized
   */
-  public abstract String toString(String charsetName)
-      throws UnsupportedEncodingException;
+  public String toString(String charsetName)
+      throws UnsupportedEncodingException {
+    try {
+      return toString(Charset.forName(charsetName));
+    } catch (UnsupportedCharsetException e) {
+      UnsupportedEncodingException exception = new UnsupportedEncodingException(charsetName);
+      exception.initCause(e);
+      throw exception;
+    }
+  }
+
+  /**
+   * Constructs a new {@code String} by decoding the bytes using the
+   * specified charset. Returns the same empty String if empty.
+   *
+   * @param charset encode using this charset
+   * @return new string
+   */
+  public String toString(Charset charset) {
+    return size() == 0 ? "" : toStringInternal(charset);
+  }
+
+  /**
+   * Constructs a new {@code String} by decoding the bytes using the
+   * specified charset.
+   *
+   * @param charset encode using this charset
+   * @return new string
+   */
+  protected abstract String toStringInternal(Charset charset);

  // =================================================================
  // UTF-8 decoding
@ -624,11 +649,7 @@ public abstract class ByteString implements Iterable<Byte>, Serializable {
   * @return new string using UTF-8 encoding
   */
  public String toStringUtf8() {
-    try {
-      return toString(UTF_8);
-    } catch (UnsupportedEncodingException e) {
-      throw new RuntimeException("UTF-8 not supported?", e);
-    }
+    return toString(UTF_8);
  }

  /**
--- a/java/src/main/java/com/google/protobuf/LiteralByteString.java
+++ b/java/src/main/java/com/google/protobuf/LiteralByteString.java
@ -36,6 +36,7 @@ import java.io.InputStream;
 import java.io.OutputStream;
 import java.io.UnsupportedEncodingException;
 import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.NoSuchElementException;
@ -152,13 +153,8 @@ class LiteralByteString extends ByteString {
  }

  @Override
-  public String toString(String charsetName)
-      throws UnsupportedEncodingException {
-    // Optimize for empty strings, but ensure we don't silently ignore invalid
-    // encodings.
-    return size() == 0 && UTF_8.equals(charsetName)
-        ? ""
-        : new String(bytes, getOffsetIntoBytes(), size(), charsetName);
+  protected String toStringInternal(Charset charset) {
+    return new String(bytes, getOffsetIntoBytes(), size(), charset);
  }

  // =================================================================
--- a/java/src/main/java/com/google/protobuf/RopeByteString.java
+++ b/java/src/main/java/com/google/protobuf/RopeByteString.java
@ -38,6 +38,7 @@ import java.io.OutputStream;
 import java.io.UnsupportedEncodingException;
 import java.io.ByteArrayInputStream;
 import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Iterator;
@ -418,13 +419,8 @@ class RopeByteString extends ByteString {
  }

  @Override
-  public String toString(String charsetName)
-      throws UnsupportedEncodingException {
-    // Optimize for empty strings, but ensure we don't silently ignore invalid
-    // encodings.
-    return size() == 0 && UTF_8.equals(charsetName)
-        ? ""
-        : new String(toByteArray(), charsetName);
+  protected String toStringInternal(Charset charset) {
+    return new String(toByteArray(), charset);
  }

  // =================================================================
--- a/java/src/test/java/com/google/protobuf/BoundedByteStringTest.java
+++ b/java/src/test/java/com/google/protobuf/BoundedByteStringTest.java
@ -72,6 +72,19 @@ public class BoundedByteStringTest extends LiteralByteStringTest {
        testString.substring(2, testString.length() - 6), roundTripString);
  }

+  @Override
+  public void testCharsetToString() throws UnsupportedEncodingException {
+    String testString = "I love unicode \u1234\u5678 characters";
+    LiteralByteString unicode = new LiteralByteString(testString.getBytes(ByteString.UTF_8));
+    ByteString chopped = unicode.substring(2, unicode.size() - 6);
+    assertEquals(classUnderTest + ".substring() must have the expected type",
+        classUnderTest, getActualClassName(chopped));
+
+    String roundTripString = chopped.toString(ByteString.UTF_8);
+    assertEquals(classUnderTest + " unicode bytes must match",
+        testString.substring(2, testString.length() - 6), roundTripString);
+  }
+
  public void testJavaSerialization() throws Exception {
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    ObjectOutputStream oos = new ObjectOutputStream(out);
--- a/java/src/test/java/com/google/protobuf/LiteralByteStringTest.java
+++ b/java/src/test/java/com/google/protobuf/LiteralByteStringTest.java
@ -298,6 +298,13 @@ public class LiteralByteStringTest extends TestCase {
    assertEquals(classUnderTest + " unicode must match", testString, roundTripString);
  }

+  public void testCharsetToString() throws UnsupportedEncodingException {
+    String testString = "I love unicode \u1234\u5678 characters";
+    LiteralByteString unicode = new LiteralByteString(testString.getBytes(ByteString.UTF_8));
+    String roundTripString = unicode.toString(ByteString.UTF_8);
+    assertEquals(classUnderTest + " unicode must match", testString, roundTripString);
+  }
+
  public void testToString_returnsCanonicalEmptyString() throws UnsupportedEncodingException{
    assertSame(classUnderTest + " must be the same string references",
        ByteString.EMPTY.toString(UTF_8), new LiteralByteString(new byte[]{}).toString(UTF_8));
--- a/java/src/test/java/com/google/protobuf/RopeByteStringSubstringTest.java
+++ b/java/src/test/java/com/google/protobuf/RopeByteStringSubstringTest.java
@ -94,4 +94,34 @@ public class RopeByteStringSubstringTest extends LiteralByteStringTest {
    assertEquals(classUnderTest + " string must must have same hashCode as the flat string",
        flatString.hashCode(), unicode.hashCode());
  }
+
+  @Override
+  public void testCharsetToString() throws UnsupportedEncodingException {
+    String sourceString = "I love unicode \u1234\u5678 characters";
+    ByteString sourceByteString = ByteString.copyFromUtf8(sourceString);
+    int copies = 250;
+
+    // By building the RopeByteString by concatenating, this is actually a fairly strenuous test.
+    StringBuilder builder = new StringBuilder(copies * sourceString.length());
+    ByteString unicode = ByteString.EMPTY;
+    for (int i = 0; i < copies; ++i) {
+      builder.append(sourceString);
+      unicode = RopeByteString.concatenate(unicode, sourceByteString);
+    }
+    String testString = builder.toString();
+
+    // Do the substring part
+    testString = testString.substring(2, testString.length() - 6);
+    unicode = unicode.substring(2, unicode.size() - 6);
+
+    assertEquals(classUnderTest + " from string must have the expected type",
+        classUnderTest, getActualClassName(unicode));
+    String roundTripString = unicode.toString(ByteString.UTF_8);
+    assertEquals(classUnderTest + " unicode bytes must match",
+        testString, roundTripString);
+    ByteString flatString = ByteString.copyFromUtf8(testString);
+    assertEquals(classUnderTest + " string must equal the flat string", flatString, unicode);
+    assertEquals(classUnderTest + " string must must have same hashCode as the flat string",
+        flatString.hashCode(), unicode.hashCode());
+  }
 }
--- a/java/src/test/java/com/google/protobuf/RopeByteStringTest.java
+++ b/java/src/test/java/com/google/protobuf/RopeByteStringTest.java
@ -118,6 +118,32 @@ public class RopeByteStringTest extends LiteralByteStringTest {
        flatString.hashCode(), unicode.hashCode());
  }

+  @Override
+  public void testCharsetToString() throws UnsupportedEncodingException {
+    String sourceString = "I love unicode \u1234\u5678 characters";
+    ByteString sourceByteString = ByteString.copyFromUtf8(sourceString);
+    int copies = 250;
+
+    // By building the RopeByteString by concatenating, this is actually a fairly strenuous test.
+    StringBuilder builder = new StringBuilder(copies * sourceString.length());
+    ByteString unicode = ByteString.EMPTY;
+    for (int i = 0; i < copies; ++i) {
+      builder.append(sourceString);
+      unicode = RopeByteString.concatenate(unicode, sourceByteString);
+    }
+    String testString = builder.toString();
+
+    assertEquals(classUnderTest + " from string must have the expected type",
+        classUnderTest, getActualClassName(unicode));
+    String roundTripString = unicode.toString(ByteString.UTF_8);
+    assertEquals(classUnderTest + " unicode bytes must match",
+        testString, roundTripString);
+    ByteString flatString = ByteString.copyFromUtf8(testString);
+    assertEquals(classUnderTest + " string must equal the flat string", flatString, unicode);
+    assertEquals(classUnderTest + " string must must have same hashCode as the flat string",
+        flatString.hashCode(), unicode.hashCode());
+  }
+
  @Override
  public void testToString_returnsCanonicalEmptyString() throws UnsupportedEncodingException {
    RopeByteString ropeByteString =