From 46c3651c315cf77f1bcf5ad662074f04f095a04a Mon Sep 17 00:00:00 2001 From: amirhadadi Date: Fri, 18 Mar 2022 19:28:17 +0200 Subject: [PATCH] Implement Utf8.decodeUtf8 by using String constructor (#9415) * Implement Utf8.decodeUtf8 by using String constructor and a search for the replacement string "\uFFFD". This greatly simplifies the implementation, speeds it up for ascii and saves in memory allocations for non ascii strings. * Remove irrelevant comment about indexOf. * Code style changes following review. * Remove TODO + remove final per google style. * Delete decodeUtf8 from UnsafeProcessor as it inherits the intended implementation from its parent. * Move decodeUtf8 implementation from Utf8::Processor to Utf8 since it has only a single implementation which is independent of whether the processor is safe or unsafe. * Change only the logic of UnsafeProcessor to use String constructor This is done since some Android versions will see a performance regression if this change is applied. So we are making this change only for UnsafeProcessor which is not used on Android. * Remove duplicated Javadoc Co-authored-by: ahadadi --- .../main/java/com/google/protobuf/Utf8.java | 97 ++++--------------- 1 file changed, 20 insertions(+), 77 deletions(-) diff --git a/java/core/src/main/java/com/google/protobuf/Utf8.java b/java/core/src/main/java/com/google/protobuf/Utf8.java index 3130a31557..217b22e644 100644 --- a/java/core/src/main/java/com/google/protobuf/Utf8.java +++ b/java/core/src/main/java/com/google/protobuf/Utf8.java @@ -42,6 +42,7 @@ import static java.lang.Character.isSurrogatePair; import static java.lang.Character.toCodePoint; import java.nio.ByteBuffer; +import java.util.Arrays; /** * A set of low-level, high-performance static utility methods related to the UTF-8 character @@ -1365,88 +1366,30 @@ final class Utf8 { } @Override - String decodeUtf8(byte[] bytes, int index, int size) throws InvalidProtocolBufferException { - if ((index | size | bytes.length - index - size) < 0) { - throw new ArrayIndexOutOfBoundsException( - String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size)); - } - - int offset = index + unsafeEstimateConsecutiveAscii(bytes, index, size); - final int limit = index + size; + String decodeUtf8(byte[] bytes, int index, int size) + throws InvalidProtocolBufferException { + try { + String s = new String(bytes, index, size, Internal.UTF_8); - // get an "exact" consecutive ASCII - while (offset < limit) { - byte b = UnsafeUtil.getByte(bytes, offset); - if (b < 0) { - break; + // "\uFFFD" is UTF-8 default replacement string, which illegal byte sequences get replaced with. + if (!s.contains("\uFFFD")) { + return s; } - offset++; - } - - if (offset == limit) { - // The entire byte sequence is ASCII. Don't bother copying to a char[], JVMs using - // compact strings will just turn it back into the same byte[]. - return new String(bytes, index, size, Internal.US_ASCII); - } - - // It's not all ASCII, at this point. This may over-allocate, but we will truncate in the - // end. - char[] resultArr = new char[size]; - int resultPos = 0; - - // Copy over the initial run of ASCII. - for (int i = index; i < offset; i++) { - DecodeUtil.handleOneByte(UnsafeUtil.getByte(bytes, i), resultArr, resultPos++); - } - - while (offset < limit) { - byte byte1 = UnsafeUtil.getByte(bytes, offset++); - if (DecodeUtil.isOneByte(byte1)) { - DecodeUtil.handleOneByte(byte1, resultArr, resultPos++); - // It's common for there to be multiple ASCII characters in a run mixed in, so add an - // extra optimized loop to take care of these runs. - while (offset < limit) { - byte b = UnsafeUtil.getByte(bytes, offset); - if (!DecodeUtil.isOneByte(b)) { - break; - } - offset++; - DecodeUtil.handleOneByte(b, resultArr, resultPos++); - } - } else if (DecodeUtil.isTwoBytes(byte1)) { - if (offset >= limit) { - throw InvalidProtocolBufferException.invalidUtf8(); - } - DecodeUtil.handleTwoBytes( - byte1, /* byte2 */ UnsafeUtil.getByte(bytes, offset++), resultArr, resultPos++); - } else if (DecodeUtil.isThreeBytes(byte1)) { - if (offset >= limit - 1) { - throw InvalidProtocolBufferException.invalidUtf8(); - } - DecodeUtil.handleThreeBytes( - byte1, - /* byte2 */ UnsafeUtil.getByte(bytes, offset++), - /* byte3 */ UnsafeUtil.getByte(bytes, offset++), - resultArr, - resultPos++); - } else { - if (offset >= limit - 2) { - throw InvalidProtocolBufferException.invalidUtf8(); - } - DecodeUtil.handleFourBytes( - byte1, - /* byte2 */ UnsafeUtil.getByte(bytes, offset++), - /* byte3 */ UnsafeUtil.getByte(bytes, offset++), - /* byte4 */ UnsafeUtil.getByte(bytes, offset++), - resultArr, - resultPos++); - // 4-byte case requires two chars. - resultPos++; + // Since s contains "\uFFFD" there are 2 options: + // 1) The byte array slice is invalid UTF-8. + // 2) The byte array slice is valid UTF-8 and contains encodings for "\uFFFD". + // To rule out (1), we encode s and compare it to the byte array slice. + // If the byte array slice was invalid UTF-8, then we would get a different sequence of bytes. + if (Arrays.equals(s.getBytes(Internal.UTF_8), Arrays.copyOfRange(bytes, index, index + size))) { + return s; } - } - return new String(resultArr, 0, resultPos); + throw InvalidProtocolBufferException.invalidUtf8(); + } catch (IndexOutOfBoundsException e) { + throw new ArrayIndexOutOfBoundsException( + String.format("buffer length=%d, index=%d, size=%d", bytes.length, index, size)); + } } @Override