From 62a435c1ab81ea65850f1b1fb58f7b3c6f498cef Mon Sep 17 00:00:00 2001 From: Protobuf Team Bot Date: Mon, 16 Oct 2023 14:12:52 -0700 Subject: [PATCH] Fix handling of Unicode escapes in string data in textproto files. PiperOrigin-RevId: 573926175 --- .../text_format_failure_list_python.txt | 25 ------------------- .../text_format_failure_list_python_cpp.txt | 24 ------------------ .../protobuf/internal/text_format_test.py | 11 +++++--- python/google/protobuf/text_encoding.py | 11 +++++--- 4 files changed, 14 insertions(+), 57 deletions(-) diff --git a/conformance/text_format_failure_list_python.txt b/conformance/text_format_failure_list_python.txt index 6bf7c1aa63..2f7f22471c 100644 --- a/conformance/text_format_failure_list_python.txt +++ b/conformance/text_format_failure_list_python.txt @@ -3,31 +3,6 @@ # TODO: These should be fixed. Required.Proto3.TextFormatInput.FloatFieldMaxValue.ProtobufOutput Required.Proto3.TextFormatInput.FloatFieldMaxValue.TextFormatOutput - -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyBytes -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyString -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairBytes -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairString -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyBytes -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyString -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.ProtobufOutput -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.TextFormatOutput -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.ProtobufOutput -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.TextFormatOutput -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.ProtobufOutput -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.TextFormatOutput -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.ProtobufOutput -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.TextFormatOutput -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyBytes -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyString -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairBytes -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairString -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyBytes -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyString -Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortBytes -Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortString -Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongBytes -Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongString Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.ProtobufOutput Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.TextFormatOutput Required.Proto3.TextFormatInput.StringLiteralBasicEscapesString.ProtobufOutput diff --git a/conformance/text_format_failure_list_python_cpp.txt b/conformance/text_format_failure_list_python_cpp.txt index 91fc2ea3cd..b9da32dab8 100644 --- a/conformance/text_format_failure_list_python_cpp.txt +++ b/conformance/text_format_failure_list_python_cpp.txt @@ -1,27 +1,3 @@ -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyBytes -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyString -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairBytes -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairString -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyBytes -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyString -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.ProtobufOutput -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.TextFormatOutput -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.ProtobufOutput -Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.TextFormatOutput -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.ProtobufOutput -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.TextFormatOutput -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.ProtobufOutput -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.TextFormatOutput -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyBytes -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyString -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairBytes -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairString -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyBytes -Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyString -Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortBytes -Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortString -Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongBytes -Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongString Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.ProtobufOutput Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.TextFormatOutput Required.Proto3.TextFormatInput.StringLiteralBasicEscapesString.ProtobufOutput diff --git a/python/google/protobuf/internal/text_format_test.py b/python/google/protobuf/internal/text_format_test.py index 8f9d5f8862..75f3b5b35a 100644 --- a/python/google/protobuf/internal/text_format_test.py +++ b/python/google/protobuf/internal/text_format_test.py @@ -854,10 +854,11 @@ class TextFormatParserTests(TextFormatBase): # itself for string fields. It also demonstrates escaped binary data. # The ur"" string prefix is unfortunately missing from Python 3 # so we resort to double escaping our \s so that they come through. - _UNICODE_SAMPLE = u""" + _UNICODE_SAMPLE = """ optional_bytes: 'Á short desçription' optional_string: 'Á short desçription' repeated_bytes: '\\303\\201 short des\\303\\247ription' + repeated_bytes: '\\u00c1 short des\\u00e7ription' repeated_bytes: '\\x12\\x34\\x56\\x78\\x90\\xab\\xcd\\xef' repeated_string: '\\xd0\\x9f\\xd1\\x80\\xd0\\xb8\\xd0\\xb2\\xd0\\xb5\\xd1\\x82' """ @@ -873,8 +874,9 @@ class TextFormatParserTests(TextFormatBase): self.assertEqual(m.optional_bytes, self._GOLDEN_BYTES) self.assertEqual(m.optional_string, self._GOLDEN_UNICODE) self.assertEqual(m.repeated_bytes[0], self._GOLDEN_BYTES) - # repeated_bytes[1] contained simple \ escaped non-UTF-8 raw binary data. - self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES_1) + self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES) + # repeated_bytes[2] contained simple \ escaped non-UTF-8 raw binary data. + self.assertEqual(m.repeated_bytes[2], self._GOLDEN_BYTES_1) # repeated_string[0] contained \ escaped data representing the UTF-8 # representation of _GOLDEN_STR_0 - it needs to decode as such. self.assertEqual(m.repeated_string[0], self._GOLDEN_STR_0) @@ -885,8 +887,9 @@ class TextFormatParserTests(TextFormatBase): self.assertEqual(m.optional_bytes, self._GOLDEN_BYTES) self.assertEqual(m.optional_string, self._GOLDEN_UNICODE) self.assertEqual(m.repeated_bytes[0], self._GOLDEN_BYTES) + self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES) # repeated_bytes[1] contained simple \ escaped non-UTF-8 raw binary data. - self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES_1) + self.assertEqual(m.repeated_bytes[2], self._GOLDEN_BYTES_1) # repeated_string[0] contained \ escaped data representing the UTF-8 # representation of _GOLDEN_STR_0 - it needs to decode as such. self.assertEqual(m.repeated_string[0], self._GOLDEN_STR_0) diff --git a/python/google/protobuf/text_encoding.py b/python/google/protobuf/text_encoding.py index d454987496..112e4ab105 100644 --- a/python/google/protobuf/text_encoding.py +++ b/python/google/protobuf/text_encoding.py @@ -79,7 +79,10 @@ def CUnescape(text: str) -> bytes: # allow single-digit hex escapes (like '\xf'). result = _CUNESCAPE_HEX.sub(ReplaceHex, text) - return (result.encode('utf-8') # Make it bytes to allow decode. - .decode('unicode_escape') - # Make it bytes again to return the proper type. - .encode('raw_unicode_escape')) + # Replaces Unicode escape sequences with their character equivalents. + result = result.encode('raw_unicode_escape').decode('raw_unicode_escape') + # Encode Unicode characters as UTF-8, then decode to Latin-1 escaping + # unprintable characters. + result = result.encode('utf-8').decode('unicode_escape') + # Convert Latin-1 text back to a byte string (latin-1 codec also works here). + return result.encode('latin-1')