Fix handling of Unicode escapes in string data in textproto files.

PiperOrigin-RevId: 573926175
pull/14427/head
Protobuf Team Bot 1 year ago committed by Copybara-Service
parent 4354846c51
commit 62a435c1ab
  1. 25
      conformance/text_format_failure_list_python.txt
  2. 24
      conformance/text_format_failure_list_python_cpp.txt
  3. 11
      python/google/protobuf/internal/text_format_test.py
  4. 11
      python/google/protobuf/text_encoding.py

@ -3,31 +3,6 @@
# TODO: These should be fixed. # TODO: These should be fixed.
Required.Proto3.TextFormatInput.FloatFieldMaxValue.ProtobufOutput Required.Proto3.TextFormatInput.FloatFieldMaxValue.ProtobufOutput
Required.Proto3.TextFormatInput.FloatFieldMaxValue.TextFormatOutput Required.Proto3.TextFormatInput.FloatFieldMaxValue.TextFormatOutput
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyBytes
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyString
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairBytes
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairString
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyBytes
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyString
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.ProtobufOutput
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.TextFormatOutput
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.ProtobufOutput
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.TextFormatOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.ProtobufOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.TextFormatOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.ProtobufOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.TextFormatOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyBytes
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyString
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairBytes
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairString
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyBytes
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyString
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortBytes
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortString
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongBytes
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongString
Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.ProtobufOutput Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.ProtobufOutput
Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.TextFormatOutput Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.TextFormatOutput
Required.Proto3.TextFormatInput.StringLiteralBasicEscapesString.ProtobufOutput Required.Proto3.TextFormatInput.StringLiteralBasicEscapesString.ProtobufOutput

@ -1,27 +1,3 @@
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyBytes
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyString
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairBytes
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairString
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyBytes
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyString
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.ProtobufOutput
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.TextFormatOutput
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.ProtobufOutput
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.TextFormatOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.ProtobufOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.TextFormatOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.ProtobufOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.TextFormatOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyBytes
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyString
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairBytes
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairString
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyBytes
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyString
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortBytes
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortString
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongBytes
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongString
Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.ProtobufOutput Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.ProtobufOutput
Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.TextFormatOutput Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.TextFormatOutput
Required.Proto3.TextFormatInput.StringLiteralBasicEscapesString.ProtobufOutput Required.Proto3.TextFormatInput.StringLiteralBasicEscapesString.ProtobufOutput

@ -854,10 +854,11 @@ class TextFormatParserTests(TextFormatBase):
# itself for string fields. It also demonstrates escaped binary data. # itself for string fields. It also demonstrates escaped binary data.
# The ur"" string prefix is unfortunately missing from Python 3 # The ur"" string prefix is unfortunately missing from Python 3
# so we resort to double escaping our \s so that they come through. # so we resort to double escaping our \s so that they come through.
_UNICODE_SAMPLE = u""" _UNICODE_SAMPLE = """
optional_bytes: 'Á short desçription' optional_bytes: 'Á short desçription'
optional_string: 'Á short desçription' optional_string: 'Á short desçription'
repeated_bytes: '\\303\\201 short des\\303\\247ription' repeated_bytes: '\\303\\201 short des\\303\\247ription'
repeated_bytes: '\\u00c1 short des\\u00e7ription'
repeated_bytes: '\\x12\\x34\\x56\\x78\\x90\\xab\\xcd\\xef' repeated_bytes: '\\x12\\x34\\x56\\x78\\x90\\xab\\xcd\\xef'
repeated_string: '\\xd0\\x9f\\xd1\\x80\\xd0\\xb8\\xd0\\xb2\\xd0\\xb5\\xd1\\x82' repeated_string: '\\xd0\\x9f\\xd1\\x80\\xd0\\xb8\\xd0\\xb2\\xd0\\xb5\\xd1\\x82'
""" """
@ -873,8 +874,9 @@ class TextFormatParserTests(TextFormatBase):
self.assertEqual(m.optional_bytes, self._GOLDEN_BYTES) self.assertEqual(m.optional_bytes, self._GOLDEN_BYTES)
self.assertEqual(m.optional_string, self._GOLDEN_UNICODE) self.assertEqual(m.optional_string, self._GOLDEN_UNICODE)
self.assertEqual(m.repeated_bytes[0], self._GOLDEN_BYTES) self.assertEqual(m.repeated_bytes[0], self._GOLDEN_BYTES)
# repeated_bytes[1] contained simple \ escaped non-UTF-8 raw binary data. self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES)
self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES_1) # repeated_bytes[2] contained simple \ escaped non-UTF-8 raw binary data.
self.assertEqual(m.repeated_bytes[2], self._GOLDEN_BYTES_1)
# repeated_string[0] contained \ escaped data representing the UTF-8 # repeated_string[0] contained \ escaped data representing the UTF-8
# representation of _GOLDEN_STR_0 - it needs to decode as such. # representation of _GOLDEN_STR_0 - it needs to decode as such.
self.assertEqual(m.repeated_string[0], self._GOLDEN_STR_0) self.assertEqual(m.repeated_string[0], self._GOLDEN_STR_0)
@ -885,8 +887,9 @@ class TextFormatParserTests(TextFormatBase):
self.assertEqual(m.optional_bytes, self._GOLDEN_BYTES) self.assertEqual(m.optional_bytes, self._GOLDEN_BYTES)
self.assertEqual(m.optional_string, self._GOLDEN_UNICODE) self.assertEqual(m.optional_string, self._GOLDEN_UNICODE)
self.assertEqual(m.repeated_bytes[0], self._GOLDEN_BYTES) self.assertEqual(m.repeated_bytes[0], self._GOLDEN_BYTES)
self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES)
# repeated_bytes[1] contained simple \ escaped non-UTF-8 raw binary data. # repeated_bytes[1] contained simple \ escaped non-UTF-8 raw binary data.
self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES_1) self.assertEqual(m.repeated_bytes[2], self._GOLDEN_BYTES_1)
# repeated_string[0] contained \ escaped data representing the UTF-8 # repeated_string[0] contained \ escaped data representing the UTF-8
# representation of _GOLDEN_STR_0 - it needs to decode as such. # representation of _GOLDEN_STR_0 - it needs to decode as such.
self.assertEqual(m.repeated_string[0], self._GOLDEN_STR_0) self.assertEqual(m.repeated_string[0], self._GOLDEN_STR_0)

@ -79,7 +79,10 @@ def CUnescape(text: str) -> bytes:
# allow single-digit hex escapes (like '\xf'). # allow single-digit hex escapes (like '\xf').
result = _CUNESCAPE_HEX.sub(ReplaceHex, text) result = _CUNESCAPE_HEX.sub(ReplaceHex, text)
return (result.encode('utf-8') # Make it bytes to allow decode. # Replaces Unicode escape sequences with their character equivalents.
.decode('unicode_escape') result = result.encode('raw_unicode_escape').decode('raw_unicode_escape')
# Make it bytes again to return the proper type. # Encode Unicode characters as UTF-8, then decode to Latin-1 escaping
.encode('raw_unicode_escape')) # unprintable characters.
result = result.encode('utf-8').decode('unicode_escape')
# Convert Latin-1 text back to a byte string (latin-1 codec also works here).
return result.encode('latin-1')

Loading…
Cancel
Save