Fix handling of Unicode escapes in string data in textproto files.

PiperOrigin-RevId: 573926175
pull/14427/head
Protobuf Team Bot 1 year ago committed by Copybara-Service
parent 4354846c51
commit 62a435c1ab
  1. 25
      conformance/text_format_failure_list_python.txt
  2. 24
      conformance/text_format_failure_list_python_cpp.txt
  3. 11
      python/google/protobuf/internal/text_format_test.py
  4. 11
      python/google/protobuf/text_encoding.py

@ -3,31 +3,6 @@
# TODO: These should be fixed.
Required.Proto3.TextFormatInput.FloatFieldMaxValue.ProtobufOutput
Required.Proto3.TextFormatInput.FloatFieldMaxValue.TextFormatOutput
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyBytes
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyString
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairBytes
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairString
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyBytes
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyString
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.ProtobufOutput
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.TextFormatOutput
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.ProtobufOutput
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.TextFormatOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.ProtobufOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.TextFormatOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.ProtobufOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.TextFormatOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyBytes
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyString
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairBytes
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairString
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyBytes
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyString
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortBytes
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortString
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongBytes
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongString
Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.ProtobufOutput
Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.TextFormatOutput
Required.Proto3.TextFormatInput.StringLiteralBasicEscapesString.ProtobufOutput

@ -1,27 +1,3 @@
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyBytes
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyString
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairBytes
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairString
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyBytes
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyString
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.ProtobufOutput
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.TextFormatOutput
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.ProtobufOutput
Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.TextFormatOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.ProtobufOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.TextFormatOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.ProtobufOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.TextFormatOutput
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyBytes
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyString
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairBytes
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairString
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyBytes
Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyString
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortBytes
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortString
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongBytes
Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongString
Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.ProtobufOutput
Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.TextFormatOutput
Required.Proto3.TextFormatInput.StringLiteralBasicEscapesString.ProtobufOutput

@ -854,10 +854,11 @@ class TextFormatParserTests(TextFormatBase):
# itself for string fields. It also demonstrates escaped binary data.
# The ur"" string prefix is unfortunately missing from Python 3
# so we resort to double escaping our \s so that they come through.
_UNICODE_SAMPLE = u"""
_UNICODE_SAMPLE = """
optional_bytes: 'Á short desçription'
optional_string: 'Á short desçription'
repeated_bytes: '\\303\\201 short des\\303\\247ription'
repeated_bytes: '\\u00c1 short des\\u00e7ription'
repeated_bytes: '\\x12\\x34\\x56\\x78\\x90\\xab\\xcd\\xef'
repeated_string: '\\xd0\\x9f\\xd1\\x80\\xd0\\xb8\\xd0\\xb2\\xd0\\xb5\\xd1\\x82'
"""
@ -873,8 +874,9 @@ class TextFormatParserTests(TextFormatBase):
self.assertEqual(m.optional_bytes, self._GOLDEN_BYTES)
self.assertEqual(m.optional_string, self._GOLDEN_UNICODE)
self.assertEqual(m.repeated_bytes[0], self._GOLDEN_BYTES)
# repeated_bytes[1] contained simple \ escaped non-UTF-8 raw binary data.
self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES_1)
self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES)
# repeated_bytes[2] contained simple \ escaped non-UTF-8 raw binary data.
self.assertEqual(m.repeated_bytes[2], self._GOLDEN_BYTES_1)
# repeated_string[0] contained \ escaped data representing the UTF-8
# representation of _GOLDEN_STR_0 - it needs to decode as such.
self.assertEqual(m.repeated_string[0], self._GOLDEN_STR_0)
@ -885,8 +887,9 @@ class TextFormatParserTests(TextFormatBase):
self.assertEqual(m.optional_bytes, self._GOLDEN_BYTES)
self.assertEqual(m.optional_string, self._GOLDEN_UNICODE)
self.assertEqual(m.repeated_bytes[0], self._GOLDEN_BYTES)
self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES)
# repeated_bytes[1] contained simple \ escaped non-UTF-8 raw binary data.
self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES_1)
self.assertEqual(m.repeated_bytes[2], self._GOLDEN_BYTES_1)
# repeated_string[0] contained \ escaped data representing the UTF-8
# representation of _GOLDEN_STR_0 - it needs to decode as such.
self.assertEqual(m.repeated_string[0], self._GOLDEN_STR_0)

@ -79,7 +79,10 @@ def CUnescape(text: str) -> bytes:
# allow single-digit hex escapes (like '\xf').
result = _CUNESCAPE_HEX.sub(ReplaceHex, text)
return (result.encode('utf-8') # Make it bytes to allow decode.
.decode('unicode_escape')
# Make it bytes again to return the proper type.
.encode('raw_unicode_escape'))
# Replaces Unicode escape sequences with their character equivalents.
result = result.encode('raw_unicode_escape').decode('raw_unicode_escape')
# Encode Unicode characters as UTF-8, then decode to Latin-1 escaping
# unprintable characters.
result = result.encode('utf-8').decode('unicode_escape')
# Convert Latin-1 text back to a byte string (latin-1 codec also works here).
return result.encode('latin-1')

Loading…
Cancel
Save