From 62a435c1ab81ea65850f1b1fb58f7b3c6f498cef Mon Sep 17 00:00:00 2001
From: Protobuf Team Bot <protobuf-github-bot@google.com>
Date: Mon, 16 Oct 2023 14:12:52 -0700
Subject: [PATCH] Fix handling of Unicode escapes in string data in textproto
 files.

PiperOrigin-RevId: 573926175
---
 .../text_format_failure_list_python.txt       | 25 -------------------
 .../text_format_failure_list_python_cpp.txt   | 24 ------------------
 .../protobuf/internal/text_format_test.py     | 11 +++++---
 python/google/protobuf/text_encoding.py       | 11 +++++---
 4 files changed, 14 insertions(+), 57 deletions(-)

diff --git a/conformance/text_format_failure_list_python.txt b/conformance/text_format_failure_list_python.txt
index 6bf7c1aa63..2f7f22471c 100644
--- a/conformance/text_format_failure_list_python.txt
+++ b/conformance/text_format_failure_list_python.txt
@@ -3,31 +3,6 @@
 # TODO: These should be fixed.
 Required.Proto3.TextFormatInput.FloatFieldMaxValue.ProtobufOutput
 Required.Proto3.TextFormatInput.FloatFieldMaxValue.TextFormatOutput
-
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyBytes
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyString
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairBytes
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairString
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyBytes
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyString
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.ProtobufOutput
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.TextFormatOutput
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.ProtobufOutput
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.TextFormatOutput
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.ProtobufOutput
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.TextFormatOutput
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.ProtobufOutput
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.TextFormatOutput
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyBytes
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyString
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairBytes
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairString
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyBytes
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyString
-Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortBytes
-Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortString
-Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongBytes
-Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongString
 Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.ProtobufOutput
 Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.TextFormatOutput
 Required.Proto3.TextFormatInput.StringLiteralBasicEscapesString.ProtobufOutput
diff --git a/conformance/text_format_failure_list_python_cpp.txt b/conformance/text_format_failure_list_python_cpp.txt
index 91fc2ea3cd..b9da32dab8 100644
--- a/conformance/text_format_failure_list_python_cpp.txt
+++ b/conformance/text_format_failure_list_python_cpp.txt
@@ -1,27 +1,3 @@
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyBytes
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateFirstOnlyString
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairBytes
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogatePairString
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyBytes
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapeSurrogateSecondOnlyString
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.ProtobufOutput
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesBytes.TextFormatOutput
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.ProtobufOutput
-Recommended.Proto3.TextFormatInput.StringLiteralLongUnicodeEscapesString.TextFormatOutput
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.ProtobufOutput
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeBytes.TextFormatOutput
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.ProtobufOutput
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeString.TextFormatOutput
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyBytes
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateFirstOnlyString
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairBytes
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogatePairString
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyBytes
-Recommended.Proto3.TextFormatInput.StringLiteralShortUnicodeEscapeSurrogateSecondOnlyString
-Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortBytes
-Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairLongShortString
-Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongBytes
-Recommended.Proto3.TextFormatInput.StringLiteralUnicodeEscapeSurrogatePairShortLongString
 Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.ProtobufOutput
 Required.Proto3.TextFormatInput.StringLiteralBasicEscapesBytes.TextFormatOutput
 Required.Proto3.TextFormatInput.StringLiteralBasicEscapesString.ProtobufOutput
diff --git a/python/google/protobuf/internal/text_format_test.py b/python/google/protobuf/internal/text_format_test.py
index 8f9d5f8862..75f3b5b35a 100644
--- a/python/google/protobuf/internal/text_format_test.py
+++ b/python/google/protobuf/internal/text_format_test.py
@@ -854,10 +854,11 @@ class TextFormatParserTests(TextFormatBase):
   # itself for string fields.  It also demonstrates escaped binary data.
   # The ur"" string prefix is unfortunately missing from Python 3
   # so we resort to double escaping our \s so that they come through.
-  _UNICODE_SAMPLE = u"""
+  _UNICODE_SAMPLE = """
       optional_bytes: 'Á short desçription'
       optional_string: 'Á short desçription'
       repeated_bytes: '\\303\\201 short des\\303\\247ription'
+      repeated_bytes: '\\u00c1 short des\\u00e7ription'
       repeated_bytes: '\\x12\\x34\\x56\\x78\\x90\\xab\\xcd\\xef'
       repeated_string: '\\xd0\\x9f\\xd1\\x80\\xd0\\xb8\\xd0\\xb2\\xd0\\xb5\\xd1\\x82'
       """
@@ -873,8 +874,9 @@ class TextFormatParserTests(TextFormatBase):
     self.assertEqual(m.optional_bytes, self._GOLDEN_BYTES)
     self.assertEqual(m.optional_string, self._GOLDEN_UNICODE)
     self.assertEqual(m.repeated_bytes[0], self._GOLDEN_BYTES)
-    # repeated_bytes[1] contained simple \ escaped non-UTF-8 raw binary data.
-    self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES_1)
+    self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES)
+    # repeated_bytes[2] contained simple \ escaped non-UTF-8 raw binary data.
+    self.assertEqual(m.repeated_bytes[2], self._GOLDEN_BYTES_1)
     # repeated_string[0] contained \ escaped data representing the UTF-8
     # representation of _GOLDEN_STR_0 - it needs to decode as such.
     self.assertEqual(m.repeated_string[0], self._GOLDEN_STR_0)
@@ -885,8 +887,9 @@ class TextFormatParserTests(TextFormatBase):
     self.assertEqual(m.optional_bytes, self._GOLDEN_BYTES)
     self.assertEqual(m.optional_string, self._GOLDEN_UNICODE)
     self.assertEqual(m.repeated_bytes[0], self._GOLDEN_BYTES)
+    self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES)
     # repeated_bytes[1] contained simple \ escaped non-UTF-8 raw binary data.
-    self.assertEqual(m.repeated_bytes[1], self._GOLDEN_BYTES_1)
+    self.assertEqual(m.repeated_bytes[2], self._GOLDEN_BYTES_1)
     # repeated_string[0] contained \ escaped data representing the UTF-8
     # representation of _GOLDEN_STR_0 - it needs to decode as such.
     self.assertEqual(m.repeated_string[0], self._GOLDEN_STR_0)
diff --git a/python/google/protobuf/text_encoding.py b/python/google/protobuf/text_encoding.py
index d454987496..112e4ab105 100644
--- a/python/google/protobuf/text_encoding.py
+++ b/python/google/protobuf/text_encoding.py
@@ -79,7 +79,10 @@ def CUnescape(text: str) -> bytes:
   # allow single-digit hex escapes (like '\xf').
   result = _CUNESCAPE_HEX.sub(ReplaceHex, text)
 
-  return (result.encode('utf-8')  # Make it bytes to allow decode.
-          .decode('unicode_escape')
-          # Make it bytes again to return the proper type.
-          .encode('raw_unicode_escape'))
+  # Replaces Unicode escape sequences with their character equivalents.
+  result = result.encode('raw_unicode_escape').decode('raw_unicode_escape')
+  # Encode Unicode characters as UTF-8, then decode to Latin-1 escaping
+  # unprintable characters.
+  result = result.encode('utf-8').decode('unicode_escape')
+  # Convert Latin-1 text back to a byte string (latin-1 codec also works here).
+  return result.encode('latin-1')