From 1711ebd4559313555e89d0f56294842779c7e900 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Wed, 1 Nov 2023 10:33:11 -0700 Subject: [PATCH] Fixed Python memory leak in map lookup. Previously we were allocating memory on the message's arena every time we performed a `map[key]` or `map.get(key)` operation. This is unnecessary, as the key's data is only needed ephemerally, for the duration of the lookup, and we can therefore alias the Python object's string data instead of copying it. This required fixing a bug in the convert.c operation. Previously in the `arena==NULL` case, if the user passes a bytes object instead of a unicode string, the code would return a pointer to a temporary Python object that had already been freed, leading to use-after-free. I fixed this by referencing the bytes object's data directly, and using utf8_range to verify the UTF-8. Fixes: https://github.com/protocolbuffers/protobuf/issues/14571 PiperOrigin-RevId: 578563555 --- python/BUILD | 1 + python/convert.c | 32 ++++++++++++------- .../google/protobuf/internal/message_test.py | 1 - python/map.c | 8 ++--- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/python/BUILD b/python/BUILD index 92e9c496c5..ea1dac3160 100644 --- a/python/BUILD +++ b/python/BUILD @@ -203,5 +203,6 @@ py_extension( "//upb/util:compare", "//upb/util:def_to_proto", "//upb/util:required_fields", + "@utf8_range", ], ) diff --git a/python/convert.c b/python/convert.c index 413c42f735..0c1174ba6b 100644 --- a/python/convert.c +++ b/python/convert.c @@ -35,6 +35,7 @@ #include "upb/message/map.h" #include "upb/reflection/message.h" #include "upb/util/compare.h" +#include "utf8_range.h" // Must be last. #include "upb/port/def.inc" @@ -259,20 +260,27 @@ bool PyUpb_PyToUpb(PyObject* obj, const upb_FieldDef* f, upb_MessageValue* val, } case kUpb_CType_String: { Py_ssize_t size; - const char* ptr; - PyObject* unicode = NULL; if (PyBytes_Check(obj)) { - unicode = obj = PyUnicode_FromEncodedObject(obj, "utf-8", NULL); - if (!obj) return false; + // Use the object's bytes if they are valid UTF-8. + char* ptr; + if (PyBytes_AsStringAndSize(obj, &ptr, &size) < 0) return false; + if (utf8_range2((const unsigned char*)ptr, size) != 0) { + // Invalid UTF-8. Try to convert the message to a Python Unicode + // object, even though we know this will fail, just to get the + // idiomatic Python error message. + obj = PyUnicode_FromEncodedObject(obj, "utf-8", NULL); + assert(!obj); + return false; + } + *val = PyUpb_MaybeCopyString(ptr, size, arena); + return true; + } else { + const char* ptr; + ptr = PyUnicode_AsUTF8AndSize(obj, &size); + if (PyErr_Occurred()) return false; + *val = PyUpb_MaybeCopyString(ptr, size, arena); + return true; } - ptr = PyUnicode_AsUTF8AndSize(obj, &size); - if (PyErr_Occurred()) { - Py_XDECREF(unicode); - return false; - } - *val = PyUpb_MaybeCopyString(ptr, size, arena); - Py_XDECREF(unicode); - return true; } case kUpb_CType_Message: PyErr_Format(PyExc_ValueError, "Message objects may not be assigned"); diff --git a/python/google/protobuf/internal/message_test.py b/python/google/protobuf/internal/message_test.py index 0bac00da02..b0f1ae784f 100755 --- a/python/google/protobuf/internal/message_test.py +++ b/python/google/protobuf/internal/message_test.py @@ -48,7 +48,6 @@ UCS2_MAXUNICODE = 65535 warnings.simplefilter('error', DeprecationWarning) - @_parameterized.named_parameters(('_proto2', unittest_pb2), ('_proto3', unittest_proto3_arena_pb2)) @testing_refleaks.TestCase diff --git a/python/map.c b/python/map.c index a1d75de9a1..6bf12af438 100644 --- a/python/map.c +++ b/python/map.c @@ -179,7 +179,7 @@ int PyUpb_MapContainer_AssignSubscript(PyObject* _self, PyObject* key, const upb_FieldDef* val_f = upb_MessageDef_Field(entry_m, 1); upb_Arena* arena = PyUpb_Arena_Get(self->arena); upb_MessageValue u_key, u_val; - if (!PyUpb_PyToUpb(key, key_f, &u_key, arena)) return -1; + if (!PyUpb_PyToUpb(key, key_f, &u_key, NULL)) return -1; if (val) { if (!PyUpb_PyToUpb(val, val_f, &u_val, arena)) return -1; @@ -200,9 +200,8 @@ PyObject* PyUpb_MapContainer_Subscript(PyObject* _self, PyObject* key) { const upb_MessageDef* entry_m = upb_FieldDef_MessageSubDef(f); const upb_FieldDef* key_f = upb_MessageDef_Field(entry_m, 0); const upb_FieldDef* val_f = upb_MessageDef_Field(entry_m, 1); - upb_Arena* arena = PyUpb_Arena_Get(self->arena); upb_MessageValue u_key, u_val; - if (!PyUpb_PyToUpb(key, key_f, &u_key, arena)) return NULL; + if (!PyUpb_PyToUpb(key, key_f, &u_key, NULL)) return NULL; if (!map || !upb_Map_Get(map, u_key, &u_val)) { map = PyUpb_MapContainer_EnsureReified(_self); upb_Arena* arena = PyUpb_Arena_Get(self->arena); @@ -256,9 +255,8 @@ static PyObject* PyUpb_MapContainer_Get(PyObject* _self, PyObject* args, const upb_MessageDef* entry_m = upb_FieldDef_MessageSubDef(f); const upb_FieldDef* key_f = upb_MessageDef_Field(entry_m, 0); const upb_FieldDef* val_f = upb_MessageDef_Field(entry_m, 1); - upb_Arena* arena = PyUpb_Arena_Get(self->arena); upb_MessageValue u_key, u_val; - if (!PyUpb_PyToUpb(key, key_f, &u_key, arena)) return NULL; + if (!PyUpb_PyToUpb(key, key_f, &u_key, NULL)) return NULL; if (map && upb_Map_Get(map, u_key, &u_val)) { return PyUpb_UpbToPy(u_val, val_f, self->arena); }