protobuf/python/convert.c

// Protocol Buffers - Google's data interchange format
// Copyright 2023 Google LLC.  All rights reserved.
// https://developers.google.com/protocol-buffers/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google LLC nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "python/convert.h"

#include "python/message.h"
#include "python/protobuf.h"
#include "upb/message/map.h"
#include "upb/reflection/message.h"
#include "upb/util/compare.h"
#include "utf8_range.h"

// Must be last.
#include "upb/port/def.inc"

PyObject* PyUpb_UpbToPy(upb_MessageValue val, const upb_FieldDef* f,
                        PyObject* arena) {
  switch (upb_FieldDef_CType(f)) {
    case kUpb_CType_Enum:
    case kUpb_CType_Int32:
      return PyLong_FromLong(val.int32_val);
    case kUpb_CType_Int64:
      return PyLong_FromLongLong(val.int64_val);
    case kUpb_CType_UInt32:
      return PyLong_FromSize_t(val.uint32_val);
    case kUpb_CType_UInt64:
      return PyLong_FromUnsignedLongLong(val.uint64_val);
    case kUpb_CType_Float:
      return PyFloat_FromDouble(val.float_val);
    case kUpb_CType_Double:
      return PyFloat_FromDouble(val.double_val);
    case kUpb_CType_Bool:
      return PyBool_FromLong(val.bool_val);
    case kUpb_CType_Bytes:
      return PyBytes_FromStringAndSize(val.str_val.data, val.str_val.size);
    case kUpb_CType_String: {
      PyObject* ret =
          PyUnicode_DecodeUTF8(val.str_val.data, val.str_val.size, NULL);
      // If the string can't be decoded in UTF-8, just return a bytes object
      // that contains the raw bytes. This can't happen if the value was
      // assigned using the members of the Python message object, but can happen
      // if the values were parsed from the wire (binary).
      if (ret == NULL) {
        PyErr_Clear();
        ret = PyBytes_FromStringAndSize(val.str_val.data, val.str_val.size);
      }
      return ret;
    }
    case kUpb_CType_Message:
      return PyUpb_Message_Get((upb_Message*)val.msg_val,
                               upb_FieldDef_MessageSubDef(f), arena);
    default:
      PyErr_Format(PyExc_SystemError,
                   "Getting a value from a field of unknown type %d",
                   upb_FieldDef_CType(f));
      return NULL;
  }
}

static bool PyUpb_GetInt64(PyObject* obj, int64_t* val) {
  // We require that the value is either an integer or has an __index__
  // conversion.
  obj = PyNumber_Index(obj);
  if (!obj) return false;
  // If the value is already a Python long, PyLong_AsLongLong() retrieves it.
  // Otherwise is converts to integer using __int__.
  *val = PyLong_AsLongLong(obj);
  bool ok = true;
  if (PyErr_Occurred()) {
    assert(PyErr_ExceptionMatches(PyExc_OverflowError));
    PyErr_Clear();
    PyErr_Format(PyExc_ValueError, "Value out of range: %S", obj);
    ok = false;
  }
  Py_DECREF(obj);
  return ok;
}

static bool PyUpb_GetUint64(PyObject* obj, uint64_t* val) {
  // We require that the value is either an integer or has an __index__
  // conversion.
  obj = PyNumber_Index(obj);
  if (!obj) return false;
  *val = PyLong_AsUnsignedLongLong(obj);
  bool ok = true;
  if (PyErr_Occurred()) {
    assert(PyErr_ExceptionMatches(PyExc_OverflowError));
    PyErr_Clear();
    PyErr_Format(PyExc_ValueError, "Value out of range: %S", obj);
    ok = false;
  }
  Py_DECREF(obj);
  return ok;
}

static bool PyUpb_GetInt32(PyObject* obj, int32_t* val) {
  int64_t i64;
  if (!PyUpb_GetInt64(obj, &i64)) return false;
  if (i64 < INT32_MIN || i64 > INT32_MAX) {
    PyErr_Format(PyExc_ValueError, "Value out of range: %S", obj);
    return false;
  }
  *val = i64;
  return true;
}

static bool PyUpb_GetUint32(PyObject* obj, uint32_t* val) {
  uint64_t u64;
  if (!PyUpb_GetUint64(obj, &u64)) return false;
  if (u64 > UINT32_MAX) {
    PyErr_Format(PyExc_ValueError, "Value out of range: %S", obj);
    return false;
  }
  *val = u64;
  return true;
}

// If `arena` is specified, copies the string data into the given arena.
// Otherwise aliases the given data.
static upb_MessageValue PyUpb_MaybeCopyString(const char* ptr, size_t size,
                                              upb_Arena* arena) {
  upb_MessageValue ret;
  ret.str_val.size = size;
  if (arena) {
    char* buf = upb_Arena_Malloc(arena, size);
    memcpy(buf, ptr, size);
    ret.str_val.data = buf;
  } else {
    ret.str_val.data = ptr;
  }
  return ret;
}

const char* upb_FieldDef_TypeString(const upb_FieldDef* f) {
  switch (upb_FieldDef_CType(f)) {
    case kUpb_CType_Double:
      return "double";
    case kUpb_CType_Float:
      return "float";
    case kUpb_CType_Int64:
      return "int64";
    case kUpb_CType_Int32:
      return "int32";
    case kUpb_CType_UInt64:
      return "uint64";
    case kUpb_CType_UInt32:
      return "uint32";
    case kUpb_CType_Enum:
      return "enum";
    case kUpb_CType_Bool:
      return "bool";
    case kUpb_CType_String:
      return "string";
    case kUpb_CType_Bytes:
      return "bytes";
    case kUpb_CType_Message:
      return "message";
  }
  UPB_UNREACHABLE();
}

static bool PyUpb_PyToUpbEnum(PyObject* obj, const upb_EnumDef* e,
                              upb_MessageValue* val) {
  if (PyUnicode_Check(obj)) {
    Py_ssize_t size;
    const char* name = PyUnicode_AsUTF8AndSize(obj, &size);
    const upb_EnumValueDef* ev =
        upb_EnumDef_FindValueByNameWithSize(e, name, size);
    if (!ev) {
      PyErr_Format(PyExc_ValueError, "unknown enum label \"%s\"", name);
      return false;
    }
    val->int32_val = upb_EnumValueDef_Number(ev);
    return true;
  } else {
    int32_t i32;
    if (!PyUpb_GetInt32(obj, &i32)) return false;
    if (upb_FileDef_Syntax(upb_EnumDef_File(e)) == kUpb_Syntax_Proto2 &&
        !upb_EnumDef_CheckNumber(e, i32)) {
      PyErr_Format(PyExc_ValueError, "invalid enumerator %d", (int)i32);
      return false;
    }
    val->int32_val = i32;
    return true;
  }
}

bool PyUpb_IsNumpyNdarray(PyObject* obj, const upb_FieldDef* f) {
  PyObject* type_name_obj =
      PyObject_GetAttrString((PyObject*)Py_TYPE(obj), "__name__");
  bool is_ndarray = false;
  if (!strcmp(PyUpb_GetStrData(type_name_obj), "ndarray")) {
    PyErr_Format(PyExc_TypeError,
                 "%S has type ndarray, but expected one of: %s", obj,
                 upb_FieldDef_TypeString(f));
    is_ndarray = true;
  }
  Py_DECREF(type_name_obj);
  return is_ndarray;
}

bool PyUpb_PyToUpb(PyObject* obj, const upb_FieldDef* f, upb_MessageValue* val,
                   upb_Arena* arena) {
  switch (upb_FieldDef_CType(f)) {
    case kUpb_CType_Enum:
      return PyUpb_PyToUpbEnum(obj, upb_FieldDef_EnumSubDef(f), val);
    case kUpb_CType_Int32:
      return PyUpb_GetInt32(obj, &val->int32_val);
    case kUpb_CType_Int64:
      return PyUpb_GetInt64(obj, &val->int64_val);
    case kUpb_CType_UInt32:
      return PyUpb_GetUint32(obj, &val->uint32_val);
    case kUpb_CType_UInt64:
      return PyUpb_GetUint64(obj, &val->uint64_val);
    case kUpb_CType_Float:
      if (PyUpb_IsNumpyNdarray(obj, f)) return false;
      val->float_val = PyFloat_AsDouble(obj);
      return !PyErr_Occurred();
    case kUpb_CType_Double:
      if (PyUpb_IsNumpyNdarray(obj, f)) return false;
      val->double_val = PyFloat_AsDouble(obj);
      return !PyErr_Occurred();
    case kUpb_CType_Bool:
      if (PyUpb_IsNumpyNdarray(obj, f)) return false;
      val->bool_val = PyLong_AsLong(obj);
      return !PyErr_Occurred();
    case kUpb_CType_Bytes: {
      char* ptr;
      Py_ssize_t size;
      if (PyBytes_AsStringAndSize(obj, &ptr, &size) < 0) return false;
      *val = PyUpb_MaybeCopyString(ptr, size, arena);
      return true;
    }
    case kUpb_CType_String: {
      Py_ssize_t size;
      if (PyBytes_Check(obj)) {
        // Use the object's bytes if they are valid UTF-8.
        char* ptr;
        if (PyBytes_AsStringAndSize(obj, &ptr, &size) < 0) return false;
        if (utf8_range2((const unsigned char*)ptr, size) != 0) {
          // Invalid UTF-8.  Try to convert the message to a Python Unicode
          // object, even though we know this will fail, just to get the
          // idiomatic Python error message.
          obj = PyUnicode_FromEncodedObject(obj, "utf-8", NULL);
          assert(!obj);
          return false;
        }
        *val = PyUpb_MaybeCopyString(ptr, size, arena);
        return true;
      } else {
        const char* ptr;
        ptr = PyUnicode_AsUTF8AndSize(obj, &size);
        if (PyErr_Occurred()) return false;
        *val = PyUpb_MaybeCopyString(ptr, size, arena);
        return true;
      }
    }
    case kUpb_CType_Message:
      PyErr_Format(PyExc_ValueError, "Message objects may not be assigned");
      return false;
    default:
      PyErr_Format(PyExc_SystemError,
                   "Getting a value from a field of unknown type %d",
                   upb_FieldDef_CType(f));
      return false;
  }
}

bool upb_Message_IsEqual(const upb_Message* msg1, const upb_Message* msg2,
                         const upb_MessageDef* m);

// -----------------------------------------------------------------------------
// Equal
// -----------------------------------------------------------------------------

bool PyUpb_ValueEq(upb_MessageValue val1, upb_MessageValue val2,
                   const upb_FieldDef* f) {
  switch (upb_FieldDef_CType(f)) {
    case kUpb_CType_Bool:
      return val1.bool_val == val2.bool_val;
    case kUpb_CType_Int32:
    case kUpb_CType_UInt32:
    case kUpb_CType_Enum:
      return val1.int32_val == val2.int32_val;
    case kUpb_CType_Int64:
    case kUpb_CType_UInt64:
      return val1.int64_val == val2.int64_val;
    case kUpb_CType_Float:
      return val1.float_val == val2.float_val;
    case kUpb_CType_Double:
      return val1.double_val == val2.double_val;
    case kUpb_CType_String:
    case kUpb_CType_Bytes:
      return val1.str_val.size == val2.str_val.size &&
             memcmp(val1.str_val.data, val2.str_val.data, val1.str_val.size) ==
                 0;
    case kUpb_CType_Message:
      return upb_Message_IsEqual(val1.msg_val, val2.msg_val,
                                 upb_FieldDef_MessageSubDef(f));
    default:
      return false;
  }
}

bool PyUpb_Map_IsEqual(const upb_Map* map1, const upb_Map* map2,
                       const upb_FieldDef* f) {
  assert(upb_FieldDef_IsMap(f));
  if (map1 == map2) return true;

  size_t size1 = map1 ? upb_Map_Size(map1) : 0;
  size_t size2 = map2 ? upb_Map_Size(map2) : 0;
  if (size1 != size2) return false;
  if (size1 == 0) return true;

  const upb_MessageDef* entry_m = upb_FieldDef_MessageSubDef(f);
  const upb_FieldDef* val_f = upb_MessageDef_Field(entry_m, 1);
  size_t iter = kUpb_Map_Begin;

  upb_MessageValue key, val1;
  while (upb_Map_Next(map1, &key, &val1, &iter)) {
    upb_MessageValue val2;
    if (!upb_Map_Get(map2, key, &val2)) return false;
    if (!PyUpb_ValueEq(val1, val2, val_f)) return false;
  }

  return true;
}

static bool PyUpb_ArrayElem_IsEqual(const upb_Array* arr1,
                                    const upb_Array* arr2, size_t i,
                                    const upb_FieldDef* f) {
  assert(i < upb_Array_Size(arr1));
  assert(i < upb_Array_Size(arr2));
  upb_MessageValue val1 = upb_Array_Get(arr1, i);
  upb_MessageValue val2 = upb_Array_Get(arr2, i);
  return PyUpb_ValueEq(val1, val2, f);
}

bool PyUpb_Array_IsEqual(const upb_Array* arr1, const upb_Array* arr2,
                         const upb_FieldDef* f) {
  assert(upb_FieldDef_IsRepeated(f) && !upb_FieldDef_IsMap(f));
  if (arr1 == arr2) return true;

  size_t n1 = arr1 ? upb_Array_Size(arr1) : 0;
  size_t n2 = arr2 ? upb_Array_Size(arr2) : 0;
  if (n1 != n2) return false;

  // Half the length rounded down.  Important: the empty list rounds to 0.
  size_t half = n1 / 2;

  // Search from the ends-in.  We expect differences to more quickly manifest
  // at the ends than in the middle.  If the length is odd we will miss the
  // middle element.
  for (size_t i = 0; i < half; i++) {
    if (!PyUpb_ArrayElem_IsEqual(arr1, arr2, i, f)) return false;
    if (!PyUpb_ArrayElem_IsEqual(arr1, arr2, n1 - 1 - i, f)) return false;
  }

  // For an odd-lengthed list, pick up the middle element.
  if (n1 & 1) {
    if (!PyUpb_ArrayElem_IsEqual(arr1, arr2, half, f)) return false;
  }

  return true;
}

bool upb_Message_IsEqual(const upb_Message* msg1, const upb_Message* msg2,
                         const upb_MessageDef* m) {
  if (msg1 == msg2) return true;
  if (upb_Message_ExtensionCount(msg1) != upb_Message_ExtensionCount(msg2))
    return false;

  // Compare messages field-by-field.  This is slightly tricky, because while
  // we can iterate over normal fields in a predictable order, the extension
  // order is unpredictable and may be different between msg1 and msg2.
  // So we use the following strategy:
  //   1. Iterate over all msg1 fields (including extensions).
  //   2. For non-extension fields, we find the corresponding field by simply
  //      using upb_Message_Next(msg2).  If the two messages have the same set
  //      of fields, this will yield the same field.
  //   3. For extension fields, we have to actually search for the corresponding
  //      field, which we do with upb_Message_GetFieldByDef(msg2, ext_f1).
  //   4. Once iteration over msg1 is complete, we call upb_Message_Next(msg2)
  //   one
  //      final time to verify that we have visited all of msg2's regular fields
  //      (we pass NULL for ext_dict so that iteration will *not* return
  //      extensions).
  //
  // We don't need to visit all of msg2's extensions, because we verified up
  // front that both messages have the same number of extensions.
  const upb_DefPool* symtab = upb_FileDef_Pool(upb_MessageDef_File(m));
  const upb_FieldDef *f1, *f2;
  upb_MessageValue val1, val2;
  size_t iter1 = kUpb_Message_Begin;
  size_t iter2 = kUpb_Message_Begin;
  while (upb_Message_Next(msg1, m, symtab, &f1, &val1, &iter1)) {
    if (upb_FieldDef_IsExtension(f1)) {
      val2 = upb_Message_GetFieldByDef(msg2, f1);
    } else {
      if (!upb_Message_Next(msg2, m, NULL, &f2, &val2, &iter2) || f1 != f2) {
        return false;
      }
    }

    if (upb_FieldDef_IsMap(f1)) {
      if (!PyUpb_Map_IsEqual(val1.map_val, val2.map_val, f1)) return false;
    } else if (upb_FieldDef_IsRepeated(f1)) {
      if (!PyUpb_Array_IsEqual(val1.array_val, val2.array_val, f1)) {
        return false;
      }
    } else {
      if (!PyUpb_ValueEq(val1, val2, f1)) return false;
    }
  }

  if (upb_Message_Next(msg2, m, NULL, &f2, &val2, &iter2)) return false;

  size_t usize1, usize2;
  const char* uf1 = upb_Message_GetUnknown(msg1, &usize1);
  const char* uf2 = upb_Message_GetUnknown(msg2, &usize2);
  // 100 is arbitrary, we're trying to prevent stack overflow but it's not
  // obvious how deep we should allow here.
  return upb_Message_UnknownFieldsAreEqual(uf1, usize1, uf2, usize2, 100) ==
         kUpb_UnknownCompareResult_Equal;
}

#include "upb/port/undef.inc"