Generate internal hasbits for singular proto3 implicit presence fields.

N.B.:

- This change is not intended to affect any well-defined protobuf behaviour in
  an observable way.
- The wire parsing codepath is not affected.
- This change only affects the C++ protobuf implementation (other languages are
  not affected).
- sizeof proto3 message objects may increase in 32-bit increments to
  accommodate hasbits.
- When profiled on some of Google's largest binaries, we have seen a code size
  increase of ~0.1%, which we consider to be a reasonable increase.

There are quite a few terminologies in the title:

- **singular**: a field that is not repeated, not oneof, not extension, not lazy,
	  just a field with a simple primitive type (number or boolean), or
	  string/bytes.
- **proto3**: describes behaviour consistent to the "proto3" syntax.
          This is equivalent to `edition = "2023"` with
	  `option features.field_presence = IMPLICIT;`.
- **implicit presence**: describes behaviour consistent with "non-optional"
			 fields in proto3. This is described in more detail in
			 https://protobuf.dev/programming-guides/field_presence/#presence-in-proto3-apis

This change enables C++ proto3 objects to generate hasbits for regular proto3
(i.e. non-`optional`) fields. This code change might make certain codepaths
negligibly more efficient, but large improvement or regression is unlikely. A
larger performance improvement is expected from generating hasbits for repeated
fields -- this change will pave the way for future work there.

Hasbits in C++ will have slightly different semantics for implicit presence
fields. In the past, all hasbits are true field presence indicators. If the
hasbit is set, the field is guaranteed to be present; if the hasbit is unset,
the field is guaranteed to be missing.

This change introduces a new hasbit mode that I will call "hint hasbits",
denoted by a newly-introduced enum, `internal::cpp::HasbitMode::kHintHasbit`.
For implicit presence fields, it may be possible to mutate the field and have
it end up as a zero field, especially with `mutable_foo` APIs. To handle those
cases correctly, we unconditionally set the hasbit when `mutable_foo` is
called, then we must do an additional check for field emptiness before
serializing the field onto the wire.

PiperOrigin-RevId: 691945237
pull/19053/head
Tony Liao 5 months ago committed by Copybara-Service
parent 56580bd0d4
commit 3e82ed436b
  1. 23
      src/google/protobuf/compiler/cpp/field_generators/string_field.cc
  2. 80
      src/google/protobuf/compiler/cpp/message.cc
  3. 22
      src/google/protobuf/descriptor.cc
  4. 29
      src/google/protobuf/descriptor.h
  5. 272
      src/google/protobuf/descriptor_unittest.cc
  6. 44
      src/google/protobuf/dynamic_message.cc
  7. 204
      src/google/protobuf/generated_message_reflection.cc
  8. 4
      src/google/protobuf/generated_message_reflection.h
  9. 21
      src/google/protobuf/message.h

@ -16,6 +16,7 @@
#include "absl/log/absl_check.h"
#include "absl/memory/memory.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/string_view.h"
#include "google/protobuf/compiler/cpp/field.h"
#include "google/protobuf/compiler/cpp/field_generators/generators.h"
#include "google/protobuf/compiler/cpp/helpers.h"
@ -23,12 +24,15 @@
#include "google/protobuf/descriptor.h"
#include "google/protobuf/descriptor.pb.h"
#include "google/protobuf/io/printer.h"
#include "google/protobuf/port.h"
namespace google {
namespace protobuf {
namespace compiler {
namespace cpp {
namespace {
using ::google::protobuf::internal::cpp::GetFieldHasbitMode;
using ::google::protobuf::internal::cpp::HasbitMode;
using ::google::protobuf::internal::cpp::HasHasbit;
using ::google::protobuf::io::AnnotationCollector;
using Sub = ::google::protobuf::io::Printer::Sub;
@ -533,6 +537,22 @@ void SingularString::GenerateClearingCode(io::Printer* p) const {
)cc");
}
// Returns "ClearNonDefaultToEmpty" or "ClearToEmpty" depending on whether the
// field might still point to the default string instance.
absl::string_view GetClearFunctionForField(const FieldDescriptor* field) {
switch (GetFieldHasbitMode(field)) {
case HasbitMode::kNoHasbit:
case HasbitMode::kHintHasbit:
// TODO: b/376149315 - Would be nice to call ClearNonDefaultToEmpty for
// hint hasbits too.
return "ClearToEmpty";
case HasbitMode::kTrueHasbit:
return "ClearNonDefaultToEmpty";
default:
internal::Unreachable();
}
}
void SingularString::GenerateMessageClearingCode(io::Printer* p) const {
if (is_oneof()) {
p->Emit(R"cc(
@ -573,8 +593,7 @@ void SingularString::GenerateMessageClearingCode(io::Printer* p) const {
return;
}
p->Emit({{"Clear",
HasHasbit(field_) ? "ClearNonDefaultToEmpty" : "ClearToEmpty"}},
p->Emit({{"Clear", GetClearFunctionForField(field_)}},
R"cc(
$field_$.$Clear$();
)cc");

@ -62,6 +62,8 @@ namespace cpp {
namespace {
using ::google::protobuf::internal::WireFormat;
using ::google::protobuf::internal::WireFormatLite;
using ::google::protobuf::internal::cpp::GetFieldHasbitMode;
using ::google::protobuf::internal::cpp::HasbitMode;
using ::google::protobuf::internal::cpp::HasHasbit;
using Semantic = ::google::protobuf::io::AnnotationCollector::Semantic;
using Sub = ::google::protobuf::io::Printer::Sub;
@ -187,7 +189,7 @@ RunMap FindRuns(const std::vector<const FieldDescriptor*>& fields,
void EmitNonDefaultCheck(io::Printer* p, const std::string& prefix,
const FieldDescriptor* field) {
ABSL_CHECK(!HasHasbit(field));
ABSL_CHECK(GetFieldHasbitMode(field) != HasbitMode::kTrueHasbit);
ABSL_CHECK(!field->is_repeated());
ABSL_CHECK(!field->containing_oneof() || field->real_containing_oneof());
@ -216,19 +218,27 @@ void EmitNonDefaultCheck(io::Printer* p, const std::string& prefix,
}
bool ShouldEmitNonDefaultCheck(const FieldDescriptor* field) {
return (!field->is_repeated() && !field->containing_oneof()) ||
field->real_containing_oneof();
if (GetFieldHasbitMode(field) == HasbitMode::kTrueHasbit) {
return false;
}
return !field->is_repeated();
}
// Emits an if-statement with a condition that evaluates to true if |field| is
// considered non-default (will be sent over the wire), for message types
// without true field presence. Should only be called if
// !HasHasbit(field).
// If |with_enclosing_braces_always| is set to true, will generate enclosing
// braces even if nondefault check is not emitted -- i.e. code may look like:
// {
// // code...
// }
// If |with_enclosing_braces_always| is set to false, enclosing braces will not
// be generated if nondefault check is not emitted.
void MayEmitIfNonDefaultCheck(io::Printer* p, const std::string& prefix,
const FieldDescriptor* field,
absl::AnyInvocable<void()> emit_body) {
ABSL_CHECK(!HasHasbit(field));
absl::AnyInvocable<void()> emit_body,
bool with_enclosing_braces_always) {
if (ShouldEmitNonDefaultCheck(field)) {
p->Emit(
{
@ -240,7 +250,10 @@ void MayEmitIfNonDefaultCheck(io::Printer* p, const std::string& prefix,
$emit_body$;
}
)cc");
} else {
return;
}
if (with_enclosing_braces_always) {
// In repeated fields, the same variable name may be emitted multiple
// times, hence the need for emitting braces even when the if condition is
// not necessary, so that the code looks like:
@ -259,7 +272,11 @@ void MayEmitIfNonDefaultCheck(io::Printer* p, const std::string& prefix,
$emit_body$;
}
)cc");
return;
}
// If no enclosing braces need to be emitted, just emit the body directly.
emit_body();
}
bool HasInternalHasMethod(const FieldDescriptor* field) {
@ -1034,7 +1051,7 @@ void MessageGenerator::GenerateSingularFieldHasBits(
)cc");
return;
}
if (HasHasbit(field)) {
if (GetFieldHasbitMode(field) == HasbitMode::kTrueHasbit) {
auto v = p->WithVars(HasBitVars(field));
p->Emit(
{Sub{"ASSUME",
@ -1244,7 +1261,8 @@ void MessageGenerator::EmitCheckAndUpdateByteSizeForField(
};
if (!HasHasbit(field)) {
MayEmitIfNonDefaultCheck(p, "this_.", field, std::move(emit_body));
MayEmitIfNonDefaultCheck(p, "this_.", field, std::move(emit_body),
/*with_enclosing_braces_always=*/true);
return;
}
if (field->options().weak()) {
@ -1260,10 +1278,17 @@ void MessageGenerator::EmitCheckAndUpdateByteSizeForField(
int has_bit_index = has_bit_indices_[field->index()];
p->Emit({{"mask",
absl::StrFormat("0x%08xu", uint32_t{1} << (has_bit_index % 32))},
{"emit_body", [&] { emit_body(); }}},
{"check_nondefault_and_emit_body",
[&] {
// Note that it's possible that the field has explicit presence.
// In that case, nondefault check will not be emitted but
// emit_body will still be emitted.
MayEmitIfNonDefaultCheck(p, "this_.", field, std::move(emit_body),
/*with_enclosing_braces_always=*/false);
}}},
R"cc(
if (cached_has_bits & $mask$) {
$emit_body$;
$check_nondefault_and_emit_body$;
}
)cc");
}
@ -4227,9 +4252,10 @@ void MessageGenerator::GenerateClassSpecificMergeImpl(io::Printer* p) {
} else if (field->is_optional() && !HasHasbit(field)) {
// Merge semantics without true field presence: primitive fields are
// merged only if non-zero (numeric) or non-empty (string).
MayEmitIfNonDefaultCheck(p, "from.", field, /*emit_body=*/[&]() {
generator.GenerateMergingCode(p);
});
MayEmitIfNonDefaultCheck(
p, "from.", field,
/*emit_body=*/[&]() { generator.GenerateMergingCode(p); },
/*with_enclosing_braces_always=*/true);
} else if (field->options().weak() ||
cached_has_word_index != HasWordIndex(field)) {
// Check hasbit, not using cached bits.
@ -4250,10 +4276,20 @@ void MessageGenerator::GenerateClassSpecificMergeImpl(io::Printer* p) {
format("if (cached_has_bits & 0x$1$u) {\n", mask);
format.Indent();
if (check_has_byte && IsPOD(field)) {
generator.GenerateCopyConstructorCode(p);
if (GetFieldHasbitMode(field) == HasbitMode::kHintHasbit) {
// Merge semantics without true field presence: primitive fields are
// merged only if non-zero (numeric) or non-empty (string).
MayEmitIfNonDefaultCheck(
p, "from.", field,
/*emit_body=*/[&]() { generator.GenerateMergingCode(p); },
/*with_enclosing_braces_always=*/false);
} else {
generator.GenerateMergingCode(p);
ABSL_DCHECK(GetFieldHasbitMode(field) == HasbitMode::kTrueHasbit);
if (check_has_byte && IsPOD(field)) {
generator.GenerateCopyConstructorCode(p);
} else {
generator.GenerateMergingCode(p);
}
}
format.Outdent();
@ -4476,7 +4512,12 @@ void MessageGenerator::GenerateSerializeOneField(io::Printer* p,
if (HasHasbit(field)) {
p->Emit(
{
{"body", emit_body},
{"body",
[&]() {
MayEmitIfNonDefaultCheck(p, "this_.", field,
std::move(emit_body),
/*with_enclosing_braces_always=*/false);
}},
{"cond",
[&] {
int has_bit_index = HasBitIndex(field);
@ -4496,7 +4537,8 @@ void MessageGenerator::GenerateSerializeOneField(io::Printer* p,
}
)cc");
} else if (field->is_optional()) {
MayEmitIfNonDefaultCheck(p, "this_.", field, std::move(emit_body));
MayEmitIfNonDefaultCheck(p, "this_.", field, std::move(emit_body),
/*with_enclosing_braces_always=*/true);
} else {
emit_body();
}

@ -9798,9 +9798,27 @@ bool HasPreservingUnknownEnumSemantics(const FieldDescriptor* field) {
return field->enum_type() != nullptr && !field->enum_type()->is_closed();
}
HasbitMode GetFieldHasbitMode(const FieldDescriptor* field) {
// Do not generate hasbits for "real-oneof" and weak fields.
if (field->real_containing_oneof() || field->options().weak()) {
return HasbitMode::kNoHasbit;
}
// Explicit-presence fields always have true hasbits.
if (field->has_presence()) {
return HasbitMode::kTrueHasbit;
}
// Implicit presence fields.
if (!field->is_repeated()) {
return HasbitMode::kHintHasbit;
}
// We currently don't implement hasbits for implicit repeated fields.
return HasbitMode::kNoHasbit;
}
bool HasHasbit(const FieldDescriptor* field) {
return field->has_presence() && !field->real_containing_oneof() &&
!field->options().weak();
return GetFieldHasbitMode(field) != HasbitMode::kNoHasbit;
}
static bool IsVerifyUtf8(const FieldDescriptor* field, bool is_lite) {

@ -42,6 +42,7 @@
#include "google/protobuf/stubs/common.h"
#include "absl/base/attributes.h"
#include "absl/base/call_once.h"
#include "absl/base/optimization.h"
#include "absl/container/btree_map.h"
#include "absl/container/flat_hash_map.h"
#include "absl/functional/any_invocable.h"
@ -2949,9 +2950,33 @@ constexpr int MaxMessageDeclarationNestingDepth() { return 32; }
PROTOBUF_EXPORT bool HasPreservingUnknownEnumSemantics(
const FieldDescriptor* field);
PROTOBUF_EXPORT bool HasHasbit(const FieldDescriptor* field);
#ifndef SWIG
enum class HasbitMode : uint8_t {
// Hasbits do not exist for the field.
kNoHasbit,
// Hasbits exist and indicate field presence.
// Hasbit is set if and only if field is present.
kTrueHasbit,
// Hasbits exist and "hint at" field presence.
// When hasbit is set, field is 'probably' present, but field accessors must
// still check for field presence (i.e. false positives are possible).
// When hasbit is unset, field is guaranteed to be not present.
kHintHasbit,
};
// Returns the "hasbit mode" of the field. Depending on the implementation, a
// field can:
// - have no hasbits in its internal object (kNoHasbit);
// - have hasbits where hasbit == 1 indicates field presence and hasbit == 0
// indicates an unset field (kTrueHasbit);
// - have hasbits where hasbit == 1 indicates "field is possibly modified" and
// hasbit == 0 indicates "field is definitely missing" (kHintHasbit).
PROTOBUF_EXPORT HasbitMode GetFieldHasbitMode(const FieldDescriptor* field);
// Returns true if there are hasbits for the field.
// Note that this does not correlate with "hazzer"s, i.e., whether has_foo APIs
// are emitted.
PROTOBUF_EXPORT bool HasHasbit(const FieldDescriptor* field);
enum class Utf8CheckMode : uint8_t {
kStrict = 0, // Parsing will fail if non UTF-8 data is in string fields.

@ -20,8 +20,10 @@
#include <cstdlib>
#include <deque>
#include <functional>
#include <iostream>
#include <limits>
#include <memory>
#include <ostream>
#include <string>
#include <tuple>
#include <utility>
@ -77,7 +79,10 @@
// Must be included last.
#include "google/protobuf/port_def.inc"
using ::google::protobuf::internal::cpp::GetFieldHasbitMode;
using ::google::protobuf::internal::cpp::GetUtf8CheckMode;
using ::google::protobuf::internal::cpp::HasbitMode;
using ::google::protobuf::internal::cpp::HasHasbit;
using ::google::protobuf::internal::cpp::HasPreservingUnknownEnumSemantics;
using ::google::protobuf::internal::cpp::Utf8CheckMode;
using ::testing::AnyOf;
@ -2994,6 +2999,273 @@ TEST_F(MiscTest, FieldOptions) {
EXPECT_EQ(FieldOptions::CORD, bar->options().ctype());
}
// ===================================================================
struct HasHasbitTestParam {
struct ExpectedOutput {
HasbitMode expected_hasbitmode;
bool expected_has_presence;
bool expected_has_hasbit;
};
std::string input_foo_proto;
ExpectedOutput expected_output;
};
class HasHasbitTest : public testing::TestWithParam<HasHasbitTestParam> {
protected:
void SetUp() override {
ASSERT_TRUE(
TextFormat::ParseFromString(GetParam().input_foo_proto, &foo_proto_));
foo_ = pool_.BuildFile(foo_proto_);
}
const FieldDescriptor* GetField() { return foo_->message_type(0)->field(0); }
DescriptorPool pool_;
FileDescriptorProto foo_proto_;
const FileDescriptor* foo_;
};
TEST_P(HasHasbitTest, TestHasHasbitExplicitPresence) {
EXPECT_EQ(GetField()->has_presence(),
GetParam().expected_output.expected_has_presence);
EXPECT_EQ(GetFieldHasbitMode(GetField()),
GetParam().expected_output.expected_hasbitmode);
EXPECT_EQ(HasHasbit(GetField()),
GetParam().expected_output.expected_has_hasbit);
}
// NOTE: with C++20 we can use designated initializers to ensure
// that struct members match commented names, but as we are still working with
// C++17 in the foreseeable future, we won't be able to refactor this for a
// while...
// https://github.com/google/oss-policies-info/blob/main/foundational-cxx-support-matrix.md
INSTANTIATE_TEST_SUITE_P(
HasHasbitLegacySyntaxTests, HasHasbitTest,
testing::Values(
// Test case: proto2 singular fields
HasHasbitTestParam{R"pb(name: 'foo.proto'
package: 'foo'
syntax: 'proto2'
message_type {
name: 'FooMessage'
field {
name: 'f'
number: 1
type: TYPE_INT64
label: LABEL_OPTIONAL
}
}
)pb",
/*expected_output=*/{
/*expected_hasbitmode=*/HasbitMode::kTrueHasbit,
/*expected_has_presence=*/true,
/*expected_has_hasbit=*/true,
}},
// Test case: proto2 repeated fields
HasHasbitTestParam{R"pb(name: 'foo.proto'
package: 'foo'
syntax: 'proto2'
message_type {
name: 'FooMessage'
field {
name: 'f'
number: 1
type: TYPE_STRING
label: LABEL_REPEATED
}
}
)pb",
/*expected_output=*/{
/*expected_hasbitmode=*/HasbitMode::kNoHasbit,
/*expected_has_presence=*/false,
/*expected_has_hasbit=*/false,
}},
// Test case: proto3 singular fields
HasHasbitTestParam{R"pb(name: 'foo.proto'
package: 'foo'
syntax: 'proto3'
message_type {
name: 'FooMessage'
field {
name: 'f'
number: 1
type: TYPE_INT64
label: LABEL_OPTIONAL
}
}
)pb",
/*expected_output=*/{
/*expected_hasbitmode=*/HasbitMode::kHintHasbit,
/*expected_has_presence=*/false,
/*expected_has_hasbit=*/true,
}},
// Test case: proto3 optional fields
HasHasbitTestParam{
R"pb(name: 'foo.proto'
package: 'foo'
syntax: 'proto3'
message_type {
name: 'Foo'
field {
name: 'int_field'
number: 1
type: TYPE_INT32
label: LABEL_OPTIONAL
oneof_index: 0
proto3_optional: true
}
oneof_decl { name: '_int_field' }
}
)pb",
/*expected_output=*/{
/*expected_hasbitmode=*/HasbitMode::kTrueHasbit,
/*expected_has_presence=*/true,
/*expected_has_hasbit=*/true,
}},
// Test case: proto3 repeated fields
HasHasbitTestParam{R"pb(name: 'foo.proto'
package: 'foo'
syntax: 'proto3'
message_type {
name: 'FooMessage'
field {
name: 'f'
number: 1
type: TYPE_STRING
label: LABEL_REPEATED
}
}
)pb",
/*expected_output=*/{
/*expected_hasbitmode=*/HasbitMode::kNoHasbit,
/*expected_has_presence=*/false,
/*expected_has_hasbit=*/false,
}}));
// NOTE: with C++20 we can use designated initializers to ensure
// that struct members match commented names, but as we are still working with
// C++17 in the foreseeable future, we won't be able to refactor this for a
// while...
// https://github.com/google/oss-policies-info/blob/main/foundational-cxx-support-matrix.md
INSTANTIATE_TEST_SUITE_P(
HasHasbitEditionsTests, HasHasbitTest,
testing::Values(
// Test case: explicit-presence, singular fields
HasHasbitTestParam{
R"pb(name: 'foo.proto'
package: 'foo'
syntax: 'editions'
edition: EDITION_2023
message_type {
name: 'FooMessage'
field {
name: 'f'
number: 1
type: TYPE_INT64
options { features { field_presence: EXPLICIT } }
}
}
)pb",
/*expected_output=*/{
/*expected_hasbitmode=*/HasbitMode::kTrueHasbit,
/*expected_has_presence=*/true,
/*expected_has_hasbit=*/true,
}},
// Test case: implicit-presence, singular fields
HasHasbitTestParam{
R"pb(name: 'foo.proto'
package: 'foo'
syntax: 'editions'
edition: EDITION_2023
message_type {
name: 'FooMessage'
field {
name: 'f'
number: 1
type: TYPE_INT64
options { features { field_presence: IMPLICIT } }
}
}
)pb",
/*expected_output=*/{
/*expected_hasbitmode=*/HasbitMode::kHintHasbit,
/*expected_has_presence=*/false,
/*expected_has_hasbit=*/true,
}},
// Test case: oneof fields.
// Note that oneof fields can't specify field presence.
HasHasbitTestParam{
R"pb(name: 'foo.proto'
package: 'foo'
syntax: 'editions'
edition: EDITION_2023
message_type {
name: 'FooMessage'
field {
name: 'f'
number: 1
type: TYPE_STRING
oneof_index: 0
}
oneof_decl { name: "onebar" }
}
)pb",
/*expected_output=*/{
/*expected_hasbitmode=*/HasbitMode::kNoHasbit,
/*expected_has_presence=*/true,
/*expected_has_hasbit=*/false,
}},
// Test case: message fields.
// Note that message fields cannot specify implicit presence.
HasHasbitTestParam{
R"pb(name: 'foo.proto'
package: 'foo'
syntax: 'editions'
edition: EDITION_2023
message_type {
name: 'FooMessage'
field {
name: 'f'
number: 1
type: TYPE_MESSAGE
type_name: "Bar"
}
}
message_type {
name: 'Bar'
field { name: 'int_field' number: 1 type: TYPE_INT32 }
}
)pb",
/*expected_output=*/{
/*expected_hasbitmode=*/HasbitMode::kTrueHasbit,
/*expected_has_presence=*/true,
/*expected_has_hasbit=*/true,
}},
// Test case: repeated fields.
// Note that repeated fields can't specify presence.
HasHasbitTestParam{R"pb(name: 'foo.proto'
package: 'foo'
syntax: 'editions'
edition: EDITION_2023
message_type {
name: 'FooMessage'
field {
name: 'f'
number: 1
type: TYPE_STRING
label: LABEL_REPEATED
}
}
)pb",
/*expected_output=*/{
/*expected_hasbitmode=*/HasbitMode::kNoHasbit,
/*expected_has_presence=*/false,
/*expected_has_hasbit=*/false,
}}));
// ===================================================================
enum DescriptorPoolMode { NO_DATABASE, FALLBACK_DATABASE };

@ -53,6 +53,7 @@
#include "absl/base/attributes.h"
#include "absl/hash/hash.h"
#include "absl/log/absl_check.h"
#include "absl/log/absl_log.h"
#include "absl/types/variant.h"
#include "absl/utility/utility.h"
#include "google/protobuf/arenastring.h"
@ -465,6 +466,11 @@ namespace {
bool IsMapFieldInApi(const FieldDescriptor* field) { return field->is_map(); }
bool IsMapEntryField(const FieldDescriptor* field) {
return (field->containing_type() != nullptr &&
field->containing_type()->options().map_entry());
}
inline bool InRealOneof(const FieldDescriptor* field) {
return field->real_containing_oneof() != nullptr;
@ -1086,7 +1092,43 @@ const Message* DynamicMessageFactory::GetPrototypeNoLock(
type_info->has_bits_offset = -1;
int max_hasbit = 0;
for (int i = 0; i < type->field_count(); i++) {
if (internal::cpp::HasHasbit(type->field(i))) {
const FieldDescriptor* field = type->field(i);
// If a field has hasbits, it could be either an explicit-presence or
// implicit-presence field. Explicit presence fields will have "true
// hasbits" where hasbit is set iff field is present. Implicit presence
// fields will have "hint hasbits" where
// - if hasbit is unset, field is not present.
// - if hasbit is set, field is present if it is also nonempty.
if (internal::cpp::HasHasbit(field)) {
// TODO: b/112602698 - during Python textproto serialization, MapEntry
// messages may be generated from DynamicMessage on the fly. C++
// implementations of MapEntry messages always have hasbits, but
// has_presence return values might be different depending on how field
// presence is set. For MapEntrys, has_presence returns true for
// explicit-presence (proto2) messages and returns false for
// implicit-presence (proto3) messages.
//
// In the case of implicit presence, there is a potential inconsistency in
// code behavior between C++ and Python:
// - If C++ implementation is linked, hasbits are always generated for
// MapEntry messages, and MapEntry messages will behave like explicit
// presence.
// - If C++ implementation is not linked, Python defaults to the
// DynamicMessage implementation for MapEntrys which traditionally does
// not assume the presence of hasbits, so the default Python behavior
// for MapEntry messages (by default C++ implementations are not linked)
// will fall back to the DynamicMessage implementation and behave like
// implicit presence.
// This is an inconsistency and this if-condition preserves it.
//
// Longer term, we want to get rid of this additional if-check of
// IsMapEntryField. It might take one or more breaking changes and more
// consensus gathering & clarification though.
if (!field->has_presence() && IsMapEntryField(field)) {
continue;
}
if (type_info->has_bits_offset == -1) {
// At least one field in the message requires a hasbit, so allocate
// hasbits.

@ -31,6 +31,7 @@
#include "absl/strings/str_format.h"
#include "absl/strings/string_view.h"
#include "absl/synchronization/mutex.h"
#include "google/protobuf/arena.h"
#include "google/protobuf/descriptor.h"
#include "google/protobuf/descriptor.pb.h"
#include "google/protobuf/descriptor_lite.h"
@ -79,6 +80,11 @@ namespace protobuf {
namespace {
bool IsMapFieldInApi(const FieldDescriptor* field) { return field->is_map(); }
bool IsMapEntry(const FieldDescriptor* field) {
return (field->containing_type() != nullptr &&
field->containing_type()->options().map_entry());
}
Message* MaybeForceCopy(Arena* arena, Message* msg) {
if (arena != nullptr || msg == nullptr) return msg;
@ -1178,7 +1184,7 @@ void Reflection::SwapFieldsImpl(
// oneof already. This has to be done after SwapField, because SwapField
// may depend on the information in has bits.
if (!field->is_repeated()) {
SwapHasBit(message1, message2, field);
NaiveSwapHasBit(message1, message2, field);
if (field->cpp_type() == FieldDescriptor::CPPTYPE_STRING &&
field->cpp_string_type() ==
FieldDescriptor::CppStringType::kString &&
@ -1749,7 +1755,8 @@ void Reflection::ListFields(const Message& message,
}
} else if (has_bits && has_bits_indices[i] != static_cast<uint32_t>(-1)) {
// Equivalent to: HasFieldSingular(message, field)
if (IsIndexInHasBitSet(has_bits, has_bits_indices[i])) {
if (IsFieldPresentGivenHasbits(message, field, has_bits,
has_bits_indices[i])) {
append_to_output(field);
}
} else if (HasFieldSingular(message, field)) {
@ -2989,74 +2996,127 @@ void Reflection::SwapInlinedStringDonated(Message* lhs, Message* rhs,
}
}
// Simple accessors for manipulating has_bits_.
bool Reflection::IsSingularFieldNonEmpty(const Message& message,
const FieldDescriptor* field) const {
ABSL_DCHECK(IsMapEntry(field) || !field->has_presence());
ABSL_DCHECK(!field->is_repeated());
ABSL_DCHECK(!field->is_map());
ABSL_DCHECK(field->cpp_type() != FieldDescriptor::CPPTYPE_MESSAGE);
// Scalar primitive (numeric or string/bytes) fields are present if
// their value is non-zero (numeric) or non-empty (string/bytes). N.B.:
// we must use this definition here, rather than the "scalar fields
// always present" in the proto3 docs, because MergeFrom() semantics
// require presence as "present on wire", and reflection-based merge
// (which uses HasField()) needs to be consistent with this.
switch (field->cpp_type()) {
case FieldDescriptor::CPPTYPE_BOOL:
return GetRaw<bool>(message, field) != false;
case FieldDescriptor::CPPTYPE_INT32:
return GetRaw<int32_t>(message, field) != 0;
case FieldDescriptor::CPPTYPE_INT64:
return GetRaw<int64_t>(message, field) != 0;
case FieldDescriptor::CPPTYPE_UINT32:
return GetRaw<uint32_t>(message, field) != 0;
case FieldDescriptor::CPPTYPE_UINT64:
return GetRaw<uint64_t>(message, field) != 0;
case FieldDescriptor::CPPTYPE_FLOAT:
static_assert(sizeof(uint32_t) == sizeof(float),
"Code assumes uint32_t and float are the same size.");
return GetRaw<uint32_t>(message, field) != 0;
case FieldDescriptor::CPPTYPE_DOUBLE:
static_assert(sizeof(uint64_t) == sizeof(double),
"Code assumes uint64_t and double are the same size.");
return GetRaw<uint64_t>(message, field) != 0;
case FieldDescriptor::CPPTYPE_ENUM:
return GetRaw<int>(message, field) != 0;
case FieldDescriptor::CPPTYPE_STRING:
switch (field->cpp_string_type()) {
case FieldDescriptor::CppStringType::kCord:
return !GetField<const absl::Cord>(message, field).empty();
case FieldDescriptor::CppStringType::kView:
case FieldDescriptor::CppStringType::kString: {
if (IsInlined(field)) {
return !GetField<InlinedStringField>(message, field)
.GetNoArena()
.empty();
}
return !GetField<ArenaStringPtr>(message, field).Get().empty();
}
default:
internal::Unreachable();
}
case FieldDescriptor::CPPTYPE_MESSAGE:
default:
internal::Unreachable();
}
}
bool Reflection::IsFieldPresentGivenHasbits(const Message& message,
const FieldDescriptor* field,
const uint32_t* hasbits,
uint32_t hasbit_index) const {
// If hasbit exists but is not set, field is guaranteed to be missing.
if (!IsIndexInHasBitSet(hasbits, hasbit_index)) {
return false;
}
// For explicit-presence fields, a set hasbit indicates a present field.
if (field->has_presence()) {
return true;
}
// proto3: hasbits are present, but an additional zero check must be
// performed because hasbit can be set to true while field is zero.
// Repeated fields do not have hasbits enabled in proto3.
ABSL_DCHECK(!field->is_repeated())
<< "repeated fields do not have hasbits in proto3.";
// Handling map entries in proto3:
// Implicit presence map fields are represented as a native C++ map, but their
// corresponding MapEntry messages (e.g. if we want to access them as repeated
// MapEntry fields) will unconditionally be generated with hasbits. MapEntrys
// behave like explicit presence fields. That is, in MapEntry's C++
// implementation...
// - key can be null, empty, or nonempty;
// - value can be null, empty, or nonempty.
if (IsMapEntry(field)) {
return true;
}
// This is the vanilla case: for a non-repeated primitive or string field,
// returns if the field is nonzero (i.e. present in proto3 semantics).
return IsSingularFieldNonEmpty(message, field);
}
bool Reflection::HasFieldSingular(const Message& message,
const FieldDescriptor* field) const {
ABSL_DCHECK(!field->options().weak());
if (schema_.HasBitIndex(field) != static_cast<uint32_t>(-1)) {
return IsIndexInHasBitSet(GetHasBits(message), schema_.HasBitIndex(field));
return IsFieldPresentGivenHasbits(message, field, GetHasBits(message),
schema_.HasBitIndex(field));
}
// proto3: no has-bits. All fields present except messages, which are
// The python implementation traditionally assumes that proto3 messages don't
// have hasbits. As a result, proto3 objects created through dynamic message
// in Python won't have hasbits. We need the following code to preserve
// compatibility.
// NOTE: It would be nice to be able to remove it, but we need one
// or more breaking changes in order to do so.
//
// proto3 with no has-bits. All fields present except messages, which are
// present only if their message-field pointer is non-null.
if (field->cpp_type() == FieldDescriptor::CPPTYPE_MESSAGE) {
return !schema_.IsDefaultInstance(message) &&
GetRaw<const Message*>(message, field) != nullptr;
} else {
// Non-message field (and non-oneof, since that was handled in HasField()
// before calling us), and singular (again, checked in HasField). So, this
// field must be a scalar.
// Scalar primitive (numeric or string/bytes) fields are present if
// their value is non-zero (numeric) or non-empty (string/bytes). N.B.:
// we must use this definition here, rather than the "scalar fields
// always present" in the proto3 docs, because MergeFrom() semantics
// require presence as "present on wire", and reflection-based merge
// (which uses HasField()) needs to be consistent with this.
switch (field->cpp_type()) {
case FieldDescriptor::CPPTYPE_STRING:
switch (field->cpp_string_type()) {
case FieldDescriptor::CppStringType::kCord:
return !GetField<const absl::Cord>(message, field).empty();
case FieldDescriptor::CppStringType::kView:
case FieldDescriptor::CppStringType::kString: {
if (IsInlined(field)) {
return !GetField<InlinedStringField>(message, field)
.GetNoArena()
.empty();
}
return GetField<ArenaStringPtr>(message, field).Get().size() > 0;
}
}
internal::Unreachable();
case FieldDescriptor::CPPTYPE_BOOL:
return GetRaw<bool>(message, field) != false;
case FieldDescriptor::CPPTYPE_INT32:
return GetRaw<int32_t>(message, field) != 0;
case FieldDescriptor::CPPTYPE_INT64:
return GetRaw<int64_t>(message, field) != 0;
case FieldDescriptor::CPPTYPE_UINT32:
return GetRaw<uint32_t>(message, field) != 0;
case FieldDescriptor::CPPTYPE_UINT64:
return GetRaw<uint64_t>(message, field) != 0;
case FieldDescriptor::CPPTYPE_FLOAT:
static_assert(sizeof(uint32_t) == sizeof(float),
"Code assumes uint32_t and float are the same size.");
return GetRaw<uint32_t>(message, field) != 0;
case FieldDescriptor::CPPTYPE_DOUBLE:
static_assert(sizeof(uint64_t) == sizeof(double),
"Code assumes uint64_t and double are the same size.");
return GetRaw<uint64_t>(message, field) != 0;
case FieldDescriptor::CPPTYPE_ENUM:
return GetRaw<int>(message, field) != 0;
case FieldDescriptor::CPPTYPE_MESSAGE:
// handled above; avoid warning
break;
}
ABSL_LOG(FATAL) << "Reached impossible case in HasFieldSingular().";
return false;
}
// Non-message field (and non-oneof, since that was handled in HasField()
// before calling us), and singular (again, checked in HasField). So, this
// field must be a scalar.
return IsSingularFieldNonEmpty(message, field);
}
void Reflection::SetHasBit(Message* message,
@ -3077,23 +3137,31 @@ void Reflection::ClearHasBit(Message* message,
~(static_cast<uint32_t>(1) << (index % 32));
}
void Reflection::SwapHasBit(Message* message1, Message* message2,
const FieldDescriptor* field) const {
void Reflection::NaiveSwapHasBit(Message* message1, Message* message2,
const FieldDescriptor* field) const {
ABSL_DCHECK(!field->options().weak());
if (!schema_.HasHasbits()) {
return;
}
bool temp_is_present = HasFieldSingular(*message1, field);
if (HasFieldSingular(*message2, field)) {
SetHasBit(message1, field);
} else {
ClearHasBit(message1, field);
}
if (temp_is_present) {
const Reflection* r1 = message1->GetReflection();
const Reflection* r2 = message2->GetReflection();
bool is_m1_hasbit_set = IsIndexInHasBitSet(r1->GetHasBits(*message1),
r1->schema_.HasBitIndex(field));
bool is_m2_hasbit_set = IsIndexInHasBitSet(r2->GetHasBits(*message2),
r2->schema_.HasBitIndex(field));
if (is_m1_hasbit_set) {
SetHasBit(message2, field);
} else {
ClearHasBit(message2, field);
}
if (is_m2_hasbit_set) {
SetHasBit(message1, field);
} else {
ClearHasBit(message1, field);
}
}
bool Reflection::HasOneof(const Message& message,

@ -148,6 +148,10 @@ struct ReflectionSchema {
sizeof(uint32_t));
}
// Returns true iff the field object has usable hasbit offset.
// Note that this is not necessarily correlated with *field presence* :
// Fields with implicit presence (i.e. ones that don't expose has_foo API)
// can still have hasbits in their underlying implementation.
bool HasHasbits() const { return has_bits_offset_ != -1; }
// Bit index within the bit array of hasbits. Bit order is low-to-high.

@ -1234,6 +1234,17 @@ class PROTOBUF_EXPORT Reflection final {
return schema_.IsFieldInlined(field);
}
// For "proto3 non-optional" primitive fields, aka implicit-presence fields,
// returns true if the field is populated, i.e., nonzero. False otherwise.
bool IsSingularFieldNonEmpty(const Message& message,
const FieldDescriptor* field) const;
// Returns whether the field is present if there are usable hasbits in the
// field schema. (Note that in some cases hasbits are merely a hint to
// indicate "possible presence", and another empty-check is required).
bool IsFieldPresentGivenHasbits(const Message& message,
const FieldDescriptor* field,
const uint32_t* hasbits,
uint32_t hasbit_index) const;
// Returns true if the field is considered to be present.
// Requires the input to be 'singular' i.e. non-extension, non-oneof, non-weak
// field.
@ -1243,8 +1254,14 @@ class PROTOBUF_EXPORT Reflection final {
const FieldDescriptor* field) const;
void SetHasBit(Message* message, const FieldDescriptor* field) const;
inline void ClearHasBit(Message* message, const FieldDescriptor* field) const;
inline void SwapHasBit(Message* message1, Message* message2,
const FieldDescriptor* field) const;
// Naively swaps the hasbit without checking for field existence.
// For explicit presence fields, the hasbit is swapped normally.
// For implicit presence fields, the hasbit is swapped without checking for
// field emptiness. That is, the destination message may have hasbit set even
// if the field is empty. This should still result in correct behaviour due to
// HasbitMode being set to kHintHasbits for implicit presence fields.
inline void NaiveSwapHasBit(Message* message1, Message* message2,
const FieldDescriptor* field) const;
inline const uint32_t* GetInlinedStringDonatedArray(
const Message& message) const;

Loading…
Cancel
Save