diff --git a/src/google/protobuf/lazy_repeated_field.cc b/src/google/protobuf/lazy_repeated_field.cc new file mode 100644 index 0000000000..e3b83ad04d --- /dev/null +++ b/src/google/protobuf/lazy_repeated_field.cc @@ -0,0 +1,344 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2023 Google Inc. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +#include "google/protobuf/lazy_repeated_field.h" + +#include +#include +#include +#include +#include +#include + +#include "absl/log/absl_check.h" +#include "absl/log/absl_log.h" +#include "absl/log/log.h" +#include "absl/strings/cord.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/string_view.h" +#include "absl/types/optional.h" +#include "google/protobuf/arena.h" +#include "google/protobuf/generated_message_util.h" +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/io/zero_copy_stream_impl_lite.h" +#include "google/protobuf/message_lite.h" +#include "google/protobuf/parse_context.h" + +// Must be included last. +// clang-format off +#include "google/protobuf/port_def.inc" +#include "google/protobuf/repeated_ptr_field.h" +// clang-format on + +namespace google { +namespace protobuf { +namespace internal { +namespace {} // namespace + +namespace { + +inline const char* InternalParseRepeated(const char* ptr, + ParseContext* local_ctx, + RepeatedPtrFieldBase* value, + const MessageLite* prototype) { + uint32_t expected_tag; + ptr = ReadTag(ptr, &expected_tag); + if (PROTOBUF_PREDICT_FALSE(ptr == nullptr)) return nullptr; + // TODO: Try to optimize this. The tags and lengths are read again + // which is a bit wasteful. + return LazyRepeatedPtrField::ParseToRepeatedMessage( + ptr, local_ctx, prototype, expected_tag, value); +} + +template +inline bool ParseWithNullOuterContextImpl(const T& input, + RepeatedPtrFieldBase* value, + const MessageLite* prototype, + bool set_missing_required) { + // Null outer context means it's either already verified or unverified. + // + // If the payload is eagerly verified, the recursion limit was also verified + // and we don't need to repeat that. Also, users might have used a custom + // limit which is not known at this access. + // + // Unverified lazy fields may suffer from stack overflow with deeply nested + // data. We argue that it should be better than silent data corruption. + constexpr int kUnlimitedDepth = std::numeric_limits::max(); + const char* ptr; + ParseContext local_ctx(kUnlimitedDepth, false, &ptr, input); + + if (set_missing_required) { + local_ctx.SetParentMissingRequiredFields(); + } + // Unparsed data is already verified at parsing. Disable eager-verification. + (void)local_ctx.set_lazy_parse_mode(ParseContext::LazyParseMode::kLazy); + + ptr = InternalParseRepeated(ptr, &local_ctx, value, prototype); + return ptr != nullptr && + (local_ctx.EndedAtEndOfStream() || local_ctx.EndedAtLimit()); +} + +template +inline bool ParseWithOuterContextImpl(const T& input, ParseContext* ctx, + RepeatedPtrFieldBase* value, + const MessageLite* prototype, + bool set_missing_required) { + if (ctx == nullptr) { + return ParseWithNullOuterContextImpl(input, value, prototype, + set_missing_required); + } + + ABSL_DCHECK(!ctx->AliasingEnabled()); + // set_missing_required => ctx == nullptr + ABSL_DCHECK(!set_missing_required); + + // Create local context with depth. + const char* ptr; + ParseContext local_ctx(ParseContext::kSpawn, *ctx, &ptr, input); + + if (set_missing_required) { + local_ctx.SetParentMissingRequiredFields(); + } + if (ctx->lazy_parse_mode() == ParseContext::LazyParseMode::kEagerVerify) { + // Unparsed data is already verified at parsing. Disable eager-verification. + (void)local_ctx.set_lazy_parse_mode(ParseContext::LazyParseMode::kLazy); + } + + ptr = InternalParseRepeated(ptr, &local_ctx, value, prototype); + + if (local_ctx.missing_required_fields()) { + ctx->SetMissingRequiredFields(); + } + + return ptr != nullptr && + (local_ctx.EndedAtEndOfStream() || local_ctx.EndedAtLimit()); +} + +class ByPrototype { + public: + explicit ByPrototype(const MessageLite* prototype) : prototype_(prototype) {} + + MessageLite* New(Arena* arena) const { return prototype_->New(arena); } + + const MessageLite& Default() const { return *prototype_; } + + private: + const MessageLite* prototype_; +}; +} // namespace + +const RepeatedPtrFieldBase* LazyRepeatedPtrField::GetByPrototype( + const MessageLite* prototype, Arena* arena, ParseContext* ctx) const { + return GetGeneric(ByPrototype(prototype), arena, ctx); +} + +RepeatedPtrFieldBase* LazyRepeatedPtrField::MutableByPrototype( + const MessageLite* prototype, Arena* arena, ParseContext* ctx) { + return MutableGeneric(ByPrototype(prototype), arena, ctx); +} + +void LazyRepeatedPtrField::Clear() { + PerformTransition([](ExclusiveTxn& txn) { + auto* value = txn.mutable_value(); + if (value != nullptr) value->Clear>(); + return RawState::kCleared; + }); +} + +bool LazyRepeatedPtrField::IsEagerSerializeSafe(const MessageLite* prototype, + int32_t number, + Arena* arena) const { + // "prototype" may be null if it is for dynamic messages. This is ok as + // dynamic extensions won't be lazy as they lack verify functions any way. + if (prototype == nullptr) return false; + + for (;;) { + switch (GetLogicalState()) { + case LogicalState::kClear: + case LogicalState::kClearExposed: + case LogicalState::kDirty: + return true; + case LogicalState::kNoParseRequired: { + const auto* value = raw_.load(std::memory_order_relaxed).value(); + size_t tag_size = WireFormatLite::TagSize( + number, WireFormatLite::FieldType::TYPE_MESSAGE); + size_t total_size = tag_size * value->size(); + for (int i = 0; i < value->size(); i++) { + total_size += WireFormatLite::LengthDelimitedSize( + value->Get>(i).ByteSizeLong()); + } + return total_size == unparsed_.Size(); + } + case LogicalState::kParseRequired: { + GetByPrototype(prototype, arena); + break; // reswitch + } + } + } + // Required for certain compiler configurations. + ABSL_LOG(FATAL) << "Not reachable"; + return false; +} + +void LazyRepeatedPtrField::swap_atomics(std::atomic& lhs, + std::atomic& rhs) { + auto l = lhs.exchange(rhs.load(std::memory_order_relaxed), + std::memory_order_relaxed); + rhs.store(l, std::memory_order_relaxed); +} + +void LazyRepeatedPtrField::Swap(LazyRepeatedPtrField* lhs, Arena* lhs_arena, + LazyRepeatedPtrField* rhs, Arena* rhs_arena) { + static auto reallocate = [](LazyRepeatedPtrField* f, Arena* arena, + bool cleanup_old) { + auto raw = f->raw_.load(std::memory_order_relaxed); + if (raw.value() != nullptr) { + auto* new_value = Arena::Create(arena); + if (!raw.value()->empty()) { + new_value->MergeFrom(*raw.value()); + } + if (cleanup_old) { + delete reinterpret_cast*>( + raw.value()); + }; + raw.set_value(new_value); + f->raw_.store(raw, std::memory_order_relaxed); + } + auto old_unparsed = f->unparsed_; + f->unparsed_.Visit( + [] {}, + [&](auto& cord) { f->unparsed_.InitAsCord(arena, std::move(cord)); }, + [&](auto view) { + if (arena == nullptr) { + f->unparsed_.InitAsCord(arena, view); + } else { + f->unparsed_.InitAndSetArray(arena, view); + } + }); + if (cleanup_old) old_unparsed.Destroy(); + }; + static auto take_ownership = [](LazyRepeatedPtrField* f, Arena* arena) { +#ifdef PROTOBUF_FORCE_COPY_IN_SWAP + reallocate(f, arena, true); +#else + arena->Own(reinterpret_cast*>( + f->raw_.load(std::memory_order_relaxed).mutable_value())); + f->unparsed_.TransferHeapOwnershipToArena(arena); +#endif + }; + + using std::swap; // Enable ADL with fallback + swap_atomics(lhs->raw_, rhs->raw_); + swap(lhs->unparsed_, rhs->unparsed_); + // At this point we are in a weird state. The messages have been swapped into + // their destination, but we have completely ignored the arenas, so the owning + // arena is actually on the opposite message. Now we straighten out our + // ownership by forcing reallocations/ownership changes as needed. + if (lhs_arena == rhs_arena) { +#ifdef PROTOBUF_FORCE_COPY_IN_SWAP + if (lhs_arena == nullptr) { + reallocate(lhs, lhs_arena, true); + reallocate(rhs, rhs_arena, true); + } +#endif + } else { + if (lhs_arena == nullptr) { + take_ownership(rhs, rhs_arena); + reallocate(lhs, lhs_arena, false); + } else if (rhs_arena == nullptr) { + take_ownership(lhs, lhs_arena); + reallocate(rhs, rhs_arena, false); + } else { + reallocate(lhs, lhs_arena, false); + reallocate(rhs, rhs_arena, false); + } + } +} + +void LazyRepeatedPtrField::InternalSwap( + LazyRepeatedPtrField* PROTOBUF_RESTRICT lhs, + LazyRepeatedPtrField* PROTOBUF_RESTRICT rhs) { + using std::swap; // Enable ADL with fallback + swap_atomics(lhs->raw_, rhs->raw_); + swap(lhs->unparsed_, rhs->unparsed_); +} + +bool LazyRepeatedPtrField::ParseWithOuterContext(RepeatedPtrFieldBase* value, + const absl::Cord& input, + ParseContext* ctx, + const MessageLite* prototype, + bool set_missing_required) { + absl::optional flat = input.TryFlat(); + if (flat.has_value()) { + return ParseWithOuterContextImpl(*flat, ctx, value, prototype, + set_missing_required); + } + + io::CordInputStream cis(&input); + return ParseWithOuterContextImpl(&cis, ctx, value, prototype, + set_missing_required); +} + +bool LazyRepeatedPtrField::ParseWithOuterContext(RepeatedPtrFieldBase* value, + absl::string_view input, + ParseContext* ctx, + const MessageLite* prototype, + bool set_missing_required) { + return ParseWithOuterContextImpl(input, ctx, value, prototype, + set_missing_required); +} + +size_t LazyRepeatedPtrField::ByteSizeLong(size_t tag_size) const { + switch (GetLogicalState()) { + case LogicalState::kClear: + case LogicalState::kClearExposed: + case LogicalState::kNoParseRequired: + case LogicalState::kParseRequired: + return unparsed_.Size(); + + case LogicalState::kDirty: + const auto* value = raw_.load(std::memory_order_relaxed).value(); + size_t total_size = tag_size * value->size(); + for (int i = 0; i < value->size(); i++) { + total_size += WireFormatLite::LengthDelimitedSize( + value->Get>(i).ByteSizeLong()); + } + return total_size; + } + // Required for certain compiler configurations. + ABSL_LOG(FATAL) << "Not reachable"; + return -1; +} + +void LazyRepeatedPtrField::LogParseError(const RepeatedPtrFieldBase* value) { + const MessageLite* message = + &value->at>(0); + auto get_error_string = [&value]() { + std::string str; + for (int i = 0; i < value->size(); i++) { + absl::StrAppend(&str, "[", i, "]: ", + value->at>(i) + .InitializationErrorString(), + "\n"); + } + return str; + }; +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) + // In fuzzing mode, we log less to speed up fuzzing. + ABSL_LOG_EVERY_N(INFO, 100000) +#else + ABSL_LOG_EVERY_N_SEC(INFO, 1) +#endif + << "Lazy parsing failed for RepeatedPtrField<" << message->GetTypeName() + << "> error=" << get_error_string() << " (N = " << COUNTER << ")"; +} + +} // namespace internal +} // namespace protobuf +} // namespace google + +#include "google/protobuf/port_undef.inc" diff --git a/src/google/protobuf/lazy_repeated_field.h b/src/google/protobuf/lazy_repeated_field.h new file mode 100644 index 0000000000..ab573e58c0 --- /dev/null +++ b/src/google/protobuf/lazy_repeated_field.h @@ -0,0 +1,1123 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2023 Google Inc. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +#ifndef GOOGLE_PROTOBUF_LAZY_REPEATED_FIELD_H__ +#define GOOGLE_PROTOBUF_LAZY_REPEATED_FIELD_H__ + +#include +#include +#include +#include +#include +#include +#include + +#include "absl/base/attributes.h" +#include "absl/log/absl_check.h" +#include "absl/strings/cord.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/string_view.h" +#include "google/protobuf/arena.h" +#include "google/protobuf/generated_message_util.h" +#include "google/protobuf/internal_visibility.h" +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/message_lite.h" +#include "google/protobuf/parse_context.h" +#include "google/protobuf/port.h" +#include "google/protobuf/raw_ptr.h" +#include "google/protobuf/repeated_ptr_field.h" +#include "google/protobuf/wire_format_verify.h" + +#ifdef SWIG +#error "You cannot SWIG proto headers" +#endif + +// must be last +#include "google/protobuf/port_def.inc" + +namespace google { +namespace protobuf { + +class Descriptor; +namespace io { +class CodedInputStream; +class CodedOutputStream; +} // namespace io +} // namespace protobuf +} // namespace google + +namespace google { +namespace protobuf { +namespace internal { + +inline const char* ReadTagInternal(const char* ptr, uint8_t* tag) { + *tag = UnalignedLoad(ptr); + return ptr + sizeof(uint8_t); +} + +inline const char* ReadTagInternal(const char* ptr, uint16_t* tag) { + *tag = UnalignedLoad(ptr); + return ptr + sizeof(uint16_t); +} + +inline const char* ReadTagInternal(const char* ptr, uint32_t* tag) { + return ReadTag(ptr, tag); +} + +template +inline size_t TagSizeInternal(TagType tag); +template <> +inline size_t TagSizeInternal(uint8_t tag) { + return sizeof(uint8_t); +} +template <> +inline size_t TagSizeInternal(uint16_t tag) { + return sizeof(uint16_t); +} +template <> +inline size_t TagSizeInternal(uint32_t tag) { + return io::CodedOutputStream::VarintSize32(tag); +} + +// This class is used to represent lazily-loaded repeated message fields. +// It stores the field in a raw buffer or a Cord initially, and then parses that +// on-demand if a caller asks for the RepeatedPtrField object. +// +// As with most protobuf classes, const methods of this class are safe to call +// from multiple threads at once, but non-const methods may only be called when +// the thread has guaranteed that it has exclusive access to the field. +class LazyRepeatedPtrField { + public: + constexpr LazyRepeatedPtrField() : raw_(MessageState(RawState::kCleared)) {} + LazyRepeatedPtrField(const LazyRepeatedPtrField& rhs) + : LazyRepeatedPtrField(nullptr, rhs, nullptr) {} + + // Arena enabled constructors. + LazyRepeatedPtrField(internal::InternalVisibility, Arena* arena) + : LazyRepeatedPtrField(arena) {} + LazyRepeatedPtrField(internal::InternalVisibility, Arena* arena, + const LazyRepeatedPtrField& rhs, Arena* rhs_arena) + : LazyRepeatedPtrField(arena, rhs, rhs_arena) {} + + // TODO: make this constructor private + explicit constexpr LazyRepeatedPtrField(Arena*) + : raw_(MessageState(RawState::kCleared)) {} + + LazyRepeatedPtrField& operator=(const LazyRepeatedPtrField&) = delete; + + ~LazyRepeatedPtrField(); + + bool IsClear() const { + auto state = GetLogicalState(); + return state == LogicalState::kClear || + state == LogicalState::kClearExposed; + } + + // Get and Mutable trigger parsing. + template + const RepeatedPtrField& Get(const Element* default_instance, + Arena* arena) const { + return *reinterpret_cast*>( + GetGeneric(ByTemplate(default_instance), arena, nullptr)); + } + + template + RepeatedPtrField* Mutable(const Element* default_instance, + Arena* arena) { + return reinterpret_cast*>( + MutableGeneric(ByTemplate(default_instance), arena, nullptr)); + } + + bool IsInitialized(const MessageLite* prototype, Arena* arena) const { + switch (GetLogicalState()) { + case LogicalState::kClear: + case LogicalState::kClearExposed: { + return true; + } + case LogicalState::kParseRequired: + case LogicalState::kNoParseRequired: { + // Returns true if "unparsed" is not verified to be (maybe) + // uninitialized. Otherwise, falls through to next cases to eagerly + // parse message and call IsInitialized(). + if (!MaybeUninitialized()) return true; + } + ABSL_FALLTHROUGH_INTENDED; + case LogicalState::kDirty: { + const auto& value = *GetByPrototype(prototype, arena); + for (int i = 0; i < value.size(); ++i) { + if (!value.Get>(i).IsInitialized()) + return false; + } + return true; + } + default: + __builtin_unreachable(); + } + } + + // Dynamic versions of basic accessors. + const RepeatedPtrFieldBase* GetDynamic(const Descriptor* type, + MessageFactory* factory, + Arena* arena) const; + RepeatedPtrFieldBase* MutableDynamic(const Descriptor* type, + MessageFactory* factory, Arena* arena); + + // Basic accessors that use a default instance to create the message. + const RepeatedPtrFieldBase* GetByPrototype(const MessageLite* prototype, + Arena* arena, + ParseContext* ctx = nullptr) const; + RepeatedPtrFieldBase* MutableByPrototype(const MessageLite* prototype, + Arena* arena, + ParseContext* ctx = nullptr); + + void Clear(); + + // Updates state such that state set in other overwrites this. + // + // Internal Lazy state transitions are updated as such: + // + // src\dest | UNINIT | INIT | DIRTY | CLEAR | ERROR + // :------- | :----: | :---: | :---: | :-----------: | :---: + // UNINIT | DIRTY | DIRTY | DIRTY | UNINIT/DIRTY* | DIRTY + // INIT | DIRTY | DIRTY | DIRTY | UNINIT/DIRTY* | UNDEF + // DIRTY | DIRTY | DIRTY | DIRTY | UNINIT/DIRTY* | UNDEF + // CLEAR | UNINIT | INIT | DIRTY | CLEAR | UNDEF + // ERROR | DIRTY | DIRTY | DIRTY | DIRTY | DIRTY + // * Depends on if clear was initialized before. + // TODO: The state after ERROR should be DIRTY. Also need to make the + // change for LazyField. + void MergeFrom(const MessageLite* prototype, + const LazyRepeatedPtrField& other, Arena* arena, + Arena* other_arena); + + static void Swap(LazyRepeatedPtrField* lhs, Arena* lhs_arena, + LazyRepeatedPtrField* rhs, Arena* rhs_arena); + static void InternalSwap(LazyRepeatedPtrField* lhs, + LazyRepeatedPtrField* rhs); + + const RepeatedPtrFieldBase* TryGetRepeated() const; + + // Returns true when the lazy field has data that have not yet parsed. + // (i.e. parsing has been deferred) Once parsing has been attempted, this + // returns false. Note that the LazyField object may still contain + // the raw unparsed data with parsing errors. + bool HasUnparsed() const { + return GetLogicalState() == LogicalState::kParseRequired; + } + + // Returns true if parsing has been attempted and it failed. + bool HasParsingError() const { + auto raw = raw_.load(std::memory_order_relaxed); + return raw.status() == RawState::kParseError; + } + + // APIs that will be used by table-driven parsing. + // + // `TagType` is passed from table-driven parser. On fast path it's uint8 or + // uint16; on slow path it's uint32. + template + const char* _InternalParse(const MessageLite* prototype, Arena* arena, + const char* ptr, ParseContext* ctx, + TagType expected_tag) { + // If this message is eagerly-verified lazy, kEager mode likely suggests + // that previous verification has failed and we fall back to eager-parsing + // (either to initialize the message to match eager field or to fix false + // errors. + // + // Lazy parsing does not support aliasing and may result in data copying. + // It seems prudent to honor aliasing to avoid any observable gaps between + // lazy and eager parsing. + if (ctx->lazy_parse_mode() == ParseContext::kEager || + ctx->AliasingEnabled()) { + auto* value = MutableByPrototype(prototype, arena, ctx); + ptr = ParseToRepeatedMessage(ptr, ctx, prototype, expected_tag, + value); + return ptr; + } + + switch (GetLogicalState()) { + case LogicalState::kParseRequired: { + return ParseToCord(ptr, ctx, prototype, arena, expected_tag); + } break; + + case LogicalState::kClear: { + // Clear/Fresh have empty unparsed data; so this is the equivalent + // of setting it to the passed in bytes. + return ParseToCord(ptr, ctx, prototype, arena, expected_tag); + } break; + + // Pointers exposed. + case LogicalState::kClearExposed: + case LogicalState::kNoParseRequired: + case LogicalState::kDirty: { + PerformTransition([&](ExclusiveTxn& txn) { + auto* value = txn.mutable_value(); + ptr = ParseToRepeatedMessage(ptr, ctx, prototype, + expected_tag, value); + return RawState::kIsParsed; + }); + return ptr; + } + } + // Required for certain compiler configurations. + internal::Unreachable(); + return nullptr; + } + + template + const char* _InternalParseVerify(const MessageLite* prototype, Arena* arena, + const char* ptr, ParseContext* ctx, + TagType expected_tag, + absl::string_view data) { + ABSL_DCHECK(ptr != nullptr); + if (ctx->lazy_parse_mode() == ParseContext::kLazy || + ctx->lazy_eager_verify_func() == nullptr) { + return ptr; + } + VerifyResult res = WireFormatVerifyView(data, ctx); + if (res.verified) { + if (res.missing_required_fields) { + // Unparsed data may be uninitialized and need to be parsed to be sure. + SetNeedsParseMaybeUninitialized(); + } + return ptr; + } + + // Try eager parsing on potentially malformed wire in case the eager parsing + // fixes the issue. For example, a negative int32 encoded as 5B varint can + // be parsed correctly. + // + // Should preserve the old parsing mode because we don't want to + // unnecessarily eager-parse other parts of message tree. This can be + // especially inefficient if the eager verification results in false + // positive errors. + ParseContext::LazyParseMode old = + ctx->set_lazy_parse_mode(ParseContext::kEager); + (void)GetByPrototype(prototype, arena, ctx); + + // If eager parsing still fails, don't bother restoring the parse mode. + if (HasParsingError()) return nullptr; + + // Unverified lazy fields may miss parsing errors on eager parsing. If it's + // certain, just mark error and return. + if (!ctx->treat_eager_parsing_errors_as_errors()) { + auto raw = raw_.load(std::memory_order_relaxed); + raw.set_status(RawState::kParseError); + raw_.store(raw, std::memory_order_relaxed); + ABSL_DCHECK(HasParsingError()); + return nullptr; + } + + // We need to transition to dirty to prefer eager serialization as the + // unparsed_ has non-canonical wire format. + (void)MutableByPrototype(prototype, arena); + + (void)ctx->set_lazy_parse_mode(old); + return ptr; + } + + template + static const char* ParseToRepeatedMessage(const char* ptr, ParseContext* ctx, + const MessageLite* prototype, + TagType expected_tag, + RepeatedPtrFieldBase* value) { + const char* ptr2 = ptr; + TagType next_tag; + do { + MessageLite* submsg = value->AddMessage(prototype); + // ptr2 points to the start of the element's encoded length. + ptr = ctx->ParseMessage(submsg, ptr2); + if (PROTOBUF_PREDICT_FALSE(ptr == nullptr)) return nullptr; + if (PROTOBUF_PREDICT_FALSE(!ctx->DataAvailable(ptr))) { + if (ctx->Done(&ptr)) { + break; + } + } + ptr2 = ReadTagInternal(ptr, &next_tag); + if (PROTOBUF_PREDICT_FALSE(ptr2 == nullptr)) return nullptr; + } while (next_tag == expected_tag); + return ptr; + } + + template + const char* ParseToCord(const char* ptr, ParseContext* ctx, + const MessageLite* prototype, Arena* arena, + TagType expected_tag) { + // ptr2 points to the start of the encoded length. + const char* ptr2 = ptr; + TagType next_tag; + // Move ptr back to the start of the tag. + size_t tag_size = TagSizeInternal(expected_tag); + ptr -= tag_size; + if (ctx->parent_missing_required_fields()) { + SetNeedsParseMaybeUninitialized(); + } else { + SetNeedsParse(); + } + do { + std::string tmp; + // Append the tag. + tmp.append(absl::string_view(ptr, ptr2 - ptr)); + size_t taglen_size; + ptr = ctx->ParseLengthDelimitedInlined( + ptr2, [&tmp, &taglen_size, ctx, ptr2](const char* p) { + // At this moment length is read and p points to the start of + // the payload. + ABSL_DCHECK(p - ptr2 > 0 && p - ptr2 <= 5) << p - ptr2; + // Append the length. + tmp.append(absl::string_view(ptr2, p - ptr2)); + taglen_size = tmp.size(); + return ctx->AppendString(p, &tmp); + }); + if (PROTOBUF_PREDICT_FALSE(ptr == nullptr)) return nullptr; + const auto tmp_size = tmp.size(); + ABSL_DCHECK_GE(tmp_size, taglen_size); + if (unparsed_.IsCord()) { + unparsed_.AsCord().Append(tmp); + } else if (arena != nullptr && + unparsed_.Size() + tmp_size <= kMaxArraySize) { + if (unparsed_.IsEmpty()) { + unparsed_.InitAsArray(arena, 0); + } + unparsed_.AppendToArray(tmp); + } else { + unparsed_.UpgradeToCord(arena).Append(tmp); + } + if (tmp_size > taglen_size) { + ptr = _InternalParseVerify( + prototype, arena, ptr, ctx, expected_tag, + absl::string_view(tmp.data() + taglen_size, + tmp_size - taglen_size)); + if (PROTOBUF_PREDICT_FALSE(ptr == nullptr)) return nullptr; + } + if (PROTOBUF_PREDICT_FALSE(!ctx->DataAvailable(ptr))) { + // `Done` advances the stream to the next buffer chunk. + if (ctx->Done(&ptr)) { + break; + } + } + // ptr points to the start of the next tag. + ptr2 = ReadTagInternal(ptr, &next_tag); + // ptr2 points to the start of the next element's encoded length. + + // TODO: Try to remove the following condition for 8 and 16 bits + // TagType. + if (PROTOBUF_PREDICT_FALSE(ptr2 == nullptr)) return nullptr; + } while (next_tag == expected_tag); + if (unparsed_.IsArray()) { + unparsed_.ZeroOutTailingBytes(); + } + return ptr; + } + + uint8_t* InternalWrite(const MessageLite* prototype, int32_t number, + uint8_t* target, + io::EpsCopyOutputStream* stream) const; + + // ByteSize of the repeated ptr field (including the varints of tags and + // lengths). + size_t ByteSizeLong(size_t tag_size) const; + size_t SpaceUsedExcludingSelfLong() const; + + // LogicalState combines the `raw_` and `unparsed_` fields to produce the + // current state. + // + // This separation allows more easily adding fine-grained states w/o touching + // std::atomics; most state transitions are in a write context and do not + // require subtle atomicity. + // TODO: Deduplicate with LazyField. + enum class LogicalState { + // The serialized data is available and unparsed. + // (kParseRequired, !unparsed.empty(), message = undefined). + kParseRequired, + // The message has been parsed from the serialized data. + // (kIsParsed, !unparsed.empty(), message != nullptr). + kNoParseRequired, + // The field is clear (freshly constructed or cleared): + // - (kCleared, unparsed.empty(), message = nullptr) + kClear, + // The field is clear but previously exposed a pointer. + // - (kCleared, unparsed.empty(), message = !nullptr) + kClearExposed, + // A write operation was done after a parse. + // (kIsParsed, unparsed.empty(), message != nullptr) + kDirty, + }; + LogicalState GetLogicalState() const { + auto raw = raw_.load(std::memory_order_acquire); + switch (raw.status()) { + case RawState::kParseError: + ABSL_DCHECK_NE(raw.value(), nullptr); + return LogicalState::kDirty; + case RawState::kCleared: + ABSL_DCHECK(unparsed_.IsEmpty()); + ABSL_DCHECK(raw.value() == nullptr || raw.value()->empty()) + << (raw.value() == nullptr + ? "nullptr" + : absl::StrCat("non-empty:", raw.value()->size())); + return raw.value() == nullptr ? LogicalState::kClear + : LogicalState::kClearExposed; + case RawState::kNeedsParse: + case RawState::kNeedsParseMaybeUninitialized: + // There is no SetEncoded, so unparsed_ is always from _InternalParse, + // which can't be empty. + ABSL_DCHECK(!unparsed_.IsEmpty()); + ABSL_DCHECK(raw.value() == nullptr || raw.value()->empty()); + return LogicalState::kParseRequired; + default: + ABSL_DCHECK(raw.status() == RawState::kIsParsed || + raw.status() == RawState::kIsParsedMaybeUninitialized); + ABSL_DCHECK(raw.value() != nullptr); + // Only other Initialized state was kParseError which is handled above. + if (unparsed_.IsEmpty()) { + return LogicalState::kDirty; + } + // Non-null message, unparsed exists. + return LogicalState::kNoParseRequired; + } + } + + private: + // Values that can be kept in `MessageState`'s status bits. + // TODO: Deduplicate with LazyField. + enum class RawState { + // `unparsed_` is empty. + // `message_` is either nullptr or an empty container. + kCleared, + + // `unparsed_` contains the canonical field data. + // `message_` points to the result of parsing that data. + // + // NOTE: serializing `message_` may produce different bytes than + // `unparsed_`, so care must be taken around issues of canonical or + // deterministic serialization. Generally, `unparsed_` should be preferred + // if it is not empty, as that is lower overhead. + kIsParsed, + + // IsParsed and may be uninitialized. See + // kNeedsParseMaybeUninitialized for details. + kIsParsedMaybeUninitialized, + + // TODO: add kIsParsedIgnoreUnparsed and + // kIsParsedIgnoreUnparsedMaybeUninitialized. + + // `message_` points to the result of parsing that data, but there was an + // error when parsing. Partially parsed `message_` is considered canonical + // to match eager fields. + kParseError, + + // `unparsed_` contains the field data. + // `message_` is either nullptr or an empty container. + kNeedsParse, + + // kNeedsParse and may be uninitialized. + // + // MaybeUninitialized is flagged in the verification and recorded to trigger + // eager parsing on IsInitialized() to be certain. + // + // Note that unverified data is assumed to be initialized (to support legacy + // cases) and treated as if it's verified to be initialized. Therefore, we + // need "MaybeUninitialized" rather than "Initialized". + kNeedsParseMaybeUninitialized, + + kMaxState = kNeedsParseMaybeUninitialized + }; + + class MessageState { + public: + constexpr explicit MessageState(RawState state) : raw_(ToUint32(state)) {} + MessageState(const RepeatedPtrFieldBase* message, RawState state) + : raw_(reinterpret_cast(message) | ToUint32(state)) { + ABSL_DCHECK_EQ(reinterpret_cast(message) & ToUint32(state), + 0u); + } + + const RepeatedPtrFieldBase* value() const { + return reinterpret_cast(raw_ & ~0b111); + } + + RepeatedPtrFieldBase* mutable_value() const { + return reinterpret_cast(raw_ & ~0b111); + } + + RawState status() const { return ToRawState(raw_ & 0b111); } + + void set_status(RawState status) { + raw_ &= ~0b111; + raw_ |= ToUint32(status); + } + + void set_value(const RepeatedPtrFieldBase* message) { + raw_ &= 0b111; + raw_ |= reinterpret_cast(message); + } + + static inline constexpr uint32_t ToUint32(RawState status) { + return static_cast(status); + } + static inline RawState ToRawState(uint32_t status) { + ABSL_DCHECK_LE(status, ToUint32(RawState::kMaxState)); + return static_cast(status); + } + + bool NeedsParse() const { + // kNeedsParse and kNeedsParseMaybeUninitialized must be 0 and 1 to make + // NeedsParse() check cheap. + static_assert( + RawState::kNeedsParseMaybeUninitialized == RawState::kMaxState, ""); + static_assert(ToUint32(RawState::kNeedsParseMaybeUninitialized) == + ToUint32(RawState::kNeedsParse) + 1, + ""); + return status() >= RawState::kNeedsParse; + } + + private: + uintptr_t raw_; + }; + + // TODO: Deduplicate. + template + class ByTemplate { + public: + // Only `Get()` needs access to the default element, but we don't want to + // force instantiation of `MessageType::default_instance()` because it + // doesn't exist in all configurations. + explicit ByTemplate() : ByTemplate(nullptr) {} + explicit ByTemplate(const MessageType* default_instance) + : default_instance_(default_instance) {} + + MessageLite* New(Arena* arena) const { + return reinterpret_cast( + Arena::DefaultConstruct(arena)); + } + + const MessageLite& Default() const { + ABSL_DCHECK(default_instance_ != nullptr); + return *reinterpret_cast(default_instance_); + } + + private: + const MessageType* default_instance_; + }; + + // Copy constructor on arena. + LazyRepeatedPtrField(Arena* arena, const LazyRepeatedPtrField& rhs, + Arena* rhs_arena); + + // Serialization methods. Note that WriteToCord may override/clear the + // given cord. + template + bool MergeFrom(const MessageLite* prototype, const Input& data, Arena* arena); + + private: + template + MessageState SharedInit(Strategy strategy, Arena* arena, + ParseContext* ctx) const { + auto old_raw = raw_.load(std::memory_order_acquire); + if (!old_raw.NeedsParse()) return old_raw; + MessageState new_raw = + // Transfer MaybeUninitialized state after a state transition. + DoParse(nullptr, strategy.Default(), arena, ctx, + old_raw.status() == RawState::kNeedsParseMaybeUninitialized); + if (raw_.compare_exchange_strong(old_raw, new_raw, + std::memory_order_release, + std::memory_order_acquire)) { + // We won the race. Dispose of the old message (if there was one). + if (arena == nullptr) { + delete reinterpret_cast*>( + old_raw.value()); + } + return new_raw; + } else { + // We lost the race, but someone else will have installed the new + // value. Dispose of the our attempt at installing. + if (arena == nullptr) { + delete reinterpret_cast*>( + new_raw.value()); + } + ABSL_DCHECK(!old_raw.NeedsParse()); + return old_raw; + } + } + + template + MessageState ExclusiveInitWithoutStore(Strategy strategy, Arena* arena, + ParseContext* ctx) { + auto old_raw = raw_.load(std::memory_order_relaxed); + if (!old_raw.NeedsParse() && old_raw.value() != nullptr) return old_raw; + if (old_raw.NeedsParse()) { + // Mutable messages need not transfer MaybeUninitialized. + return DoParse(old_raw.mutable_value(), strategy.Default(), arena, ctx, + false); + } + ABSL_DCHECK(old_raw.value() == nullptr); + return MessageState(Arena::Create(arena), + RawState::kIsParsed); + } + + template + const RepeatedPtrFieldBase* GetGeneric(Strategy strategy, Arena* arena, + ParseContext* ctx) const { + const auto* value = SharedInit(strategy, arena, ctx).value(); + if (value == nullptr) { + return reinterpret_cast(DefaultRawPtr()); + } + return value; + } + + template + RepeatedPtrFieldBase* MutableGeneric(Strategy strategy, Arena* arena, + ParseContext* ctx) { + auto raw = ExclusiveInitWithoutStore(strategy, arena, ctx); + unparsed_.Clear(); + ABSL_DCHECK(raw.value() != nullptr); + raw.set_status(RawState::kIsParsed); + raw_.store(raw, std::memory_order_relaxed); + return raw.mutable_value(); + } + + void SetNeedsParse() { + auto raw = raw_.load(std::memory_order_relaxed); + raw.set_status(RawState::kNeedsParse); + raw_.store(raw, std::memory_order_relaxed); + } + + void SetNeedsParseMaybeUninitialized() { + auto raw = raw_.load(std::memory_order_relaxed); + ABSL_DCHECK(raw.status() == RawState::kNeedsParse || + raw.status() == RawState::kNeedsParseMaybeUninitialized); + raw.set_status(RawState::kNeedsParseMaybeUninitialized); + raw_.store(raw, std::memory_order_relaxed); + } + + void SetParseNotRequiredMaybeUninitialized() { + auto raw = raw_.load(std::memory_order_relaxed); + ABSL_DCHECK(raw.status() == RawState::kIsParsed || + raw.status() == RawState::kIsParsedMaybeUninitialized); + raw.set_status(RawState::kIsParsedMaybeUninitialized); + raw_.store(raw, std::memory_order_relaxed); + } + + bool MaybeUninitialized() const { + auto raw = raw_.load(std::memory_order_relaxed); + if (raw.status() == RawState::kNeedsParseMaybeUninitialized) return true; + + // Make sure the logical state matches as well. + return raw.status() == RawState::kIsParsedMaybeUninitialized && + GetLogicalState() == LogicalState::kNoParseRequired; + } + + // Adds MaybeUninitialized state if "other" may be uninitialized. + void MergeMaybeUninitializedState(const LazyRepeatedPtrField& other); + + bool IsEagerSerializeSafe(const MessageLite* prototype, int32_t number, + Arena* arena) const; + + static void swap_atomics(std::atomic& lhs, + std::atomic& rhs); + + // Helper to enforce invariants when exclusive R/M/W access is required. + class ExclusiveTxn { + public: + explicit ExclusiveTxn(LazyRepeatedPtrField& lazy) + : lazy_(lazy), state_(lazy_.raw_.load(std::memory_order_relaxed)) {} + + RepeatedPtrFieldBase* mutable_value() { + // Any write to the message at this point should nuke unparsed_. + lazy_.unparsed_.Clear(); + return state_.mutable_value(); + } + + void Commit(RawState new_status) { + if (state_.status() != new_status) { + state_.set_status(new_status); + lazy_.raw_.store(state_, std::memory_order_relaxed); + } + } + + private: + LazyRepeatedPtrField& lazy_; + MessageState state_; + }; + + template + RawState PerformTransition(Transition fn) { + ExclusiveTxn txn(*this); + RawState new_state = fn(txn); + txn.Commit(new_state); + return new_state; + } + + public: + // Payload abstraction that can hold a raw char array or a Cord depending on + // how much data it needs to hold. + // The caller is responsible for managing the lifetime of the payload. + // TODO: Deduplicate with the LazyField::UnparsedPayload. + class UnparsedPayload { + enum Tag : uintptr_t { + kTagEmpty = 0, + kTagArray = 1, + kTagCord = 2, + + kTagBits = 3, + kRemoveMask = ~kTagBits, + }; + + public: + using ArraySizeType = uint16_t; + + // Visit the payload and calls the respective callback. The signatures are: + // - () for kUnset + // - (Cord&) for kCord + // - (absl::string_view) for kArray + // Returns the value returned by the callback. + template + auto Visit(UnsetF unset_f, CordF cord_f, ViewF view_f) const { + Tag t = tag(); + // Using ternary to allow for common-type implicit conversions. + return t == kTagEmpty ? unset_f() + : t == kTagArray ? view_f(AsStringView()) + : cord_f(AsCord()); + } + + Tag tag() const { return static_cast(value_ & kTagBits); } + + bool IsCord() const { + ABSL_DCHECK_EQ(static_cast(value_ & kTagCord), + static_cast(tag() == kTagCord)); + return (value_ & kTagCord) != 0u; + } + + bool IsArray() const { + ABSL_DCHECK_EQ(static_cast(value_ & kTagArray), + static_cast(tag() == kTagArray)); + return (value_ & kTagArray) != 0u; + } + + // Requires: IsCord() + absl::Cord& AsCord() const { + ABSL_DCHECK(IsCord()); + return *reinterpret_cast(value_ & kRemoveMask); + } + + // Return the payload as Cord regardless of the existing storage. + absl::Cord ForceAsCord() const { + return Visit([] { return absl::Cord(); }, // + [](const auto& c) { return c; }, + [](auto view) { return absl::Cord(view); }); + } + + // Similar to AsCord(), but if the payload is not already a Cord it will + // convert it first, maintaining existing bytes. + absl::Cord& UpgradeToCord(Arena* arena) { + if (IsCord()) return AsCord(); + absl::Cord new_cord(AsStringView()); + return InitAsCord(arena, std::move(new_cord)); + } + + // Requires: input array is the untagged value. + ArraySizeType GetArraySize(const char* array) const { + ABSL_DCHECK_EQ(array, reinterpret_cast(value_ - kTagArray)); + ArraySizeType size; + memcpy(&size, array, sizeof(size)); + return size; + } + + void SetArraySize(void* array, ArraySizeType size) const { + ABSL_DCHECK_EQ(array, reinterpret_cast(value_ - kTagArray)); + memcpy(array, &size, sizeof(ArraySizeType)); + } + + void SetArraySize(ArraySizeType size) const { + void* array = reinterpret_cast(value_ - kTagArray); + memcpy(array, &size, sizeof(ArraySizeType)); + } + + // Requires: !IsCord() + absl::string_view AsStringView() const { + switch (tag()) { + case kTagEmpty: + return {}; + + case kTagArray: { + const char* array = reinterpret_cast(value_ - kTagArray); + auto size = GetArraySize(array); + return absl::string_view(array + sizeof(size), size); + } + + default: + Unreachable(); + } + } + + // Clear the payload. After this call `Size()==0` and `IsEmpty()==true`, but + // it is not necessarily true that `tag()==kTagEmpty`. + // In particular, it keeps the Cord around in case it needs to be reused. + void Clear() { + switch (tag()) { + case kTagEmpty: + case kTagArray: + value_ = 0; + break; + default: + AsCord().Clear(); + break; + } + } + + // Destroys allocated memory if necessary. Does not reset the object. + void Destroy() { + if (IsCord()) delete &AsCord(); + } + + bool IsEmpty() const { + return Visit([] { return true; }, + [](const auto& cord) { return cord.empty(); }, + [](auto view) { + ABSL_DCHECK(!view.empty()); + return false; + }); + } + + size_t Size() const { + return Visit([] { return 0; }, + [](const auto& cord) { return cord.size(); }, + [](auto view) { return view.size(); }); + } + + // Sets the currently value as a Cord constructed from `args...`. + // It will clean up the existing value if necessary. + template + void SetCord(Arena* arena, Arg&& arg) { + if (IsCord()) { + // Reuse the existing cord. + AsCord() = std::forward(arg); + } else { + absl::Cord* cord = + Arena::Create(arena, std::forward(arg)); + value_ = reinterpret_cast(cord) | kTagCord; + } + } + + // Initialize the value as a Cord constructed from `args...` + // Ignores existing value. + template + absl::Cord& InitAsCord(Arena* arena, Args&&... args) { + auto* cord = + Arena::Create(arena, std::forward(args)...); + value_ = reinterpret_cast(cord) | kTagCord; + return *cord; + } + + // Initialize the value as an array copied from `view`. The tailing bytes + // are set to 0 to avoid UB. + // Ignores existing value. + void InitAndSetArray(Arena* arena, absl::string_view view) { + char* array = InitAsArray(arena, view.size()); + memcpy(array, view.data(), view.size()); + if (view.size() < kMaxArraySize) { + // Memset uninit data to avoid UB later. + memset(array + view.size(), '\0', kMaxArraySize - view.size()); + } + ABSL_DCHECK_EQ(view, AsStringView()); + } + + // Initialize the value as an array copied from `cord`. The tailing bytes + // are set to 0 to avoid UB. + // Ignores existing value. + void InitAndSetArray(Arena* arena, const absl::Cord& cord) { + auto size = cord.size(); + char* array = InitAsArray(arena, size); + cord.CopyToArray(array); + if (size < kMaxArraySize) { + // Memset uninit data to avoid UB later. + memset(array + size, '\0', kMaxArraySize - size); + } + } + + // Initialize the value as an array of size `size`. The payload bytes are + // uninitialized. + // Ignores existing value. + char* InitAsArray(Arena* arena, ArraySizeType size) { + ABSL_DCHECK(arena != nullptr); + // Allocate max allowed capacity. + // TODO: improve this to reduce waste when the size is small. + void* c = arena->AllocateAligned(kMaxArraySize + sizeof(ArraySizeType)); + ABSL_DCHECK_EQ(reinterpret_cast(c) & kTagBits, uintptr_t{0}); + value_ = reinterpret_cast(c) | kTagArray; + SetArraySize(c, size); + return static_cast(c) + sizeof(ArraySizeType); + } + + void AppendToArray(absl::string_view view) { + char* array = reinterpret_cast(value_ - kTagArray); + ArraySizeType size = GetArraySize(array); + char* c = array + sizeof(size) + size; + size += view.size(); + SetArraySize(array, size); + memcpy(c, view.data(), view.size()); + } + + void ZeroOutTailingBytes() { + char* array = reinterpret_cast(value_ - kTagArray); + auto size = GetArraySize(array); + if (size < kMaxArraySize) { + memset(array + sizeof(ArraySizeType) + size, '\0', + kMaxArraySize - size); + } + } + + size_t SpaceUsedExcludingSelf() const { + return Visit( + [] { return 0; }, + [](const auto& cord) { return cord.EstimatedMemoryUsage(); }, + [](auto view) { return kMaxArraySize + sizeof(ArraySizeType); }); + } + + void TransferHeapOwnershipToArena(Arena* arena) { + ABSL_DCHECK(tag() == kTagCord || tag() == kTagEmpty); + if (IsCord()) arena->Own(&AsCord()); + } + + private: + uintptr_t value_ = 0; + }; + + public: + static bool ParseWithOuterContext(RepeatedPtrFieldBase* value, + const absl::Cord& input, ParseContext* ctx, + const MessageLite* prototype, + bool set_missing_required); + static bool ParseWithOuterContext(RepeatedPtrFieldBase* value, + absl::string_view input, ParseContext* ctx, + const MessageLite* prototype, + bool set_missing_required); + + private: + // This method has to be below the definition of class UnparsedPayload due to + // the call to `unparsed_.Visit`. + // TODO: Deduplicate with LazyField. + MessageState DoParse(RepeatedPtrFieldBase* old, const MessageLite& prototype, + Arena* arena, ParseContext* ctx, + bool maybe_uninitialized) const { + auto* value = + (old == nullptr) ? Arena::Create(arena) : old; + if (!unparsed_.Visit( + [] { return true; }, + [&](const auto& cord) { + return ParseWithOuterContext(value, cord, ctx, &prototype, + maybe_uninitialized); + }, + [&](auto view) { + return ParseWithOuterContext(value, view, ctx, &prototype, + maybe_uninitialized); + })) { + // If this is called by eager verficiation, ctx != nullptr and logging + // parsing error in that case is likely redundant because the parsing will + // fail anyway. Users who care about parsing errors would have already + // checked the return value and others may find the error log unexpected. + // + // `ctx == nullptr` means it's not eagerly verified (e.g. unverified lazy) + // and logging in that case makes sense. + if (ctx == nullptr) { + LogParseError(value); + } + return MessageState(value, RawState::kParseError); + } + return MessageState(value, maybe_uninitialized + ? RawState::kIsParsedMaybeUninitialized + : RawState::kIsParsed); + } + + // Mutable because it is initialized lazily. + // A MessageState is a tagged RepeatedPtrFieldBase* + mutable std::atomic raw_; + + // NOT mutable because we keep the payload around until the message changes in + // some way. + UnparsedPayload unparsed_; + // absl::Cord will make copies on anything under this limit, so we might as + // well do the copies into our own buffer instead. + static constexpr size_t kMaxArraySize = 512; + static_assert(kMaxArraySize <= + std::numeric_limits::max()); + friend class ::google::protobuf::Arena; + friend class ::google::protobuf::Reflection; + friend class ExtensionSet; + typedef void InternalArenaConstructable_; + typedef void DestructorSkippable_; + + // Logs a parsing error. + static void LogParseError(const RepeatedPtrFieldBase* value); + + bool IsAllocated() const { + return raw_.load(std::memory_order_acquire).value() != nullptr; + } + + // For testing purposes. + friend class LazyRepeatedPtrFieldTest; + friend class LazyRepeatedInMessageTest; + template + void OverwriteForTest(RawState status, const absl::Cord& unparsed, + RepeatedPtrField* value, Arena* arena); +}; + +inline LazyRepeatedPtrField::~LazyRepeatedPtrField() { + const auto* value = raw_.load(std::memory_order_relaxed).value(); + delete reinterpret_cast*>(value); + unparsed_.Destroy(); +} + +// TODO: Deduplicate with LazyField. +inline const RepeatedPtrFieldBase* LazyRepeatedPtrField::TryGetRepeated() + const { + switch (GetLogicalState()) { + case LogicalState::kDirty: + case LogicalState::kNoParseRequired: + case LogicalState::kParseRequired: + return raw_.load(std::memory_order_relaxed).value(); + case LogicalState::kClear: + case LogicalState::kClearExposed: + return nullptr; + } + internal::Unreachable(); + return nullptr; +} + +// ------------------------------------------------------------------- +// Testing stuff. + +// It's in the header due to the template. +// TODO: Deduplicate with LazyField. +template +void LazyRepeatedPtrField::OverwriteForTest(RawState status, + const absl::Cord& unparsed, + RepeatedPtrField* value, + Arena* arena) { + auto raw = raw_.load(std::memory_order_relaxed); + if (arena == nullptr) { + delete reinterpret_cast*>(raw.value()); + } + raw.set_value(reinterpret_cast(value)); + raw.set_status(status); + if (!unparsed.empty()) { + if (arena != nullptr && unparsed.size() <= kMaxArraySize) { + unparsed_.InitAndSetArray(arena, unparsed); + } else { + unparsed_.SetCord(arena, unparsed); + } + } + raw_.store(raw, std::memory_order_relaxed); +} + +} // namespace internal +} // namespace protobuf +} // namespace google + +#include "google/protobuf/port_undef.inc" + +#endif // GOOGLE_PROTOBUF_LAZY_REPEATED_FIELD_H__ diff --git a/src/google/protobuf/lazy_repeated_field_heavy.cc b/src/google/protobuf/lazy_repeated_field_heavy.cc new file mode 100644 index 0000000000..1c4e1bf686 --- /dev/null +++ b/src/google/protobuf/lazy_repeated_field_heavy.cc @@ -0,0 +1,401 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2023 Google Inc. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +#include +#include +#include +#include +#include + +#include "absl/log/absl_check.h" +#include "absl/log/absl_log.h" +#include "absl/strings/cord.h" +#include "absl/strings/escaping.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_replace.h" +#include "absl/strings/string_view.h" +#include "google/protobuf/arena.h" +#include "google/protobuf/io/zero_copy_stream_impl_lite.h" +#include "google/protobuf/lazy_repeated_field.h" +#include "google/protobuf/message.h" +#include "google/protobuf/message_lite.h" +#include "google/protobuf/repeated_ptr_field.h" +#include "google/protobuf/wire_format_lite.h" + +namespace google { +namespace protobuf { +namespace internal { +namespace { + +class ByFactory { + public: + explicit ByFactory(const Descriptor* type, MessageFactory* factory) + : type_(type), factory_(factory) {} + + Message* New(Arena* arena) const { + return factory_->GetPrototype(type_)->New(arena); + } + + const Message& Default() const { return *factory_->GetPrototype(type_); } + + private: + const Descriptor* type_; + MessageFactory* factory_; +}; + +// Escape C++ trigraphs by escaping question marks to \? +std::string EscapeTrigraphs(absl::string_view to_escape) { + return absl::StrReplaceAll(to_escape, {{"?", "\\?"}}); +} + +std::string EscapeEncoded(absl::string_view encoded) { + std::string out; + out.reserve(encoded.size() * 2); + constexpr size_t kBytesPerLine = 25; + for (size_t i = 0; i < encoded.size(); i += kBytesPerLine) { + absl::StrAppend( + &out, "\"", + EscapeTrigraphs(absl::CEscape(encoded.substr(i, kBytesPerLine))), + "\"\n"); + } + return out; +} + +// Deterministic serialization is required to minimize false positives: e.g. +// ordering, redundant wire format data, etc. Such discrepancies are +// expected and tolerated. To prevent this serialization starts yet another +// consistency check, we should skip consistency-check. +std::string DeterministicSerialization(const google::protobuf::MessageLite& m) { + std::string result; + { + google::protobuf::io::StringOutputStream sink(&result); + google::protobuf::io::CodedOutputStream out(&sink); + out.SetSerializationDeterministic(true); + out.SkipCheckConsistency(); + m.SerializePartialToCodedStream(&out); + } + return result; +} + +// If LazyField is initialized, unparsed and message should be consistent. If +// a LazyField is mutated via const_cast, that may break. We should rather fail +// than silently propagate such discrepancy. Note that this aims to detect +// missing/added data. +void VerifyConsistency(LazyRepeatedPtrField::LogicalState state, + const RepeatedPtrFieldBase* value, + const MessageLite* prototype, const absl::Cord& unparsed, + io::EpsCopyOutputStream* stream) { +#ifndef NDEBUG + if (stream != nullptr && !stream->ShouldCheckConsistency()) return; + if (state != LazyRepeatedPtrField::LogicalState::kNoParseRequired) return; + + RepeatedPtrField unparsed_msg; + if (!LazyRepeatedPtrField::ParseWithOuterContext( + reinterpret_cast(&unparsed_msg), unparsed, + nullptr, prototype, /*set_missing_required=*/false)) { + // Bail out on parse failure as it can result in false positive + // inconsistency and ABSL_CHECK failure. Warn instead. + ABSL_LOG(WARNING) + << "Verify skipped due to parse falure: RepeatedPtrField of " + << prototype->GetTypeName(); + return; + } + + const auto* msgs = reinterpret_cast*>(value); + // Eagerly parse all lazy fields to eliminate non-canonical wireformat data. + for (int i = 0; i < msgs->size(); i++) { + // Clone a new one from the original to eagerly parse all lazy + // fields. + const auto& msg = msgs->Get(i); + std::unique_ptr clone(msg.New()); + clone->CopyFrom(msg); + EagerParseLazyFieldIgnoreUnparsed(*clone); + EagerParseLazyFieldIgnoreUnparsed(*unparsed_msg.Mutable(i)); + ABSL_DCHECK_EQ(DeterministicSerialization(*clone), + DeterministicSerialization(unparsed_msg.Get(i))) + << "RepeatedPtrField<" << msg.GetTypeName() << ">(" << i << ")" + << ": likely mutated via getters + const_cast\n" + << "unparsed:\n" + << EscapeEncoded(DeterministicSerialization(unparsed_msg.Get(i))) + << "\nmessage:\n" + << EscapeEncoded(DeterministicSerialization(*clone)); + } +#endif // !NDEBUG +} + +} // namespace + +LazyRepeatedPtrField::LazyRepeatedPtrField(Arena* arena, + const LazyRepeatedPtrField& rhs, + Arena* rhs_arena) + : raw_(MessageState(RawState::kCleared)) { + switch (rhs.GetLogicalState()) { + case LogicalState::kClear: + case LogicalState::kClearExposed: + return; // Leave uninitialized / empty + case LogicalState::kNoParseRequired: + case LogicalState::kParseRequired: { + rhs.unparsed_.Visit( + [] {}, // + [&](const auto& cord) { unparsed_.InitAsCord(arena, cord); }, + [&](auto view) { + if (arena == nullptr) { + unparsed_.InitAsCord(nullptr, view); + } else { + unparsed_.InitAndSetArray(arena, view); + } + }); + raw_.store( + MessageState(nullptr, rhs.MaybeUninitialized() + ? RawState::kNeedsParseMaybeUninitialized + : RawState::kNeedsParse), + std::memory_order_relaxed); + return; + } + case LogicalState::kDirty: { + MessageState state = rhs.raw_.load(std::memory_order_relaxed); + const auto* src = state.value(); + if (src->empty()) { + return; // Leave uninitialized / empty + } + // Retain the existing IsParsed or IsParsedMaybeUninitialized status. + // TODO: use copy construction. + auto new_state = state.status(); + auto* value = Arena::Create(arena); + // MergeFrom calls reserve. + value->MergeFrom(*src); + raw_.store(MessageState(value, new_state), std::memory_order_relaxed); + return; + } + } +} + +const RepeatedPtrFieldBase* LazyRepeatedPtrField::GetDynamic( + const Descriptor* type, MessageFactory* factory, Arena* arena) const { + return GetGeneric(ByFactory(type, factory), arena, nullptr); +} + +RepeatedPtrFieldBase* LazyRepeatedPtrField::MutableDynamic( + const Descriptor* type, MessageFactory* factory, Arena* arena) { + return MutableGeneric(ByFactory(type, factory), arena, nullptr); +} + +size_t LazyRepeatedPtrField::SpaceUsedExcludingSelfLong() const { + // absl::Cord::EstimatedMemoryUsage counts itself that should be excluded + // because sizeof(Cord) is already counted in self. + size_t total_size = unparsed_.SpaceUsedExcludingSelf(); + switch (GetLogicalState()) { + case LogicalState::kClearExposed: + case LogicalState::kNoParseRequired: + case LogicalState::kDirty: { + const auto* value = raw_.load(std::memory_order_relaxed).value(); + total_size += + value->SpaceUsedExcludingSelfLong>(); + } break; + case LogicalState::kClear: + case LogicalState::kParseRequired: + // We may have a `Message*` here, but we cannot safely access it + // because, a racing SharedInit could delete it out from under us. + // Other states in this structure are already passed kSharedInit and are + // thus safe. + break; // Nothing to add. + } + return total_size; +} + +template +bool LazyRepeatedPtrField::MergeFrom(const MessageLite* prototype, + const Input& data, Arena* arena) { + switch (GetLogicalState()) { + case LogicalState::kParseRequired: { + unparsed_.UpgradeToCord(arena).Append(data); + break; + } + case LogicalState::kClear: { + size_t num_bytes = data.size(); + ABSL_DCHECK(num_bytes > 0); + if (arena == nullptr || num_bytes > kMaxArraySize || unparsed_.IsCord()) { + unparsed_.SetCord(arena, data); + } else { + unparsed_.InitAndSetArray(arena, data); + } + SetNeedsParse(); + break; + } + + // Pointer was previously exposed merge into that object. + case LogicalState::kClearExposed: + case LogicalState::kNoParseRequired: + case LogicalState::kDirty: { + auto new_state = PerformTransition([&](ExclusiveTxn& txn) { + auto* value = txn.mutable_value(); + bool res = + ParseWithOuterContext(value, data, /*ctx=*/nullptr, prototype, + /*set_missing_required=*/false); + if (!res) { + LogParseError(value); + return RawState::kParseError; + } else { + return RawState::kIsParsed; + } + }); + return new_state == RawState::kIsParsed; + } + } + return true; +} + +void LazyRepeatedPtrField::MergeMaybeUninitializedState( + const LazyRepeatedPtrField& other) { + if (MaybeUninitialized() || !other.MaybeUninitialized()) return; + + switch (GetLogicalState()) { + case LogicalState::kParseRequired: + SetNeedsParseMaybeUninitialized(); + break; + case LogicalState::kNoParseRequired: + SetParseNotRequiredMaybeUninitialized(); + break; + default: + break; + } +} + +void LazyRepeatedPtrField::MergeFrom(const MessageLite* prototype, + const LazyRepeatedPtrField& other, + Arena* arena, Arena* other_arena) { +#ifndef NDEBUG + VerifyConsistency(other.GetLogicalState(), + other.raw_.load(std::memory_order_relaxed).value(), + prototype, other.unparsed_.ForceAsCord(), nullptr); +#endif // !NDEBUG + switch (other.GetLogicalState()) { + case LogicalState::kClear: + case LogicalState::kClearExposed: + return; // Nothing to do. + + case LogicalState::kParseRequired: + case LogicalState::kNoParseRequired: + if (other.unparsed_.IsCord()) { + MergeFrom(prototype, other.unparsed_.AsCord(), arena); + } else { + MergeFrom(prototype, other.unparsed_.AsStringView(), arena); + } + MergeMaybeUninitializedState(other); + return; + + case LogicalState::kDirty: { + const auto* other_value = + other.raw_.load(std::memory_order_relaxed).value(); + if (other_value->empty()) { + return; + } + auto* value = MutableByPrototype(prototype, arena); + value->MergeFrom(*other_value); + // No need to merge uninitialized state. + ABSL_DCHECK(GetLogicalState() == LogicalState::kDirty); + return; + } + } +} + +uint8_t* LazyRepeatedPtrField::InternalWrite( + const MessageLite* prototype, int32_t number, uint8_t* target, + io::EpsCopyOutputStream* stream) const { +#ifndef NDEBUG + VerifyConsistency(GetLogicalState(), + raw_.load(std::memory_order_relaxed).value(), prototype, + unparsed_.ForceAsCord(), stream); +#endif // !NDEBUG + switch (GetLogicalState()) { + case LogicalState::kClear: + case LogicalState::kClearExposed: + case LogicalState::kNoParseRequired: + case LogicalState::kParseRequired: + // If deterministic is enabled then attempt to parse to a message which + // can then be serialized deterministically. (The serialized bytes may + // have been created undeterministically). + if (stream->IsSerializationDeterministic() && prototype != nullptr) { + RepeatedPtrField value; + // TODO: Test this path. + bool success = unparsed_.Visit( + [] { return true; }, + [&](const auto& cord) { + // `set_missing_required = false` to avoid checking require fields + // (simialr to Message::ParsePartial*). + return ParseWithOuterContext( + reinterpret_cast(&value), cord, + /*ctx=*/nullptr, prototype, /*set_missing_required=*/false); + }, + [&](auto view) { + return ParseWithOuterContext( + reinterpret_cast(&value), view, + /*ctx=*/nullptr, prototype, false); + }); + if (success) { + size_t tag_size = WireFormatLite::TagSize( + number, WireFormatLite::FieldType::TYPE_MESSAGE); + auto count = tag_size * value.size(); + for (int i = 0; i < value.size(); i++) { + count += WireFormatLite::LengthDelimitedSize( + value.Get(i).ByteSizeLong()); + } + + // Serialization takes place in two phases: + // 1) Figure out the expected number of bytes (e.g. ByteSizeLong() on + // the container message) 2) InternalWrite the bytes. + // + // There is a golden contract that the # of bytes written matches + // the returned value from the first step. + // + // In this case unparsed_ was used as the source of truth for the + // number of bytes. There are some known cases where the number of + // serialized bytes is different than the number of bytes written + // by a message parsed from the serialized bytes. For example if the + // serialized representation contained multiple entries for the same + // non-repeated field the duplicates are removed upon parsing. + // + // If this (relatively rare) case is hit then there is no choice + // but to serialize the original unparsed bytes; otherwise the + // golden contract is broken. + // It's possible for the size to change if the unparsed_ was not + // canonical, for example it can have repeated entries for the same + // tag (this is more common then you would think). + if (count == unparsed_.Size()) { + for (int i = 0, n = value.size(); i < n; i++) { + const auto& repfield = value.Get(i); + target = WireFormatLite::InternalWriteMessage( + number, repfield, repfield.GetCachedSize(), target, stream); + } + return target; + } + } + } + return unparsed_.Visit( + [&] { return target; }, + [&](const auto& cord) { return stream->WriteCord(cord, target); }, + [&](auto view) { + return stream->WriteRaw(view.data(), view.size(), target); + }); + case LogicalState::kDirty: { + const auto* value = raw_.load(std::memory_order_relaxed).value(); + for (int i = 0, n = value->size(); i < n; i++) { + const auto& repfield = value->Get>(i); + target = WireFormatLite::InternalWriteMessage( + number, repfield, repfield.GetCachedSize(), target, stream); + } + return target; + } + } + // Required for certain compiler configurations. + ABSL_LOG(FATAL) << "Not reachable"; + return nullptr; +} + +} // namespace internal +} // namespace protobuf +} // namespace google diff --git a/src/google/protobuf/repeated_ptr_field.h b/src/google/protobuf/repeated_ptr_field.h index 9dfa8e8801..fcd99edc1b 100644 --- a/src/google/protobuf/repeated_ptr_field.h +++ b/src/google/protobuf/repeated_ptr_field.h @@ -576,6 +576,7 @@ class PROTOBUF_EXPORT RepeatedPtrFieldBase { // subclass. friend class google::protobuf::Reflection; friend class internal::SwapFieldHelper; + friend class LazyRepeatedPtrField; // Concrete Arena enabled copy function used to copy messages instances. // This follows the `Arena::CopyConstruct` signature so that the compiler