Added field-level control over UTF-8 enforcement, and started respecting `enforce_utf8` in google3.

Prior to this CL, upb MiniDescriptors only allowed per-message control over UTF-8.  This CL adds a new field-level modifier to upb MiniDescriptors that can flip a field to the opposite of the message default, like we already have for packed, allowing per-field control over UTF-8 validation.

PiperOrigin-RevId: 566638331
pull/14134/head
Joshua Haberman 1 year ago committed by Copybara-Service
parent df77c90305
commit 0f02930475
  1. 25
      upb/upb/message/BUILD
  2. 149
      upb/upb/message/utf8_test.cc
  3. 71
      upb/upb/message/utf8_test.proto
  4. 15
      upb/upb/mini_descriptor/decode.c
  5. 17
      upb/upb/mini_descriptor/internal/encode.c
  6. 2
      upb/upb/mini_descriptor/internal/modifiers.h
  7. 1
      upb/upb/mini_descriptor/internal/wire_constants.h
  8. 32
      upb/upb/reflection/field_def.c
  9. 1
      upb/upb/reflection/field_def.h
  10. 25
      upb/upb/reflection/message_def.c

@ -311,6 +311,31 @@ cc_test(
],
)
proto_library(
name = "utf8_test_proto",
testonly = 1,
srcs = ["utf8_test.proto"],
deps = ["//src/google/protobuf:test_messages_proto3_proto"],
)
upb_proto_library(
name = "utf8_test_upb_proto",
testonly = 1,
deps = [":utf8_test_proto"],
)
cc_test(
name = "utf8_test",
srcs = ["utf8_test.cc"],
deps = [
":utf8_test_upb_proto",
"//upb:base",
"//upb:mem",
"//upb:wire",
"@com_google_googletest//:gtest_main",
],
)
# begin:github_only
filegroup(
name = "source_files",

@ -0,0 +1,149 @@
// Protocol Buffers - Google's data interchange format
// Copyright 2023 Google LLC. All rights reserved.
// https://developers.google.com/protocol-buffers/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google LLC nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <stdlib.h>
#include "gtest/gtest.h"
#include "upb/upb/base/string_view.h"
#include "upb/upb/mem/arena.h"
#include "upb/upb/mem/arena.hpp"
#include "upb/upb/message/utf8_test.upb.h"
#include "upb/upb/wire/decode.h"
namespace {
const char bad_utf8[] = "\xff";
static char* GetBadUtf8Payload(upb_Arena* arena, size_t* size) {
upb_test_TestUtf8Bytes* msg = upb_test_TestUtf8Bytes_new(arena);
upb_test_TestUtf8Bytes_set_data(msg, upb_StringView_FromString(bad_utf8));
char* data = upb_test_TestUtf8Bytes_serialize(msg, arena, size);
EXPECT_TRUE(data != nullptr);
return data;
}
TEST(Utf8Test, BytesFieldDoesntValidate) {
upb::Arena arena;
size_t size;
char* data = GetBadUtf8Payload(arena.ptr(), &size);
upb_test_TestUtf8Bytes* msg2 =
upb_test_TestUtf8Bytes_parse(data, size, arena.ptr());
// Parse succeeds, because the bytes field does not validate UTF-8.
ASSERT_TRUE(msg2 != nullptr);
}
TEST(Utf8Test, Proto3FieldValidates) {
upb::Arena arena;
size_t size;
char* data = GetBadUtf8Payload(arena.ptr(), &size);
upb_test_TestUtf8Proto3String* msg =
upb_test_TestUtf8Proto3String_new(arena.ptr());
upb_DecodeStatus status =
upb_Decode(data, size, msg, &upb_test_TestUtf8Proto3String_msg_init,
nullptr, 0, arena.ptr());
// Parse fails, because proto3 string fields validate UTF-8.
ASSERT_EQ(kUpb_DecodeStatus_BadUtf8, status);
}
TEST(Utf8Test, RepeatedProto3FieldValidates) {
upb::Arena arena;
size_t size;
char* data = GetBadUtf8Payload(arena.ptr(), &size);
upb_test_TestUtf8RepeatedProto3String* msg =
upb_test_TestUtf8RepeatedProto3String_new(arena.ptr());
upb_DecodeStatus status = upb_Decode(
data, size, msg, &upb_test_TestUtf8RepeatedProto3String_msg_init, nullptr,
0, arena.ptr());
// Parse fails, because proto3 string fields validate UTF-8.
ASSERT_EQ(kUpb_DecodeStatus_BadUtf8, status);
}
// begin:google_only
// TEST(Utf8Test, Proto3MixedFieldValidates) {
// upb::Arena arena;
// size_t size;
// char* data = GetBadUtf8Payload(arena.ptr(), &size);
//
// upb_test_TestUtf8Proto3StringMixed* msg =
// upb_test_TestUtf8Proto3StringMixed_new(arena.ptr());
//
// upb_DecodeStatus status =
// upb_Decode(data, size, msg, &upb_test_TestUtf8Proto3StringMixed_msg_init,
// nullptr, 0, arena.ptr());
//
// // Parse fails, because proto3 string fields validate UTF-8.
// ASSERT_EQ(kUpb_DecodeStatus_BadUtf8, status);
// }
//
// TEST(Utf8Test, EnforceUtf8Options) {
// upb::Arena arena;
// size_t size;
// char* data = GetBadUtf8Payload(arena.ptr(), &size);
// upb_test_TestUtf8Proto3StringEnforceUtf8False* msg2 =
// upb_test_TestUtf8Proto3StringEnforceUtf8False_parse(data, size,
// arena.ptr());
//
// // Parse succeeds, because enforce_utf8=false inhibits utf-8 validation.
// ASSERT_TRUE(msg2 != nullptr);
// }
//
// TEST(Utf8Test, RepeatedEnforceUtf8Options) {
// upb::Arena arena;
// size_t size;
// char* data = GetBadUtf8Payload(arena.ptr(), &size);
// upb_test_TestUtf8RepeatedProto3StringEnforceUtf8False* msg2 =
// upb_test_TestUtf8RepeatedProto3StringEnforceUtf8False_parse(data, size,
// arena.ptr());
//
// // Parse succeeds, because enforce_utf8=false inhibits utf-8 validation.
// ASSERT_TRUE(msg2 != nullptr);
// }
//
// TEST(Utf8Test, EnforceUtf8OptionsMixed) {
// upb::Arena arena;
// size_t size;
// char* data = GetBadUtf8Payload(arena.ptr(), &size);
// upb_test_TestUtf8Proto3StringEnforceUtf8FalseMixed* msg2 =
// upb_test_TestUtf8Proto3StringEnforceUtf8FalseMixed_parse(data, size,
// arena.ptr());
//
// // Parse succeeds, because enforce_utf8=false inhibits utf-8 validation.
// ASSERT_TRUE(msg2 != nullptr);
// }
// end:google_only
} // namespace

@ -0,0 +1,71 @@
// Protocol Buffers - Google's data interchange format
// Copyright 2023 Google LLC. All rights reserved.
// https://developers.google.com/protocol-buffers/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google LLC nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
syntax = "proto3";
package upb_test;
message TestUtf8Bytes {
optional bytes data = 1;
}
message TestUtf8Proto3String {
optional string data = 1;
}
message TestUtf8RepeatedProto3String {
repeated string data = 1;
}
// begin:google_only
message TestUtf8Proto3StringMixed {
optional string data = 1;
// Add other fields with different enforce_utf8 values, to ensure we can
// handle the mix.
optional string other_data = 2;
}
message TestUtf8Proto3StringEnforceUtf8False {
optional string data = 1;
}
message TestUtf8RepeatedProto3StringEnforceUtf8False {
optional string data = 1;
}
message TestUtf8Proto3StringEnforceUtf8FalseMixed {
optional string data = 1;
// Add other fields with different enforce_utf8 values, to ensure we can
// handle the mix.
optional string other_data = 2;
}
// end:google_only

@ -33,12 +33,14 @@
#include <inttypes.h>
#include <stdlib.h>
#include "upb/upb/base/descriptor_constants.h"
#include "upb/upb/base/string_view.h"
#include "upb/upb/mem/arena.h"
#include "upb/upb/mini_descriptor/internal/base92.h"
#include "upb/upb/mini_descriptor/internal/decoder.h"
#include "upb/upb/mini_descriptor/internal/modifiers.h"
#include "upb/upb/mini_descriptor/internal/wire_constants.h"
#include "upb/upb/mini_table/internal/field.h"
// Must be last.
#include "upb/upb/port/def.inc"
@ -219,6 +221,19 @@ static void upb_MtDecoder_ModifyField(upb_MtDecoder* d,
field->mode ^= kUpb_LabelFlags_IsPacked;
}
if (field_modifiers & kUpb_EncodedFieldModifier_FlipValidateUtf8) {
if (field->UPB_PRIVATE(descriptortype) != kUpb_FieldType_Bytes ||
!(field->mode & kUpb_LabelFlags_IsAlternate)) {
upb_MdDecoder_ErrorJmp(
&d->base,
"Cannot flip ValidateUtf8 on field %" PRIu32 ", type=%d, mode=%d",
field->number, (int)field->UPB_PRIVATE(descriptortype),
(int)field->mode);
}
field->UPB_PRIVATE(descriptortype) = kUpb_FieldType_String;
field->mode &= ~kUpb_LabelFlags_IsAlternate;
}
bool singular = field_modifiers & kUpb_EncodedFieldModifier_IsProto3Singular;
bool required = field_modifiers & kUpb_EncodedFieldModifier_IsRequired;

@ -30,6 +30,10 @@
#include "upb/upb/mini_descriptor/internal/encode.h"
#include <assert.h>
#include <stddef.h>
#include <stdint.h>
#include "upb/upb/base/internal/log2.h"
#include "upb/upb/mini_descriptor/internal/base92.h"
#include "upb/upb/mini_descriptor/internal/modifiers.h"
@ -228,6 +232,19 @@ static char* _upb_MtDataEncoder_MaybePutModifiers(upb_MtDataEncoder* e,
}
}
if (type == kUpb_FieldType_String) {
bool field_validates_utf8 = field_mod & kUpb_FieldModifier_ValidateUtf8;
bool message_validates_utf8 =
in->state.msg_state.msg_modifiers & kUpb_MessageModifier_ValidateUtf8;
if (field_validates_utf8 != message_validates_utf8) {
// Old binaries do not recognize the field modifier. We need the failure
// mode to be too lax rather than too strict. Our caller should have
// handled this (see _upb_MessageDef_ValidateUtf8()).
assert(!message_validates_utf8);
encoded_modifiers |= kUpb_EncodedFieldModifier_FlipValidateUtf8;
}
}
if (field_mod & kUpb_FieldModifier_IsProto3Singular) {
encoded_modifiers |= kUpb_EncodedFieldModifier_IsProto3Singular;
}

@ -40,8 +40,10 @@ typedef enum {
kUpb_FieldModifier_IsClosedEnum = 1 << 2,
kUpb_FieldModifier_IsProto3Singular = 1 << 3,
kUpb_FieldModifier_IsRequired = 1 << 4,
kUpb_FieldModifier_ValidateUtf8 = 1 << 5,
} kUpb_FieldModifier;
// These modifiers are also used on the wire.
typedef enum {
kUpb_MessageModifier_ValidateUtf8 = 1 << 0,
kUpb_MessageModifier_DefaultIsPacked = 1 << 1,

@ -64,6 +64,7 @@ typedef enum {
kUpb_EncodedFieldModifier_FlipPacked = 1 << 0,
kUpb_EncodedFieldModifier_IsRequired = 1 << 1,
kUpb_EncodedFieldModifier_IsProto3Singular = 1 << 2,
kUpb_EncodedFieldModifier_FlipValidateUtf8 = 1 << 3,
} upb_EncodedFieldModifier;
enum {

@ -32,16 +32,16 @@
#include <ctype.h>
#include <errno.h>
#include <stdbool.h>
#include "upb/upb/base/descriptor_constants.h"
#include "upb/upb/mini_descriptor/decode.h"
#include "upb/upb/mini_descriptor/internal/modifiers.h"
#include "upb/upb/reflection/def.h"
#include "upb/upb/reflection/def_pool.h"
#include "upb/upb/reflection/def_type.h"
#include "upb/upb/reflection/internal/def_builder.h"
#include "upb/upb/reflection/internal/desc_state.h"
#include "upb/upb/reflection/internal/enum_def.h"
#include "upb/upb/reflection/internal/enum_value_def.h"
#include "upb/upb/reflection/internal/file_def.h"
#include "upb/upb/reflection/internal/message_def.h"
#include "upb/upb/reflection/internal/oneof_def.h"
@ -270,6 +270,29 @@ bool _upb_FieldDef_IsProto3Optional(const upb_FieldDef* f) {
int _upb_FieldDef_LayoutIndex(const upb_FieldDef* f) { return f->layout_index; }
// begin:google_only
// static bool _upb_FieldDef_EnforceUtf8Option(const upb_FieldDef* f) {
// #if defined(UPB_BOOTSTRAP_STAGE0)
// return true;
// #else
// return UPB_DESC(FieldOptions_enforce_utf8)(f->opts);
// #endif
// }
// end:google_only
// begin:github_only
static bool _upb_FieldDef_EnforceUtf8Option(const upb_FieldDef* f) {
return true;
}
// end:github_only
bool _upb_FieldDef_ValidateUtf8(const upb_FieldDef* f) {
if (upb_FieldDef_Type(f) != kUpb_FieldType_String) return false;
return upb_FileDef_Syntax(upb_FieldDef_File(f)) == kUpb_Syntax_Proto3
? _upb_FieldDef_EnforceUtf8Option(f)
: false;
}
uint64_t _upb_FieldDef_Modifiers(const upb_FieldDef* f) {
uint64_t out = f->is_packed ? kUpb_FieldModifier_IsPacked : 0;
@ -290,6 +313,11 @@ uint64_t _upb_FieldDef_Modifiers(const upb_FieldDef* f) {
if (_upb_FieldDef_IsClosedEnum(f)) {
out |= kUpb_FieldModifier_IsClosedEnum;
}
if (_upb_FieldDef_ValidateUtf8(f)) {
out |= kUpb_FieldModifier_ValidateUtf8;
}
return out;
}

@ -74,6 +74,7 @@ UPB_API bool upb_FieldDef_IsSubMessage(const upb_FieldDef* f);
UPB_API const char* upb_FieldDef_JsonName(const upb_FieldDef* f);
UPB_API upb_Label upb_FieldDef_Label(const upb_FieldDef* f);
UPB_API const upb_MessageDef* upb_FieldDef_MessageSubDef(const upb_FieldDef* f);
bool _upb_FieldDef_ValidateUtf8(const upb_FieldDef* f);
// Creates a mini descriptor string for a field, returns true on success.
bool upb_FieldDef_MiniDescriptorEncode(const upb_FieldDef* f, upb_Arena* a,

@ -30,6 +30,7 @@
#include "upb/upb/reflection/internal/message_def.h"
#include "upb/upb/base/descriptor_constants.h"
#include "upb/upb/hash/int_table.h"
#include "upb/upb/hash/str_table.h"
#include "upb/upb/mini_descriptor/decode.h"
@ -512,15 +513,37 @@ void _upb_MessageDef_LinkMiniTable(upb_DefBuilder* ctx,
#endif
}
static bool _upb_MessageDef_ValidateUtf8(const upb_MessageDef* m) {
bool has_string = false;
for (int i = 0; i < m->field_count; i++) {
const upb_FieldDef* f = upb_MessageDef_Field(m, i);
// Old binaries do not recognize the field-level "FlipValidateUtf8" wire
// modifier, so we do not actually have field-level control for old
// binaries. Given this, we judge that the better failure mode is to be
// more lax than intended, rather than more strict. To achieve this, we
// only mark the message with the ValidateUtf8 modifier if *all* fields
// validate UTF-8.
if (!_upb_FieldDef_ValidateUtf8(f)) return false;
if (upb_FieldDef_Type(f) == kUpb_FieldType_String) has_string = true;
}
return has_string;
}
static uint64_t _upb_MessageDef_Modifiers(const upb_MessageDef* m) {
uint64_t out = 0;
if (upb_FileDef_Syntax(m->file) == kUpb_Syntax_Proto3) {
out |= kUpb_MessageModifier_ValidateUtf8;
out |= kUpb_MessageModifier_DefaultIsPacked;
}
if (_upb_MessageDef_ValidateUtf8(m)) {
out |= kUpb_MessageModifier_ValidateUtf8;
}
if (m->ext_range_count) {
out |= kUpb_MessageModifier_IsExtendable;
}
return out;
}

Loading…
Cancel
Save