Add kUpb_DecodeOption_AlwaysValidateUtf8 decode option, to force UTF-8 validation of proto2 strings.

PiperOrigin-RevId: 597341799
pull/15378/head
Protobuf Team Bot 1 year ago committed by Copybara-Service
parent 686cfc6ab5
commit d4dfb9c4a6
  1. 21
      upb/message/BUILD
  2. 96
      upb/message/utf8_test.cc
  3. 2
      upb/message/utf8_test.proto
  4. 26
      upb/message/utf8_test_proto2.proto
  5. 11
      upb/wire/decode.c
  6. 10
      upb/wire/decode.h

@ -304,7 +304,12 @@ proto_library(
name = "utf8_test_proto",
testonly = 1,
srcs = ["utf8_test.proto"],
deps = ["//src/google/protobuf:test_messages_proto3_proto"],
)
proto_library(
name = "utf8_test_proto2_proto",
testonly = 1,
srcs = ["utf8_test_proto2.proto"],
)
upb_minitable_proto_library(
@ -313,16 +318,30 @@ upb_minitable_proto_library(
deps = [":utf8_test_proto"],
)
upb_minitable_proto_library(
name = "utf8_test_proto2_upb_minitable_proto",
testonly = 1,
deps = [":utf8_test_proto2_proto"],
)
upb_c_proto_library(
name = "utf8_test_upb_proto",
testonly = 1,
deps = [":utf8_test_proto"],
)
upb_c_proto_library(
name = "utf8_test_proto2_upb_proto",
testonly = 1,
deps = [":utf8_test_proto2_proto"],
)
cc_test(
name = "utf8_test",
srcs = ["utf8_test.cc"],
deps = [
":utf8_test_proto2_upb_minitable_proto",
":utf8_test_proto2_upb_proto",
":utf8_test_upb_minitable_proto",
":utf8_test_upb_proto",
"//upb:base",

@ -14,6 +14,8 @@
#include "upb/mem/arena.hpp"
#include "upb/message/utf8_test.upb.h"
#include "upb/message/utf8_test.upb_minitable.h"
#include "upb/message/utf8_test_proto2.upb.h"
#include "upb/message/utf8_test_proto2.upb_minitable.h"
#include "upb/wire/decode.h"
namespace {
@ -72,6 +74,100 @@ TEST(Utf8Test, RepeatedProto3FieldValidates) {
ASSERT_EQ(kUpb_DecodeStatus_BadUtf8, status);
}
TEST(Utf8Test, Proto2BytesValidates) {
upb::Arena arena;
size_t size;
char* data = GetBadUtf8Payload(arena.ptr(), &size);
upb_test_TestUtf8Proto2Bytes* msg =
upb_test_TestUtf8Proto2Bytes_new(arena.ptr());
upb_DecodeStatus status;
status = upb_Decode(data, size, UPB_UPCAST(msg),
&upb_0test__TestUtf8Proto2Bytes_msg_init, nullptr, 0,
arena.ptr());
// Parse succeeds, because proto2 bytes fields don't validate UTF-8.
ASSERT_EQ(kUpb_DecodeStatus_Ok, status);
}
TEST(Utf8Test, Proto2RepeatedBytesValidates) {
upb::Arena arena;
size_t size;
char* data = GetBadUtf8Payload(arena.ptr(), &size);
upb_test_TestUtf8RepeatedProto2Bytes* msg =
upb_test_TestUtf8RepeatedProto2Bytes_new(arena.ptr());
upb_DecodeStatus status;
status = upb_Decode(data, size, UPB_UPCAST(msg),
&upb_0test__TestUtf8RepeatedProto2Bytes_msg_init, nullptr,
0, arena.ptr());
// Parse succeeds, because proto2 bytes fields don't validate UTF-8.
ASSERT_EQ(kUpb_DecodeStatus_Ok, status);
}
TEST(Utf8Test, Proto2StringValidates) {
upb::Arena arena;
size_t size;
char* data = GetBadUtf8Payload(arena.ptr(), &size);
upb_test_TestUtf8Proto2String* msg =
upb_test_TestUtf8Proto2String_new(arena.ptr());
upb_DecodeStatus status;
status = upb_Decode(data, size, UPB_UPCAST(msg),
&upb_0test__TestUtf8Proto2String_msg_init, nullptr, 0,
arena.ptr());
// Parse succeeds, because proto2 string fields don't validate UTF-8.
ASSERT_EQ(kUpb_DecodeStatus_Ok, status);
}
TEST(Utf8Test, Proto2FieldFailsValidation) {
upb::Arena arena;
size_t size;
char* data = GetBadUtf8Payload(arena.ptr(), &size);
upb_test_TestUtf8Proto2String* msg =
upb_test_TestUtf8Proto2String_new(arena.ptr());
upb_DecodeStatus status;
status = upb_Decode(data, size, UPB_UPCAST(msg),
&upb_0test__TestUtf8Proto2String_msg_init, nullptr, 0,
arena.ptr());
// Parse fails, because we pass in kUpb_DecodeOption_AlwaysValidateUtf8 to
// force validation of proto2 string fields.
status = upb_Decode(data, size, UPB_UPCAST(msg),
&upb_0test__TestUtf8Proto2String_msg_init, nullptr,
kUpb_DecodeOption_AlwaysValidateUtf8, arena.ptr());
ASSERT_EQ(kUpb_DecodeStatus_BadUtf8, status);
}
TEST(Utf8Test, Proto2RepeatedFieldFailsValidation) {
upb::Arena arena;
size_t size;
char* data = GetBadUtf8Payload(arena.ptr(), &size);
upb_test_TestUtf8RepeatedProto2String* msg =
upb_test_TestUtf8RepeatedProto2String_new(arena.ptr());
upb_DecodeStatus status;
status = upb_Decode(data, size, UPB_UPCAST(msg),
&upb_0test__TestUtf8RepeatedProto2String_msg_init,
nullptr, 0, arena.ptr());
// Parse fails, because we pass in kUpb_DecodeOption_AlwaysValidateUtf8 to
// force validation of proto2 string fields.
status =
upb_Decode(data, size, UPB_UPCAST(msg),
&upb_0test__TestUtf8RepeatedProto2String_msg_init, nullptr,
kUpb_DecodeOption_AlwaysValidateUtf8, arena.ptr());
ASSERT_EQ(kUpb_DecodeStatus_BadUtf8, status);
}
// begin:google_only
// TEST(Utf8Test, Proto3MixedFieldValidates) {
// upb::Arena arena;

@ -35,7 +35,7 @@ message TestUtf8Proto3StringEnforceUtf8False {
}
message TestUtf8RepeatedProto3StringEnforceUtf8False {
optional string data = 1;
repeated string data = 1;
}
message TestUtf8Proto3StringEnforceUtf8FalseMixed {

@ -0,0 +1,26 @@
// Protocol Buffers - Google's data interchange format
// Copyright 2023 Google LLC. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file or at
// https://developers.google.com/open-source/licenses/bsd
syntax = "proto2";
package upb_test;
message TestUtf8Proto2Bytes {
optional bytes data = 1;
}
message TestUtf8RepeatedProto2Bytes {
optional bytes data = 1;
}
message TestUtf8Proto2String {
optional string data = 1;
}
message TestUtf8RepeatedProto2String {
repeated string data = 1;
}

@ -1034,6 +1034,15 @@ static void _upb_Decoder_CheckUnlinked(upb_Decoder* d, const upb_MiniTable* mt,
*op = kUpb_DecodeOp_UnknownField;
}
UPB_FORCEINLINE
static void _upb_Decoder_MaybeVerifyUtf8(upb_Decoder* d,
const upb_MiniTableField* field,
int* op) {
if ((field->UPB_ONLYBITS(mode) & kUpb_LabelFlags_IsAlternate) &&
UPB_UNLIKELY(d->options & kUpb_DecodeOption_AlwaysValidateUtf8))
*op = kUpb_DecodeOp_String;
}
static int _upb_Decoder_GetDelimitedOp(upb_Decoder* d, const upb_MiniTable* mt,
const upb_MiniTableField* field) {
enum { kRepeatedBase = 19 };
@ -1090,6 +1099,8 @@ static int _upb_Decoder_GetDelimitedOp(upb_Decoder* d, const upb_MiniTable* mt,
if (op == kUpb_DecodeOp_SubMessage) {
_upb_Decoder_CheckUnlinked(d, mt, field, &op);
} else if (op == kUpb_DecodeOp_Bytes) {
_upb_Decoder_MaybeVerifyUtf8(d, field, &op);
}
return op;

@ -83,6 +83,16 @@ enum {
* be created by the parser or the message-copying logic in message/copy.h.
*/
kUpb_DecodeOption_ExperimentalAllowUnlinked = 4,
/* EXPERIMENTAL:
*
* If set, decoding will enforce UTF-8 validation for string fields, even for
* proto2 or fields with `features.utf8_validation = NONE`. Normally, only
* proto3 string fields will be validated for UTF-8. Decoding will return
* kUpb_DecodeStatus_BadUtf8 for non-UTF-8 strings, which is the same behavior
* as non-UTF-8 proto3 string fields.
*/
kUpb_DecodeOption_AlwaysValidateUtf8 = 8,
};
UPB_INLINE uint32_t upb_DecodeOptions_MaxDepth(uint16_t depth) {

Loading…
Cancel
Save