Merge pull request #302 from haberman/verify-utf8

Verify UTF-8 when parsing proto3 string fields.
pull/13171/head
Joshua Haberman 4 years ago committed by GitHub
commit ed86d98f53
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 7
      BUILD
  2. 72
      generated_for_cmake/google/protobuf/descriptor.upb.c
  3. 17
      tests/bindings/lua/test_upb.lua
  4. 56
      upb/decode.c
  5. 7
      upb/def.c
  6. 18
      upbc/generator.cc

@ -765,6 +765,7 @@ cc_test(
"upb/bindings/lua/upb.lua",
":descriptor_proto_lua",
":test_messages_proto3_proto_lua",
":test_messages_proto2_proto_lua",
":test_proto_lua",
"@com_google_protobuf//:conformance_proto",
"@com_google_protobuf//:descriptor_proto",
@ -807,6 +808,12 @@ lua_proto_library(
deps = ["@com_google_protobuf//:test_messages_proto3_proto"],
)
lua_proto_library(
name = "test_messages_proto2_proto_lua",
testonly = 1,
deps = ["@com_google_protobuf//:test_messages_proto2_proto"],
)
# Test the CMake build #########################################################
filegroup(

@ -36,9 +36,9 @@ static const upb_msglayout *const google_protobuf_FileDescriptorProto_submsgs[6]
};
static const upb_msglayout_field google_protobuf_FileDescriptorProto__fields[12] = {
{1, UPB_SIZE(4, 8), 1, 0, 9, 1},
{2, UPB_SIZE(12, 24), 2, 0, 9, 1},
{3, UPB_SIZE(36, 72), 0, 0, 9, 3},
{1, UPB_SIZE(4, 8), 1, 0, 12, 1},
{2, UPB_SIZE(12, 24), 2, 0, 12, 1},
{3, UPB_SIZE(36, 72), 0, 0, 12, 3},
{4, UPB_SIZE(40, 80), 0, 0, 11, 3},
{5, UPB_SIZE(44, 88), 0, 1, 11, 3},
{6, UPB_SIZE(48, 96), 0, 4, 11, 3},
@ -47,7 +47,7 @@ static const upb_msglayout_field google_protobuf_FileDescriptorProto__fields[12]
{9, UPB_SIZE(32, 64), 5, 5, 11, 1},
{10, UPB_SIZE(56, 112), 0, 0, 5, 3},
{11, UPB_SIZE(60, 120), 0, 0, 5, 3},
{12, UPB_SIZE(20, 40), 3, 0, 9, 1},
{12, UPB_SIZE(20, 40), 3, 0, 12, 1},
};
const upb_msglayout google_protobuf_FileDescriptorProto_msginit = {
@ -67,7 +67,7 @@ static const upb_msglayout *const google_protobuf_DescriptorProto_submsgs[8] = {
};
static const upb_msglayout_field google_protobuf_DescriptorProto__fields[10] = {
{1, UPB_SIZE(4, 8), 1, 0, 9, 1},
{1, UPB_SIZE(4, 8), 1, 0, 12, 1},
{2, UPB_SIZE(16, 32), 0, 4, 11, 3},
{3, UPB_SIZE(20, 40), 0, 0, 11, 3},
{4, UPB_SIZE(24, 48), 0, 3, 11, 3},
@ -76,7 +76,7 @@ static const upb_msglayout_field google_protobuf_DescriptorProto__fields[10] = {
{7, UPB_SIZE(12, 24), 2, 5, 11, 1},
{8, UPB_SIZE(36, 72), 0, 6, 11, 3},
{9, UPB_SIZE(40, 80), 0, 2, 11, 3},
{10, UPB_SIZE(44, 88), 0, 0, 9, 3},
{10, UPB_SIZE(44, 88), 0, 0, 12, 3},
};
const upb_msglayout google_protobuf_DescriptorProto_msginit = {
@ -131,16 +131,16 @@ static const upb_msglayout *const google_protobuf_FieldDescriptorProto_submsgs[1
};
static const upb_msglayout_field google_protobuf_FieldDescriptorProto__fields[11] = {
{1, UPB_SIZE(36, 40), 6, 0, 9, 1},
{2, UPB_SIZE(44, 56), 7, 0, 9, 1},
{1, UPB_SIZE(36, 40), 6, 0, 12, 1},
{2, UPB_SIZE(44, 56), 7, 0, 12, 1},
{3, UPB_SIZE(24, 24), 3, 0, 5, 1},
{4, UPB_SIZE(8, 8), 1, 0, 14, 1},
{5, UPB_SIZE(16, 16), 2, 0, 14, 1},
{6, UPB_SIZE(52, 72), 8, 0, 9, 1},
{7, UPB_SIZE(60, 88), 9, 0, 9, 1},
{6, UPB_SIZE(52, 72), 8, 0, 12, 1},
{7, UPB_SIZE(60, 88), 9, 0, 12, 1},
{8, UPB_SIZE(76, 120), 11, 0, 11, 1},
{9, UPB_SIZE(28, 28), 4, 0, 5, 1},
{10, UPB_SIZE(68, 104), 10, 0, 9, 1},
{10, UPB_SIZE(68, 104), 10, 0, 12, 1},
{17, UPB_SIZE(32, 32), 5, 0, 8, 1},
};
@ -155,7 +155,7 @@ static const upb_msglayout *const google_protobuf_OneofDescriptorProto_submsgs[1
};
static const upb_msglayout_field google_protobuf_OneofDescriptorProto__fields[2] = {
{1, UPB_SIZE(4, 8), 1, 0, 9, 1},
{1, UPB_SIZE(4, 8), 1, 0, 12, 1},
{2, UPB_SIZE(12, 24), 2, 0, 11, 1},
};
@ -172,11 +172,11 @@ static const upb_msglayout *const google_protobuf_EnumDescriptorProto_submsgs[3]
};
static const upb_msglayout_field google_protobuf_EnumDescriptorProto__fields[5] = {
{1, UPB_SIZE(4, 8), 1, 0, 9, 1},
{1, UPB_SIZE(4, 8), 1, 0, 12, 1},
{2, UPB_SIZE(16, 32), 0, 2, 11, 3},
{3, UPB_SIZE(12, 24), 2, 1, 11, 1},
{4, UPB_SIZE(20, 40), 0, 0, 11, 3},
{5, UPB_SIZE(24, 48), 0, 0, 9, 3},
{5, UPB_SIZE(24, 48), 0, 0, 12, 3},
};
const upb_msglayout google_protobuf_EnumDescriptorProto_msginit = {
@ -201,7 +201,7 @@ static const upb_msglayout *const google_protobuf_EnumValueDescriptorProto_subms
};
static const upb_msglayout_field google_protobuf_EnumValueDescriptorProto__fields[3] = {
{1, UPB_SIZE(8, 8), 2, 0, 9, 1},
{1, UPB_SIZE(8, 8), 2, 0, 12, 1},
{2, UPB_SIZE(4, 4), 1, 0, 5, 1},
{3, UPB_SIZE(16, 24), 3, 0, 11, 1},
};
@ -218,7 +218,7 @@ static const upb_msglayout *const google_protobuf_ServiceDescriptorProto_submsgs
};
static const upb_msglayout_field google_protobuf_ServiceDescriptorProto__fields[3] = {
{1, UPB_SIZE(4, 8), 1, 0, 9, 1},
{1, UPB_SIZE(4, 8), 1, 0, 12, 1},
{2, UPB_SIZE(16, 32), 0, 0, 11, 3},
{3, UPB_SIZE(12, 24), 2, 1, 11, 1},
};
@ -234,9 +234,9 @@ static const upb_msglayout *const google_protobuf_MethodDescriptorProto_submsgs[
};
static const upb_msglayout_field google_protobuf_MethodDescriptorProto__fields[6] = {
{1, UPB_SIZE(4, 8), 3, 0, 9, 1},
{2, UPB_SIZE(12, 24), 4, 0, 9, 1},
{3, UPB_SIZE(20, 40), 5, 0, 9, 1},
{1, UPB_SIZE(4, 8), 3, 0, 12, 1},
{2, UPB_SIZE(12, 24), 4, 0, 12, 1},
{3, UPB_SIZE(20, 40), 5, 0, 12, 1},
{4, UPB_SIZE(28, 56), 6, 0, 11, 1},
{5, UPB_SIZE(1, 1), 1, 0, 8, 1},
{6, UPB_SIZE(2, 2), 2, 0, 8, 1},
@ -253,11 +253,11 @@ static const upb_msglayout *const google_protobuf_FileOptions_submsgs[1] = {
};
static const upb_msglayout_field google_protobuf_FileOptions__fields[21] = {
{1, UPB_SIZE(28, 32), 11, 0, 9, 1},
{8, UPB_SIZE(36, 48), 12, 0, 9, 1},
{1, UPB_SIZE(28, 32), 11, 0, 12, 1},
{8, UPB_SIZE(36, 48), 12, 0, 12, 1},
{9, UPB_SIZE(8, 8), 1, 0, 14, 1},
{10, UPB_SIZE(16, 16), 2, 0, 8, 1},
{11, UPB_SIZE(44, 64), 13, 0, 9, 1},
{11, UPB_SIZE(44, 64), 13, 0, 12, 1},
{16, UPB_SIZE(17, 17), 3, 0, 8, 1},
{17, UPB_SIZE(18, 18), 4, 0, 8, 1},
{18, UPB_SIZE(19, 19), 5, 0, 8, 1},
@ -265,14 +265,14 @@ static const upb_msglayout_field google_protobuf_FileOptions__fields[21] = {
{23, UPB_SIZE(21, 21), 7, 0, 8, 1},
{27, UPB_SIZE(22, 22), 8, 0, 8, 1},
{31, UPB_SIZE(23, 23), 9, 0, 8, 1},
{36, UPB_SIZE(52, 80), 14, 0, 9, 1},
{37, UPB_SIZE(60, 96), 15, 0, 9, 1},
{39, UPB_SIZE(68, 112), 16, 0, 9, 1},
{40, UPB_SIZE(76, 128), 17, 0, 9, 1},
{41, UPB_SIZE(84, 144), 18, 0, 9, 1},
{36, UPB_SIZE(52, 80), 14, 0, 12, 1},
{37, UPB_SIZE(60, 96), 15, 0, 12, 1},
{39, UPB_SIZE(68, 112), 16, 0, 12, 1},
{40, UPB_SIZE(76, 128), 17, 0, 12, 1},
{41, UPB_SIZE(84, 144), 18, 0, 12, 1},
{42, UPB_SIZE(24, 24), 10, 0, 8, 1},
{44, UPB_SIZE(92, 160), 19, 0, 9, 1},
{45, UPB_SIZE(100, 176), 20, 0, 9, 1},
{44, UPB_SIZE(92, 160), 19, 0, 12, 1},
{45, UPB_SIZE(100, 176), 20, 0, 12, 1},
{999, UPB_SIZE(108, 192), 0, 0, 11, 3},
};
@ -402,12 +402,12 @@ static const upb_msglayout *const google_protobuf_UninterpretedOption_submsgs[1]
static const upb_msglayout_field google_protobuf_UninterpretedOption__fields[7] = {
{2, UPB_SIZE(56, 80), 0, 0, 11, 3},
{3, UPB_SIZE(32, 32), 4, 0, 9, 1},
{3, UPB_SIZE(32, 32), 4, 0, 12, 1},
{4, UPB_SIZE(8, 8), 1, 0, 4, 1},
{5, UPB_SIZE(16, 16), 2, 0, 3, 1},
{6, UPB_SIZE(24, 24), 3, 0, 1, 1},
{7, UPB_SIZE(40, 48), 5, 0, 12, 1},
{8, UPB_SIZE(48, 64), 6, 0, 9, 1},
{8, UPB_SIZE(48, 64), 6, 0, 12, 1},
};
const upb_msglayout google_protobuf_UninterpretedOption_msginit = {
@ -417,7 +417,7 @@ const upb_msglayout google_protobuf_UninterpretedOption_msginit = {
};
static const upb_msglayout_field google_protobuf_UninterpretedOption_NamePart__fields[2] = {
{1, UPB_SIZE(4, 8), 2, 0, 9, 2},
{1, UPB_SIZE(4, 8), 2, 0, 12, 2},
{2, UPB_SIZE(1, 1), 1, 0, 8, 2},
};
@ -444,9 +444,9 @@ const upb_msglayout google_protobuf_SourceCodeInfo_msginit = {
static const upb_msglayout_field google_protobuf_SourceCodeInfo_Location__fields[5] = {
{1, UPB_SIZE(20, 40), 0, 0, 5, _UPB_LABEL_PACKED},
{2, UPB_SIZE(24, 48), 0, 0, 5, _UPB_LABEL_PACKED},
{3, UPB_SIZE(4, 8), 1, 0, 9, 1},
{4, UPB_SIZE(12, 24), 2, 0, 9, 1},
{6, UPB_SIZE(28, 56), 0, 0, 9, 3},
{3, UPB_SIZE(4, 8), 1, 0, 12, 1},
{4, UPB_SIZE(12, 24), 2, 0, 12, 1},
{6, UPB_SIZE(28, 56), 0, 0, 12, 3},
};
const upb_msglayout google_protobuf_SourceCodeInfo_Location_msginit = {
@ -471,7 +471,7 @@ const upb_msglayout google_protobuf_GeneratedCodeInfo_msginit = {
static const upb_msglayout_field google_protobuf_GeneratedCodeInfo_Annotation__fields[4] = {
{1, UPB_SIZE(20, 32), 0, 0, 5, _UPB_LABEL_PACKED},
{2, UPB_SIZE(12, 16), 3, 0, 9, 1},
{2, UPB_SIZE(12, 16), 3, 0, 12, 1},
{3, UPB_SIZE(4, 4), 1, 0, 5, 1},
{4, UPB_SIZE(8, 8), 2, 0, 5, 1},
};

@ -3,6 +3,7 @@ local upb = require "lupb"
local lunit = require "lunit"
local upb_test = require "tests.test_pb"
local test_messages_proto3 = require "google.protobuf.test_messages_proto3_pb"
local test_messages_proto2 = require "google.protobuf.test_messages_proto2_pb"
local descriptor = require "google.protobuf.descriptor_pb"
if _VERSION >= 'Lua 5.2' then
@ -69,6 +70,22 @@ function test_msg_map()
assert_equal(12, msg2.map_int32_int32[6])
end
function test_utf8()
local proto2_msg = test_messages_proto2.TestAllTypesProto2()
proto2_msg.optional_string = "\xff"
local serialized = upb.encode(proto2_msg)
-- Decoding invalid UTF-8 succeeds in proto2.
upb.decode(test_messages_proto2.TestAllTypesProto2, serialized)
-- Decoding invalid UTF-8 fails in proto2.
assert_error_match("Error decoding protobuf", function()
upb.decode(test_messages_proto3.TestAllTypesProto3, serialized)
end)
-- TOOD(haberman): should proto3 accessors also check UTF-8 at set time?
end
function test_string_double_map()
msg = upb_test.MapTest()
msg.map_string_double["one"] = 1.0

@ -62,11 +62,13 @@ static const unsigned fixed64_ok = (1 << UPB_DTYPE_DOUBLE) |
(1 << UPB_DTYPE_SFIXED64);
/* Op: an action to be performed for a wire-type/field-type combination. */
#define OP_SCALAR_LG2(n) (n)
#define OP_FIXPCK_LG2(n) (n + 4)
#define OP_VARPCK_LG2(n) (n + 8)
#define OP_SCALAR_LG2(n) (n) /* n in [0, 2, 3] => op in [0, 2, 3] */
#define OP_STRING 4
#define OP_SUBMSG 5
#define OP_BYTES 5
#define OP_SUBMSG 6
/* Ops above are scalar-only. Repeated fields can use any op. */
#define OP_FIXPCK_LG2(n) (n + 5) /* n in [2, 3] => op in [7, 8] */
#define OP_VARPCK_LG2(n) (n + 9) /* n in [0, 2, 3] => op in [9, 11, 12] */
static const int8_t varint_ops[19] = {
-1, /* field not found */
@ -104,7 +106,7 @@ static const int8_t delim_ops[37] = {
OP_STRING, /* STRING */
-1, /* GROUP */
OP_SUBMSG, /* MESSAGE */
OP_STRING, /* BYTES */
OP_BYTES, /* BYTES */
-1, /* UINT32 */
-1, /* ENUM */
-1, /* SFIXED32 */
@ -123,7 +125,7 @@ static const int8_t delim_ops[37] = {
OP_STRING, /* REPEATED STRING */
OP_SUBMSG, /* REPEATED GROUP */
OP_SUBMSG, /* REPEATED MESSAGE */
OP_STRING, /* REPEATED BYTES */
OP_BYTES, /* REPEATED BYTES */
OP_VARPCK_LG2(2), /* REPEATED UINT32 */
OP_VARPCK_LG2(2), /* REPEATED ENUM */
OP_FIXPCK_LG2(2), /* REPEATED SFIXED32 */
@ -155,6 +157,40 @@ static const char *decode_msg(upb_decstate *d, const char *ptr, upb_msg *msg,
UPB_NORETURN static void decode_err(upb_decstate *d) { longjmp(d->err, 1); }
void decode_verifyutf8(upb_decstate *d, const char *buf, int len) {
static const uint8_t utf8_offset[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0,
};
int i, j;
uint8_t offset;
i = 0;
while (i < len) {
offset = utf8_offset[(uint8_t)buf[i]];
if (offset == 0 || i + offset > len) {
decode_err(d);
}
for (j = i + 1; j < i + offset; j++) {
if ((buf[j] & 0xc0) != 0x80) {
decode_err(d);
}
}
i += offset;
}
if (i != len) decode_err(d);
}
static bool decode_reserve(upb_decstate *d, upb_array *arr, size_t elem) {
bool need_realloc = arr->size - arr->len < elem;
if (need_realloc && !_upb_array_realloc(arr, arr->len + elem, d->arena)) {
@ -300,7 +336,10 @@ static const char *decode_toarray(upb_decstate *d, const char *ptr,
memcpy(mem, &val, 1 << op);
return ptr;
case OP_STRING:
/* Append string. */
decode_verifyutf8(d, val.str_val.data, val.str_val.size);
/* Fallthrough. */
case OP_BYTES:
/* Append bytes. */
mem =
UPB_PTR_AT(_upb_array_ptr(arr), arr->len * sizeof(upb_strview), void);
arr->len++;
@ -434,6 +473,9 @@ static const char *decode_tomsg(upb_decstate *d, const char *ptr, upb_msg *msg,
break;
}
case OP_STRING:
decode_verifyutf8(d, val.str_val.data, val.str_val.size);
/* Fallthrough. */
case OP_BYTES:
memcpy(mem, &val, sizeof(upb_strview));
break;
case OP_SCALAR_LG2(3):

@ -1013,6 +1013,13 @@ static bool make_layout(const upb_symtab *symtab, const upb_msgdef *m) {
field->descriptortype = upb_fielddef_descriptortype(f);
field->label = upb_fielddef_label(f);
if (field->descriptortype == UPB_DTYPE_STRING &&
f->file->syntax == UPB_SYNTAX_PROTO2) {
/* See TableDescriptorType() in upbc/generator.cc for details and
* rationale. */
field->descriptortype = UPB_DTYPE_BYTES;
}
if (upb_fielddef_ismap(f)) {
field->label = _UPB_LABEL_MAP;
} else if (upb_fielddef_packed(f)) {

@ -687,6 +687,22 @@ void WriteHeader(const protobuf::FileDescriptor* file, Output& output) {
ToPreproc(file->name()));
}
int TableDescriptorType(const protobuf::FieldDescriptor* field) {
if (field->file()->syntax() == protobuf::FileDescriptor::SYNTAX_PROTO2 &&
field->type() == protobuf::FieldDescriptor::TYPE_STRING) {
// From the perspective of the binary encoder/decoder, proto2 string fields
// are identical to bytes fields. Only in proto3 do we check UTF-8 for
// string fields at parse time.
//
// If we ever use these tables for JSON encoding/decoding (for example by
// embedding field names on the side) we will have to revisit this, because
// string vs. bytes behavior is not affected by proto2 vs proto3.
return protobuf::FieldDescriptor::TYPE_BYTES;
} else {
return field->type();
}
}
void WriteSource(const protobuf::FileDescriptor* file, Output& output) {
EmitFileWarning(file, output);
@ -781,7 +797,7 @@ void WriteSource(const protobuf::FileDescriptor* file, Output& output) {
GetSizeInit(layout.GetFieldOffset(field)),
presence,
submsg_index,
field->type(),
TableDescriptorType(field),
label);
}
output("};\n\n");

Loading…
Cancel
Save