From a77ea639d5d4ff3abc8106ec977285c16381c09d Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Mon, 6 Jul 2020 11:52:23 -0700 Subject: [PATCH 1/3] Verify UTF-8 when parsing proto3 string fields. --- .../google/protobuf/descriptor.upb.c | 72 +++++++++---------- upb/decode.c | 56 +++++++++++++-- upb/def.c | 7 ++ upbc/generator.cc | 18 ++++- 4 files changed, 109 insertions(+), 44 deletions(-) diff --git a/generated_for_cmake/google/protobuf/descriptor.upb.c b/generated_for_cmake/google/protobuf/descriptor.upb.c index 3c7363922e..44cd3ae2ce 100644 --- a/generated_for_cmake/google/protobuf/descriptor.upb.c +++ b/generated_for_cmake/google/protobuf/descriptor.upb.c @@ -36,9 +36,9 @@ static const upb_msglayout *const google_protobuf_FileDescriptorProto_submsgs[6] }; static const upb_msglayout_field google_protobuf_FileDescriptorProto__fields[12] = { - {1, UPB_SIZE(4, 8), 1, 0, 9, 1}, - {2, UPB_SIZE(12, 24), 2, 0, 9, 1}, - {3, UPB_SIZE(36, 72), 0, 0, 9, 3}, + {1, UPB_SIZE(4, 8), 1, 0, 12, 1}, + {2, UPB_SIZE(12, 24), 2, 0, 12, 1}, + {3, UPB_SIZE(36, 72), 0, 0, 12, 3}, {4, UPB_SIZE(40, 80), 0, 0, 11, 3}, {5, UPB_SIZE(44, 88), 0, 1, 11, 3}, {6, UPB_SIZE(48, 96), 0, 4, 11, 3}, @@ -47,7 +47,7 @@ static const upb_msglayout_field google_protobuf_FileDescriptorProto__fields[12] {9, UPB_SIZE(32, 64), 5, 5, 11, 1}, {10, UPB_SIZE(56, 112), 0, 0, 5, 3}, {11, UPB_SIZE(60, 120), 0, 0, 5, 3}, - {12, UPB_SIZE(20, 40), 3, 0, 9, 1}, + {12, UPB_SIZE(20, 40), 3, 0, 12, 1}, }; const upb_msglayout google_protobuf_FileDescriptorProto_msginit = { @@ -67,7 +67,7 @@ static const upb_msglayout *const google_protobuf_DescriptorProto_submsgs[8] = { }; static const upb_msglayout_field google_protobuf_DescriptorProto__fields[10] = { - {1, UPB_SIZE(4, 8), 1, 0, 9, 1}, + {1, UPB_SIZE(4, 8), 1, 0, 12, 1}, {2, UPB_SIZE(16, 32), 0, 4, 11, 3}, {3, UPB_SIZE(20, 40), 0, 0, 11, 3}, {4, UPB_SIZE(24, 48), 0, 3, 11, 3}, @@ -76,7 +76,7 @@ static const upb_msglayout_field google_protobuf_DescriptorProto__fields[10] = { {7, UPB_SIZE(12, 24), 2, 5, 11, 1}, {8, UPB_SIZE(36, 72), 0, 6, 11, 3}, {9, UPB_SIZE(40, 80), 0, 2, 11, 3}, - {10, UPB_SIZE(44, 88), 0, 0, 9, 3}, + {10, UPB_SIZE(44, 88), 0, 0, 12, 3}, }; const upb_msglayout google_protobuf_DescriptorProto_msginit = { @@ -131,16 +131,16 @@ static const upb_msglayout *const google_protobuf_FieldDescriptorProto_submsgs[1 }; static const upb_msglayout_field google_protobuf_FieldDescriptorProto__fields[11] = { - {1, UPB_SIZE(36, 40), 6, 0, 9, 1}, - {2, UPB_SIZE(44, 56), 7, 0, 9, 1}, + {1, UPB_SIZE(36, 40), 6, 0, 12, 1}, + {2, UPB_SIZE(44, 56), 7, 0, 12, 1}, {3, UPB_SIZE(24, 24), 3, 0, 5, 1}, {4, UPB_SIZE(8, 8), 1, 0, 14, 1}, {5, UPB_SIZE(16, 16), 2, 0, 14, 1}, - {6, UPB_SIZE(52, 72), 8, 0, 9, 1}, - {7, UPB_SIZE(60, 88), 9, 0, 9, 1}, + {6, UPB_SIZE(52, 72), 8, 0, 12, 1}, + {7, UPB_SIZE(60, 88), 9, 0, 12, 1}, {8, UPB_SIZE(76, 120), 11, 0, 11, 1}, {9, UPB_SIZE(28, 28), 4, 0, 5, 1}, - {10, UPB_SIZE(68, 104), 10, 0, 9, 1}, + {10, UPB_SIZE(68, 104), 10, 0, 12, 1}, {17, UPB_SIZE(32, 32), 5, 0, 8, 1}, }; @@ -155,7 +155,7 @@ static const upb_msglayout *const google_protobuf_OneofDescriptorProto_submsgs[1 }; static const upb_msglayout_field google_protobuf_OneofDescriptorProto__fields[2] = { - {1, UPB_SIZE(4, 8), 1, 0, 9, 1}, + {1, UPB_SIZE(4, 8), 1, 0, 12, 1}, {2, UPB_SIZE(12, 24), 2, 0, 11, 1}, }; @@ -172,11 +172,11 @@ static const upb_msglayout *const google_protobuf_EnumDescriptorProto_submsgs[3] }; static const upb_msglayout_field google_protobuf_EnumDescriptorProto__fields[5] = { - {1, UPB_SIZE(4, 8), 1, 0, 9, 1}, + {1, UPB_SIZE(4, 8), 1, 0, 12, 1}, {2, UPB_SIZE(16, 32), 0, 2, 11, 3}, {3, UPB_SIZE(12, 24), 2, 1, 11, 1}, {4, UPB_SIZE(20, 40), 0, 0, 11, 3}, - {5, UPB_SIZE(24, 48), 0, 0, 9, 3}, + {5, UPB_SIZE(24, 48), 0, 0, 12, 3}, }; const upb_msglayout google_protobuf_EnumDescriptorProto_msginit = { @@ -201,7 +201,7 @@ static const upb_msglayout *const google_protobuf_EnumValueDescriptorProto_subms }; static const upb_msglayout_field google_protobuf_EnumValueDescriptorProto__fields[3] = { - {1, UPB_SIZE(8, 8), 2, 0, 9, 1}, + {1, UPB_SIZE(8, 8), 2, 0, 12, 1}, {2, UPB_SIZE(4, 4), 1, 0, 5, 1}, {3, UPB_SIZE(16, 24), 3, 0, 11, 1}, }; @@ -218,7 +218,7 @@ static const upb_msglayout *const google_protobuf_ServiceDescriptorProto_submsgs }; static const upb_msglayout_field google_protobuf_ServiceDescriptorProto__fields[3] = { - {1, UPB_SIZE(4, 8), 1, 0, 9, 1}, + {1, UPB_SIZE(4, 8), 1, 0, 12, 1}, {2, UPB_SIZE(16, 32), 0, 0, 11, 3}, {3, UPB_SIZE(12, 24), 2, 1, 11, 1}, }; @@ -234,9 +234,9 @@ static const upb_msglayout *const google_protobuf_MethodDescriptorProto_submsgs[ }; static const upb_msglayout_field google_protobuf_MethodDescriptorProto__fields[6] = { - {1, UPB_SIZE(4, 8), 3, 0, 9, 1}, - {2, UPB_SIZE(12, 24), 4, 0, 9, 1}, - {3, UPB_SIZE(20, 40), 5, 0, 9, 1}, + {1, UPB_SIZE(4, 8), 3, 0, 12, 1}, + {2, UPB_SIZE(12, 24), 4, 0, 12, 1}, + {3, UPB_SIZE(20, 40), 5, 0, 12, 1}, {4, UPB_SIZE(28, 56), 6, 0, 11, 1}, {5, UPB_SIZE(1, 1), 1, 0, 8, 1}, {6, UPB_SIZE(2, 2), 2, 0, 8, 1}, @@ -253,11 +253,11 @@ static const upb_msglayout *const google_protobuf_FileOptions_submsgs[1] = { }; static const upb_msglayout_field google_protobuf_FileOptions__fields[21] = { - {1, UPB_SIZE(28, 32), 11, 0, 9, 1}, - {8, UPB_SIZE(36, 48), 12, 0, 9, 1}, + {1, UPB_SIZE(28, 32), 11, 0, 12, 1}, + {8, UPB_SIZE(36, 48), 12, 0, 12, 1}, {9, UPB_SIZE(8, 8), 1, 0, 14, 1}, {10, UPB_SIZE(16, 16), 2, 0, 8, 1}, - {11, UPB_SIZE(44, 64), 13, 0, 9, 1}, + {11, UPB_SIZE(44, 64), 13, 0, 12, 1}, {16, UPB_SIZE(17, 17), 3, 0, 8, 1}, {17, UPB_SIZE(18, 18), 4, 0, 8, 1}, {18, UPB_SIZE(19, 19), 5, 0, 8, 1}, @@ -265,14 +265,14 @@ static const upb_msglayout_field google_protobuf_FileOptions__fields[21] = { {23, UPB_SIZE(21, 21), 7, 0, 8, 1}, {27, UPB_SIZE(22, 22), 8, 0, 8, 1}, {31, UPB_SIZE(23, 23), 9, 0, 8, 1}, - {36, UPB_SIZE(52, 80), 14, 0, 9, 1}, - {37, UPB_SIZE(60, 96), 15, 0, 9, 1}, - {39, UPB_SIZE(68, 112), 16, 0, 9, 1}, - {40, UPB_SIZE(76, 128), 17, 0, 9, 1}, - {41, UPB_SIZE(84, 144), 18, 0, 9, 1}, + {36, UPB_SIZE(52, 80), 14, 0, 12, 1}, + {37, UPB_SIZE(60, 96), 15, 0, 12, 1}, + {39, UPB_SIZE(68, 112), 16, 0, 12, 1}, + {40, UPB_SIZE(76, 128), 17, 0, 12, 1}, + {41, UPB_SIZE(84, 144), 18, 0, 12, 1}, {42, UPB_SIZE(24, 24), 10, 0, 8, 1}, - {44, UPB_SIZE(92, 160), 19, 0, 9, 1}, - {45, UPB_SIZE(100, 176), 20, 0, 9, 1}, + {44, UPB_SIZE(92, 160), 19, 0, 12, 1}, + {45, UPB_SIZE(100, 176), 20, 0, 12, 1}, {999, UPB_SIZE(108, 192), 0, 0, 11, 3}, }; @@ -402,12 +402,12 @@ static const upb_msglayout *const google_protobuf_UninterpretedOption_submsgs[1] static const upb_msglayout_field google_protobuf_UninterpretedOption__fields[7] = { {2, UPB_SIZE(56, 80), 0, 0, 11, 3}, - {3, UPB_SIZE(32, 32), 4, 0, 9, 1}, + {3, UPB_SIZE(32, 32), 4, 0, 12, 1}, {4, UPB_SIZE(8, 8), 1, 0, 4, 1}, {5, UPB_SIZE(16, 16), 2, 0, 3, 1}, {6, UPB_SIZE(24, 24), 3, 0, 1, 1}, {7, UPB_SIZE(40, 48), 5, 0, 12, 1}, - {8, UPB_SIZE(48, 64), 6, 0, 9, 1}, + {8, UPB_SIZE(48, 64), 6, 0, 12, 1}, }; const upb_msglayout google_protobuf_UninterpretedOption_msginit = { @@ -417,7 +417,7 @@ const upb_msglayout google_protobuf_UninterpretedOption_msginit = { }; static const upb_msglayout_field google_protobuf_UninterpretedOption_NamePart__fields[2] = { - {1, UPB_SIZE(4, 8), 2, 0, 9, 2}, + {1, UPB_SIZE(4, 8), 2, 0, 12, 2}, {2, UPB_SIZE(1, 1), 1, 0, 8, 2}, }; @@ -444,9 +444,9 @@ const upb_msglayout google_protobuf_SourceCodeInfo_msginit = { static const upb_msglayout_field google_protobuf_SourceCodeInfo_Location__fields[5] = { {1, UPB_SIZE(20, 40), 0, 0, 5, _UPB_LABEL_PACKED}, {2, UPB_SIZE(24, 48), 0, 0, 5, _UPB_LABEL_PACKED}, - {3, UPB_SIZE(4, 8), 1, 0, 9, 1}, - {4, UPB_SIZE(12, 24), 2, 0, 9, 1}, - {6, UPB_SIZE(28, 56), 0, 0, 9, 3}, + {3, UPB_SIZE(4, 8), 1, 0, 12, 1}, + {4, UPB_SIZE(12, 24), 2, 0, 12, 1}, + {6, UPB_SIZE(28, 56), 0, 0, 12, 3}, }; const upb_msglayout google_protobuf_SourceCodeInfo_Location_msginit = { @@ -471,7 +471,7 @@ const upb_msglayout google_protobuf_GeneratedCodeInfo_msginit = { static const upb_msglayout_field google_protobuf_GeneratedCodeInfo_Annotation__fields[4] = { {1, UPB_SIZE(20, 32), 0, 0, 5, _UPB_LABEL_PACKED}, - {2, UPB_SIZE(12, 16), 3, 0, 9, 1}, + {2, UPB_SIZE(12, 16), 3, 0, 12, 1}, {3, UPB_SIZE(4, 4), 1, 0, 5, 1}, {4, UPB_SIZE(8, 8), 2, 0, 5, 1}, }; diff --git a/upb/decode.c b/upb/decode.c index d1eec474df..042dbadfc9 100644 --- a/upb/decode.c +++ b/upb/decode.c @@ -62,11 +62,13 @@ static const unsigned fixed64_ok = (1 << UPB_DTYPE_DOUBLE) | (1 << UPB_DTYPE_SFIXED64); /* Op: an action to be performed for a wire-type/field-type combination. */ -#define OP_SCALAR_LG2(n) (n) -#define OP_FIXPCK_LG2(n) (n + 4) -#define OP_VARPCK_LG2(n) (n + 8) +#define OP_SCALAR_LG2(n) (n) /* n in [0, 2, 3] => op in [0, 2, 3] */ #define OP_STRING 4 -#define OP_SUBMSG 5 +#define OP_BYTES 5 +#define OP_SUBMSG 6 +/* Ops above are scalar-only. Repeated fields can use any op. */ +#define OP_FIXPCK_LG2(n) (n + 5) /* n in [2, 3] => op in [7, 8] */ +#define OP_VARPCK_LG2(n) (n + 9) /* n in [0, 2, 3] => op in [9, 11, 12] */ static const int8_t varint_ops[19] = { -1, /* field not found */ @@ -104,7 +106,7 @@ static const int8_t delim_ops[37] = { OP_STRING, /* STRING */ -1, /* GROUP */ OP_SUBMSG, /* MESSAGE */ - OP_STRING, /* BYTES */ + OP_BYTES, /* BYTES */ -1, /* UINT32 */ -1, /* ENUM */ -1, /* SFIXED32 */ @@ -123,7 +125,7 @@ static const int8_t delim_ops[37] = { OP_STRING, /* REPEATED STRING */ OP_SUBMSG, /* REPEATED GROUP */ OP_SUBMSG, /* REPEATED MESSAGE */ - OP_STRING, /* REPEATED BYTES */ + OP_BYTES, /* REPEATED BYTES */ OP_VARPCK_LG2(2), /* REPEATED UINT32 */ OP_VARPCK_LG2(2), /* REPEATED ENUM */ OP_FIXPCK_LG2(2), /* REPEATED SFIXED32 */ @@ -155,6 +157,40 @@ static const char *decode_msg(upb_decstate *d, const char *ptr, upb_msg *msg, UPB_NORETURN static void decode_err(upb_decstate *d) { longjmp(d->err, 1); } +bool decode_verifyutf8(const char *buf, int len) { + static const uint8_t utf8_offset[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, + }; + + int i, j; + uint8_t offset; + + i = 0; + while (i < len) { + offset = utf8_offset[(uint8_t)buf[i]]; + if (offset == 0 || i + offset > len) { + return false; + } + for (j = i + 1; j < i + offset; j++) { + if ((buf[j] & 0xc0) != 0x80) { + return false; + } + } + i += offset; + } + return i == len; +} + static bool decode_reserve(upb_decstate *d, upb_array *arr, size_t elem) { bool need_realloc = arr->size - arr->len < elem; if (need_realloc && !_upb_array_realloc(arr, arr->len + elem, d->arena)) { @@ -300,7 +336,10 @@ static const char *decode_toarray(upb_decstate *d, const char *ptr, memcpy(mem, &val, 1 << op); return ptr; case OP_STRING: - /* Append string. */ + decode_verifyutf8(val.str_val.data, val.str_val.size); + /* Fallthrough. */ + case OP_BYTES: + /* Append bytes. */ mem = UPB_PTR_AT(_upb_array_ptr(arr), arr->len * sizeof(upb_strview), void); arr->len++; @@ -434,6 +473,9 @@ static const char *decode_tomsg(upb_decstate *d, const char *ptr, upb_msg *msg, break; } case OP_STRING: + decode_verifyutf8(val.str_val.data, val.str_val.size); + /* Fallthrough. */ + case OP_BYTES: memcpy(mem, &val, sizeof(upb_strview)); break; case OP_SCALAR_LG2(3): diff --git a/upb/def.c b/upb/def.c index 0b5c9c5473..a9a32a98b8 100644 --- a/upb/def.c +++ b/upb/def.c @@ -1013,6 +1013,13 @@ static bool make_layout(const upb_symtab *symtab, const upb_msgdef *m) { field->descriptortype = upb_fielddef_descriptortype(f); field->label = upb_fielddef_label(f); + if (field->descriptortype == UPB_DTYPE_STRING && + f->file->syntax == UPB_SYNTAX_PROTO2) { + // See TableDescriptorType() in upbc/generator.cc for details and + // rationale. + field->descriptortype = UPB_DTYPE_BYTES; + } + if (upb_fielddef_ismap(f)) { field->label = _UPB_LABEL_MAP; } else if (upb_fielddef_packed(f)) { diff --git a/upbc/generator.cc b/upbc/generator.cc index 1d0290c634..72df0244aa 100644 --- a/upbc/generator.cc +++ b/upbc/generator.cc @@ -687,6 +687,22 @@ void WriteHeader(const protobuf::FileDescriptor* file, Output& output) { ToPreproc(file->name())); } +int TableDescriptorType(const protobuf::FieldDescriptor* field) { + if (field->file()->syntax() == protobuf::FileDescriptor::SYNTAX_PROTO2 && + field->type() == protobuf::FieldDescriptor::TYPE_STRING) { + // From the perspective of the binary encoder/decoder, proto2 string fields + // are identical to bytes fields. Only in proto3 do we check UTF-8 for + // string fields at parse time. + // + // If we ever use these tables for JSON encoding/decoding (for example by + // embedding field names on the side) we will have to revisit this, because + // string vs. bytes behavior is not affected by proto2 vs proto3. + return protobuf::FieldDescriptor::TYPE_BYTES; + } else { + return field->type(); + } +} + void WriteSource(const protobuf::FileDescriptor* file, Output& output) { EmitFileWarning(file, output); @@ -781,7 +797,7 @@ void WriteSource(const protobuf::FileDescriptor* file, Output& output) { GetSizeInit(layout.GetFieldOffset(field)), presence, submsg_index, - field->type(), + TableDescriptorType(field), label); } output("};\n\n"); From 2c666bc8f665e028884ba8e4118c1460785533bd Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Mon, 6 Jul 2020 12:02:28 -0700 Subject: [PATCH 2/3] Use C-style comment instead of C++. --- upb/def.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/upb/def.c b/upb/def.c index a9a32a98b8..05dea02242 100644 --- a/upb/def.c +++ b/upb/def.c @@ -1015,8 +1015,8 @@ static bool make_layout(const upb_symtab *symtab, const upb_msgdef *m) { if (field->descriptortype == UPB_DTYPE_STRING && f->file->syntax == UPB_SYNTAX_PROTO2) { - // See TableDescriptorType() in upbc/generator.cc for details and - // rationale. + /* See TableDescriptorType() in upbc/generator.cc for details and + * rationale. */ field->descriptortype = UPB_DTYPE_BYTES; } From 8e26a33bcb59b38dad2d4bac6a2ce893fdaf2bdf Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Mon, 6 Jul 2020 12:39:24 -0700 Subject: [PATCH 3/3] Added a test for UTF-8 parse checking and added missing error reporting. --- BUILD | 7 +++++++ tests/bindings/lua/test_upb.lua | 17 +++++++++++++++++ upb/decode.c | 12 ++++++------ 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/BUILD b/BUILD index cfc734a9b4..96229584a4 100644 --- a/BUILD +++ b/BUILD @@ -765,6 +765,7 @@ cc_test( "upb/bindings/lua/upb.lua", ":descriptor_proto_lua", ":test_messages_proto3_proto_lua", + ":test_messages_proto2_proto_lua", ":test_proto_lua", "@com_google_protobuf//:conformance_proto", "@com_google_protobuf//:descriptor_proto", @@ -807,6 +808,12 @@ lua_proto_library( deps = ["@com_google_protobuf//:test_messages_proto3_proto"], ) +lua_proto_library( + name = "test_messages_proto2_proto_lua", + testonly = 1, + deps = ["@com_google_protobuf//:test_messages_proto2_proto"], +) + # Test the CMake build ######################################################### filegroup( diff --git a/tests/bindings/lua/test_upb.lua b/tests/bindings/lua/test_upb.lua index d1c32b1674..4586b34ba7 100644 --- a/tests/bindings/lua/test_upb.lua +++ b/tests/bindings/lua/test_upb.lua @@ -3,6 +3,7 @@ local upb = require "lupb" local lunit = require "lunit" local upb_test = require "tests.test_pb" local test_messages_proto3 = require "google.protobuf.test_messages_proto3_pb" +local test_messages_proto2 = require "google.protobuf.test_messages_proto2_pb" local descriptor = require "google.protobuf.descriptor_pb" if _VERSION >= 'Lua 5.2' then @@ -69,6 +70,22 @@ function test_msg_map() assert_equal(12, msg2.map_int32_int32[6]) end +function test_utf8() + local proto2_msg = test_messages_proto2.TestAllTypesProto2() + proto2_msg.optional_string = "\xff" + local serialized = upb.encode(proto2_msg) + + -- Decoding invalid UTF-8 succeeds in proto2. + upb.decode(test_messages_proto2.TestAllTypesProto2, serialized) + + -- Decoding invalid UTF-8 fails in proto2. + assert_error_match("Error decoding protobuf", function() + upb.decode(test_messages_proto3.TestAllTypesProto3, serialized) + end) + + -- TOOD(haberman): should proto3 accessors also check UTF-8 at set time? +end + function test_string_double_map() msg = upb_test.MapTest() msg.map_string_double["one"] = 1.0 diff --git a/upb/decode.c b/upb/decode.c index 042dbadfc9..fbb9f2d2ff 100644 --- a/upb/decode.c +++ b/upb/decode.c @@ -157,7 +157,7 @@ static const char *decode_msg(upb_decstate *d, const char *ptr, upb_msg *msg, UPB_NORETURN static void decode_err(upb_decstate *d) { longjmp(d->err, 1); } -bool decode_verifyutf8(const char *buf, int len) { +void decode_verifyutf8(upb_decstate *d, const char *buf, int len) { static const uint8_t utf8_offset[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -179,16 +179,16 @@ bool decode_verifyutf8(const char *buf, int len) { while (i < len) { offset = utf8_offset[(uint8_t)buf[i]]; if (offset == 0 || i + offset > len) { - return false; + decode_err(d); } for (j = i + 1; j < i + offset; j++) { if ((buf[j] & 0xc0) != 0x80) { - return false; + decode_err(d); } } i += offset; } - return i == len; + if (i != len) decode_err(d); } static bool decode_reserve(upb_decstate *d, upb_array *arr, size_t elem) { @@ -336,7 +336,7 @@ static const char *decode_toarray(upb_decstate *d, const char *ptr, memcpy(mem, &val, 1 << op); return ptr; case OP_STRING: - decode_verifyutf8(val.str_val.data, val.str_val.size); + decode_verifyutf8(d, val.str_val.data, val.str_val.size); /* Fallthrough. */ case OP_BYTES: /* Append bytes. */ @@ -473,7 +473,7 @@ static const char *decode_tomsg(upb_decstate *d, const char *ptr, upb_msg *msg, break; } case OP_STRING: - decode_verifyutf8(val.str_val.data, val.str_val.size); + decode_verifyutf8(d, val.str_val.data, val.str_val.size); /* Fallthrough. */ case OP_BYTES: memcpy(mem, &val, sizeof(upb_strview));