Added UTF-8 validation for proto3 string fields.

pull/13171/head
Joshua Haberman 4 years ago
parent e8f9eac68c
commit 154f2c25f4
  1. 2
      benchmarks/compare.py
  2. 16
      tests/bindings/lua/test_upb.lua
  3. 23
      tests/test_generated_code.c
  4. 26
      upb/decode.c
  5. 23
      upb/decode.int.h
  6. 93
      upb/decode_fast.c
  7. 17
      upb/decode_fast.h
  8. 8
      upbc/generator.cc

@ -53,7 +53,7 @@ def Benchmark(outbase, bench_cpu=True, runs=12):
baseline = "master"
bench_cpu = False
bench_cpu = True
if len(sys.argv) > 1:
baseline = sys.argv[1]

@ -356,6 +356,22 @@ local numeric_types = {
},
}
function test_utf8()
local invalid_utf8 = "\xff"
local proto2_msg = test_messages_proto2.TestAllTypesProto2{
optional_string = invalid_utf8,
}
-- As proto2, invalid UTF-8 parses and serializes fine.
local serialized = upb.encode(proto2_msg)
local proto2_msg2 = upb.decode(test_messages_proto2.TestAllTypesProto2, serialized)
-- Decoding as proto3 fails.
assert_error(function()
upb.decode(test_messages_proto3.TestAllTypesProto3, serialized)
end)
end
function test_msg_primitives()
local msg = test_messages_proto3.TestAllTypesProto3{
optional_int32 = 10,

@ -68,6 +68,28 @@ static void test_scalars(void) {
upb_arena_free(arena);
}
static void test_utf8(void) {
const char invalid_utf8[] = "\xff";
const upb_strview invalid_utf8_view = upb_strview_make(invalid_utf8, 1);
upb_arena *arena = upb_arena_new();
upb_strview serialized;
protobuf_test_messages_proto3_TestAllTypesProto3 *msg =
protobuf_test_messages_proto3_TestAllTypesProto3_new(arena);
protobuf_test_messages_proto3_TestAllTypesProto3 *msg2;
protobuf_test_messages_proto3_TestAllTypesProto3_set_optional_string(
msg, invalid_utf8_view);
serialized.data = protobuf_test_messages_proto3_TestAllTypesProto3_serialize(
msg, arena, &serialized.size);
msg2 = protobuf_test_messages_proto3_TestAllTypesProto3_parse(
serialized.data, serialized.size, arena);
ASSERT(msg2 == NULL);
upb_arena_free(arena);
}
static void check_string_map_empty(
protobuf_test_messages_proto3_TestAllTypesProto3 *msg) {
size_t iter = UPB_MAP_BEGIN;
@ -390,6 +412,7 @@ void test_status_truncation(void) {
int run_tests(int argc, char *argv[]) {
test_scalars();
test_utf8();
test_string_map();
test_string_double_map();
test_int32_map();

@ -147,7 +147,6 @@ typedef union {
static const char *decode_msg(upb_decstate *d, const char *ptr, upb_msg *msg,
const upb_msglayout *layout);
#include <stdlib.h>
UPB_NORETURN static void decode_err(upb_decstate *d) { longjmp(d->err, 1); }
@ -156,8 +155,7 @@ const char *fastdecode_err(upb_decstate *d) {
return NULL;
}
void decode_verifyutf8(upb_decstate *d, const char *buf, int len) {
static const uint8_t utf8_offset[] = {
const uint8_t upb_utf8_offsets[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@ -171,23 +169,8 @@ void decode_verifyutf8(upb_decstate *d, const char *buf, int len) {
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0,
};
int i, j;
uint8_t offset;
i = 0;
while (i < len) {
offset = utf8_offset[(uint8_t)buf[i]];
if (offset == 0 || i + offset > len) {
decode_err(d);
}
for (j = i + 1; j < i + offset; j++) {
if ((buf[j] & 0xc0) != 0x80) {
decode_err(d);
}
}
i += offset;
}
if (i != len) decode_err(d);
static void decode_verifyutf8(upb_decstate *d, const char *buf, int len) {
if (!decode_verifyutf8_inl(buf, len)) decode_err(d);
}
static bool decode_reserve(upb_decstate *d, upb_array *arr, size_t elem) {
@ -316,6 +299,7 @@ static const char *decode_readstr(upb_decstate *d, const char *ptr, int size,
return ptr + size;
}
UPB_FORCEINLINE
static const char *decode_tosubmsg(upb_decstate *d, const char *ptr,
upb_msg *submsg, const upb_msglayout *layout,
const upb_msglayout_field *field, int size) {
@ -331,6 +315,7 @@ static const char *decode_tosubmsg(upb_decstate *d, const char *ptr,
return ptr;
}
UPB_FORCEINLINE
static const char *decode_group(upb_decstate *d, const char *ptr,
upb_msg *submsg, const upb_msglayout *subl,
uint32_t number) {
@ -345,6 +330,7 @@ static const char *decode_group(upb_decstate *d, const char *ptr,
return ptr;
}
UPB_FORCEINLINE
static const char *decode_togroup(upb_decstate *d, const char *ptr,
upb_msg *submsg, const upb_msglayout *layout,
const upb_msglayout_field *field) {

@ -38,6 +38,29 @@ typedef struct upb_decstate {
* noreturn. */
const char *fastdecode_err(upb_decstate *d);
extern const uint8_t upb_utf8_offsets[];
UPB_INLINE
bool decode_verifyutf8_inl(const char *buf, int len) {
int i, j;
uint8_t offset;
i = 0;
while (i < len) {
offset = upb_utf8_offsets[(uint8_t)buf[i]];
if (offset == 0 || i + offset > len) {
return false;
}
for (j = i + 1; j < i + offset; j++) {
if ((buf[j] & 0xc0) != 0x80) {
return false;
}
}
i += offset;
}
return i == len;
}
/* x86-64 pointers always have the high 16 bits matching. So we can shift
* left 8 and right 8 without loss of information. */
UPB_INLINE intptr_t decode_totable(const upb_msglayout *tablep) {

@ -650,10 +650,21 @@ typedef const char *fastdecode_copystr_func(struct upb_decstate *d,
uint64_t hasbits, upb_strview *dst);
UPB_NOINLINE
static const char *fastdecode_verifyutf8(upb_decstate *d, const char *ptr,
upb_msg *msg, intptr_t table,
uint64_t hasbits, upb_strview *dst) {
if (!decode_verifyutf8_inl(dst->data, dst->size)) {
return fastdecode_err(d);
}
return fastdecode_dispatch(d, ptr, msg, table, hasbits);
}
UPB_FORCEINLINE
static const char *fastdecode_longstring(struct upb_decstate *d,
const char *ptr, upb_msg *msg,
intptr_t table, uint64_t hasbits,
upb_strview *dst) {
upb_strview *dst,
bool validate_utf8) {
int size = (uint8_t)ptr[0]; // Could plumb through hasbits.
ptr++;
if (size & 0x80) {
@ -678,8 +689,29 @@ static const char *fastdecode_longstring(struct upb_decstate *d,
dst->size = size;
}
if (validate_utf8) {
return fastdecode_verifyutf8(d, ptr + size, msg, table, hasbits, dst);
} else {
return fastdecode_dispatch(d, ptr + size, msg, table, hasbits);
}
}
UPB_NOINLINE
static const char *fastdecode_longstring_utf8(struct upb_decstate *d,
const char *ptr, upb_msg *msg,
intptr_t table, uint64_t hasbits,
upb_strview *dst) {
return fastdecode_longstring(d, ptr, msg, table, hasbits, dst, true);
}
UPB_NOINLINE
static const char *fastdecode_longstring_noutf8(struct upb_decstate *d,
const char *ptr, upb_msg *msg,
intptr_t table,
uint64_t hasbits,
upb_strview *dst) {
return fastdecode_longstring(d, ptr, msg, table, hasbits, dst, false);
}
UPB_FORCEINLINE
static void fastdecode_docopy(upb_decstate *d, const char *ptr, uint32_t size,
@ -693,7 +725,7 @@ static void fastdecode_docopy(upb_decstate *d, const char *ptr, uint32_t size,
UPB_FORCEINLINE
static const char *fastdecode_copystring(UPB_PARSE_PARAMS, int tagbytes,
upb_card card) {
upb_card card, bool validate_utf8) {
upb_strview *dst;
fastdecode_arr farr;
int64_t size;
@ -741,6 +773,9 @@ again:
ptr += size;
if (card == CARD_r) {
if (validate_utf8 && !decode_verifyutf8_inl(dst->data, dst->size)) {
return fastdecode_err(d);
}
fastdecode_nextret ret = fastdecode_nextrepeated(
d, dst, &ptr, &farr, data, tagbytes, sizeof(upb_strview));
switch (ret.next) {
@ -754,17 +789,25 @@ again:
}
}
if (card != CARD_r && validate_utf8) {
return fastdecode_verifyutf8(d, ptr, msg, table, hasbits, dst);
}
return fastdecode_dispatch(d, ptr, msg, table, hasbits);
longstr:
ptr--;
return fastdecode_longstring(d, ptr, msg, table, hasbits, dst);
if (validate_utf8) {
return fastdecode_longstring_utf8(d, ptr, msg, table, hasbits, dst);
} else {
return fastdecode_longstring_noutf8(d, ptr, msg, table, hasbits, dst);
}
}
UPB_FORCEINLINE
static const char *fastdecode_string(UPB_PARSE_PARAMS, int tagbytes,
upb_card card,
_upb_field_parser *copyfunc) {
upb_card card, _upb_field_parser *copyfunc,
bool validate_utf8) {
upb_strview *dst;
fastdecode_arr farr;
int64_t size;
@ -792,12 +835,19 @@ again:
if (UPB_UNLIKELY(fastdecode_boundscheck(ptr, size, d->end))) {
ptr--;
return fastdecode_longstring(d, ptr, msg, table, hasbits, dst);
if (validate_utf8) {
return fastdecode_longstring_utf8(d, ptr, msg, table, hasbits, dst);
} else {
return fastdecode_longstring_noutf8(d, ptr, msg, table, hasbits, dst);
}
}
ptr += size;
if (card == CARD_r) {
if (validate_utf8 && !decode_verifyutf8_inl(dst->data, dst->size)) {
return fastdecode_err(d);
}
fastdecode_nextret ret = fastdecode_nextrepeated(
d, dst, &ptr, &farr, data, tagbytes, sizeof(upb_strview));
switch (ret.next) {
@ -817,30 +867,45 @@ again:
}
}
if (card != CARD_r && validate_utf8) {
return fastdecode_verifyutf8(d, ptr, msg, table, hasbits, dst);
}
return fastdecode_dispatch(d, ptr, msg, table, hasbits);
}
/* Generate all combinations:
* {p,c} x {s,o,r} x {1bt,2bt} */
* {p,c} x {s,o,r} x {s, b} x {1bt,2bt} */
#define F(card, tagbytes) \
#define s_VALIDATE true
#define b_VALIDATE false
#define F(card, tagbytes, type) \
UPB_NOINLINE \
const char *upb_c##card##s_##tagbytes##bt(UPB_PARSE_PARAMS) { \
return fastdecode_copystring(UPB_PARSE_ARGS, tagbytes, CARD_##card); \
const char *upb_c##card##type##_##tagbytes##bt(UPB_PARSE_PARAMS) { \
return fastdecode_copystring(UPB_PARSE_ARGS, tagbytes, CARD_##card, \
type##_VALIDATE); \
} \
const char *upb_p##card##s_##tagbytes##bt(UPB_PARSE_PARAMS) { \
const char *upb_p##card##type##_##tagbytes##bt(UPB_PARSE_PARAMS) { \
return fastdecode_string(UPB_PARSE_ARGS, tagbytes, CARD_##card, \
&upb_c##card##s_##tagbytes##bt); \
&upb_c##card##type##_##tagbytes##bt, \
type##_VALIDATE); \
}
#define UTF8(card, tagbytes) \
F(card, tagbytes, s) \
F(card, tagbytes, b)
#define TAGBYTES(card) \
F(card, 1) \
F(card, 2)
UTF8(card, 1) \
UTF8(card, 2)
TAGBYTES(s)
TAGBYTES(o)
TAGBYTES(r)
#undef s_VALIDATE
#undef b_VALIDATE
#undef F
#undef TAGBYTES

@ -25,7 +25,8 @@
// - 'f4' for 4-byte fixed
// - 'f8' for 8-byte fixed
// - 'm' for sub-message
// - 's' for string/bytes
// - 's' for string (validate UTF-8)
// - 'b' for bytes
//
// In position 4 (tag length):
// - '1' for one-byte tags (field numbers 1-15)
@ -77,13 +78,17 @@ TAGBYTES(p)
/* string fields **************************************************************/
#define F(card, tagbytes) \
const char *upb_p##card##s_##tagbytes##bt(UPB_PARSE_PARAMS); \
const char *upb_c##card##s_##tagbytes##bt(UPB_PARSE_PARAMS);
#define F(card, tagbytes, type) \
const char *upb_p##card##type##_##tagbytes##bt(UPB_PARSE_PARAMS); \
const char *upb_c##card##type##_##tagbytes##bt(UPB_PARSE_PARAMS);
#define UTF8(card, tagbytes) \
F(card, tagbytes, s) \
F(card, tagbytes, b)
#define TAGBYTES(card) \
F(card, 1) \
F(card, 2)
UTF8(card, 1) \
UTF8(card, 2)
TAGBYTES(s)
TAGBYTES(o)

@ -798,9 +798,15 @@ bool TryFillTableEntry(const protobuf::Descriptor* message,
type = "z8";
break;
case protobuf::FieldDescriptor::TYPE_STRING:
case protobuf::FieldDescriptor::TYPE_BYTES:
if (field->file()->syntax() == protobuf::FileDescriptor::SYNTAX_PROTO3) {
// Only proto3 validates UTF-8.
type = "s";
break;
}
/* Fallthrough. */
case protobuf::FieldDescriptor::TYPE_BYTES:
type = "b";
break;
case protobuf::FieldDescriptor::TYPE_MESSAGE:
if (field->is_map()) {
return false; // Not supported yet (ever?).

Loading…
Cancel
Save