Updated parser semantics to better support streaming.

pull/13171/head
Joshua Haberman 16 years ago
parent 5fa6912da8
commit 4240e0e598
  1. 15
      src/upb_msg.c
  2. 100
      src/upb_parse.c
  3. 37
      src/upb_parse.h
  4. 6
      src/upb_string.h

@ -291,18 +291,23 @@ static upb_status_t value_cb(void *udata, uint8_t *buf, uint8_t *end,
return UPB_STATUS_OK;
}
static void str_cb(void *udata, struct upb_string *str, void *user_field_desc)
static void str_cb(void *udata, uint8_t *str,
size_t avail_len, size_t total_len,
void *udesc)
{
struct upb_msg_parse_state *s = udata;
struct upb_msg_field *f = user_field_desc;
struct upb_msg_field *f = udesc;
union upb_value_ptr p = get_value_ptr(s->top->data, f);
upb_msg_set(s->top->data, f);
if(avail_len != total_len) abort(); /* TODO: support streaming. */
if(s->byref) {
upb_msg_reuse_strref(p.str);
**p.str = *str;
(*p.str)->ptr = (char*)str;
(*p.str)->byte_len = avail_len;
} else {
upb_msg_reuse_str(p.str, str->byte_len);
upb_strcpy(*p.str, str);
upb_msg_reuse_str(p.str, avail_len);
memcpy((*p.str)->ptr, str, avail_len);
(*p.str)->byte_len = avail_len;
}
//google_protobuf_FieldDescriptorProto *fd = upb_msg_field_descriptor(f, s->top->m);
//upb_text_printfield(&s->p, *fd->name, f->type, upb_deref(p, fd->type), stdout);

@ -39,36 +39,12 @@ upb_status_t upb_get_v_uint64_t_full(uint8_t *restrict buf, uint8_t *end,
uint64_t *restrict val,
uint8_t **outbuf)
{
if(buf + 10 <= end) {
/* >2-byte varint, fast path. */
uint64_t cont = *(uint64_t*)(buf+2) | 0x7f7f7f7f7f7f7f7fULL;
int num_bytes = __builtin_ffsll(~cont) / 8;
uint32_t part0 = 0, part1 = 0, part2 = 0;
switch(num_bytes) {
default: return UPB_ERROR_UNTERMINATED_VARINT;
case 8: part2 |= (buf[9] & 0x7F) << 7;
case 7: part2 |= (buf[8] & 0x7F);
case 6: part1 |= (buf[7] & 0x7F) << 21;
case 5: part1 |= (buf[6] & 0x7F) << 14;
case 4: part1 |= (buf[5] & 0x7F) << 7;
case 3: part1 |= (buf[4] & 0x7F);
case 2: part0 |= (buf[3] & 0x7F) << 21;
case 1: part0 |= (buf[2] & 0x7F) << 14;
part0 |= (buf[1] & 0x7F) << 7;
part0 |= (buf[0] & 0x7F);
}
*val = (uint64_t)part0 | ((uint64_t)part1 << 28) | ((uint64_t)part2 << 56);
*outbuf = buf + num_bytes + 2;
} else {
/* >2-byte varint, slow path. */
uint8_t last = 0x80;
*val = 0;
for(int bitpos = 0; buf < (uint8_t*)end && (last & 0x80); buf++, bitpos += 7)
*val |= ((uint64_t)((last = *buf) & 0x7F)) << bitpos;
if(last & 0x80) return UPB_STATUS_NEED_MORE_DATA;
*outbuf = buf;
}
uint8_t last = 0x80;
*val = 0;
for(int bitpos = 0; buf < (uint8_t*)end && (last & 0x80); buf++, bitpos += 7)
*val |= ((uint64_t)((last = *buf) & 0x7F)) << bitpos;
if(last & 0x80) return UPB_STATUS_NEED_MORE_DATA;
*outbuf = buf;
return UPB_STATUS_OK;
}
@ -201,15 +177,17 @@ upb_status_t upb_parse(struct upb_parse_state *s, void *_buf, size_t len,
uint8_t *buf = _buf;
uint8_t *completed = buf;
uint8_t *const start = buf;
uint8_t *end = buf + len;
uint8_t *submsg_end = buf + (*s->top > 0 ? *s->top : 0);
upb_status_t status = UPB_STATUS_OK;
/* Make local copies so optimizer knows they won't change. */
upb_tag_cb tag_cb = s->tag_cb;
upb_str_cb str_cb = s->str_cb;
upb_value_cb value_cb = s->value_cb;
void *udata = s->udata;
uint8_t *end = buf + len;
uint8_t *submsg_end = buf + (*s->top > 0 ? *s->top : 0);
/* Main loop: parse a tag, then handle the value. */
while(buf < end) {
struct upb_tag tag;
UPB_CHECK(parse_tag(buf, end, &tag, &buf));
@ -218,53 +196,43 @@ upb_status_t upb_parse(struct upb_parse_state *s, void *_buf, size_t len,
completed = buf;
continue;
}
/* Don't handle START_GROUP here, so client can skip group via tag_cb. */
void *user_field_desc;
upb_field_type_t ft = tag_cb(udata, &tag, &user_field_desc);
void *udesc;
upb_field_type_t ft = tag_cb(udata, &tag, &udesc);
if(tag.wire_type == UPB_WIRE_TYPE_DELIMITED) {
int32_t delim_len;
UPB_CHECK(upb_get_INT32(buf, end, &delim_len, &buf));
uint8_t *delim_end = buf + delim_len;
if(delim_end > end) { /* String ends beyond the data we have. */
if(ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE) {
/* Streaming the body of a message is ok. */
} else {
/* String, bytes, and packed arrays must have all data present. */
status = UPB_STATUS_NEED_MORE_DATA;
goto done;
}
}
if(ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE) {
UPB_CHECK(push_stack_frame(s, start, delim_end - start, user_field_desc, &submsg_end));
} else { /* Delimited data for which we require (and have) all data. */
if(ft == 0) {
/* Do nothing -- client has elected to skip. */
} else if(upb_isstringtype(ft)) {
struct upb_string str = {.ptr = (char*)buf, .byte_len = delim_len};
str_cb(udata, &str, user_field_desc);
} else { /* Packed Array. */
while(buf < delim_end)
UPB_CHECK(value_cb(udata, buf, end, user_field_desc, &buf));
}
buf = delim_end;
UPB_CHECK(push_stack_frame(
s, start, delim_end - start, udesc, &submsg_end));
} else {
if(upb_isstringtype(ft))
str_cb(udata, buf, UPB_MIN(delim_end, end) - buf, delim_end - buf, udesc);
else
;/* Set a marker for packed arrays. */
buf = delim_end; /* Note that this could be greater than end. */
}
} else { /* Scalar (non-delimited) value. */
if(ft == 0) /* Client elected to skip. */
UPB_CHECK(skip_wire_value(buf, end, tag.wire_type, &buf));
else if(ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP)
UPB_CHECK(push_stack_frame(s, start, 0, user_field_desc, &submsg_end));
else
UPB_CHECK(value_cb(udata, buf, end, user_field_desc, &buf));
switch(ft) {
case 0: /* Client elected to skip. */
UPB_CHECK(skip_wire_value(buf, end, tag.wire_type, &buf));
break;
case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP:
UPB_CHECK(push_stack_frame(s, start, 0, udesc, &submsg_end));
break;
default:
UPB_CHECK(value_cb(udata, buf, end, udesc, &buf));
break;
}
}
while(buf == submsg_end) submsg_end = pop_stack_frame(s, start);
//while(buf < s->packed_end) /* packed arrays. */
// UPB_CHECK(value_cb(udata, buf, end, udesc, &buf));
completed = buf;
}
done:
*read = (char*)completed - (char*)start;
s->completed_offset += *read;
return status;

@ -72,8 +72,12 @@ typedef upb_field_type_t (*upb_tag_cb)(void *udata,
typedef upb_status_t (*upb_value_cb)(void *udata, uint8_t *buf, uint8_t *end,
void *user_field_desc, uint8_t **outbuf);
/* The callback that is called when a string is parsed. */
typedef void (*upb_str_cb)(void *udata, struct upb_string *str,
/* The callback that is called when a string is parsed. Note that the data
* for the string might not all be available -- we could be streaming, and
* the current buffer might end right in the middle of the string. So we
* pass both the available length and the total length. */
typedef void (*upb_str_cb)(void *udata, uint8_t *str,
size_t avail_len, size_t total_len,
void *user_field_desc);
/* Callbacks that are called when a submessage begins and ends, respectively.
@ -96,9 +100,16 @@ struct upb_parse_state {
};
/* Parses up to len bytes of protobuf data out of buf, calling cb as needed.
* The function returns how many bytes were consumed from buf. Data is parsed
* until no more data can be read from buf, or the callback sets *done=true,
* or an error occured. Sets *read to the number of bytes consumed. */
* The function returns a status indicating the success of the operation. Data
* is parsed until no more data can be read from buf, or the callback returns an
* error like UPB_STATUS_USER_CANCELLED, or an error occurs.
*
* *read is set to the number of bytes consumed. Note that this can be greater
* than len in the case that a string was recognized that spans beyond the end
* of the currently provided data.
*
* The next call to upb_parse must be the first byte after buf + *read, even in
* the case that *read > len. */
upb_status_t upb_parse(struct upb_parse_state *s, void *buf, size_t len,
size_t *read);
@ -136,18 +147,12 @@ upb_status_t upb_get_v_uint64_t_full(uint8_t *buf, uint8_t *end, uint64_t *val,
INLINE upb_status_t upb_get_v_uint64_t(uint8_t *buf, uint8_t *end, uint64_t *val,
uint8_t **outbuf)
{
/* We inline these two common cases (short varints), if that fails we
* dispatch to the full (non-inlined) version. */
/* We inline this common case (1-byte varints), if that fails we dispatch to
* the full (non-inlined) version. */
if((*buf & 0x80) == 0) {
/* Single-byte varint -- very common case. */
*val = *buf & 0x7f;
*outbuf = buf + 1;
return UPB_STATUS_OK;
} else if(buf <= end && (*(buf+1) & 0x80) == 0) {
/* Two-byte varint. */
*val = (buf[0] & 0x7f) | ((buf[1] & 0x7f) << 7);
*outbuf = buf + 2;
return UPB_STATUS_OK;
} else {
return upb_get_v_uint64_t_full(buf, end, val, outbuf);
}
@ -174,7 +179,7 @@ INLINE upb_status_t upb_get_f_uint32_t(uint8_t *buf, uint8_t *end,
*val = *(uint32_t*)buf;
#else
#define SHL(val, bits) ((uint32_t)val << bits)
*val = SHL(b[0], 0) | SHL(b[1], 8) | SHL(b[2], 16) | SHL(b[3], 24);
*val = SHL(buf[0], 0) | SHL(buf[1], 8) | SHL(buf[2], 16) | SHL(buf[3], 24);
#undef SHL
#endif
*outbuf = uint32_end;
@ -191,8 +196,8 @@ INLINE upb_status_t upb_get_f_uint64_t(uint8_t *buf, uint8_t *end,
*val = *(uint64_t*)buf;
#else
#define SHL(val, bits) ((uint64_t)val << bits)
*val = SHL(b[0], 0) | SHL(b[1], 8) | SHL(b[2], 16) | SHL(b[3], 24) |
SHL(b[4], 32) | SHL(b[5], 40) | SHL(b[6], 48) | SHL(b[7], 56) |
*val = SHL(buf[0], 0) | SHL(buf[1], 8) | SHL(buf[2], 16) | SHL(buf[3], 24) |
SHL(buf[4], 32) | SHL(buf[5], 40) | SHL(buf[6], 48) | SHL(buf[7], 56);
#undef SHL
#endif
*outbuf = uint64_end;

@ -37,6 +37,8 @@ extern "C" {
#define INLINE static inline
#endif
#define UPB_MIN(x, y) ((x) < (y) ? (x) : (y))
struct upb_string {
/* We expect the data to be 8-bit clean (uint8_t), but char* is such an
* ingrained convention that we follow it. */
@ -44,15 +46,13 @@ struct upb_string {
uint32_t byte_len;
};
INLINE uint32_t min(uint32_t a, uint32_t b) { return a < b ? a : b; }
INLINE bool upb_streql(struct upb_string *s1, struct upb_string *s2) {
return s1->byte_len == s2->byte_len &&
memcmp(s1->ptr, s2->ptr, s1->byte_len) == 0;
}
INLINE int upb_strcmp(struct upb_string s1, struct upb_string s2) {
size_t common_length = min(s1.byte_len, s2.byte_len);
size_t common_length = UPB_MIN(s1.byte_len, s2.byte_len);
int common_diff = memcmp(s1.ptr, s2.ptr, common_length);
if(common_diff == 0) return s1.byte_len - s2.byte_len;
else return common_diff;

Loading…
Cancel
Save