Getting closer to a decoder that could actually compile and work.

pull/13171/head
Joshua Haberman 14 years ago
parent 58a70b55c6
commit fe659c8c93
  1. 7
      core/upb_stream.h
  2. 6
      core/upb_string.h
  3. 207
      stream/upb_decoder.c

@ -40,8 +40,11 @@ typedef enum {
UPB_CONTINUE, UPB_CONTINUE,
// Stop processing for now; check status for details. If no status was set, // Stop processing for now; check status for details. If no status was set,
// a generic error will be returned. If the error is resumable, processing // a generic error will be returned. If the error is resumable, it is not
// will resume by delivering this callback again. // (yet) defined where processing will resume -- waiting for real-world
// examples of resumable decoders and resume-requiring clients. upb_src
// implementations that are not capable of resuming will override the return
// status to be non-resumable if a resumable status was set by the handlers.
UPB_BREAK, UPB_BREAK,
// Skips to the end of the current submessage (or if we are at the top // Skips to the end of the current submessage (or if we are at the top

@ -134,6 +134,12 @@ INLINE upb_strlen_t upb_string_len(upb_string *str) { return str->len; }
INLINE const char *upb_string_getrobuf(upb_string *str) { return str->ptr; } INLINE const char *upb_string_getrobuf(upb_string *str) { return str->ptr; }
INLINE void upb_string_endread(upb_string *str) { (void)str; } INLINE void upb_string_endread(upb_string *str) { (void)str; }
// Convenience method for getting the end of the string. Calls
// upb_string_getrobuf() so inherits the caveats of calling that function.
INLINE const char *upb_string_getbufend(upb_string *str) {
return upb_string_getrobuf(str) + upb_string_len(str);
}
// Attempts to recycle the string "str" so it may be reused and have different // Attempts to recycle the string "str" so it may be reused and have different
// data written to it. After the function returns, "str" points to a writable // data written to it. After the function returns, "str" points to a writable
// string, which is either the original string if it had no other references // string, which is either the original string if it had no other references

@ -13,23 +13,24 @@
/* Pure Decoding **************************************************************/ /* Pure Decoding **************************************************************/
// The key fast-path varint-decoding routine. There are a lot of possibilities // The key fast-path varint-decoding routine. Here we can assume we have at
// for optimization/experimentation here. // least UPB_MAX_ENCODED_SIZE bytes available. There are a lot of
INLINE bool upb_decode_varint_fast(uint8_t **buf, uint8_t *end, uint64_t &val, // possibilities for optimization/experimentation here.
INLINE bool upb_decode_varint_fast(uint8_t **ptr, uint64_t &val,
upb_status *status) { upb_status *status) {
*high = 0; *high = 0;
uint32_t b; uint32_t b;
uint8_t *ptr = p->ptr; uint8_t *ptr = p->ptr;
b = *(*buf++); *low = (b & 0x7f) ; if(!(b & 0x80)) goto done; b = *(*ptr++); *low = (b & 0x7f) ; if(!(b & 0x80)) goto done;
b = *(*buf++); *low |= (b & 0x7f) << 7; if(!(b & 0x80)) goto done; b = *(*ptr++); *low |= (b & 0x7f) << 7; if(!(b & 0x80)) goto done;
b = *(*buf++); *low |= (b & 0x7f) << 14; if(!(b & 0x80)) goto done; b = *(*ptr++); *low |= (b & 0x7f) << 14; if(!(b & 0x80)) goto done;
b = *(*buf++); *low |= (b & 0x7f) << 21; if(!(b & 0x80)) goto done; b = *(*ptr++); *low |= (b & 0x7f) << 21; if(!(b & 0x80)) goto done;
b = *(*buf++); *low |= (b & 0x7f) << 28; b = *(*ptr++); *low |= (b & 0x7f) << 28;
*high = (b & 0x7f) >> 3; if(!(b & 0x80)) goto done; *high = (b & 0x7f) >> 3; if(!(b & 0x80)) goto done;
b = *(*buf++); *high |= (b & 0x7f) << 4; if(!(b & 0x80)) goto done; b = *(*ptr++); *high |= (b & 0x7f) << 4; if(!(b & 0x80)) goto done;
b = *(*buf++); *high |= (b & 0x7f) << 11; if(!(b & 0x80)) goto done; b = *(*ptr++); *high |= (b & 0x7f) << 11; if(!(b & 0x80)) goto done;
b = *(*buf++); *high |= (b & 0x7f) << 18; if(!(b & 0x80)) goto done; b = *(*ptr++); *high |= (b & 0x7f) << 18; if(!(b & 0x80)) goto done;
b = *(*buf++); *high |= (b & 0x7f) << 25; if(!(b & 0x80)) goto done; b = *(*ptr++); *high |= (b & 0x7f) << 25; if(!(b & 0x80)) goto done;
upb_seterr(status, UPB_ERROR, "Unterminated varint"); upb_seterr(status, UPB_ERROR, "Unterminated varint");
return false; return false;
@ -71,23 +72,51 @@ struct upb_decoder {
// Current input buffer. // Current input buffer.
upb_string *buf; upb_string *buf;
// Our current offset *within* buf.
upb_strlen_t buf_offset;
// The offset within the overall stream represented by the *beginning* of buf. // The offset within the overall stream represented by the *beginning* of buf.
upb_strlen_t buf_stream_offset; upb_strlen_t buf_stream_offset;
}; };
// Called only from the slow path, this function copies the next "len" bytes // Called only from the slow path, this function copies the next "len" bytes
// from the stream to "data", adjusting "buf" and "end" appropriately. // from the stream to "data", adjusting "buf" and "len" appropriately.
INLINE bool upb_getbuf(upb_decoder *d, void *data, size_t len, static bool upb_getbuf(upb_decoder *d, void *data, size_t bytes_wanted,
uint8_t **buf, uint8_t **end) { uint8_t **ptr, size_t *len) {
while (len > 0) { while (1) {
memcpy(data, *buf, *end-*buf); memcpy(data, *ptr, *len);
len -= (*end-*buf); bytes_wanted -= *len;
if (!upb_bytesrc_getstr(d->bytesrc, d->buf, d->status)) return false; *ptr += *len;
*buf = upb_string_getrobuf(d->buf); if (bytes_wanted == 0) return true;
*end = *buf + upb_string_len(d->buf);
// Did "len" indicate end-of-submessage or end-of-buffer?
size_t buf_offset = d->buf ? (*ptr - upb_string_getrobuf(d->buf)) : 0;
if (d->top->end_offset > 0 &&
d->top->end_offset == d->buf_stream_offset + buf_offset) {
// End-of-submessage.
if (bytes_wanted > 0) {
upb_seterr(d->status, UPB_ERROR, "Bad submessage end.")
return false;
}
if (upb_pop(d) != UPB_CONTINUE) return false;
} else {
// End-of-buffer.
if (d->buf) d->buf_stream_offset += upb_string_len(d->buf);
if (!upb_bytesrc_getstr(d->bytesrc, d->buf, d->status)) return false;
*ptr = upb_string_getrobuf(d->buf);
}
// Wait for end-of-submessage or end-of-buffer, whichever comes first.
size_t offset_in_buf = *ptr - upb_string_getrobuf(d->buf);
size_t buf_remaining = upb_string_getbufend(d->buf) - *ptr;
size_t submsg_remaining =
d->top->end_offset - d->buf_stream_offset - offset_in_buf;
if (d->top->end_offset == UPB_GROUP_END_OFFSET ||
buf_remaining > submsg_remaining) {
*len = buf_remaining;
} else {
// Check that non of our subtraction overflowed.
assert(d->top->end_offset > d->buf_stream_offset);
assert(d->top->end_offset - d->buf_stream_offset > offset_in_buf);
*len = submsg_remaining;
}
} }
} }
@ -112,21 +141,21 @@ static bool upb_decode_varint_slow(upb_decoder *d) {
} }
} }
INLINE bool upb_decode_tag(upb_decoder *d, const uint8_t **_buf, INLINE bool upb_decode_tag(upb_decoder *d, const uint8_t **_ptr,
const uint8_t **end, upb_tag *tag) { const uint8_t **len, upb_tag *tag) {
const uint8_t *buf = *_buf, *end = *_end; const uint8_t *ptr = *_ptr, *len = *_end;
uint32_t tag_int; uint32_t tag_int;
// Nearly all tag varints will be either 1 byte (1-16) or 2 bytes (17-2048). // Nearly all tag varints will be either 1 byte (1-16) or 2 bytes (17-2048).
if (end - buf < 2) goto slow; // unlikely. if (len - ptr < 2) goto slow; // unlikely.
tag_int = *buf & 0x7f; tag_int = *ptr & 0x7f;
if ((*(buf++) & 0x80) == 0) goto done; // predictable if fields are in order if ((*(ptr++) & 0x80) == 0) goto done; // predictable if fields are in order
tag_int |= (*buf & 0x7f) << 7; tag_int |= (*ptr & 0x7f) << 7;
if ((*(buf++) & 0x80) != 0) goto slow; // unlikely. if ((*(ptr++) & 0x80) != 0) goto slow; // unlikely.
slow: slow:
if (!upb_decode_varint_slow(d, _buf, _end)) return false; if (!upb_decode_varint_slow(d, _ptr, _end)) return false;
buf = *_buf; // Trick the next line into not overwriting us. ptr = *_ptr; // Trick the next line into not overwriting us.
done: done:
*_buf = buf; *_ptr = ptr;
tag->wire_type = (upb_wire_type_t)(tag_int & 0x07); tag->wire_type = (upb_wire_type_t)(tag_int & 0x07);
tag->field_number = tag_int >> 3; tag->field_number = tag_int >> 3;
return true; return true;
@ -134,22 +163,22 @@ done:
INLINE bool upb_decode_varint(upb_decoder *d, ptrs *p, INLINE bool upb_decode_varint(upb_decoder *d, ptrs *p,
uint32_t *low, uint32_t *high) { uint32_t *low, uint32_t *high) {
if (p->end - p->ptr >= UPB_MAX_VARINT_ENCODED_SIZE) if (p->len - p->ptr >= UPB_MAX_VARINT_ENCODED_SIZE)
return upb_decode_varint_fast(d); return upb_decode_varint_fast(d);
else else
return upb_decode_varint_slow(d); return upb_decode_varint_slow(d);
} }
INLINE bool upb_decode_fixed(upb_decoder *d, upb_wire_type_t wt, INLINE bool upb_decode_fixed(upb_decoder *d, upb_wire_type_t wt,
uint8_t **buf, uint8_t **end, upb_value *val) { uint8_t **ptr, uint8_t **len, upb_value *val) {
static const char table = {0, 8, 0, 0, 0, 4}; static const char table = {0, 8, 0, 0, 0, 4};
size_t bytes = table[wt]; size_t bytes = table[wt];
if (*end - *buf >= bytes) { if (*len - *ptr >= bytes) {
// Common (fast) case. // Common (fast) case.
memcpy(&val, *buf, bytes); memcpy(&val, *ptr, bytes);
*buf += bytes; *ptr += bytes;
} else { } else {
if (!upb_getbuf(d, &val, bytes, buf, end)) return false; if (!upb_getptr(d, &val, bytes, ptr, len)) return false;
} }
return true; return true;
} }
@ -159,12 +188,12 @@ INLINE bool upb_decode_fixed(upb_decoder *d, upb_wire_type_t wt,
INLINE bool upb_decode_string(upb_decoder *d, upb_value *val, upb_string **str) { INLINE bool upb_decode_string(upb_decoder *d, upb_value *val, upb_string **str) {
upb_string_recycle(str); upb_string_recycle(str);
upb_strlen_t len = upb_valu_getint32(*val); upb_strlen_t len = upb_valu_getint32(*val);
if (*end - *buf >= len) { if (*len - *ptr >= len) {
// Common (fast) case. // Common (fast) case.
upb_string_substr(*str, d->buf, *buf - upb_string_getrobuf(d->buf), len); upb_string_substr(*str, d->buf, *ptr - upb_string_getrobuf(d->buf), len);
*buf += len; *ptr += len;
} else { } else {
if (!upb_getbuf(d, upb_string_getrwbuf(*str, len), len, buf, end)) if (!upb_getbuf(d, upb_string_getrwbuf(*str, len), len, ptr, len))
return false; return false;
} }
return true; return true;
@ -173,19 +202,6 @@ INLINE bool upb_decode_string(upb_decoder *d, upb_value *val, upb_string **str)
/* The main decoding loop *****************************************************/ /* The main decoding loop *****************************************************/
static const void *get_msgend(upb_decoder *d)
{
if(d->top->end_offset > 0)
return upb_string_getrobuf(d->buf) + (d->top->end_offset - d->buf_stream_offset);
else
return (void*)UINTPTR_MAX; // group.
}
static bool isgroup(const void *submsg_end)
{
return submsg_end == (void*)UINTPTR_MAX;
}
extern upb_wire_type_t upb_expected_wire_types[]; extern upb_wire_type_t upb_expected_wire_types[];
// Returns true if wt is the correct on-the-wire type for ft. // Returns true if wt is the correct on-the-wire type for ft.
INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) { INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) {
@ -193,76 +209,78 @@ INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) {
return upb_types[ft].expected_wire_type == wt; return upb_types[ft].expected_wire_type == wt;
} }
static bool upb_push(upb_decoder *d, const uint8_t *start, static upb_flow_t upb_push(upb_decoder *d, upb_fielddef *f,
uint32_t submsg_len, upb_fielddef *f, upb_strlen_t submsg_len, upb_field_type_t type) {
upb_status *status)
{
d->top->field = f; d->top->field = f;
d->top++; d->top++;
if(d->top >= d->limit) { if(d->top >= d->limit) {
upb_seterr(status, UPB_ERROR, "Nesting too deep."); upb_seterr(status, UPB_ERROR, "Nesting too deep.");
return false; return UPB_ERROR;
} }
d->top->end_offset = d->completed_offset + submsg_len; d->top->end_offset = type == UPB_TYPE(GROUP) ?
UPB_GROUP_END_OFFSET : d->completed_offset + submsg_len;
d->top->msgdef = upb_downcast_msgdef(f->def); d->top->msgdef = upb_downcast_msgdef(f->def);
*submsg_end = get_msgend(d); return upb_dispatch_startsubmsg(&d->dispatcher, f);
if (!upb_dispatch_startsubmsg(&d->dispatcher, f)) return false;
return true;
} }
static bool upb_pop(upb_decoder *d, const uint8_t *start, upb_status *status) static upb_flow_t upb_pop(upb_decoder *d) {
{
d->top--; d->top--;
upb_dispatch_endsubmsg(&d->dispatcher); return upb_dispatch_endsubmsg(&d->dispatcher);
*submsg_end = get_msgend(d);
return true;
} }
void upb_decoder_run(upb_src *src, upb_status *status) { void upb_decoder_run(upb_src *src, upb_status *status) {
// buf is our current offset, moves from start to end. // We use stack variables for our frequently used vars so the compiler knows
const uint8_t *buf = (uint8_t*)upb_string_getrobuf(str) + d->buf_offset; // they can't be changed by external code (like when we dispatch a callback).
const uint8_t *end = (uint8_t*)upb_string_getrobuf(str) + upb_string_len(str);
const uint8_t *submsg_end = get_msgend(d, start); // Our current position in the data buffer.
upb_msgdef *msgdef = d->top->msgdef; uint8_t *ptr = NULL;
// Number of bytes available at ptr, until either end-of-buf or
// end-of-submessage (whichever is smaller).
size_t len = 0;
upb_string *str = NULL; upb_string *str = NULL;
upb_dispatch_startmsg(&d->dispatcher); // TODO: handle UPB_SKIPSUBMSG
#define CHECK_FLOW(expr) if ((expr) != UPB_CONTINUE) goto err
#define CHECK(expr) if (!expr) goto err;
CHECK_FLOW(upb_dispatch_startmsg(&d->dispatcher));
// Main loop: executed once per tag/field pair. // Main loop: executed once per tag/field pair.
while(1) { while(1) {
// Parse/handle tag. // Parse/handle tag.
upb_tag tag; upb_tag tag;
CHECK(upb_decode_tag(d, &buf, &end, &tag)); CHECK(upb_decode_tag(d, &ptr, &len, &tag));
// Decode wire data. Hopefully this branch will predict pretty well // Decode wire data. Hopefully this branch will predict pretty well
// since most types will read a varint here. // since most types will read a varint here.
upb_value val; upb_value val;
switch (tag.wire_type) { switch (tag.wire_type) {
case UPB_WIRE_TYPE_END_GROUP: case UPB_WIRE_TYPE_END_GROUP:
if(!isgroup(submsg_end)) { if(d->top->end_offset != UPB_GROUP_END_OFFSET)
upb_seterr(status, UPB_ERROR, "Unexpected END_GROUP tag."); upb_seterr(status, UPB_ERROR, "Unexpected END_GROUP tag.");
goto err; goto err;
} }
CHECK(upb_pop(d, start, status, &msgdef, &submsg_end)); CHECK_FLOW(upb_pop(d));
goto check_msgend; // We have no value to dispatch. continue; // We have no value to dispatch.
case UPB_WIRE_TYPE_VARINT: case UPB_WIRE_TYPE_VARINT:
case UPB_WIRE_TYPE_DELIMITED: case UPB_WIRE_TYPE_DELIMITED:
// For the delimited case we are parsing the length. // For the delimited case we are parsing the length.
CHECK(upb_decode_varint(d, &buf, &end, &val)); CHECK(upb_decode_varint(d, &ptr, &len, &val));
break; break;
case UPB_WIRE_TYPE_32BIT: case UPB_WIRE_TYPE_32BIT:
case UPB_WIRE_TYPE_64BIT: case UPB_WIRE_TYPE_64BIT:
CHECK(upb_decode_fixed(d, tag.wire_type, &buf, &end, &val)); CHECK(upb_decode_fixed(d, tag.wire_type, &ptr, &len, &val));
break; break;
} }
// Look up field by tag number. // Look up field by tag number.
upb_fielddef *f = upb_msg_itof(msgdef, tag.field_number); upb_fielddef *f = upb_msg_itof(d->top->msgdef, tag.field_number);
if (!f) { if (!f) {
if (tag.wire_type == UPB_WIRE_TYPE_DELIMITED) if (tag.wire_type == UPB_WIRE_TYPE_DELIMITED)
CHECK(upb_decode_string(d, &val, &str)); CHECK(upb_decode_string(d, &val, &str));
CHECK(upb_dispatch_unknownval(d, tag.field_number, val)); CHECK_FLOW(upb_dispatch_unknownval(d, tag.field_number, val));
} else if (!upb_check_type(tag.wire_type, f->type)) { } else if (!upb_check_type(tag.wire_type, f->type)) {
// TODO: put more details in this error msg. // TODO: put more details in this error msg.
upb_seterr(status, UPB_ERROR, "Field had incorrect type."); upb_seterr(status, UPB_ERROR, "Field had incorrect type.");
@ -280,8 +298,8 @@ void upb_decoder_run(upb_src *src, upb_status *status) {
switch (f->type) { switch (f->type) {
case UPB_TYPE(MESSAGE): case UPB_TYPE(MESSAGE):
case UPB_TYPE(GROUP): case UPB_TYPE(GROUP):
CHECK(upb_push(d, start, upb_value_getint32(val), f, status, &msgdef)); CHECK_FLOW(upb_push(d, start, upb_value_getint32(val), f, status, &msgdef));
goto check_msgend; // We have no value to dispatch. continue; // We have no value to dispatch.
case UPB_TYPE(STRING): case UPB_TYPE(STRING):
case UPB_TYPE(BYTES): case UPB_TYPE(BYTES):
CHECK(upb_decode_string(d, &val, &str)); CHECK(upb_decode_string(d, &val, &str));
@ -295,19 +313,10 @@ void upb_decoder_run(upb_src *src, upb_status *status) {
default: default:
break; // Other types need no further processing at this point. break; // Other types need no further processing at this point.
} }
CHECK(upb_dispatch_value(d->sink, f, val, status)); CHECK_FLOW(upb_dispatch_value(d->sink, f, val, status));
check_msgend:
while(buf >= submsg_end) {
if(buf > submsg_end) {
upb_seterr(status, UPB_ERROR, "Bad submessage end.")
goto err;
}
CHECK(upb_pop(d, start, status, &msgdef, &submsg_end));
}
} }
CHECK(upb_dispatch_endmsg(&d->dispatcher)); CHECK_FLOW(upb_dispatch_endmsg(&d->dispatcher));
return; return;
err: err:

Loading…
Cancel
Save