diff --git a/stream/upb_decoder.c b/stream/upb_decoder.c index c35212e035..b820b08194 100644 --- a/stream/upb_decoder.c +++ b/stream/upb_decoder.c @@ -9,587 +9,384 @@ #include #include #include +#include "upb_def.h" -#define UPB_GROUP_END_OFFSET UINT32_MAX - -// Returns true if the give wire type and field type combination is valid, -// taking into account both packed and non-packed encodings. -static bool upb_check_type(upb_wire_type_t wt, upb_fielddef *f) { - // TODO: need to take into account the label; only repeated fields are - // allowed to use packed encoding. - return (1 << wt) & upb_types[f->type].allowed_wire_types; -} - -// Performs zig-zag decoding, which is used by sint32 and sint64. -static int32_t upb_zzdec_32(uint32_t n) { return (n >> 1) ^ -(int32_t)(n & 1); } -static int64_t upb_zzdec_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); } - - -/* upb_decoder ****************************************************************/ - -// The decoder keeps a stack with one entry per level of recursion. -// upb_decoder_frame is one frame of that stack. -typedef struct { - upb_msgdef *msgdef; - upb_strlen_t end_offset; // For groups, UPB_GROUP_END_OFFSET. -} upb_decoder_frame; - -struct upb_decoder { - upb_src src; // upb_decoder is a upb_src. - - upb_msgdef *toplevel_msgdef; - upb_bytesrc *bytesrc; - - // The buffer of input data. NULL is equivalent to the empty string. - upb_string *buf; - - // Holds residual bytes when fewer than UPB_MAX_ENCODED_SIZE bytes remain. - uint8_t tmpbuf[UPB_MAX_ENCODED_SIZE]; - - // The number of bytes we have yet to consume from "buf" or tmpbuf. This is - // always >= 0 unless we were just reset or are eof. - int32_t buf_bytesleft; - - // The offset within "buf" from where we are currently reading. This can be - // <0 if we are reading some residual bytes from the previous buffer, which - // are stored in tmpbuf and combined with bytes from "buf". - int32_t buf_offset; - - // The overall stream offset of the beginning of "buf". - uint32_t buf_stream_offset; - - // Wire type of the key we just read. - upb_wire_type_t wire_type; - - // Delimited length of the string field we are reading. - upb_strlen_t delimited_len; - - upb_strlen_t packed_end_offset; - - // Fielddef for the key we just read. - upb_fielddef *field; - - // We keep a stack of messages we have recursed into. - upb_decoder_frame *top, *limit, stack[UPB_MAX_NESTING]; -}; +/* Functions to read wire values. *********************************************/ +// These functions are internal to the decode, but might be moved into an +// internal header file if we at some point in the future opt to do code +// generation, because the generated code would want to inline these functions. +// The same applies to the functions to read .proto values below. -/* upb_decoder buffering. *****************************************************/ +const uint8_t *upb_get_v_uint64_t_full(const uint8_t *buf, const uint8_t *end, + uint64_t *val, upb_status *status); -static upb_strlen_t upb_decoder_offset(upb_decoder *d) +// Gets a varint (wire type: UPB_WIRE_TYPE_VARINT). +INLINE const uint8_t *upb_get_v_uint64_t(const uint8_t *buf, const uint8_t *end, + uint64_t *val, upb_status *status) { - return d->buf_stream_offset + d->buf_offset; -} - -static bool upb_decoder_nextbuf(upb_decoder *d) -{ - assert(d->buf_bytesleft < UPB_MAX_ENCODED_SIZE); - - // Copy residual bytes to temporary buffer. - if(d->buf_bytesleft > 0) { - memcpy(d->tmpbuf, upb_string_getrobuf(d->buf) + d->buf_offset, - d->buf_bytesleft); - } - - // Recycle old buffer. - if(d->buf) { - d->buf_offset -= upb_string_len(d->buf); - d->buf_stream_offset += upb_string_len(d->buf); - } - d->buf = upb_string_tryrecycle(d->buf); - - // Pull next buffer. - if(upb_bytesrc_get(d->bytesrc, d->buf, UPB_MAX_ENCODED_SIZE)) { - d->buf_bytesleft += upb_string_len(d->buf); - return true; + // We inline this common case (1-byte varints), if that fails we dispatch to + // the full (non-inlined) version. + if((*buf & 0x80) == 0) { + *val = *buf & 0x7f; + return buf + 1; } else { - return false; + return upb_get_v_uint64_t_full(buf, end, val, status); } } -static const uint8_t *upb_decoder_getbuf_full(upb_decoder *d, uint32_t *bytes) +// Gets a varint -- called when we only need 32 bits of it. Note that a 32-bit +// varint is not a true wire type. +INLINE const uint8_t *upb_get_v_uint32_t(const uint8_t *buf, const uint8_t *end, + uint32_t *val, upb_status *status) { - if(d->buf_bytesleft < UPB_MAX_ENCODED_SIZE && !upb_bytesrc_eof(d->bytesrc)) - upb_decoder_nextbuf(d); - - if(d->buf_bytesleft < UPB_MAX_ENCODED_SIZE) { - if(upb_bytesrc_eof(d->bytesrc) && d->buf_bytesleft > 0) { - // We're working through the last few bytes of the buffer. - } else if(upb_bytesrc_eof(d->bytesrc)) { - // End of stream, no more bytes left. - assert(d->buf_bytesleft == 0); - d->src.eof = true; - return NULL; - } else { - // We are short of bytes even though the bytesrc isn't EOF; must be error. - upb_copyerr(&d->src.status, upb_bytesrc_status(d->bytesrc)); - return NULL; - } - } - - if(d->buf_offset >= 0) { - // Common case: the main buffer contains at least UPB_MAX_ENCODED_SIZE - // contiguous bytes, so we can read directly out of it. - *bytes = d->buf_bytesleft; - return (uint8_t*)upb_string_getrobuf(d->buf) + d->buf_offset; - } else { - // We need to accumulate UPB_MAX_ENCODED_SIZE bytes; len is how many we - // have so far. - upb_strlen_t len = -d->buf_offset; - if(d->buf) { - upb_strlen_t to_copy = - UPB_MIN(UPB_MAX_ENCODED_SIZE - len, upb_string_len(d->buf)); - memcpy(d->tmpbuf + len, upb_string_getrobuf(d->buf), to_copy); - len += to_copy; - } - // Pad the buffer out to UPB_MAX_ENCODED_SIZE. - memset(d->tmpbuf + len, 0x80, UPB_MAX_ENCODED_SIZE - len); - *bytes = len; - return d->tmpbuf; - } + uint64_t val64; + const uint8_t *ret = upb_get_v_uint64_t(buf, end, &val64, status); + *val = (uint32_t)val64; // Discard the high bits. + return ret; } -// Returns a pointer to a buffer of data that is at least UPB_MAX_ENCODED_SIZE -// bytes long. This buffer contains the next bytes in the stream (even if -// those bytes span multiple buffers). *bytes is set to the number of actual -// stream bytes that are available in the returned buffer. If -// *bytes < UPB_MAX_ENCODED_SIZE, the buffer is padded with 0x80 bytes. -// -// After the data has been read, upb_decoder_consume() should be called to -// indicate how many bytes were consumed. -static const uint8_t *upb_decoder_getbuf(upb_decoder *d, uint32_t *bytes) +// Gets a fixed-length 32-bit integer (wire type: UPB_WIRE_TYPE_32BIT). +INLINE const uint8_t *upb_get_f_uint32_t(const uint8_t *buf, const uint8_t *end, + uint32_t *val, upb_status *status) { - if(d->buf_bytesleft >= UPB_MAX_ENCODED_SIZE && d->buf_offset >= 0) { - // Common case: the main buffer contains at least UPB_MAX_ENCODED_SIZE - // contiguous bytes, so we can read directly out of it. - *bytes = d->buf_bytesleft; - return (uint8_t*)upb_string_getrobuf(d->buf) + d->buf_offset; - } else { - return upb_decoder_getbuf_full(d, bytes); + const uint8_t *uint32_end = buf + sizeof(uint32_t); + if(uint32_end > end) { + status->code = UPB_STATUS_NEED_MORE_DATA; + return end; } + memcpy(val, buf, sizeof(uint32_t)); + return uint32_end; } -static bool upb_decoder_consume(upb_decoder *d, uint32_t bytes) +// Gets a fixed-length 64-bit integer (wire type: UPB_WIRE_TYPE_64BIT). +INLINE const uint8_t *upb_get_f_uint64_t(const uint8_t *buf, const uint8_t *end, + uint64_t *val, upb_status *status) { - assert(bytes <= UPB_MAX_ENCODED_SIZE); - d->buf_offset += bytes; - d->buf_bytesleft -= bytes; - if(d->buf_offset < 0) { - // We still have residual bytes we have not consumed. - memmove(d->tmpbuf, d->tmpbuf + bytes, -d->buf_offset); - } - assert(d->buf_bytesleft >= 0); - - // Detect end-of-submessage. - if(upb_decoder_offset(d) >= d->top->end_offset) { - d->src.eof = true; + const uint8_t *uint64_end = buf + sizeof(uint64_t); + if(uint64_end > end) { + status->code = UPB_STATUS_NEED_MORE_DATA; + return end; } - - return true; + memcpy(val, buf, sizeof(uint64_t)); + return uint64_end; } -static bool upb_decoder_skipbytes(upb_decoder *d, int32_t bytes) +INLINE const uint8_t *upb_skip_v_uint64_t(const uint8_t *buf, + const uint8_t *end, + upb_status *status) { - d->buf_offset += bytes; - d->buf_bytesleft -= bytes; - while(d->buf_bytesleft < 0) { - if(!upb_decoder_nextbuf(d)) return false; - } - - // Detect end-of-submessage. - if(upb_decoder_offset(d) >= d->top->end_offset) { - d->src.eof = true; + const uint8_t *const maxend = buf + 10; + uint8_t last = 0x80; + for(; buf < (uint8_t*)end && (last & 0x80); buf++) + last = *buf; + + if(buf >= end && buf <= maxend && (last & 0x80)) { + status->code = UPB_STATUS_NEED_MORE_DATA; + buf = end; + } else if(buf > maxend) { + status->code = UPB_ERROR_UNTERMINATED_VARINT; + buf = end; } - - return true; + return buf; } - -/* Functions to read wire values. *********************************************/ - -// Parses remining bytes of a 64-bit varint that has already had its first byte -// parsed. -INLINE bool upb_decoder_readv64(upb_decoder *d, uint32_t *low, uint32_t *high) +INLINE const uint8_t *upb_skip_f_uint32_t(const uint8_t *buf, + const uint8_t *end, + upb_status *status) { - upb_strlen_t bytes_available; - const uint8_t *buf = upb_decoder_getbuf(d, &bytes_available); - const uint8_t *start = buf; - if(!buf) return false; - - *high = 0; - uint32_t b; - b = *(buf++); *low = (b & 0x7f) ; if(!(b & 0x80)) goto done; - b = *(buf++); *low |= (b & 0x7f) << 7; if(!(b & 0x80)) goto done; - b = *(buf++); *low |= (b & 0x7f) << 14; if(!(b & 0x80)) goto done; - b = *(buf++); *low |= (b & 0x7f) << 21; if(!(b & 0x80)) goto done; - b = *(buf++); *low |= (b & 0x7f) << 28; - *high = (b & 0x7f) >> 3; if(!(b & 0x80)) goto done; - b = *(buf++); *high |= (b & 0x7f) << 4; if(!(b & 0x80)) goto done; - b = *(buf++); *high |= (b & 0x7f) << 11; if(!(b & 0x80)) goto done; - b = *(buf++); *high |= (b & 0x7f) << 18; if(!(b & 0x80)) goto done; - b = *(buf++); *high |= (b & 0x7f) << 25; if(!(b & 0x80)) goto done; - - if(bytes_available >= 10) { - upb_seterr(&d->src.status, UPB_STATUS_ERROR, "Varint was unterminated " - "after 10 bytes, stream offset: %u", upb_decoder_offset(d)); - } else { - upb_seterr(&d->src.status, UPB_STATUS_ERROR, "Stream ended in the middle " - "of a varint, stream offset: %u", upb_decoder_offset(d)); + const uint8_t *uint32_end = buf + sizeof(uint32_t); + if(uint32_end > end) { + status->code = UPB_STATUS_NEED_MORE_DATA; + return end; } - return false; - -done: - return upb_decoder_consume(d, buf - start); + return uint32_end; } -// Gets a varint -- called when we only need 32 bits of it. Note that a 32-bit -// varint is not a true wire type. -static bool upb_decoder_readv32(upb_decoder *d, uint32_t *val) +INLINE const uint8_t *upb_skip_f_uint64_t(const uint8_t *buf, + const uint8_t *end, + upb_status *status) { - uint32_t high; - if(!upb_decoder_readv64(d, val, &high)) return false; - - // We expect the high bits to be zero, except that signed 32-bit values are - // first sign-extended to be wire-compatible with 64 bits, in which case we - // expect the high bits to be all one. - // - // We could perform a slightly more sophisticated check by having the caller - // indicate whether a signed or unsigned value is being read. We could check - // that the high bits are all zeros for unsigned, and properly sign-extended - // for signed. - if(high != 0 && ~high != 0) { - upb_seterr(&d->src.status, UPB_STATUS_ERROR, "Read a 32-bit varint, but " - "the high bits contained data we should not truncate: " - "%ux, stream offset: %u", high, upb_decoder_offset(d)); - return false; + const uint8_t *uint64_end = buf + sizeof(uint64_t); + if(uint64_end > end) { + status->code = UPB_STATUS_NEED_MORE_DATA; + return end; } - return true; + return uint64_end; } -// Gets a fixed-length 32-bit integer (wire type: UPB_WIRE_TYPE_32BIT). Caller -// promises that 4 bytes are available at buf. -static bool upb_decoder_readf32(upb_decoder *d, uint32_t *val) -{ - upb_strlen_t bytes_available; - const uint8_t *buf = upb_decoder_getbuf(d, &bytes_available); - if(!buf) return false; - if(bytes_available < 4) { - upb_seterr(&d->src.status, UPB_STATUS_ERROR, - "Stream ended in the middle of a 32-bit value"); - return false; - } - memcpy(val, buf, 4); - // TODO: byte swap if big-endian. - return upb_decoder_consume(d, 4); -} +/* Functions to read .proto values. *******************************************/ -// Gets a fixed-length 64-bit integer (wire type: UPB_WIRE_TYPE_64BIT). Caller -// promises that 8 bytes are available at buf. -static bool upb_decoder_readf64(upb_decoder *d, uint64_t *val) -{ - upb_strlen_t bytes_available; - const uint8_t *buf = upb_decoder_getbuf(d, &bytes_available); - if(!buf) return false; - if(bytes_available < 8) { - upb_seterr(&d->src.status, UPB_STATUS_ERROR, - "Stream ended in the middle of a 64-bit value"); - return false; - } - memcpy(val, buf, 8); - // TODO: byte swap if big-endian. - return upb_decoder_consume(d, 8); -} +// Performs zig-zag decoding, which is used by sint32 and sint64. +INLINE int32_t upb_zzdec_32(uint32_t n) { return (n >> 1) ^ -(int32_t)(n & 1); } +INLINE int64_t upb_zzdec_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); } -// Returns the length of a varint (wire type: UPB_WIRE_TYPE_VARINT), allowing -// it to be easily skipped. Caller promises that 10 bytes are available at -// "buf". The function will return a maximum of 11 bytes before quitting. -static uint8_t upb_decoder_skipv64(upb_decoder *d) +// Parses a tag, places the result in *tag. +INLINE const uint8_t *decode_tag(const uint8_t *buf, const uint8_t *end, + upb_tag *tag, upb_status *status) { - uint32_t bytes_available; - const uint8_t *buf = upb_decoder_getbuf(d, &bytes_available); - if(!buf) return false; - uint8_t i; - for(i = 0; i < 10 && buf[i] & 0x80; i++) - ; // empty loop body. - if(i > 10) { - upb_seterr(&d->src.status, UPB_STATUS_ERROR, "Unterminated varint."); - return false; - } - return upb_decoder_consume(d, i); + uint32_t tag_int; + const uint8_t *ret = upb_get_v_uint32_t(buf, end, &tag_int, status); + tag->wire_type = (upb_wire_type_t)(tag_int & 0x07); + tag->field_number = tag_int >> 3; + return ret; } +// The decoder keeps a stack with one entry per level of recursion. +// upb_decoder_frame is one frame of that stack. +typedef struct { + upb_msgdef *msgdef; + upb_fielddef *field; + size_t end_offset; // For groups, 0. +} upb_decoder_frame; -/* upb_src implementation for upb_decoder. ************************************/ +struct upb_decoder { + // Immutable state of the decoder. + upb_src src; + upb_dispatcher dispatcher; + upb_msgdef *toplevel_msgdef; + upb_decoder_frame stack[UPB_MAX_NESTING]; -bool upb_decoder_skipval(upb_decoder *d); + // Mutable state of the decoder. -upb_fielddef *upb_decoder_getdef(upb_decoder *d) -{ - if (d->src.eof) return NULL; - // Handles the packed field case. - if(d->field) { - return d->field; - } + // Where we will store any errors that occur. + upb_status *status; + + // Stack entries store the offset where the submsg ends (for groups, 0). + upb_decoder_frame *top, *limit; + + // Current input buffer. + upb_string *buf; + + // The offset within the overall stream represented by the *beginning* of buf. + upb_strlen_t buf_stream_offset; + + // Our current offset *within* buf. Will be negative if we are buffering + // from previous buffers in tmpbuf. + upb_strlen_t buf_offset; + + // Holds any bytes we have from previous buffers. The number of bytes we + // have encoded here is -buf_offset, if buf_offset<0, 0 otherwise. + uint8_t tmpbuf[UPB_MAX_ENCODED_SIZE]; +}; + +upb_flow_t upb_decode_varint(upb_decoder *d, ptrs *p, + uint32_t *low, uint32_t *high) { + if (p->end - p->ptr > UPB_MAX_ENCODED_SIZE) { + // Fast path; we know we have a complete varint in our existing buffer. + *high = 0; + uint32_t b; + uint8_t *ptr = p->ptr; + b = *(buf++); *low = (b & 0x7f) ; if(!(b & 0x80)) goto done; + b = *(buf++); *low |= (b & 0x7f) << 7; if(!(b & 0x80)) goto done; + b = *(buf++); *low |= (b & 0x7f) << 14; if(!(b & 0x80)) goto done; + b = *(buf++); *low |= (b & 0x7f) << 21; if(!(b & 0x80)) goto done; + b = *(buf++); *low |= (b & 0x7f) << 28; + *high = (b & 0x7f) >> 3; if(!(b & 0x80)) goto done; + b = *(buf++); *high |= (b & 0x7f) << 4; if(!(b & 0x80)) goto done; + b = *(buf++); *high |= (b & 0x7f) << 11; if(!(b & 0x80)) goto done; + b = *(buf++); *high |= (b & 0x7f) << 18; if(!(b & 0x80)) goto done; + b = *(buf++); *high |= (b & 0x7f) << 25; if(!(b & 0x80)) goto done; + + if(bytes_available >= 10) { + upb_seterr(&d->src.status, UPB_STATUS_ERROR, "Varint was unterminated " + "after 10 bytes, stream offset: %u", upb_decoder_offset(d)); + return false; + } - uint32_t key = 0; -again: - if(!upb_decoder_readv32(d, &key)) return NULL; - upb_wire_type_t wire_type = key & 0x7; - int32_t field_number = key >> 3; - - if(wire_type == UPB_WIRE_TYPE_DELIMITED) { - // For delimited wire values we parse the length now, since we need it in - // all cases. - if(!upb_decoder_readv32(d, &d->delimited_len)) return NULL; - } else if(wire_type == UPB_WIRE_TYPE_END_GROUP) { - if(d->top->end_offset == UPB_GROUP_END_OFFSET) { - d->src.eof = true; + done: + p->ptr = ptr; + } else { + // Slow path: we may have to combine one or more buffers to get a whole + // varint worth of data. + uint8_t buf[UPB_MAX_ENCODED_SIZE]; + uint8_t *p = buf, *end = buf + sizeof(buf); + for(ing bitpos = 0; p < end && getbyte(d, p) && (last & 0x80); p++, bitpos += 7) + *val |= ((uint64_t)((last = *p) & 0x7F)) << bitpos; + + if(d->status->code == UPB_EOF && (last & 0x80)) { + upb_seterr(status, UPB_ERROR, + "Provided data ended in the middle of a varint.\n"); + } else if(buf == maxend) { + upb_seterr(status, UPB_ERROR, + "Varint was unterminated after 10 bytes.\n"); } else { - upb_seterr(&d->src.status, UPB_STATUS_ERROR, "End group seen but current " - "message is not a group, byte offset: %zd", - upb_decoder_offset(d)); + // Success. + return; } - return NULL; + ungetbytes(d, buf, p - buf); } +} - // Look up field by tag number. - upb_fielddef *f = upb_msgdef_itof(d->top->msgdef, field_number); - - if (!f) { - // Unknown field. If/when the upb_src interface supports reporting - // unknown fields we will implement that here. - upb_decoder_skipval(d); - goto again; - } else if (!upb_check_type(wire_type, f)) { - // This is a recoverable error condition. We skip the value but also - // return NULL and report the error. - upb_decoder_skipval(d); - // TODO: better error message. - upb_seterr(&d->src.status, UPB_STATUS_ERROR, "Incorrect wire type.\n"); - return NULL; - } - d->field = f; - d->wire_type = wire_type; - return f; +static const void *get_msgend(upb_decoder *d) +{ + if(d->top->end_offset > 0) + return upb_string_getrobuf(d->buf) + (d->top->end_offset - d->buf_stream_offset); + else + return (void*)UINTPTR_MAX; // group. } -bool upb_decoder_getval(upb_decoder *d, upb_valueptr val) +static bool isgroup(const void *submsg_end) { - switch(upb_types[d->field->type].native_wire_type) { - case UPB_WIRE_TYPE_VARINT: { - uint32_t low, high; - if(!upb_decoder_readv64(d, &low, &high)) return false; - uint64_t u64 = ((uint64_t)high << 32) | low; - if(d->field->type == UPB_TYPE(SINT64)) - *val.int64 = upb_zzdec_64(u64); - else - *val.uint64 = u64; - break; - } - case UPB_WIRE_TYPE_32BIT_VARINT: { - uint32_t u32; - if(!upb_decoder_readv32(d, &u32)) return false; - if(d->field->type == UPB_TYPE(SINT32)) - *val.int32 = upb_zzdec_32(u32); - else - *val.uint32 = u32; - break; - } - case UPB_WIRE_TYPE_64BIT: - if(!upb_decoder_readf64(d, val.uint64)) return false; - break; - case UPB_WIRE_TYPE_32BIT: - if(!upb_decoder_readf32(d, val.uint32)) return false; - break; - default: - upb_seterr(&d->src.status, UPB_STATUS_ERROR, - "Attempted to call getval on a group."); - return false; - } - // For a packed field where we have not reached the end, we leave the field - // in the decoder so we will return it again without parsing a key. - if(d->wire_type != UPB_WIRE_TYPE_DELIMITED || - upb_decoder_offset(d) >= d->packed_end_offset) { - d->field = NULL; - } - return true; + return submsg_end == (void*)UINTPTR_MAX; } -bool upb_decoder_getstr(upb_decoder *d, upb_string *str) { - // A string, bytes, or a length-delimited submessage. The latter isn't - // technically a string, but can be gotten as one to perform lazy parsing. - const int32_t total_len = d->delimited_len; - if (d->buf_offset >= 0 && (int32_t)total_len <= d->buf_bytesleft) { - // The entire string is inside our current buffer, so we can just - // return a substring of the buffer without copying. - upb_string_substr(str, d->buf, - upb_string_len(d->buf) - d->buf_bytesleft, - total_len); - upb_decoder_skipbytes(d, total_len); - } else { - // The string spans buffers, so we must copy from the residual buffer - // (if any bytes are there), then the buffer, and finally from the bytesrc. - uint8_t *ptr = (uint8_t*)upb_string_getrwbuf( - str, UPB_MIN(total_len, d->buf_bytesleft)); - int32_t len = 0; - if(d->buf_offset < 0) { - // Residual bytes we need to copy from tmpbuf. - memcpy(ptr, d->tmpbuf, -d->buf_offset); - len += -d->buf_offset; - } - if(d->buf) { - // Bytes from the buffer. - memcpy(ptr + len, upb_string_getrobuf(d->buf) + d->buf_offset, - upb_string_len(str) - len); - } - upb_decoder_skipbytes(d, upb_string_len(str)); - if(len < total_len) { - // Bytes from the bytesrc. - if(!upb_bytesrc_append(d->bytesrc, str, total_len - len)) { - upb_copyerr(&d->src.status, upb_bytesrc_status(d->bytesrc)); - return false; - } - // Have to advance this since the buffering layer of the decoder will - // never see these bytes. - d->buf_stream_offset += total_len - len; - } - } - d->field = NULL; - return true; +extern upb_wire_type_t upb_expected_wire_types[]; +// Returns true if wt is the correct on-the-wire type for ft. +INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) { + // This doesn't currently support packed arrays. + return upb_types[ft].expected_wire_type == wt; } -static bool upb_decoder_skipgroup(upb_decoder *d); -bool upb_decoder_startmsg(upb_decoder *d) { - if(++d->top >= d->limit) { - upb_seterr(&d->src.status, UPB_ERROR_MAX_NESTING_EXCEEDED, +// Pushes a new stack frame for a submessage with the given len (which will +// be zero if the submessage is a group). +static const uint8_t *push(upb_decoder *d, const uint8_t *start, + uint32_t submsg_len, upb_fielddef *f, + upb_status *status) +{ + d->top->field = f; + d->top++; + if(d->top >= d->limit) { + upb_seterr(status, UPB_ERROR_MAX_NESTING_EXCEEDED, "Nesting exceeded maximum (%d levels)\n", UPB_MAX_NESTING); - return false; + return NULL; } upb_decoder_frame *frame = d->top; - if(d->field->type == UPB_TYPE(GROUP)) { - frame->end_offset = UPB_GROUP_END_OFFSET; - } else if (d->field->type == UPB_TYPE(MESSAGE)) { - frame->end_offset = upb_decoder_offset(d) + d->delimited_len; - } else { - upb_seterr(&d->src.status, UPB_STATUS_ERROR, - "Tried to startmsg a non-msg field."); - } - frame->msgdef = upb_downcast_msgdef(d->field->def); - d->field = NULL; - return true; -} - -bool upb_decoder_endmsg(upb_decoder *d) { - if(d->top > d->stack) { - --d->top; - if(!d->src.eof) { - if(d->top->end_offset == UPB_GROUP_END_OFFSET) - upb_decoder_skipgroup(d); - else - upb_decoder_skipbytes(d, d->top->end_offset - upb_decoder_offset(d)); - } - // Detect end-of-submessage. - d->src.eof = upb_decoder_offset(d) >= d->top->end_offset; - return true; - } else { - return false; - } -} + frame->end_offset = d->completed_offset + submsg_len; + frame->msgdef = upb_downcast_msgdef(f->def); -bool upb_decoder_skipval(upb_decoder *d) { - upb_strlen_t bytes_to_skip; - d->field = NULL; - switch(d->wire_type) { - case UPB_WIRE_TYPE_VARINT: { - return upb_decoder_skipv64(d); - } - case UPB_WIRE_TYPE_START_GROUP: - if(!upb_decoder_startmsg(d)) return false; - if(!upb_decoder_skipgroup(d)) return false; - if(!upb_decoder_endmsg(d)) return false; - return true; - default: - // Including UPB_WIRE_TYPE_END_GROUP. - assert(false); - upb_seterr(&d->src.status, UPB_STATUS_ERROR, "Tried to skip an end group"); - return false; - case UPB_WIRE_TYPE_64BIT: - bytes_to_skip = 8; - break; - case UPB_WIRE_TYPE_32BIT: - bytes_to_skip = 4; - break; - case UPB_WIRE_TYPE_DELIMITED: - // Works for both string/bytes *and* submessages. - bytes_to_skip = d->delimited_len; - break; - } - return upb_decoder_skipbytes(d, bytes_to_skip); + upb_dispatch_startsubmsg(&d->dispatcher, f); + return get_msgend(d); } -static bool upb_decoder_skipgroup(upb_decoder *d) +// Pops a stack frame, returning a pointer for where the next submsg should +// end (or a pointer that is out of range for a group). +static const void *pop(upb_decoder *d, const uint8_t *start, upb_status *status) { - // This will be mututally recursive with upb_decoder_skipval() if the group - // has sub-groups. If we wanted to handle EAGAIN in the future, this - // approach would not work; we would need to track the group depth - // explicitly. - while(upb_decoder_getdef(d)) { - if(!upb_decoder_skipval(d)) return false; - } - // If we are at the end of the group like we want to be, then - // upb_decoder_getdef() returned NULL because of eof, not error. - if(!&d->src.eof) return false; - return true; + d->top--; + upb_dispatch_endsubmsg(&d->dispatcher); + return get_msgend(d); } -upb_src_vtable upb_decoder_src_vtbl = { - (upb_src_getdef_fptr)&upb_decoder_getdef, - (upb_src_getval_fptr)&upb_decoder_getval, - (upb_src_getstr_fptr)&upb_decoder_getstr, - (upb_src_skipval_fptr)&upb_decoder_skipval, - (upb_src_startmsg_fptr)&upb_decoder_startmsg, - (upb_src_endmsg_fptr)&upb_decoder_endmsg, -}; +void upb_decoder_run(upb_src *src, upb_status *status) { + // buf is our current offset, moves from start to end. + const uint8_t *buf = (uint8_t*)upb_string_getrobuf(str) + d->buf_offset; + const uint8_t *end = (uint8_t*)upb_string_getrobuf(str) + upb_string_len(str); + const uint8_t *submsg_end = get_msgend(d, start); + upb_msgdef *msgdef = d->top->msgdef; + upb_string *str = NULL; + + // Main loop: executed once per tag/field pair. + while(1) { + // Parse/handle tag. + upb_tag tag; + CHECK(decode_tag(d, &buf, &end, &tag)); + + // Decode wire data. Hopefully this branch will predict pretty well + // since most types will read a varint here. + upb_value val; + switch (tag.wire_type) { + case UPB_WIRE_TYPE_END_GROUP: + if(!isgroup(submsg_end)) { + upb_seterr(status, UPB_STATUS_ERROR, "End group seen but current " + "message is not a group, byte offset: %zd", + d->completed_offset + (completed - start)); + goto err; + } + submsg_end = pop(d, start, status, &msgdef); + completed = buf; + goto check_msgend; + case UPB_WIRE_TYPE_VARINT: + case UPB_WIRE_TYPE_DELIMITED: + // For the delimited case we are parsing the length. + CHECK(upb_decode_varint(d, &buf, &end, &val)); + break; + case UPB_WIRE_TYPE_32BIT: + CHECK(upb_decode_32bit(d, &buf, &end, &val)); + break; + case UPB_WIRE_TYPE_64BIT: + CHECK(upb_decode_64bit(d, &buf, &end, &val)); + break; + } + // Look up field by tag number. + upb_fielddef *f = upb_msg_itof(msgdef, tag.field_number); -/* upb_decoder construction/destruction. **************************************/ + if (!f) { + // Unknown field. + } else if (!upb_check_type(tag.wire_type, f->type)) { + // Field has incorrect type. + } -upb_decoder *upb_decoder_new(upb_msgdef *msgdef) -{ - upb_decoder *d = malloc(sizeof(*d)); - d->toplevel_msgdef = msgdef; - d->limit = &d->stack[UPB_MAX_NESTING]; - d->buf = NULL; - upb_src_init(&d->src, &upb_decoder_src_vtbl); - return d; -} + // Perform any further massaging of the data now that we have the fielddef. + // Now we can distinguish strings from submessages, and we know about + // zig-zag-encoded types. + // TODO: handle packed encoding. + switch (f->type) { + case UPB_TYPE(MESSAGE): + case UPB_TYPE(GROUP): + CHECK(push(d, start, upb_value_getint32(val), f, status, &msgdef)); + goto check_msgend; + case UPB_TYPE(STRING): + case UPB_TYPE(BYTES): + CHECK(upb_decode_string(d, str, upb_value_getint32(val))); + upb_value_setstr(&val, str); + break; + case UPB_TYPE(SINT32): + upb_value_setint32(&val, upb_zzdec_32(upb_value_getint32(val))); + break; + case UPB_TYPE(SINT64): + upb_value_setint64(&val, upb_zzdec_64(upb_value_getint64(val))); + break; + default: + // Other types need no further processing at this point. + } + CHECK(upb_dispatch_value(d->sink, f, val, status)); + +check_msgend: + while(buf >= submsg_end) { + if(buf > submsg_end) { + upb_seterr(status, UPB_ERROR, "Expected submsg end offset " + "did not lie on a tag/value boundary."); + goto err; + } + submsg_end = pop(d, start, status, &msgdef); + } + completed = buf; + } -void upb_decoder_free(upb_decoder *d) -{ - upb_string_unref(d->buf); - free(d); +err: + read = (char*)completed - (char*)start; + d->completed_offset += read; + return read; } -void upb_decoder_reset(upb_decoder *d, upb_bytesrc *bytesrc) -{ - upb_string_unref(d->buf); +void upb_decoder_sethandlers(upb_src *src, upb_handlers *handlers) { + upb_decoder *d = (upb_decoder*)src; + upb_dispatcher_reset(&d->dispatcher, handlers); d->top = d->stack; + d->completed_offset = 0; d->top->msgdef = d->toplevel_msgdef; // The top-level message is not delimited (we can keep receiving data for it - // indefinitely), so we set the end offset as high as possible, but not equal - // to UINT32_MAX so it doesn't equal UPB_GROUP_END_OFFSET. - d->top->end_offset = UINT32_MAX - 1; - d->src.eof = false; - d->bytesrc = bytesrc; - d->field = NULL; - d->buf = NULL; - d->buf_bytesleft = 0; - d->buf_stream_offset = 0; - d->buf_offset = 0; + // indefinitely), so we treat it like a group. + d->top->end_offset = 0; } -upb_src *upb_decoder_src(upb_decoder *d) { - return &d->src; +upb_decoder *upb_decoder_new(upb_msgdef *msgdef) { + static upb_src_vtbl vtbl = { + &upb_decoder_sethandlers, + &upb_decoder_run, + }; + upb_decoder *d = malloc(sizeof(*d)); + upb_src_init(&d->src, &vtbl); + upb_dispatcher_init(&d->dispatcher); + d->toplevel_msgdef = msgdef; + d->limit = &d->stack[UPB_MAX_NESTING]; + return d; +} + +void upb_decoder_free(upb_decoder *d) { + free(d); }