More work on the decoder.

pull/13171/head
Joshua Haberman 15 years ago
parent cfe0ef08c1
commit fbc57ee488
  1. 4
      LICENSE
  2. 179
      src/upb_decoder.c
  3. 27
      src/upb_decoder.h
  4. 3
      src/upb_srcsink.h

@ -1,6 +1,6 @@
Copyright (c) 2009, Joshua Haberman
Copyright (c) 2009, Google Inc.
Copyright (c) 2009-2010, Joshua Haberman
Copyright (c) 2009-2010, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without

@ -18,7 +18,7 @@ const int8_t upb_get_v_uint64_full(const uint8_t *buf, uint64_t *val);
// Gets a varint (wire type: UPB_WIRE_TYPE_VARINT). Caller promises that >=10
// bytes are available at buf. Returns the number of bytes consumed, or 11 if
// the varint was unterminated after 10 bytes.
INLINE int8_t upb_get_v_uint64(const uint8_t *buf, uint64_t *val)
INLINE uint8_t upb_get_v_uint64(const uint8_t *buf, uint64_t *val)
{
// We inline this common case (1-byte varints), if that fails we dispatch to
// the full (non-inlined) version.
@ -33,7 +33,7 @@ INLINE int8_t upb_get_v_uint64(const uint8_t *buf, uint64_t *val)
// Gets a varint -- called when we only need 32 bits of it. Note that a 32-bit
// varint is not a true wire type.
INLINE int8_t upb_get_v_uint32(const uint8_t *buf, uint32_t *val)
INLINE uint8_t upb_get_v_uint32(const uint8_t *buf, uint32_t *val)
{
uint64_t val64;
int8_t ret = upb_get_v_uint64(buf, end, &val64, status);
@ -54,7 +54,8 @@ INLINE void upb_get_f_uint32(const uint8_t *buf, uint32_t *val)
#endif
}
// Gets a fixed-length 64-bit integer (wire type: UPB_WIRE_TYPE_64BIT).
// Gets a fixed-length 64-bit integer (wire type: UPB_WIRE_TYPE_64BIT). Caller
// promises that 8 bytes are available at buf.
INLINE void upb_get_f_uint64(const uint8_t *buf uint64_t *val)
{
#if UPB_UNALIGNED_READS_OK
@ -67,9 +68,10 @@ INLINE void upb_get_f_uint64(const uint8_t *buf uint64_t *val)
#endif
}
INLINE const uint8_t *upb_skip_v_uint64(const uint8_t *buf,
const uint8_t *end,
upb_status *status)
// Skips a varint (wire type: UPB_WIRE_TYPE_VARINT). Caller promises that 10
// bytes are available at "buf". Returns the number of bytes that were
// skipped.
INLINE const uint8_t *upb_skip_v_uint64(const uint8_t *buf)
{
const uint8_t *const maxend = buf + 10;
uint8_t last = 0x80;
@ -82,7 +84,7 @@ INLINE const uint8_t *upb_skip_v_uint64(const uint8_t *buf,
// Parses a 64-bit varint that is known to be >= 2 bytes (the inline version
// handles 1 and 2 byte varints).
const int8_t upb_get_v_uint64_full(const uint8_t *buf uint64_t *val)
const uint8_t upb_get_v_uint64_full(const uint8_t *buf uint64_t *val)
{
const uint8_t *const maxend = buf + 9;
uint8_t last = 0x80;
@ -102,7 +104,7 @@ INLINE int32_t upb_zzdec_32(uint32_t n) { return (n >> 1) ^ -(int32_t)(n & 1); }
INLINE int64_t upb_zzdec_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); }
/* Functions to read .proto values. *******************************************/
/* upb_decoder ****************************************************************/
// The decoder keeps a stack with one entry per level of recursion.
// upb_decoder_frame is one frame of that stack.
@ -113,36 +115,30 @@ typedef struct {
} upb_decoder_frame;
struct upb_decoder {
// Immutable state of the decoder.
upb_src src; // upb_decoder is a upb_src.
upb_msgdef *toplevel_msgdef;
upb_bytesrc *bytesrc;
// State pertaining to a particular decode (resettable).
// Stack entries store the offset where the submsg ends (for groups, 0).
// We keep a stack of messages we have recursed into.
upb_decoder_frame stack[UPB_MAX_NESTING], *top, *limit;
// The current buffer.
// The buffers of input data. See buffering code below for details.
upb_string *buf;
upb_string *nextbuf;
uint8_t tmpbuf[UPB_MAX_ENCODED_SIZE]; // Used to bridge buf and nextbuf.
// The overflow buffer. Used when fewer than UPB_MAX_ENCODED_SIZE bytes
// are left in a buffer, the remaining bytes are copied here along with
// the bytes from the next buffer (or 0x80 if the byte stream is EOF).
uint8_t overflow_buf[UPB_MAX_ENCODED_SIZE];
// The number of bytes we have yet to consume from this buffer.
int32_t buf_bytes_remaining;
// The number of bytes we have yet to consume from "buf". This can be
// negative if we have skipped more bytes than are in the buffer, or if we
// have started to consume bytes from "nextbuf".
int32_t buf_bytesleft;
// The overall stream offset of the beginning of this buffer.
// The overall stream offset of the end of "buf". If "buf" is NULL, it is as
// if "buf" was the empty string.
uint32_t buf_stream_offset;
// Indicates that we are in the middle of skipping bytes or groups (or both).
// If both are set, the byte-skipping needs to happen first.
uint8_t skip_groups;
uint32_t skip_bytes;
bool eof;
};
/* upb_decoder construction/destruction. **************************************/
upb_decoder *upb_decoder_new(upb_msgdef *msgdef)
@ -169,12 +165,13 @@ void upb_decoder_reset(upb_decoder *d, upb_sink *sink)
d->top->end_offset = 0;
}
/* upb_decoder buffering. *****************************************************/
// Discards the current buffer if we are done with it, make the next buffer
// current if there is one.
static void upb_decoder_advancebuf(upb_decoder *d)
{
// Discard the current buffer if we are done with it, make the next buffer
// current if there is one.
if(d->buf_bytes_remaining <= 0) {
if(d->buf) upb_bytesrc_recycle(d->bytesrc, d->buf);
d->buf = d->nextbuf;
@ -185,13 +182,9 @@ static void upb_decoder_advancebuf(upb_decoder *d)
static void upb_decoder_pullnextbuf(upb_decoder *d)
{
if(!d->nextbuf && !upb_bytesrc_eof(d->bytesrc)) { // Need another buffer?
// We test the eof flag both before and after the get; checking it
// before lets us short-circuit the get if we are already at eof,
// checking it after makes sure we don't report an error if the get only
// failed because of eof.
if(!(d->nextbuf = upb_bytesrc_get(d->bytesrc)) &&
!upb_bytesrc_eof(d->bytesrc)) {
if(!d->nextbuf) {
d->nextbuf = upb_bytesrc_get(d->bytesrc);
if(!d->nextbuf && !upb_bytesrc_eof(d->bytesrc)) {
// There was an error in the byte stream, halt the decoder.
upb_copyerr(&d->status, upb_bytesrc_status(d->bytesrc));
return;
@ -202,7 +195,10 @@ static void upb_decoder_pullnextbuf(upb_decoder *d)
static void upb_decoder_skipbytes(upb_decoder *d, int32_t bytes)
{
d->buf_bytes_remaining -= bytes;
while(d->buf_bytes_remaining < 0) upb_decoder_getbuf(d);
while(d->buf_bytes_remaining <= 0) {
upb_decoder_pullnextbuf(d);
upb_decoder_advancebuf(d);
}
}
static void upb_decoder_skipgroup(upb_decoder *d)
@ -213,31 +209,29 @@ static void upb_decoder_skipgroup(upb_decoder *d)
while(upb_decoder_getdef(d)) upb_decoder_skipval(d);
}
static const uint8_t *upb_decoder_getbuf(upb_decoder *d, int32_t *bytes)
static const uint8_t *upb_decoder_getbuf_full(upb_decoder *d, int32_t *bytes)
{
if(d->buf_bytes_remaining < 10) {
upb_strlen_t total = 0;
if(d->buf) {
upb_strlen_t len = upb_string_len(d->buf);
memcpy(d->overflow_buf, upb_string_getrobuf(d->buf), len);
total += len;
if(d->nextbuf) {
len = upb_string_len(d->nextbuf);
if(total + len > 10) len = 10 - total;
memcpy(d->overflow_buf + total, upb_string_getrobuf(d->nextbuf, len));
total += len;
}
}
memset(d->overflow_buf + total, 0x80, 10 - total);
} else {
upb_decoder_pullnextbuf(d);
upb_decoder_advancebuf(d);
if(d->buf_bytes_remaining >= UPB_MAX_ENCODED_SIZE) {
return upb_string_getrobuf(d->buf) + upb_string_len(d->buf) -
d->buf_bytes_remaining;
} else {
upb_strlen_t total = 0;
if(d->buf) total += upb_decoder_append(d->buf, total);
if(d->nextbuf) total += upb_decoder_append(d->nextbuf, total);
memset(d->overflow_buf + total, 0x80, UPB_MAX_ENCODED_SIZE - total);
}
}
// Returns a pointer to a buffer of data that is at least UPB_MAX_ENCODED_SIZE
// bytes long. This buffer contains the next bytes in the stream (even if
// those bytes span multiple buffers). *bytes is set to the number of actual
// stream bytes that are available in the returned buffer. If
// *bytes < UPB_MAX_ENCODED_SIZE, the buffer is padded with 0x80 bytes.
INLINE static const uint8_t *upb_decoder_getbuf(upb_decoder *d, int32_t *bytes)
{
if(d->buf_bytes_remaining >= 10) {
if(d->buf_bytes_remaining >= UPB_MAX_ENCODED_SIZE) {
*bytes = d->buf_bytes_remaining;
return upb_string_getrobuf(d->buf) + upb_string_len(d->buf) -
d->buf_bytes_remaining;
@ -246,25 +240,31 @@ INLINE static const uint8_t *upb_decoder_getbuf(upb_decoder *d, int32_t *bytes)
}
}
/* upb_src implementation for upb_decoder. ************************************/
upb_fielddef *upb_decoder_getdef(upb_decoder *d)
{
// Detect end-of-submessage.
if(offset >= d->top->end_offset) {
if(upb_decoder_offset(d) >= d->top->end_offset) {
d->eof = true;
return NULL;
}
// Handles the packed field case.
if(d->field) return d->field;
if(d->eof) return NULL;
again:
uint32_t key;
if(!upb_decoder_get_v_uint32(d, &key)) return NULL;
if(upb_wiretype_from_key(key) == UPB_WIRE_TYPE_END_GROUP) {
if(!upb_decoder_get_v_uint32(d, &key)) {
return NULL;
if(d->key.wire_type == UPB_WIRE_TYPE_DELIMITED) {
// For delimited wire values we parse the length now, since we need it in
// all cases.
if(!upb_decoder_get_v_uint32(d, &d->delim_len)) return NULL;
} else if(upb_wiretype_from_key(key) == UPB_WIRE_TYPE_END_GROUP) {
if(isgroup(d->top->submsg_end)) {
d->eof = true;
d->status->code = UPB_STATUS_EOF;
} else {
upb_seterr(d->status, UPB_STATUS_ERROR, "End group seen but current "
"message is not a group, byte offset: %zd",
@ -273,59 +273,66 @@ again:
return NULL;
}
// For delimited wire values we parse the length now, since we need it in all
// cases.
if(d->key.wire_type == UPB_WIRE_TYPE_DELIMITED) {
if(!upb_decoder_get_v_uint32(d, &d->delim_len)) return NULL;
}
// Look up field by tag number.
upb_fielddef *f = upb_msg_itof(d->top->msgdef, upb_fieldnum_from_key(key));
if (!f || !upb_check_type(upb_wiretype_from_key(key), f->type)) {
// Unknown field or incorrect wire type. In the future these cases may be
// separated, like if we want to give the client unknown fields but not
// incorrect fields.
if (!f) {
// Unknown field. If/when the upb_src interface supports reporting
// unknown fields we will implement that here.
upb_decoder_skipval(d);
goto again;
} else if (!upb_check_type(upb_wiretype_from_key(key), f->type)) {
// This is a recoverable error condition. We skip the value but also
// return NULL and report the error.
upb_decoder_skipval(d);
// TODO: better error message.
upb_seterr(&d->status, UPB_STATUS_ERROR, "Incorrect wire type.\n");
return NULL;
}
d->field = f;
return f;
}
bool upb_decoder_getval(upb_decoder *d, upb_valueptr val)
{
uint32_t bytes;
if(expected_type_for_field == UPB_DELIMITED) {
// A string, bytes, or a length-delimited submessage. The latter isn't
// technically a string, but can be gotten as one to perform lazy parsing.
d->str = upb_string_tryrecycle(d->str);
if (d->delimited_len <= d->buf_bytes_remaining) {
const upb_strlen_t total_len = d->delimited_len;
if (total_len <= d->buf_bytes_remaining) {
// The entire string is inside our current buffer, so we can just
// return a substring of the buffer without copying.
upb_string_substr(d->str, d->buf,
upb_string_len(d->buf) - d->buf_bytes_remaining,
d->delimited_len);
d->buf_bytes_remaining -= d->delimited_len;
total_len);
d->buf_bytes_remaining -= total_len
*val.str = d->str;
} else {
// The string spans buffers, so we must copy.
memcpy(upb_string_getrwbuf(d->str, len),
upb_string_getrobuf(d->buf) + upb_string_len(d->buf),
bar);
if(!upb_bytesrc_append(d->bytesrc, d->str, len)) goto err;
// The string spans buffers, so we must copy from the current buffer,
// the next buffer (if we have one), and finally from the bytesrc.
char *str = upb_string_getrwbuf(d->str, d->);
upb_strlen_t len = 0;
len += upb_decoder_append(d->buf, len, total_len);
if(!upb_decoder_advancebuf(d)) goto err;
if(d->buf) len += upb_decoder_append(d->buf, len, total_len);
if(len < total_len)
if(!upb_bytesrc_append(d->bytesrc, d->str, len - bytes)) goto err;
}
d->field = NULL;
} else {
// For all of the integer types we need the bytes to be in a single
// contiguous buffer.
uint32_t bytes;
const uint8_t *buf = upb_decoder_getbuf(d, &bytes)
switch(expected_type_for_field) {
case UPB_32BIT_VARINT:
case UPB_64BIT_VARINT:
if(upb_get_v_uint32(buf, val.uint32) > 10) goto err;
if(f->type == UPB_TYPE(SINT32)) *val.int32 = upb_zzdec_32(*val.int32);
if(f->type == UPB_TYPE(SINT64)) *val.int64 = upb_zzdec_64(*val.int64);
break;
case UPB_64BIT_VARINT: {
case UPB_32BIT_VARINT:
if(upb_get_v_uint64(buf, val.uint64) > 5) goto err;
if(f->type == UPB_TYPE(SINT64)) *val.int64 = upb_zzdec_64(*val.int64);
if(f->type == UPB_TYPE(SINT32)) *val.int32 = upb_zzdec_32(*val.int32);
break;
case UPB_64BIT_FIXED:
if(bytes < 8) goto err;
@ -338,9 +345,12 @@ bool upb_decoder_getval(upb_decoder *d, upb_valueptr val)
default:
// Including start/end group.
goto err;
}
if(wire_type != UPB_WIRE_TYPE_DELIMITED ||
upb_decoder_offset(d) >= d->packed_end_offset) {
d->field = NULL;
}
}
if(non-packed field || packed field that is done)
d->field = NULL;
return true;
err:
}
@ -356,6 +366,7 @@ bool upb_decoder_skipval(upb_decoder *d) {
case UPB_WIRE_TYPE_START_GROUP:
return upb_skip_groups(1);
case UPB_WIRE_TYPE_DELIMITED:
// Works for both string/bytes *and* submessages.
return upb_skip_bytes(d->delimited_len);
default:
// Including UPB_WIRE_TYPE_END_GROUP.

@ -1,15 +1,16 @@
/*
* upb - a minimalist implementation of protocol buffers.
*
* upb_decoder implements a high performance, callback-based, stream-oriented
* decoder (comparable to the SAX model in XML parsers). For parsing protobufs
* into in-memory messages (a more DOM-like model), see the routines in
* upb_msg.h, which are layered on top of this decoder.
* upb_decoder implements a high performance, streaming decoder for protobuf
* data that works by implementing upb_src and getting its data from a
* upb_bytesrc.
*
* TODO: the decoder currently does not support returning unknown values. This
* can easily be added when it is needed.
* The decoder does not currently support non-blocking I/O, in the sense that
* if the bytesrc returns UPB_STATUS_TRYAGAIN it is not possible to resume the
* decoder when data becomes available again. Support for this could be added,
* but it would add complexity and perhaps cost efficiency also.
*
* Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
* Copyright (c) 2009-2010 Joshua Haberman. See LICENSE for details.
*/
#ifndef UPB_DECODER_H_
@ -17,8 +18,8 @@
#include <stdbool.h>
#include <stdint.h>
#include "upb.h"
#include "descriptor.h"
#include "upb_def.h"
#include "upb_srcsink.h"
#ifdef __cplusplus
extern "C" {
@ -33,17 +34,17 @@ typedef struct upb_decoder upb_decoder;
// Allocates and frees a upb_decoder, respectively.
upb_decoder *upb_decoder_new(upb_msgdef *md);
void upb_decoder_free(upb_decoder *p);
void upb_decoder_free(upb_decoder *d);
// Resets the internal state of an already-allocated decoder. This puts it in a
// state where it has not seen any data, and expects the next data to be from
// the beginning of a new protobuf. Parsers must be reset before they can be
// used. A decoder can be reset multiple times.
void upb_decoder_reset(upb_decoder *p, upb_bytesrc *bytesrc);
void upb_decoder_reset(upb_decoder *d, upb_bytesrc *bytesrc);
// Returns a upb_src pointer by which the decoder can be used. The returned
// upb_src is invalidated by upb_decoder_reset().
upb_src *upb_decoder_getsrc(upb_decoder *p);
// upb_src is invalidated by upb_decoder_reset() or upb_decoder_free().
upb_src *upb_decoder_getsrc(upb_decoder *d);
#ifdef __cplusplus
} /* extern "C" */

@ -71,7 +71,7 @@ upb_status *upb_sink_status(upb_sink *sink);
/* upb_bytesrc ****************************************************************/
// Returns the next string in the stream. NULL is returned on error or eof.
// The string must be at least "minlen" bytes long.
// The string must be at least "minlen" bytes long unless the stream is eof.
//
// A ref is passed to the caller, though the caller is encouraged to pass the
// ref back to the bytesrc with upb_bytesrc_recycle(). This can help reduce
@ -140,6 +140,7 @@ typedef struct {
typedef struct {
upb_src_vtable *vtbl;
upb_status status;
bool eof;
#ifndef NDEBUG
int state; // For debug-mode checking of API usage.
#endif

Loading…
Cancel
Save