High-level parsing interface written (not yet tested).

pull/13171/head
Joshua Haberman 16 years ago
parent 0f50bcbceb
commit d1aa095cb3
  1. 1
      test_table.cc
  2. 32
      upb.h
  3. 133
      upb_parse.c
  4. 75
      upb_parse.h
  5. 40
      upb_struct.h

@ -189,4 +189,3 @@ int main()
test(keys4, 64);
delete[] keys4;
}

32
upb.h

@ -37,7 +37,7 @@ enum upb_wire_type {
UPB_WIRE_TYPE_END_GROUP = 4,
UPB_WIRE_TYPE_32BIT = 5
};
typedef int8_t upb_wire_type_t;
typedef uint8_t upb_wire_type_t;
/* A value as it is encoded on-the-wire, except delimited, which is handled
* separately. */
@ -48,8 +48,10 @@ union upb_wire_value {
};
/* Value type as defined in a .proto file. The values of this are defined by
* google_protobuf_FieldDescriptorProto_Type (from descriptor.proto). */
typedef int32_t upb_field_type_t;
* google_protobuf_FieldDescriptorProto_Type (from descriptor.proto).
* Note that descriptor.proto reserves "0" for errors, and we use it to
* represent exceptional circumstances. */
typedef uint8_t upb_field_type_t;
/* A value as described in a .proto file, except delimited, which is handled
* separately. */
@ -76,35 +78,21 @@ struct upb_tag {
/* Status codes used as a return value. */
typedef enum upb_status {
UPB_STATUS_OK = 0,
UPB_STATUS_SUBMESSAGE_END = 1,
/** FATAL ERRORS: these indicate corruption, and cannot be recovered. */
// A varint did not terminate before hitting 64 bits.
UPB_ERROR_UNTERMINATED_VARINT = -1,
// A submessage ended in the middle of data.
// A submessage or packed array ended in the middle of data.
UPB_ERROR_BAD_SUBMESSAGE_END = -2,
// Encountered a "group" on the wire (deprecated and unsupported).
UPB_ERROR_GROUP = -3,
// Input was nested more than UPB_MAX_NESTING deep.
UPB_ERROR_STACK_OVERFLOW = -4,
UPB_ERROR_STACK_OVERFLOW = -3,
// The input data caused the pb's offset (a size_t) to overflow.
UPB_ERROR_OVERFLOW = -5,
// Generic error.
UPB_ERROR = -6,
/** NONFATAL ERRORS: the input was invalid, but we can continue if desired. */
// A value was encountered that was not defined in the .proto file.
UPB_ERROR_UNKNOWN_VALUE = 2,
UPB_ERROR_OVERFLOW = -4,
// A field was encoded with the wrong wire type.
UPB_ERROR_MISMATCHED_TYPE = 3
// An "end group" tag was encountered in an inappropriate place.
UPB_ERROR_SPURIOUS_END_GROUP = -5
} upb_status_t;
#ifdef __cplusplus

@ -29,7 +29,7 @@
* To avoid branches, none of these do bounds checking. So we force clients
* to overallocate their buffers by >=9 bytes. */
static upb_status_t get_v_uint64_t(uint8_t *restrict *buf,
static upb_status_t get_v_uint64_t(void *restrict *buf,
uint64_t *restrict val)
{
uint8_t *ptr = *buf, b;
@ -59,7 +59,7 @@ done:
#if 0
/* The no-branching version. */
static upb_status_t get_v_uint64_t(uint8_t *restrict *buf,
static upb_status_t get_v_uint64_t(void *restrict *buf,
uint64_t *restrict val)
{
uint8_t *b = *buf;
@ -95,7 +95,7 @@ static upb_status_t get_v_uint64_t(uint8_t *restrict *buf,
}
/* The single-branch version. */
static upb_status_t get_v_uint64_t(uint8_t *restrict *buf,
static upb_status_t get_v_uint64_t(void *restrict *buf,
uint64_t *restrict val)
{
/* Endian-specific! */
@ -127,7 +127,7 @@ static upb_status_t get_v_uint64_t(uint8_t *restrict *buf,
}
#endif
static upb_status_t skip_v_uint64_t(uint8_t **buf)
static upb_status_t skip_v_uint64_t(void **buf)
{
uint8_t *ptr = *buf, b;
b = *(ptr++); if (!(b & 0x80)) goto done;
@ -147,7 +147,7 @@ done:
return UPB_STATUS_OK;
}
static upb_status_t get_v_uint32_t(uint8_t *restrict *buf,
static upb_status_t get_v_uint32_t(void *restrict *buf,
uint32_t *restrict val)
{
uint8_t *ptr = *buf, b;
@ -167,24 +167,25 @@ done:
return UPB_STATUS_OK;
}
static upb_status_t get_f_uint32_t(uint8_t *restrict *buf,
static upb_status_t get_f_uint32_t(void *restrict *buf,
uint32_t *restrict val)
{
uint8_t *b = *buf;
#define SHL(val, bits) ((uint32_t)val << bits)
*val = SHL(b[0], 0) | SHL(b[1], 8) | SHL(b[2], 16) | SHL(b[3], 24);
#undef SHL
*buf += sizeof(uint32_t);
b += sizeof(uint32_t);
*buf = b;
return UPB_STATUS_OK;
}
static upb_status_t skip_f_uint32_t(uint8_t **buf)
static upb_status_t skip_f_uint32_t(void **buf)
{
*buf += sizeof(uint32_t);
*buf = (char*)*buf + sizeof(uint32_t);
return UPB_STATUS_OK;
}
static upb_status_t get_f_uint64_t(uint8_t *restrict *buf,
static upb_status_t get_f_uint64_t(void *restrict *buf,
uint64_t *restrict val)
{
uint8_t *b = *buf;
@ -193,13 +194,14 @@ static upb_status_t get_f_uint64_t(uint8_t *restrict *buf,
*val = SHL(b[0], 0) | SHL(b[1], 8) | SHL(b[2], 16) | SHL(b[3], 24) |
SHL(b[4], 32) | SHL(b[5], 40) | SHL(b[6], 48) | SHL(b[7], 56);
#undef SHL
*buf += sizeof(uint64_t);
b += sizeof(uint64_t);
*buf = b;
return UPB_STATUS_OK;
}
static upb_status_t skip_f_uint64_t(uint8_t **buf)
static upb_status_t skip_f_uint64_t(void **buf)
{
*buf += sizeof(uint64_t);
*buf = (char*)*buf + sizeof(uint64_t);
return UPB_STATUS_OK;
}
@ -213,7 +215,7 @@ static int64_t zz_decode_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); }
static void wvtov_ ## type(wire_t s, val_t *d)
#define GET(type, v_or_f, wire_t, val_t, member_name) \
static upb_status_t get_ ## type(uint8_t **buf, union upb_value *d) { \
static upb_status_t get_ ## type(void **buf, union upb_value *d) { \
wire_t tmp; \
CHECK(get_ ## v_or_f ## _ ## wire_t(buf, &tmp)); \
wvtov_ ## type(tmp, &d->member_name); \
@ -264,7 +266,7 @@ upb_wire_type_t upb_expected_wire_types[] = {
[GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_SINT64] = UPB_WIRE_TYPE_VARINT,
};
upb_status_t parse_tag(uint8_t **buf, struct upb_tag *tag)
upb_status_t upb_parse_tag(void **buf, struct upb_tag *tag)
{
uint32_t tag_int;
CHECK(get_v_uint32_t(buf, &tag_int));
@ -273,12 +275,12 @@ upb_status_t parse_tag(uint8_t **buf, struct upb_tag *tag)
return UPB_STATUS_OK;
}
upb_status_t parse_wire_value(uint8_t *buf, size_t *offset,
upb_wire_type_t wt,
union upb_wire_value *wv)
upb_status_t upb_parse_wire_value(void *buf, size_t *offset,
upb_wire_type_t wt,
union upb_wire_value *wv)
{
#define READ(expr) CHECK(expr); *offset += (b-buf)
uint8_t *b = buf;
#define READ(expr) CHECK(expr); *offset += ((char*)b-(char*)buf)
void *b = buf;
switch(wt) {
case UPB_WIRE_TYPE_VARINT: READ(get_v_uint64_t(&b, &wv->varint)); break;
case UPB_WIRE_TYPE_64BIT: READ(get_f_uint64_t(&b, &wv->_64bit)); break;
@ -290,15 +292,15 @@ upb_status_t parse_wire_value(uint8_t *buf, size_t *offset,
*offset += new_offset;
break;
case UPB_WIRE_TYPE_START_GROUP:
case UPB_WIRE_TYPE_END_GROUP: return UPB_ERROR_GROUP; /* TODO */
case UPB_WIRE_TYPE_END_GROUP: break;
}
return UPB_STATUS_OK;
}
upb_status_t skip_wire_value(uint8_t *buf, size_t *offset,
upb_wire_type_t wt)
upb_status_t upb_skip_wire_value(void *buf, size_t *offset,
upb_wire_type_t wt)
{
uint8_t *b = buf;
void *b = buf;
switch(wt) {
case UPB_WIRE_TYPE_VARINT: READ(skip_v_uint64_t(&b)); break;
case UPB_WIRE_TYPE_64BIT: READ(skip_f_uint64_t(&b)); break;
@ -312,14 +314,14 @@ upb_status_t skip_wire_value(uint8_t *buf, size_t *offset,
*offset += new_offset;
break;
}
case UPB_WIRE_TYPE_START_GROUP:
case UPB_WIRE_TYPE_END_GROUP: return UPB_ERROR_GROUP; /* TODO */
case UPB_WIRE_TYPE_START_GROUP: /* TODO: skip to matching end group. */
case UPB_WIRE_TYPE_END_GROUP: break;
}
return UPB_STATUS_OK;
#undef READ
}
upb_status_t upb_parse_value(uint8_t **b, upb_field_type_t ft,
upb_status_t upb_parse_value(void **b, upb_field_type_t ft,
union upb_value *v)
{
#define CASE(t) \
@ -332,7 +334,82 @@ upb_status_t upb_parse_value(uint8_t **b, upb_field_type_t ft,
case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_STRING:
case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE:
return get_UINT32(b, v);
default: return UPB_ERROR; /* Including GROUP. */
default: return 0; /* Including GROUP -- groups have no value. */
}
#undef CASE
}
static void pop_stack_frame(struct upb_parse_state *s)
{
s->submsg_end_cb(s);
s->top--;
s->top = (struct upb_parse_stack_frame*)((char*)s->top - s->udata_size);
}
static upb_status_t push_stack_frame(struct upb_parse_state *s, size_t end,
void *user_field_desc)
{
s->top++;
s->top = (struct upb_parse_stack_frame*)((char*)s->top + s->udata_size);
if(unlikely(s->top > s->limit)) return UPB_ERROR_STACK_OVERFLOW;
s->top->end_offset = end;
s->submsg_start_cb(s, user_field_desc);
return UPB_STATUS_OK;
}
upb_status_t upb_parse(struct upb_parse_state *s, void *buf, size_t len,
size_t *read)
{
size_t start_offset = s->offset;
size_t end_offset = start_offset + len;
while(!s->done && s->offset < end_offset) {
while(s->offset >= s->top->end_offset) pop_stack_frame(s);
while(s->packed_end_offset > s->offset) {
/* Parse a packed field entry. */
}
struct upb_tag tag;
void *b = buf;
CHECK(upb_parse_tag(&b, &tag));
int tag_bytes = ((char*)b - (char*)buf);
s->offset += tag_bytes;
buf = b;
if(unlikely(tag.wire_type == UPB_WIRE_TYPE_END_GROUP)) {
if(unlikely(s->top->end_offset != 0)) return UPB_ERROR_SPURIOUS_END_GROUP;
pop_stack_frame(s);
continue;
}
void *user_field_desc;
upb_field_type_t ft = s->tag_cb(s, &tag, &user_field_desc);
if(ft == 0) {
CHECK(upb_skip_wire_value(b, &s->offset, tag.wire_type));
} else if(ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_GROUP) {
/* No length specified, an "end group" tag will mark the end. */
push_stack_frame(s, 0, user_field_desc);
} else {
/* For all other cases we parse the next value. */
union upb_value v;
CHECK(upb_parse_value(&b, ft, &v));
if(ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE) {
/* The value we parsed is the length of the submessage. */
push_stack_frame(s, s->offset + v.delim_len, user_field_desc);
} else if(ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_STRING ||
ft == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BYTES) {
s->value_cb(s, &v, b, user_field_desc);
b = (char*)b + v.delim_len;
} else if(tag.wire_type == UPB_WIRE_TYPE_DELIMITED) {
/* Delimited data which is not a string, bytes, or a submessage.
* It must be a packed array. */
s->packed_type = ft;
s->packed_end_offset = s->offset + v.delim_len;
} else {
/* The common case: a simple value. */
CHECK(upb_parse_value(&b, ft, &v));
s->value_cb(s, &v, b, user_field_desc);
}
}
}
*read = s->offset - start_offset;
return UPB_STATUS_OK;
}

@ -18,12 +18,79 @@
extern "C" {
#endif
/* High-level parsing interface. **********************************************/
struct upb_parse_state;
/* Initialize and free (respectively) the given parse state, which must have
* been previously allocated. udata_size specifies how much space will be
* available at parse_stack_frame.user_data in each frame for user data. */
void upb_parse_state_init(struct upb_parse_state *state, size_t udata_size);
void upb_parse_state_free(struct upb_parse_state *state);
/* The callback that is called immediately after a tag has been parsed. The
* client uses it to decide if it wants to process this value or skip it. If
* it wants to process it, it must determine its specific .proto type (at this
* point we only know its wire type) and verify that it matches the wire type.
* The client will then return the .proto type. To skip the value, the client
* should return 0 (which is not a valid .proto type).
*
* The client can set user_field_desc to a record describing this field -- this
* pointer will be supplied to the value callback (for simple values) or the
* submsg_start callback (for submessages). */
typedef upb_field_type_t (*upb_tag_cb)(struct upb_parse_state *s,
struct upb_tag *tag,
void **user_field_desc);
/* The callback that is called for individual values. This callback is only
* called when the previously invoked tag_cb has returned nonzero. It receives
* the parsed and converted value as well as the user_field_desc that was set
* by the tag_cb. Note that this function can be called several times in a row
* (ie. with no intervening tag_cb) in the case of packed arrays. For string
* data (bytes and string) str points to the beginning of the string. */
typedef void (*upb_value_cb)(struct upb_parse_state *s, union upb_value *v,
void *str, void *user_field_desc);
/* Callbacks that are called when a submessage begins and ends, respectively.
* Both are called with the submessage's stack frame at the top of the stack. */
typedef void (*upb_submsg_start_cb)(struct upb_parse_state *s,
void *user_field_desc);
typedef void (*upb_submsg_end_cb)(struct upb_parse_state *s);
/* Each stack frame (one for each level of submessages/groups) has this format,
* where user_data has as many bytes allocated as specified when initialized. */
struct upb_parse_stack_frame {
size_t end_offset; /* 0 indicates that this is a group. */
char user_data[];
};
struct upb_parse_state {
size_t offset;
struct upb_parse_stack_frame *stack, *top, *limit;
size_t udata_size; /* How many bytes the user gets in each frame. */
bool done; /* Any callback can abort processing by setting done=true. */
/* These are only set if we're in the middle of a packed array. */
size_t packed_end_offset; /* 0 if not in a packed array. */
upb_field_type_t packed_type;
upb_tag_cb tag_cb;
upb_value_cb value_cb;
upb_submsg_start_cb submsg_start_cb;
upb_submsg_end_cb submsg_end_cb;
};
/* Parses up to len bytes of protobuf data out of buf, calling cb as needed.
* The function returns how many bytes were consumed from buf. Data is parsed
* until no more data can be read from buf, or the callback sets *done=true,
* or an error occured. Sets *read to the number of bytes consumed. */
upb_status_t upb_parse(struct upb_parse_state *s, void *buf, size_t len,
size_t *read);
/* Low-level parsing functions. ***********************************************/
/* Parses a single tag from the character data starting at buf, and updates
* buf to point one past the bytes that were consumed. buf will be incremented
* by at most ten bytes. */
upb_status_t parse_tag(uint8_t **buf, struct upb_tag *tag);
upb_status_t upb_parse_tag(void **buf, struct upb_tag *tag);
extern upb_wire_type_t upb_expected_wire_types[];
/* Returns true if wt is the correct on-the-wire type for ft. */
@ -35,19 +102,19 @@ INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) {
* caller must have previously checked that the wire type is appropriate for
* this field type. For delimited data, buf is advanced to the beginning of
* the delimited data, not the end. */
upb_status_t upb_parse_value(uint8_t **buf, upb_field_type_t ft,
upb_status_t upb_parse_value(void **buf, upb_field_type_t ft,
union upb_value *value);
/* Parses a wire value with the given type (which must have been obtained from
* a tag that was just parsed) and adds the number of bytes that were consumed
* to *offset. For delimited types, offset is advanced past the delimited
* data. */
upb_status_t upb_parse_wire_value(uint8_t *buf, size_t *offset,
upb_status_t upb_parse_wire_value(void *buf, size_t *offset,
upb_wire_type_t wt,
union upb_wire_value *wv);
/* Like the above, but discards the wire value instead of saving it. */
upb_status_t upb_skip_wire_value(uint8_t *buf, size_t *offset,
upb_status_t upb_skip_wire_value(void *buf, size_t *offset,
upb_wire_type_t wt);
#ifdef __cplusplus

@ -70,14 +70,14 @@ struct upb_struct_field *upb_struct_find_field_by_number(
/* Represents a string or bytes. */
struct upb_string {
size_t byte_len;
uint8_t *data;
void *data;
};
/* Represents an array (a repeated field) of any type. The interpretation of
* the data in the array depends on the type. */
struct upb_array {
size_t len; /* Measured in elements. */
uint8_t *data; /* Size of individual elements is based on type. */
void *data; /* Size of individual elements is based on type. */
};
/* A generic array of structs, using void* instead of specific types. */
@ -121,9 +121,9 @@ UPB_DEFINE_PRIMITIVE_ARRAY(bool, bool)
/* For each primitive type we define a set of six functions:
*
* // For fetching out of a struct (s points to the raw struct data).
* int32_t *upb_struct_get_int32_ptr(uint8_t *s, struct upb_struct_field *f);
* int32_t upb_struct_get_int32(uint8_t *s, struct upb_struct_field *f);
* void upb_struct_set_int32(uint8_t *s, struct upb_struct_field *f, int32_t val);
* int32_t *upb_struct_get_int32_ptr(void *s, struct upb_struct_field *f);
* int32_t upb_struct_get_int32(void *s, struct upb_struct_field *f);
* void upb_struct_set_int32(void *s, struct upb_struct_field *f, int32_t val);
*
* // For fetching out of an array.
* int32_t *upb_array_get_int32_ptr(struct upb_array *a, int n);
@ -137,15 +137,15 @@ UPB_DEFINE_PRIMITIVE_ARRAY(bool, bool)
#define UPB_DEFINE_ACCESSORS(ctype, name, INLINE) \
INLINE ctype *upb_struct_get_ ## name ## _ptr( \
uint8_t *s, struct upb_struct_field *f) { \
return (ctype*)(s + f->byte_offset); \
void *s, struct upb_struct_field *f) { \
return (ctype*)((char*)s + f->byte_offset); \
} \
INLINE ctype upb_struct_get_ ## name( \
uint8_t *s, struct upb_struct_field *f) { \
void *s, struct upb_struct_field *f) { \
return *upb_struct_get_ ## name ## _ptr(s, f); \
} \
INLINE void upb_struct_set_ ## name( \
uint8_t *s, struct upb_struct_field *f, ctype val) { \
void *s, struct upb_struct_field *f, ctype val) { \
*upb_struct_get_ ## name ## _ptr(s, f) = val; \
}
@ -173,42 +173,42 @@ UPB_DEFINE_ALL_ACCESSORS(uint64_t, uint64, INLINE)
UPB_DEFINE_ALL_ACCESSORS(bool, bool, INLINE)
UPB_DEFINE_ALL_ACCESSORS(struct upb_struct_delimited*, bytes, INLINE)
UPB_DEFINE_ALL_ACCESSORS(struct upb_struct_delimited*, string, INLINE)
UPB_DEFINE_ALL_ACCESSORS(uint8_t*, substruct, INLINE)
UPB_DEFINE_ALL_ACCESSORS(void*, substruct, INLINE)
UPB_DEFINE_ACCESSORS(struct upb_array*, array, INLINE)
/* Functions for reading and writing the "set" flags in the pbstruct. Note
* that these do not perform any memory management associated with any dynamic
* memory these fields may be referencing; that is the client's responsibility.
* These *only* set and test the flags. */
INLINE void upb_struct_set(uint8_t *s, struct upb_struct_field *f)
INLINE void upb_struct_set(void *s, struct upb_struct_field *f)
{
s[f->isset_byte_offset] |= f->isset_byte_mask;
((char*)s)[f->isset_byte_offset] |= f->isset_byte_mask;
}
INLINE void upb_struct_unset(uint8_t *s, struct upb_struct_field *f)
INLINE void upb_struct_unset(void *s, struct upb_struct_field *f)
{
s[f->isset_byte_offset] &= ~f->isset_byte_mask;
((char*)s)[f->isset_byte_offset] &= ~f->isset_byte_mask;
}
INLINE bool upb_struct_is_set(uint8_t *s, struct upb_struct_field *f)
INLINE bool upb_struct_is_set(void *s, struct upb_struct_field *f)
{
return s[f->isset_byte_offset] & f->isset_byte_mask;
return ((char*)s)[f->isset_byte_offset] & f->isset_byte_mask;
}
INLINE bool upb_struct_all_required_fields_set(
uint8_t *s, struct upb_struct_definition *d)
void *s, struct upb_struct_definition *d)
{
int num_fields = d->num_required_fields;
int i = 0;
while(num_fields > 8) {
if(s[i++] != 0xFF) return false;
if(((uint8_t*)s)[i++] != 0xFF) return false;
num_fields -= 8;
}
if(s[i] != (1 << num_fields) - 1) return false;
if(((uint8_t*)s)[i] != (1 << num_fields) - 1) return false;
return true;
}
INLINE void upb_struct_clear(uint8_t *s, struct upb_struct_definition *d)
INLINE void upb_struct_clear(void *s, struct upb_struct_definition *d)
{
memset(s, 0, d->set_flags_bytes);
}

Loading…
Cancel
Save