Changed parse API to know about msgdefs.

This should make it both easier to use and easier to
optimize, in exchange for a small amount of generality.
In practice, any remotely normal case is still very
natural.
pull/13171/head
Joshua Haberman 15 years ago
parent 7cde43ea0a
commit 868f118797
  1. 7
      Makefile
  2. 11
      src/upb.h
  3. 38
      src/upb_msg.c
  4. 189
      src/upb_parse.c
  5. 107
      src/upb_parse.h
  6. 22
      tests/tests.c
  7. 5
      tools/upbc.c

@ -42,6 +42,7 @@ clean:
rm -rf benchmark/google_messages.proto.pb benchmark/google_messages.pb.* benchmarks/b.* benchmarks/*.pb*
rm -rf tests/tests tests/t.* tests/test_table
rm -rf descriptor/descriptor.proto.pb
rm -rf tools/upbc deps
cd lang_ext/python && python setup.py clean --all
# The core library (src/libupb.a)
@ -86,8 +87,10 @@ test: tests
# Needs to be rewritten to separate the benchmark.
# valgrind --error-exitcode=1 ./tests/test_table
@for test in tests/t.* ; do \
echo $(VALGRIND) ./$$test; \
$(VALGRIND) ./$$test; \
if [ -f ./$$test ] ; then \
echo $(VALGRIND) ./$$test: \\c; \
$(VALGRIND) ./$$test; \
fi \
done;
tests/t.test_vs_proto2.googlemessage1 \

@ -140,13 +140,10 @@ union upb_value_ptr {
void *_void;
};
// Unfortunately there is no way to define this so that it can be used as a
// generic expression, a la:
// foo(UPB_VALUE_ADDROF(bar));
// ...you have to use it as the initializer of a upb_value_ptr:
// union upb_value_ptr p = UPB_VALUE_ADDROF(bar);
// foo(p);
#define UPB_VALUE_ADDROF(val) {(void*)&val._double}
INLINE union upb_value_ptr upb_value_addrof(union upb_value *val) {
union upb_value_ptr ptr = {&val->_double};
return ptr;
}
/**
* Converts upb_value_ptr -> upb_value by reading from the pointer. We need to

@ -50,35 +50,24 @@ static union upb_value_ptr get_value_ptr(struct upb_msg *msg,
/* Callbacks for the stream parser. */
static upb_field_type_t tag_cb(void *udata, struct upb_tag *tag,
void **user_field_desc)
static bool value_cb(void *udata, struct upb_msgdef *msgdef,
struct upb_fielddef *f, union upb_value val)
{
(void)msgdef;
struct upb_msgparser *mp = udata;
struct upb_fielddef *f =
upb_msg_fieldbynum(mp->top->msg->def, tag->field_number);
if(!f || !upb_check_type(tag->wire_type, f->type))
return 0; /* Skip unknown or fields of the wrong type. */
*user_field_desc = f;
return f->type;
}
static void *value_cb(void *udata, uint8_t *buf, uint8_t *end,
void *user_field_desc, struct upb_status *status)
{
struct upb_msgparser *mp = udata;
struct upb_fielddef *f = user_field_desc;
struct upb_msg *msg = mp->top->msg;
union upb_value_ptr p = get_value_ptr(msg, f);
upb_msg_set(msg, f);
return upb_parse_value(buf, end, f->type, p, status);
upb_value_write(p, val, f->type);
return true;
}
static void str_cb(void *udata, uint8_t *str,
size_t avail_len, size_t total_len,
void *udesc)
static bool str_cb(void *udata, struct upb_msgdef *msgdef,
struct upb_fielddef *f, uint8_t *str, size_t avail_len,
size_t total_len)
{
(void)msgdef;
struct upb_msgparser *mp = udata;
struct upb_fielddef *f = udesc;
struct upb_msg *msg = mp->top->msg;
union upb_value_ptr p = get_value_ptr(msg, f);
upb_msg_set(msg, f);
@ -98,12 +87,12 @@ static void str_cb(void *udata, uint8_t *str,
memcpy((*p.str)->ptr, str, avail_len);
(*p.str)->byte_len = avail_len;
//}
return true;
}
static void start_cb(void *udata, void *user_field_desc)
static void start_cb(void *udata, struct upb_fielddef *f)
{
struct upb_msgparser *mp = udata;
struct upb_fielddef *f = user_field_desc;
struct upb_msg *oldmsg = mp->top->msg;
union upb_value_ptr p = get_value_ptr(oldmsg, f);
@ -131,15 +120,14 @@ static void end_cb(void *udata)
struct upb_msgparser *upb_msgparser_new(struct upb_msgdef *def)
{
(void)def; // Not used atm.
struct upb_msgparser *mp = malloc(sizeof(struct upb_msgparser));
mp->s = upb_cbparser_new();
mp->s = upb_cbparser_new(def, value_cb, str_cb, start_cb, end_cb);
return mp;
}
void upb_msgparser_reset(struct upb_msgparser *s, struct upb_msg *msg, bool byref)
{
upb_cbparser_reset(s->s, s, tag_cb, value_cb, str_cb, start_cb, end_cb);
upb_cbparser_reset(s->s, s);
s->byref = byref;
s->top = s->stack;
s->top->msg = msg;

@ -6,8 +6,10 @@
#include "upb_parse.h"
#include <inttypes.h>
#include <stddef.h>
#include <stdlib.h>
#include "upb_def.h"
/* Functions to read wire values. *********************************************/
@ -297,21 +299,38 @@ uint8_t *upb_parse_value(uint8_t *buf, uint8_t *end, upb_field_type_t ft,
#undef CASE
}
struct upb_cbparser_frame {
struct upb_msgdef *msgdef;
size_t end_offset; // For groups, 0.
};
struct upb_cbparser {
// Stack entries store the offset where the submsg ends (for groups, 0).
size_t stack[UPB_MAX_NESTING], *top, *limit;
size_t completed_offset;
void *udata;
upb_tag_cb tag_cb;
// Immutable state of the parser.
struct upb_msgdef *toplevel_msgdef;
upb_value_cb value_cb;
upb_str_cb str_cb;
upb_start_cb start_cb;
upb_end_cb end_cb;
// State pertaining to a particular parse (resettable).
// Stack entries store the offset where the submsg ends (for groups, 0).
struct upb_cbparser_frame stack[UPB_MAX_NESTING], *top, *limit;
size_t completed_offset;
void *udata;
};
struct upb_cbparser *upb_cbparser_new(void)
struct upb_cbparser *upb_cbparser_new(struct upb_msgdef *msgdef,
upb_value_cb valuecb, upb_str_cb strcb,
upb_start_cb startcb, upb_end_cb endcb)
{
return malloc(sizeof(struct upb_cbparser));
struct upb_cbparser *p = malloc(sizeof(struct upb_cbparser));
p->toplevel_msgdef = msgdef;
p->value_cb = valuecb;
p->str_cb = strcb;
p->start_cb = startcb;
p->end_cb = endcb;
p->limit = &p->stack[UPB_MAX_NESTING];
return p;
}
void upb_cbparser_free(struct upb_cbparser *p)
@ -319,145 +338,165 @@ void upb_cbparser_free(struct upb_cbparser *p)
free(p);
}
void upb_cbparser_reset(struct upb_cbparser *p, void *udata,
upb_tag_cb tagcb,
upb_value_cb valuecb,
upb_str_cb strcb,
upb_start_cb startcb,
upb_end_cb endcb)
void upb_cbparser_reset(struct upb_cbparser *p, void *udata)
{
p->top = p->stack;
p->limit = &p->stack[UPB_MAX_NESTING];
p->completed_offset = 0;
p->udata = udata;
p->tag_cb = tagcb;
p->value_cb = valuecb;
p->str_cb = strcb;
p->start_cb = startcb;
p->end_cb = endcb;
p->top->msgdef = p->toplevel_msgdef;
// The top-level message is not delimited (we can keep receiving data for it
// indefinitely), so we treat it like a group.
*p->top = 0;
p->top->end_offset = 0;
}
static void *get_msgend(struct upb_cbparser *p, uint8_t *start)
{
if(p->top->end_offset > 0)
return start + (p->top->end_offset - p->completed_offset);
else
return (void*)UINTPTR_MAX; // group.
}
static bool isgroup(void *submsg_end)
{
return submsg_end == (void*)UINTPTR_MAX;
}
extern upb_wire_type_t upb_expected_wire_types[];
// Returns true if wt is the correct on-the-wire type for ft.
INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) {
// This doesn't currently support packed arrays.
return upb_type_info[ft].expected_wire_type == wt;
}
/**
* Pushes a new stack frame for a submessage with the given len (which will
* be zero if the submessage is a group).
*/
static uint8_t *push(struct upb_cbparser *s, uint8_t *start,
uint32_t submsg_len, void *user_field_desc,
static uint8_t *push(struct upb_cbparser *p, uint8_t *start,
uint32_t submsg_len, struct upb_fielddef *f,
struct upb_status *status)
{
s->top++;
if(s->top >= s->limit) {
p->top++;
if(p->top >= p->limit) {
upb_seterr(status, UPB_STATUS_ERROR,
"Nesting exceeded maximum (%d levels)\n",
UPB_MAX_NESTING);
return NULL;
}
*s->top = s->completed_offset + submsg_len;
struct upb_cbparser_frame *frame = p->top;
frame->end_offset = p->completed_offset + submsg_len;
frame->msgdef = f->ref.msg;
if(s->start_cb)
s->start_cb(s->udata, user_field_desc);
if(*s->top > 0)
return start + (*s->top - s->completed_offset);
else
return (void*)UINTPTR_MAX;
if(p->start_cb) p->start_cb(p->udata, f);
return get_msgend(p, start);
}
/**
* Pops a stack frame, returning a pointer for where the next submsg should
* end (or a pointer that is out of range for a group).
*/
static void *pop(struct upb_cbparser *s, uint8_t *start)
static void *pop(struct upb_cbparser *p, uint8_t *start)
{
if(s->end_cb)
s->end_cb(s->udata);
s->top--;
if(*s->top > 0)
return (char*)start + (*s->top - s->completed_offset);
else
return (void*)UINTPTR_MAX; // group.
if(p->end_cb) p->end_cb(p->udata);
p->top--;
return get_msgend(p, start);
}
size_t upb_cbparser_parse(struct upb_cbparser *s, void *_buf, size_t len,
size_t upb_cbparser_parse(struct upb_cbparser *p, void *_buf, size_t len,
struct upb_status *status)
{
// buf is our current offset, moves from start to end.
uint8_t *buf = _buf;
uint8_t *completed = buf;
uint8_t *const start = buf; // ptr equivalent of s->completed_offset
uint8_t *const start = buf; // ptr equivalent of p->completed_offset
uint8_t *end = buf + len;
uint8_t *submsg_end = *s->top > 0 ? buf + *s->top : (uint8_t*)UINTPTR_MAX;
// When we have fully parsed a tag/value pair, we advance this.
uint8_t *completed = buf;
uint8_t *submsg_end = get_msgend(p, start);
struct upb_msgdef *msgdef = p->top->msgdef;
bool keep_going = true;
// Make local copies so optimizer knows they won't change.
upb_tag_cb tag_cb = s->tag_cb;
upb_str_cb str_cb = s->str_cb;
upb_value_cb value_cb = s->value_cb;
void *udata = s->udata;
upb_str_cb str_cb = p->str_cb;
upb_value_cb value_cb = p->value_cb;
void *udata = p->udata;
// We need to check the status of operations that can fail, but we do so as
// late as possible to avoid introducing branches that have to wait on
// (status->code) which must be loaded from memory.
#define CHECK_STATUS() do { if(!upb_ok(status)) goto err; } while(0)
// Main loop: parse a tag, then handle the value.
while(buf < end) {
// Main loop: parse a tag, find the appropriate fielddef.
while(keep_going && buf < end) {
struct upb_tag tag;
buf = parse_tag(buf, end, &tag, status);
if(tag.wire_type == UPB_WIRE_TYPE_END_GROUP) {
CHECK_STATUS();
submsg_end = pop(s, start);
if(!isgroup(submsg_end)) {
upb_seterr(status, UPB_STATUS_ERROR, "End group seen but current "
"message is not a group, byte offset: %zd",
p->completed_offset + (completed - start));
goto err;
}
submsg_end = pop(p, start);
msgdef = p->top->msgdef;
completed = buf;
continue;
}
void *udesc;
upb_field_type_t ft = tag_cb(udata, &tag, &udesc);
struct upb_fielddef *f = upb_msg_fieldbynum(msgdef, tag.field_number);
if(tag.wire_type == UPB_WIRE_TYPE_DELIMITED) {
int32_t delim_len;
buf = upb_get_INT32(buf, end, &delim_len, status);
CHECK_STATUS();
uint8_t *delim_end = buf + delim_len;
if(ft == UPB_TYPENUM(MESSAGE)) {
submsg_end = push(s, start, delim_end - start, udesc, status);
if(f && f->type == UPB_TYPENUM(MESSAGE)) {
submsg_end = push(p, start, delim_end - start, f, status);
msgdef = p->top->msgdef;
} else {
if(upb_isstringtype(ft)) {
if(f && upb_isstringtype(f->type)) {
size_t avail_len = UPB_MIN(delim_end, end) - buf;
str_cb(udata, buf, avail_len, delim_end - buf, udesc);
keep_going =
str_cb(udata, msgdef, f, buf, avail_len, delim_end - buf);
} // else { TODO: packed arrays }
// If field was not found, it is skipped silently.
buf = delim_end; // Could be >end.
}
} else {
// Scalar (non-delimited) value.
switch(ft) {
case 0: // Client elected to skip.
buf = skip_wire_value(buf, end, tag.wire_type, status);
break;
case UPB_TYPENUM(GROUP):
submsg_end = push(s, start, 0, udesc, status);
break;
default:
buf = value_cb(udata, buf, end, udesc, status);
break;
if(!f || !upb_check_type(tag.wire_type, f->type)) {
buf = skip_wire_value(buf, end, tag.wire_type, status);
} else if (f->type == UPB_TYPENUM(GROUP)) {
submsg_end = push(p, start, 0, f, status);
msgdef = p->top->msgdef;
} else {
union upb_value val;
buf = upb_parse_value(buf, end, f->type, upb_value_addrof(&val),
status);
keep_going = value_cb(udata, msgdef, f, val);
}
}
CHECK_STATUS();
while(buf >= submsg_end) {
if(buf > submsg_end) {
return UPB_STATUS_ERROR; // Bad submessage end.
upb_seterr(status, UPB_STATUS_ERROR, "Expected submsg end offset "
"did not lie on a tag/value boundary.");
goto err;
}
submsg_end = pop(s, start);
submsg_end = pop(p, start);
msgdef = p->top->msgdef;
}
// while(buf < s->packed_end) { TODO: packed arrays }
// while(buf < p->packed_end) { TODO: packed arrays }
completed = buf;
}
size_t read;
err:
read = (char*)completed - (char*)start;
s->completed_offset += read;
p->completed_offset += read;
return read;
}

@ -6,6 +6,9 @@
* into in-memory messages (a more DOM-like model), see the routines in
* upb_msg.h, which are layered on top of this parser.
*
* TODO: the parser currently does not support returning unknown values. This
* can easily be added when it is needed.
*
* Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
*/
@ -23,105 +26,63 @@ extern "C" {
/* Event Callbacks. ***********************************************************/
// The tag callback is called immediately after a tag has been parsed. The
// client should determine whether it wants to parse or skip the corresponding
// value. If it wants to parse it, it must discover and return the correct
// .proto type (the tag only contains the wire type) and check that the wire
// type is appropriate for the .proto type. Returning a type for which
// upb_check_type(tag->wire_type, type) == false invokes undefined behavior.
//
// To skip the value (which means skipping all submessages, in the case of a
// submessage), the callback should return zero.
//
// The client can store a void* in *user_field_desc; this will be passed to
// the value callback or the string callback.
typedef upb_field_type_t (*upb_tag_cb)(void *udata, struct upb_tag *tag,
void **user_field_desc);
// The value callback is called when a regular value (ie. not a string or
// submessage) is encountered which the client has opted to parse (by not
// returning 0 from the tag_cb). The client must parse the value by calling
// upb_parse_value(), returning success or failure accordingly.
// submessage) is encountered which was defined in the upb_msgdef. The client
// returns true to continue the parse or false to halt it.
//
// Note that this callback can be called several times in a row for a single
// call to tag_cb in the case of packed arrays.
typedef void *(*upb_value_cb)(void *udata, uint8_t *buf, uint8_t *end,
void *user_field_desc, struct upb_status *status);
typedef bool (*upb_value_cb)(void *udata, struct upb_msgdef *msgdef,
struct upb_fielddef *f, union upb_value val);
// The string callback is called when a string is parsed. avail_len is the
// number of bytes that are currently available at str. If the client is
// streaming and the current buffer ends in the middle of the string, this
// number could be less than total_len.
typedef void (*upb_str_cb)(void *udata, uint8_t *str, size_t avail_len,
size_t total_len, void *user_field_desc);
// The string callback is called when a string that was defined in the
// upb_msgdef is parsed. avail_len is the number of bytes that are currently
// available at str. If the client is streaming and the current buffer ends in
// the middle of the string, this number could be less than total_len.
typedef bool (*upb_str_cb)(void *udata, struct upb_msgdef *msgdef,
struct upb_fielddef *f, uint8_t *str,
size_t avail_len, size_t total_len);
// The start and end callbacks are called when a submessage begins and ends,
// respectively.
typedef void (*upb_start_cb)(void *udata, void *user_field_desc);
typedef void (*upb_start_cb)(void *udata, struct upb_fielddef *f);
typedef void (*upb_end_cb)(void *udata);
/* Callback parser interface. *************************************************/
// Allocates and frees a upb_cbparser, respectively.
struct upb_cbparser *upb_cbparser_new(void);
// Allocates and frees a upb_cbparser, respectively. Callbacks may be NULL,
// in which case they will be skipped.
struct upb_cbparser *upb_cbparser_new(struct upb_msgdef *md,
upb_value_cb valuecb, upb_str_cb strcb,
upb_start_cb startcb, upb_end_cb endcb);
void upb_cbparser_free(struct upb_cbparser *p);
// Resets the internal state of an already-allocated parser. Parsers must be
// reset before they can be used. A parser can be reset multiple times. udata
// will be passed as the first argument to callbacks.
//
// tagcb must be set, but all other callbacks can be NULL, in which case they
// will just be skipped.
void upb_cbparser_reset(struct upb_cbparser *p, void *udata,
upb_tag_cb tagcb,
upb_value_cb valuecb,
upb_str_cb strcb,
upb_start_cb startcb,
upb_end_cb endcb);
// Resets the internal state of an already-allocated parser. This puts it in a
// state where it has not seen any data, and expects the next data to be from
// the beginning of a new protobuf. Parsers must be reset before they can be
// used. A parser can be reset multiple times. udata will be passed as the
// first argument to callbacks.
void upb_cbparser_reset(struct upb_cbparser *p, void *udata);
// Parses up to len bytes of protobuf data out of buf, calling the appropriate
// callbacks as values are parsed.
//
// The function returns a status indicating the success of the operation. Data
// is parsed until no more data can be read from buf, or the callback returns an
// error like UPB_STATUS_USER_CANCELLED, or an error occurs.
// is parsed until no more data can be read from buf, or a user callback
// returns false, or an error occurs.
//
// *read is set to the number of bytes consumed. Note that this can be greater
// than len in the case that a string was recognized that spans beyond the end
// of the currently provided data.
// The function returns the number of bytes consumed. Note that this can be
// greater than len in the case that a string was recognized that spans beyond
// the end of the currently provided data.
//
// The next call to upb_parse must be the first byte after buf + *read, even in
// the case that *read > len.
// The next call to upb_parse must be the first byte after buf + retval, even in
// the case that retval > len.
//
// TODO: see if we can provide the following guarantee efficiently:
// *read will always be >= len. */
// retval will always be >= len. */
size_t upb_cbparser_parse(struct upb_cbparser *p, void *buf, size_t len,
struct upb_status *status);
extern upb_wire_type_t upb_expected_wire_types[];
// Returns true if wt is the correct on-the-wire type for ft.
INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) {
// This doesn't currently support packed arrays.
return upb_type_info[ft].expected_wire_type == wt;
}
/* Data-consuming functions (to be called from value cb). *********************/
// Parses and converts a value from the character data starting at buf (but not
// past end). Returns a pointer that is one past the data that was read. The
// caller must have previously checked that the wire type is appropriate for
// this field type.
uint8_t *upb_parse_value(uint8_t *buf, uint8_t *end, upb_field_type_t ft,
union upb_value_ptr v, struct upb_status *status);
// Parses a wire value with the given type (which must have been obtained from
// a tag that was just parsed) and returns a pointer to one past the data that
// was read.
uint8_t *upb_parse_wire_value(uint8_t *buf, uint8_t *end, upb_wire_type_t wt,
union upb_wire_value *wv,
struct upb_status *status);
#ifdef __cplusplus
} /* extern "C" */
#endif

@ -234,27 +234,6 @@ static void test_upb_context() {
}
static upb_field_type_t tag_cb(void *udata, struct upb_tag *tag,
void **user_field_desc)
{
(void)udata;
(void)tag;
(void)user_field_desc;
return 0;
}
static void test_cbparser()
{
struct upb_cbparser *p = upb_cbparser_new();
ASSERT(p);
upb_cbparser_reset(p, NULL, tag_cb, NULL, NULL, NULL, NULL);
size_t read;
struct upb_status status = UPB_STATUS_INIT;
read = upb_cbparser_parse(p, NULL, 0, &status);
ASSERT(upb_ok(&status));
ASSERT(read == 0);
}
int main()
{
#define TEST(func) do { \
@ -269,7 +248,6 @@ int main()
TEST(test_skip_v_uint64_t);
TEST(test_get_f_uint32_t);
TEST(test_upb_context);
TEST(test_cbparser);
printf("All tests passed (%d assertions).\n", num_assertions);
return 0;
}

@ -79,7 +79,7 @@ static void write_const_h(struct upb_symtab_entry *entries[], int num_entries,
for(int i = 0; i < num_entries; i++) { /* Foreach enum */
if(entries[i]->type != UPB_SYM_ENUM) continue;
struct upb_symtab_entry *entry = entries[i];
struct upb_enum *e = entry->ref._enum;
struct upb_enumdef *e = entry->ref._enum;
google_protobuf_EnumDescriptorProto *ed = e->descriptor;
/* We use entry->e.key (the fully qualified name) instead of ed->name. */
struct upb_string *enum_name = upb_strdup(&entry->e.key);
@ -476,8 +476,7 @@ static void write_message_c(void *data, struct upb_msgdef *m,
.type = GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE,
.ref = {.msg = m}
};
union upb_value_ptr p = UPB_VALUE_ADDROF(val);
add_value(p, &fake_field, &types);
add_value(upb_value_addrof(&val), &fake_field, &types);
add_submsgs(data, m, &types);
/* Emit foward declarations for all msgs of all types, and define arrays. */

Loading…
Cancel
Save