upb_parser -> upb_decoder

pull/13171/head
Joshua Haberman 15 years ago
parent 57d6353a3c
commit 9116c697f8
  1. 2
      Makefile
  2. 18
      benchmarks/parsetostruct.upb_table.c
  3. 14
      src/upb_data.c
  4. 4
      src/upb_data.h
  5. 134
      src/upb_decoder.c
  6. 57
      src/upb_decoder.h
  7. 2
      src/upb_def.c
  8. 57
      src/upb_parse.h
  9. 4
      tests/test_vs_proto2.cc
  10. 2
      tests/tests.c
  11. 2
      tools/upbc.c

@ -46,7 +46,7 @@ clean:
cd lang_ext/python && python setup.py clean --all
# The core library (src/libupb.a)
SRC=src/upb.c src/upb_parse.c src/upb_table.c src/upb_def.c src/upb_data.c \
SRC=src/upb.c src/upb_decoder.c src/upb_table.c src/upb_def.c src/upb_data.c \
src/upb_encoder.c descriptor/descriptor.c src/upb_text.c
# Override the optimization level for upb_def.o, because it is not in the
# critical path but gets very large when -O3 is used.

@ -3,18 +3,18 @@
#include "upb_data.h"
#include "upb_def.h"
#include "upb_parse.h"
#include "upb_decoder.h"
static struct upb_symtab *s;
static upb_strptr str;
static struct upb_msgdef *def;
static upb_msg *msgs[NUM_MESSAGES];
static upb_parser *parser;
static upb_decoder *decoder;
static upb_msgsink *sink;
static bool initialize()
{
// Initialize upb state, parse descriptor.
// Initialize upb state, decode descriptor.
struct upb_status status = UPB_STATUS_INIT;
s = upb_symtab_new();
upb_strptr fds = upb_strreadfile(MESSAGE_DESCRIPTOR_FILE);
@ -49,7 +49,7 @@ static bool initialize()
fprintf(stderr, "Error reading " MESSAGE_FILE "\n");
return false;
}
parser = upb_parser_new(def);
decoder = upb_decoder_new(def);
sink = upb_msgsink_new(def);
return true;
}
@ -60,7 +60,7 @@ static void cleanup()
upb_msg_unref(msgs[i], def);
upb_string_unref(str);
upb_symtab_unref(s);
upb_parser_free(parser);
upb_decoder_free(decoder);
upb_msgsink_free(sink);
}
@ -69,11 +69,11 @@ static size_t run(int i)
struct upb_status status = UPB_STATUS_INIT;
upb_msg *msg = msgs[i%NUM_MESSAGES];
upb_msgsink_reset(sink, msg);
upb_parser_reset(parser, upb_msgsink_sink(sink));
upb_decoder_reset(decoder, upb_msgsink_sink(sink));
upb_msg_clear(msg, def);
size_t parsed = upb_parser_parse(parser, str, &status);
if(!upb_ok(&status) || parsed != upb_strlen(str)) {
fprintf(stderr, "Parse error: %s\n", status.msg);
size_t decoded = upb_decoder_decode(decoder, str, &status);
if(!upb_ok(&status) || decoded != upb_strlen(str)) {
fprintf(stderr, "Decode error: %s\n", status.msg);
return 0;
}
return upb_strlen(str);

@ -6,8 +6,8 @@
#include <stdlib.h>
#include "upb_data.h"
#include "upb_decoder.h"
#include "upb_def.h"
#include "upb_parse.h"
static uint32_t round_up_to_pow2(uint32_t v)
{
@ -282,18 +282,18 @@ void _upb_msg_free(upb_msg *msg, struct upb_msgdef *md)
free(msg);
}
void upb_msg_parsestr(upb_msg *msg, struct upb_msgdef *md, upb_strptr str,
struct upb_status *status)
void upb_msg_decodestr(upb_msg *msg, struct upb_msgdef *md, upb_strptr str,
struct upb_status *status)
{
upb_parser *p = upb_parser_new(md);
upb_decoder *d = upb_decoder_new(md);
upb_msgsink *s = upb_msgsink_new(md);
upb_msgsink_reset(s, msg);
upb_parser_reset(p, upb_msgsink_sink(s));
upb_decoder_reset(d, upb_msgsink_sink(s));
upb_msg_clear(msg, md);
upb_parser_parse(p, str, status);
upb_decoder_decode(d, str, status);
upb_parser_free(p);
upb_decoder_free(d);
upb_msgsink_free(s);
}

@ -516,8 +516,8 @@ INLINE void upb_msg_clear(upb_msg *msg, struct upb_msgdef *md) {
// A convenience function for parsing an entire protobuf all at once, without
// having to worry about setting up the appropriate objects.
void upb_msg_parsestr(upb_msg *msg, struct upb_msgdef *md, upb_strptr str,
struct upb_status *status);
void upb_msg_decodestr(upb_msg *msg, struct upb_msgdef *md, upb_strptr str,
struct upb_status *status);
/* upb_msgsrc *****************************************************************/

@ -4,7 +4,7 @@
* Copyright (c) 2008-2009 Joshua Haberman. See LICENSE for details.
*/
#include "upb_parse.h"
#include "upb_decoder.h"
#include <inttypes.h>
#include <stddef.h>
@ -13,7 +13,7 @@
/* Functions to read wire values. *********************************************/
// These functions are internal to the parser, but might be moved into an
// These functions are internal to the decode, but might be moved into an
// internal header file if we at some point in the future opt to do code
// generation, because the generated code would want to inline these functions.
// The same applies to the functions to read .proto values below.
@ -200,8 +200,8 @@ T(FLOAT, f, uint32_t, float, _float) {
#undef T
// Parses a tag, places the result in *tag.
INLINE const uint8_t *parse_tag(const uint8_t *buf, const uint8_t *end,
struct upb_tag *tag, struct upb_status *status)
INLINE const uint8_t *decode_tag(const uint8_t *buf, const uint8_t *end,
struct upb_tag *tag, struct upb_status *status)
{
uint32_t tag_int;
const uint8_t *ret = upb_get_v_uint32_t(buf, end, &tag_int, status);
@ -239,10 +239,10 @@ const uint8_t *upb_get_v_uint64_t_full(const uint8_t *buf, const uint8_t *end,
return buf;
}
const uint8_t *upb_parse_wire_value(uint8_t *buf, uint8_t *end,
upb_wire_type_t wt,
union upb_wire_value *wv,
struct upb_status *status)
const uint8_t *upb_decode_wire_value(uint8_t *buf, uint8_t *end,
upb_wire_type_t wt,
union upb_wire_value *wv,
struct upb_status *status)
{
switch(wt) {
case UPB_WIRE_TYPE_VARINT:
@ -282,10 +282,10 @@ static const uint8_t *skip_wire_value(const uint8_t *buf, const uint8_t *end,
}
}
static const uint8_t *upb_parse_value(const uint8_t *buf, const uint8_t *end,
upb_field_type_t ft,
union upb_value_ptr v,
struct upb_status *status)
static const uint8_t *upb_decode_value(const uint8_t *buf, const uint8_t *end,
upb_field_type_t ft,
union upb_value_ptr v,
struct upb_status *status)
{
#define CASE(t, member_name) \
case UPB_TYPE(t): return upb_get_ ## t(buf, end, v.member_name, status);
@ -311,52 +311,52 @@ static const uint8_t *upb_parse_value(const uint8_t *buf, const uint8_t *end,
#undef CASE
}
struct upb_parser_frame {
struct upb_decoder_frame {
struct upb_msgdef *msgdef;
struct upb_fielddef *field;
size_t end_offset; // For groups, 0.
};
struct upb_parser {
// Immutable state of the parser.
struct upb_decoder {
// Immutable state of the decoder.
struct upb_msgdef *toplevel_msgdef;
upb_sink *sink;
// State pertaining to a particular parse (resettable).
// State pertaining to a particular decode (resettable).
// Stack entries store the offset where the submsg ends (for groups, 0).
struct upb_parser_frame stack[UPB_MAX_NESTING], *top, *limit;
struct upb_decoder_frame stack[UPB_MAX_NESTING], *top, *limit;
size_t completed_offset;
void *udata;
};
upb_parser *upb_parser_new(struct upb_msgdef *msgdef)
upb_decoder *upb_decoder_new(struct upb_msgdef *msgdef)
{
upb_parser *p = malloc(sizeof(*p));
p->toplevel_msgdef = msgdef;
p->limit = &p->stack[UPB_MAX_NESTING];
return p;
upb_decoder *d = malloc(sizeof(*d));
d->toplevel_msgdef = msgdef;
d->limit = &d->stack[UPB_MAX_NESTING];
return d;
}
void upb_parser_free(upb_parser *p)
void upb_decoder_free(upb_decoder *d)
{
free(p);
free(d);
}
void upb_parser_reset(upb_parser *p, upb_sink *sink)
void upb_decoder_reset(upb_decoder *d, upb_sink *sink)
{
p->top = p->stack;
p->completed_offset = 0;
p->sink = sink;
p->top->msgdef = p->toplevel_msgdef;
d->top = d->stack;
d->completed_offset = 0;
d->sink = sink;
d->top->msgdef = d->toplevel_msgdef;
// The top-level message is not delimited (we can keep receiving data for it
// indefinitely), so we treat it like a group.
p->top->end_offset = 0;
d->top->end_offset = 0;
}
static const void *get_msgend(upb_parser *p, const uint8_t *start)
static const void *get_msgend(upb_decoder *d, const uint8_t *start)
{
if(p->top->end_offset > 0)
return start + (p->top->end_offset - p->completed_offset);
if(d->top->end_offset > 0)
return start + (d->top->end_offset - d->completed_offset);
else
return (void*)UINTPTR_MAX; // group.
}
@ -378,50 +378,50 @@ INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) {
* Pushes a new stack frame for a submessage with the given len (which will
* be zero if the submessage is a group).
*/
static const uint8_t *push(upb_parser *p, const uint8_t *start,
static const uint8_t *push(upb_decoder *d, const uint8_t *start,
uint32_t submsg_len, struct upb_fielddef *f,
struct upb_status *status)
{
p->top->field = f;
p->top++;
if(p->top >= p->limit) {
d->top->field = f;
d->top++;
if(d->top >= d->limit) {
upb_seterr(status, UPB_STATUS_ERROR,
"Nesting exceeded maximum (%d levels)\n",
UPB_MAX_NESTING);
return NULL;
}
struct upb_parser_frame *frame = p->top;
frame->end_offset = p->completed_offset + submsg_len;
struct upb_decoder_frame *frame = d->top;
frame->end_offset = d->completed_offset + submsg_len;
frame->msgdef = upb_downcast_msgdef(f->def);
upb_sink_onstart(p->sink, f);
return get_msgend(p, start);
upb_sink_onstart(d->sink, f);
return get_msgend(d, start);
}
/**
* Pops a stack frame, returning a pointer for where the next submsg should
* end (or a pointer that is out of range for a group).
*/
static const void *pop(upb_parser *p, const uint8_t *start)
static const void *pop(upb_decoder *d, const uint8_t *start)
{
p->top--;
upb_sink_onend(p->sink, p->top->field);
return get_msgend(p, start);
d->top--;
upb_sink_onend(d->sink, d->top->field);
return get_msgend(d, start);
}
size_t upb_parser_parse(upb_parser *p, upb_strptr str, struct upb_status *status)
size_t upb_decoder_decode(upb_decoder *d, upb_strptr str, struct upb_status *status)
{
// buf is our current offset, moves from start to end.
const uint8_t *buf = (uint8_t*)upb_string_getrobuf(str);
const uint8_t *const start = buf; // ptr equivalent of p->completed_offset
const uint8_t *const start = buf; // ptr equivalent of d->completed_offset
const uint8_t *const end = buf + upb_strlen(str);
// When we have fully parsed a tag/value pair, we advance this.
// When we have fully decoded a tag/value pair, we advance this.
const uint8_t *completed = buf;
const uint8_t *submsg_end = get_msgend(p, start);
struct upb_msgdef *msgdef = p->top->msgdef;
const uint8_t *submsg_end = get_msgend(d, start);
struct upb_msgdef *msgdef = d->top->msgdef;
upb_sink_status sink_status = UPB_SINK_CONTINUE;
// We need to check the status of operations that can fail, but we do so as
@ -434,17 +434,17 @@ size_t upb_parser_parse(upb_parser *p, upb_strptr str, struct upb_status *status
while(sink_status == UPB_SINK_CONTINUE && buf < end) {
// Parse/handle tag.
struct upb_tag tag;
buf = parse_tag(buf, end, &tag, status);
buf = decode_tag(buf, end, &tag, status);
if(tag.wire_type == UPB_WIRE_TYPE_END_GROUP) {
CHECK_STATUS();
if(!isgroup(submsg_end)) {
upb_seterr(status, UPB_STATUS_ERROR, "End group seen but current "
"message is not a group, byte offset: %zd",
p->completed_offset + (completed - start));
d->completed_offset + (completed - start));
goto err;
}
submsg_end = pop(p, start);
msgdef = p->top->msgdef;
submsg_end = pop(d, start);
msgdef = d->top->msgdef;
completed = buf;
continue;
}
@ -456,16 +456,16 @@ size_t upb_parser_parse(upb_parser *p, upb_strptr str, struct upb_status *status
if(tag.wire_type == UPB_WIRE_TYPE_DELIMITED) {
int32_t delim_len;
buf = upb_get_INT32(buf, end, &delim_len, status);
CHECK_STATUS(); // Checking parse_tag() and upb_get_INT32().
CHECK_STATUS(); // Checking decode_tag() and upb_get_INT32().
const uint8_t *delim_end = buf + delim_len;
if(f && f->type == UPB_TYPE(MESSAGE)) {
submsg_end = push(p, start, delim_end - start, f, status);
msgdef = p->top->msgdef;
submsg_end = push(d, start, delim_end - start, f, status);
msgdef = d->top->msgdef;
} else {
if(f && upb_isstringtype(f->type)) {
int32_t str_start = buf - start;
sink_status =
upb_sink_onstr(p->sink, f, str, str_start, str_start + delim_len);
upb_sink_onstr(d->sink, f, str, str_start, str_start + delim_len);
} // else { TODO: packed arrays }
// If field was not found, it is skipped silently.
buf = delim_end; // Could be >end.
@ -474,14 +474,14 @@ size_t upb_parser_parse(upb_parser *p, upb_strptr str, struct upb_status *status
if(!f || !upb_check_type(tag.wire_type, f->type)) {
buf = skip_wire_value(buf, end, tag.wire_type, status);
} else if (f->type == UPB_TYPE(GROUP)) {
submsg_end = push(p, start, 0, f, status);
msgdef = p->top->msgdef;
submsg_end = push(d, start, 0, f, status);
msgdef = d->top->msgdef;
} else {
union upb_value val;
buf = upb_parse_value(buf, end, f->type, upb_value_addrof(&val),
buf = upb_decode_value(buf, end, f->type, upb_value_addrof(&val),
status);
CHECK_STATUS(); // Checking upb_parse_value().
sink_status = upb_sink_onvalue(p->sink, f, val);
CHECK_STATUS(); // Checking upb_decode_value().
sink_status = upb_sink_onvalue(d->sink, f, val);
}
}
CHECK_STATUS();
@ -492,16 +492,16 @@ size_t upb_parser_parse(upb_parser *p, upb_strptr str, struct upb_status *status
"did not lie on a tag/value boundary.");
goto err;
}
submsg_end = pop(p, start);
msgdef = p->top->msgdef;
submsg_end = pop(d, start);
msgdef = d->top->msgdef;
}
// while(buf < p->packed_end) { TODO: packed arrays }
// while(buf < d->packed_end) { TODO: packed arrays }
completed = buf;
}
size_t read;
err:
read = (char*)completed - (char*)start;
p->completed_offset += read;
d->completed_offset += read;
return read;
}

@ -0,0 +1,57 @@
/*
* upb - a minimalist implementation of protocol buffers.
*
* upb_decoder implements a high performance, callback-based, stream-oriented
* decoder (comparable to the SAX model in XML parsers). For parsing protobufs
* into in-memory messages (a more DOM-like model), see the routines in
* upb_msg.h, which are layered on top of this decoder.
*
* TODO: the decoder currently does not support returning unknown values. This
* can easily be added when it is needed.
*
* Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
*/
#ifndef UPB_DECODER_H_
#define UPB_DECODER_H_
#include <stdbool.h>
#include <stdint.h>
#include "upb.h"
#include "descriptor.h"
#ifdef __cplusplus
extern "C" {
#endif
/* upb_decoder *****************************************************************/
// A upb_decoder decodes the binary protocol buffer format, writing the data it
// decodes to a upb_sink.
struct upb_decoder;
typedef struct upb_decoder upb_decoder;
// Allocates and frees a upb_decoder, respectively.
upb_decoder *upb_decoder_new(struct upb_msgdef *md);
void upb_decoder_free(upb_decoder *p);
// Resets the internal state of an already-allocated decoder. This puts it in a
// state where it has not seen any data, and expects the next data to be from
// the beginning of a new protobuf. Parsers must be reset before they can be
// used. A decoder can be reset multiple times.
void upb_decoder_reset(upb_decoder *p, upb_sink *sink);
// Decodes protobuf data out of str, returning how much data was decoded. The
// next call to upb_decoder_decode should begin with the first byte that was
// not decoded. "status" indicates whether an error occurred.
//
// TODO: provide the following guarantee:
// retval will always be >= len.
size_t upb_decoder_decode(upb_decoder *p, upb_strptr str,
struct upb_status *status);
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* UPB_DECODER_H_ */

@ -829,7 +829,7 @@ void upb_symtab_add_desc(struct upb_symtab *s, upb_strptr desc,
struct upb_status *status)
{
upb_msg *fds = upb_msg_new(s->fds_msgdef);
upb_msg_parsestr(fds, s->fds_msgdef, desc, status);
upb_msg_decodestr(fds, s->fds_msgdef, desc, status);
if(!upb_ok(status)) return;
upb_symtab_addfds(s, (google_protobuf_FileDescriptorSet*)fds, status);
upb_msg_unref(fds, s->fds_msgdef);

@ -1,57 +0,0 @@
/*
* upb - a minimalist implementation of protocol buffers.
*
* upb_parse implements a high performance, callback-based, stream-oriented
* parser (comparable to the SAX model in XML parsers). For parsing protobufs
* into in-memory messages (a more DOM-like model), see the routines in
* upb_msg.h, which are layered on top of this parser.
*
* TODO: the parser currently does not support returning unknown values. This
* can easily be added when it is needed.
*
* Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
*/
#ifndef UPB_PARSE_H_
#define UPB_PARSE_H_
#include <stdbool.h>
#include <stdint.h>
#include "upb.h"
#include "descriptor.h"
#ifdef __cplusplus
extern "C" {
#endif
/* upb_parser *****************************************************************/
// A upb_parser parses the binary protocol buffer format, writing the data it
// parses to a upb_sink.
struct upb_parser;
typedef struct upb_parser upb_parser;
// Allocates and frees a upb_parser, respectively.
upb_parser *upb_parser_new(struct upb_msgdef *md);
void upb_parser_free(upb_parser *p);
// Resets the internal state of an already-allocated parser. This puts it in a
// state where it has not seen any data, and expects the next data to be from
// the beginning of a new protobuf. Parsers must be reset before they can be
// used. A parser can be reset multiple times.
void upb_parser_reset(upb_parser *p, upb_sink *sink);
// Parses protobuf data out of str, returning how much data was parsed. The
// next call to upb_parser_parse should begin with the first byte that was
// not parsed. "status" indicates whether an error occurred.
//
// TODO: provide the following guarantee:
// retval will always be >= len. */
size_t upb_parser_parse(upb_parser *p, upb_strptr str,
struct upb_status *status);
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* UPB_PARSE_H_ */

@ -6,7 +6,7 @@
#include <google/protobuf/descriptor.h>
#include "upb_data.h"
#include "upb_def.h"
#include "upb_parse.h"
#include "upb_decoder.h"
int num_assertions = 0;
#define ASSERT(expr) do { \
@ -171,7 +171,7 @@ void parse_and_compare(MESSAGE_CIDENT *proto2_msg,
// Parse to both proto2 and upb.
ASSERT(proto2_msg->ParseFromArray(upb_string_getrobuf(str), upb_strlen(str)));
struct upb_status status = UPB_STATUS_INIT;
upb_msg_parsestr(upb_msg, upb_md, str, &status);
upb_msg_decodestr(upb_msg, upb_md, str, &status);
ASSERT(upb_ok(&status));
compare(*proto2_msg, upb_msg, upb_md);
}

@ -3,7 +3,7 @@
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include "upb_parse.c"
#include "upb_decoder.c"
#include "upb_def.h"
int num_assertions = 0;

@ -673,7 +673,7 @@ int main(int argc, char *argv[])
struct upb_symtab *s = upb_symtab_new();
upb_msg *fds_msg = upb_msg_new(s->fds_msgdef);
struct upb_status status = UPB_STATUS_INIT;
upb_msg_parsestr(fds_msg, s->fds_msgdef, descriptor, &status);
upb_msg_decodestr(fds_msg, s->fds_msgdef, descriptor, &status);
if(!upb_ok(&status))
error("Failed to parse input file descriptor: %s", status.msg);
google_protobuf_FileDescriptorSet *fds = (void*)fds_msg;

Loading…
Cancel
Save