A flurry of activity. Doesn't compile yet.

- a descriptor.c that describes the data structures in
  descriptor.proto using the data structures in descriptor.h.
- everything renamed pbstream -> upb.
- modularization rethought.
- Doesn't compile yet, but should once things settle back down.
pull/13171/head
Joshua Haberman 16 years ago
parent fdcefd68b1
commit 0c80c38475
  1. 1
      LICENSE
  2. 17
      Makefile
  3. 6
      README
  4. 1513
      descriptor.c
  5. 189
      pbstream.h
  6. 51
      pbstream_lowlevel.h
  7. 68
      upb.h
  8. 50
      upb_fieldmap.c
  9. 53
      upb_fieldmap.h
  10. 257
      upb_parse.c
  11. 137
      upb_parse.h

@ -1,5 +1,6 @@
Copyright (c) 2009, Joshua Haberman
Copyright (c) 2009, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without

@ -1,12 +1,19 @@
.PHONY: all clean
CFLAGS=-std=c99 -O3 -Wall -Wextra -pedantic
all: pbstream.o tests
OBJ=upb_parse.o upb_fieldmap.o upb_struct.o
all: $(OBJ) tests
clean:
rm -f pbstream.o pbstruct.o tests
rm -f $(OBJ) tests
pbstream.o: pbstream.c pbstream.h
gcc $(CFLAGS) -o pbstream.o -c pbstream.c
upb_parse.o: upb_parse.c upb_parse.h
gcc $(CFLAGS) -o upb_parse.o -c upb_parse.c
tests: tests.c pbstream.c pbstream.h
upb_fieldmap.o: upb_fieldmap.c upb_fieldmap.h
gcc $(CFLAGS) -o upb_fieldmap.o -c upb_fieldmap.c
upb_struct.o: upb_struct.c upb_struct.h
gcc $(CFLAGS) -o upb_struct.o -c upb_struct.c
tests: tests.c upb_parse.c upb_parse.h
gcc $(CFLAGS) -o tests tests.c

@ -1,9 +1,9 @@
pbstream: a stream-oriented implementation of protocol buffers.
upb - a minimalist implementation of protocol buffers.
- For API documentation, see the header file.
- For API documentation, see the header files.
- To build type "make".
Author: Joshua Haberman <joshua@reverberate.org>
Author: Joshua Haberman (joshua@reverberate.org, haberman@google.com)
See LICENSE for copyright information.

File diff suppressed because it is too large Load Diff

@ -1,189 +0,0 @@
/*
* pbstream - a stream-oriented implementation of protocol buffers.
*
* Copyright (c) 2008 Joshua Haberman. See LICENSE for details.
*/
#ifndef PBSTREAM_H_
#define PBSTREAM_H_
#include <stdint.h>
#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
#endif
/* The maximum that any submessages can be nested. Matches proto2's limit. */
#define PBSTREAM_MAX_STACK 64
/* A list of types as they can appear in a .proto file. */
typedef enum pbstream_type {
PBSTREAM_TYPE_DOUBLE = 0,
PBSTREAM_TYPE_FLOAT,
PBSTREAM_TYPE_INT32,
PBSTREAM_TYPE_INT64,
PBSTREAM_TYPE_UINT32,
PBSTREAM_TYPE_UINT64,
PBSTREAM_TYPE_SINT32,
PBSTREAM_TYPE_SINT64,
PBSTREAM_TYPE_FIXED32,
PBSTREAM_TYPE_FIXED64,
PBSTREAM_TYPE_SFIXED32,
PBSTREAM_TYPE_SFIXED64,
PBSTREAM_TYPE_BOOL,
PBSTREAM_TYPE_STRING,
PBSTREAM_TYPE_BYTES,
PBSTREAM_TYPE_ENUM,
PBSTREAM_TYPE_MESSAGE
} pbstream_type_t;
/* A list of types as they are encoded on-the-wire. */
typedef enum pbstream_wire_type {
PBSTREAM_WIRE_TYPE_VARINT = 0,
PBSTREAM_WIRE_TYPE_64BIT = 1,
PBSTREAM_WIRE_TYPE_DELIMITED = 2,
PBSTREAM_WIRE_TYPE_START_GROUP = 3,
PBSTREAM_WIRE_TYPE_END_GROUP = 4,
PBSTREAM_WIRE_TYPE_32BIT = 5,
} pbstream_wire_type_t;
typedef int32_t pbstream_field_number_t;
/* A deserialized value as described in a .proto file. */
struct pbstream_tagged_value {
struct pbstream_field *field;
union pbstream_value {
double _double;
float _float;
int32_t int32;
int64_t int64;
uint32_t uint32;
uint64_t uint64;
bool _bool;
struct pbstream_delimited {
size_t offset; /* relative to the beginning of the stream. */
uint32_t len;
} delimited;
} v;
};
/* A value as it is encoded on-the-wire, before it has been interpreted as
* any particular .proto type. */
struct pbstream_tagged_wire_value {
pbstream_wire_type_t type;
union pbstream_wire_value {
uint64_t varint;
uint64_t _64bit;
struct {
size_t offset; /* relative to the beginning of the stream. */
uint32_t len;
} delimited;
uint32_t _32bit;
} v;
};
/* Definition of a single field in a message. Note that this does not include
* nearly all of the information that can be specified about a field in a
* .proto file. For example, we don't even know the field's name. We keep
* only the information necessary to parse the field. */
struct pbstream_field {
pbstream_field_number_t field_number;
pbstream_type_t type;
struct pbstream_fieldset *fieldset; /* if type == MESSAGE */
};
/* A fieldset is a data structure that supports fast lookup of fields by number.
* It is logically a map of {field_number -> struct pbstream_field*}. Fast
* lookup is important, because it is in the critical path of parsing. */
struct pbstream_fieldset {
int num_fields;
struct pbstream_field *fields;
int array_size;
struct pbstream_field **array;
/* TODO: the hashtable part. */
};
/* Takes an array of num_fields fields and builds an optimized table for fast
* lookup of fields by number. The input fields need not be sorted. This
* fieldset must be freed with pbstream_free_fieldset(). */
void pbstream_init_fieldset(struct pbstream_fieldset *fieldset,
struct pbstream_field *fields,
int num_fields);
void pbstream_free_fieldset(struct pbstream_fieldset *fieldset);
struct pbstream_parse_stack_frame {
struct pbstream_fieldset *fieldset;
size_t end_offset; /* unknown for the top frame, so we set to SIZE_MAX */
};
/* The stream parser's state. */
struct pbstream_parse_state {
size_t offset;
struct pbstream_parse_stack_frame stack[PBSTREAM_MAX_STACK];
struct pbstream_parse_stack_frame *top, *limit;
};
/* Call this once before parsing to initialize the data structures.
* message_type can be NULL, in which case all fields will be reported as
* unknown. */
void pbstream_init_parser(
struct pbstream_parse_state *state,
struct pbstream_fieldset *toplevel_fieldset);
/* Status as returned by pbstream_parse(). Status codes <0 are fatal errors
* that cannot be recovered. Status codes >0 are unusual but nonfatal events,
* which nonetheless must be handled differently since they do not return data
* in val. */
typedef enum pbstream_status {
PBSTREAM_STATUS_OK = 0,
PBSTREAM_STATUS_SUBMESSAGE_END = 1, // No data is stored in val or wv.
/** FATAL ERRORS: these indicate corruption, and cannot be recovered. */
// A varint did not terminate before hitting 64 bits.
PBSTREAM_ERROR_UNTERMINATED_VARINT = -1,
// A submessage ended in the middle of data.
PBSTREAM_ERROR_BAD_SUBMESSAGE_END = -2,
// Encountered a "group" on the wire (deprecated and unsupported).
PBSTREAM_ERROR_GROUP = -3,
// Input was nested more than PBSTREAM_MAX_STACK deep.
PBSTREAM_ERROR_STACK_OVERFLOW = -4,
// The input data caused the pb's offset (a size_t) to overflow.
PBSTREAM_ERROR_OVERFLOW = -5,
/** NONFATAL ERRORS: the input was invalid, but we can continue if desired. */
// A value was encountered that was not defined in the .proto file. The
// unknown value is stored in wv.
PBSTREAM_ERROR_UNKNOWN_VALUE = 2,
// A field was encoded with the wrong wire type. The wire value is stored in
// wv.
PBSTREAM_ERROR_MISMATCHED_TYPE = 3,
} pbstream_status_t;
struct pbstream_parse_state;
/* The main parsing function. Parses the next value from buf, storing the
* parsed value in val. If val is of type PBSTREAM_TYPE_MESSAGE, then a
* submessage was entered.
*
* IMPORTANT NOTE: for efficiency, the parsing routines do not do bounds checks,
* and may read as much as far as buf+10. So the caller must ensure that buf is
* not within 10 bytes of unmapped memory, or the program will segfault. Clients
* are encouraged to overallocate their buffers by ten bytes to compensate. */
pbstream_status_t pbstream_parse_field(struct pbstream_parse_state *s,
uint8_t *buf,
pbstream_field_number_t *fieldnum,
struct pbstream_tagged_value *val,
struct pbstream_tagged_wire_value *wv);
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* PBSTREAM_H_ */

@ -1,51 +0,0 @@
/*
* pbstream - a stream-oriented implementation of protocol buffers.
*
* Copyright (c) 2008 Joshua Haberman. See LICENSE for details.
*
* The structures and functions in this file offer more control than what is
* offered in pbstream.h. These can be used for more specialized/optimized
* parsing applications. */
#ifndef PBSTREAM_LOWLEVEL_H_
#define PBSTREAM_LOWLEVEL_H_
#include "pbstream.h"
#ifdef __cplusplus
extern "C" {
#endif
/* A tag occurs before each value on-the-wire. */
struct pbstream_tag {
pbstream_field_number_t field_number;
pbstream_wire_type_t wire_type;
};
/* Parses a single tag from the character data starting at buf, and updates
* buf to point one past the bytes that were consumed. buf will be incremented
* by at most ten bytes. */
pbstream_status_t parse_tag(uint8_t **buf, struct pbstream_tag *tag);
/* Parses a wire value with the given type (which must have been obtained from
* a tag that was just parsed) and adds the number of bytes that were consumed
* to *offset. */
pbstream_status_t parse_wire_value(uint8_t *buf, size_t *offset,
pbstream_wire_type_t wt,
union pbstream_wire_value *wv);
/* Like the above, but discards the wire value instead of saving it. */
pbstream_status_t skip_wire_value(uint8_t *buf, size_t *offset,
pbstream_wire_type_t wt);
/* Looks the given field number up in the fieldset, and returns the
* corresponding pbstream_field definition (or NULL if this field number
* does not exist in this fieldset). */
struct pbstream_field *pbstream_find_field(struct pbstream_fieldset *fs,
pbstream_field_number_t num);
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* PBSTREAM_LOWLEVEL_H_ */

68
upb.h

@ -0,0 +1,68 @@
/*
* upb - a minimalist implementation of protocol buffers.
*
* Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
*/
#ifndef UPB_H_
#define UPB_H_
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h> /* for size_t. */
#ifdef __cplusplus
extern "C" {
#endif
/* The maximum that any submessages can be nested. Matches proto2's limit. */
#define UPB_MAX_NESTING 64
/* A list of types as they are encoded on-the-wire. */
typedef enum upb_wire_type {
UPB_WIRE_TYPE_VARINT = 0,
UPB_WIRE_TYPE_64BIT = 1,
UPB_WIRE_TYPE_DELIMITED = 2,
UPB_WIRE_TYPE_START_GROUP = 3,
UPB_WIRE_TYPE_END_GROUP = 4,
UPB_WIRE_TYPE_32BIT = 5,
} upb_wire_type_t;
struct upb_delimited {
size_t offset; /* relative to the beginning of the stream. */
uint32_t len;
};
/* A value as it is encoded on-the-wire. */
union upb_wire_value {
uint64_t varint;
uint64_t _64bit;
uint32_t _32bit;
struct upb_delimited delimited;
};
/* A value as described in a .proto file. */
union upb_value {
double _double;
float _float;
int32_t int32;
int64_t int64;
uint32_t uint32;
uint64_t uint64;
bool _bool;
struct upb_delimited delimited;
};
typedef int32_t upb_field_number_t;
/* A tag occurs before each value on-the-wire. */
struct upb_tag {
upb_field_number_t field_number;
upb_wire_type_t wire_type;
};
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* UPB_H_ */

@ -0,0 +1,50 @@
/*
* upb - a minimalist implementation of protocol buffers.
*
* Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
*/
#include "upb_fieldmap.h"
#include <stdlib.h>
void pbstream_init_fieldmap(struct pbstream_fieldmap *fieldmap,
struct pbstream_field *fields,
int num_fields)
{
qsort(fields, num_fields, sizeof(*fields), compare_fields);
/* Find the largest n for which at least half the fieldnums <n are used.
* Start at 8 to avoid noise of small numbers. */
pbstream_field_number_t n = 0, maybe_n;
for(int i = 0; i < num_fields; i++) {
maybe_n = fields[i].field_number;
if(maybe_n > 8 && maybe_n/(i+1) >= 2) break;
n = maybe_n;
}
fieldmap->num_fields = num_fields;
fieldmap->fields = malloc(sizeof(*fieldmap->fields)*num_fields);
memcpy(fieldmap->fields, fields, sizeof(*fields)*num_fields);
fieldmap->array_size = n;
fieldmap->array = malloc(sizeof(*fieldmap->array)*n);
memset(fieldmap->array, 0, sizeof(*fieldmap->array)*n);
for (int i = 0; i < num_fields && fields[i].field_number <= n; i++)
fieldmap->array[fields[i].field_number-1] = &fieldmap->fields[i];
/* Until we support the hashtable part... */
assert(n == fields[num_fields-1].field_number);
}
void pbstream_free_fieldmap(struct pbstream_fieldmap *fieldmap)
{
free(fieldmap->fields);
free(fieldmap->array);
}
/* Emit definition for inline function. */
extern void *upb_fieldmap_find(struct upb_fieldmap *fm,
pbstream_field_number_t num,
size_t info_size);

@ -0,0 +1,53 @@
/*
* upb - a minimalist implementation of protocol buffers.
*
* Copyright (c) 2009 Joshua Haberman. See LICENSE for details.
*
* A fieldmap is a data structure that supports fast lookup of fields by
* number. It is logically a map of {field_number -> <field info>}, where
* <field info> is any struct that begins with the field number. Fast lookup
* is important, because it is in the critical path of parsing. */
#ifndef UPB_FIELDMAP_H_
#define UPB_FIELDMAP_H_
#include "upb.h"
#ifdef __cplusplus
extern "C" {
#endif
struct upb_fieldmap {
int array_size;
void *array;
/* TODO: the hashtable part. */
};
/* Takes an array of num_fields fields and builds an optimized table for fast
* lookup of fields by number. The input fields need not be sorted. This
* fieldmap must be freed with upb_free_fieldmap(). */
void upb_init_fieldmap(struct upb_fieldmap *fieldmap,
void *fields,
int num_fields,
int field_size);
void upb_free_fieldmap(struct upb_fieldmap *fieldmap);
/* Looks the given field number up in the fieldmap, and returns the
* corresponding field definition (or NULL if this field number does not exist
* in this fieldmap). */
inline void *upb_fieldmap_find(struct upb_fieldmap *fm,
upb_field_number_t num,
size_t info_size)
{
if (num < array_size) {
return (char*)fs->array + (num*info_size);
} else {
/* TODO: the hashtable part. */
}
}
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* UPB_PARSE_H_ */

@ -1,14 +1,13 @@
/*
* pbstream - a stream-oriented implementation of protocol buffers.
* upb - a minimalist implementation of protocol buffers.
*
* Copyright (c) 2008-2009 Joshua Haberman. See LICENSE for details.
*/
#include "upb_parse.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "pbstream.h"
#include "pbstream_lowlevel.h"
/* Branch prediction hints for GCC. */
#ifdef __GNUC__
@ -20,16 +19,16 @@
#endif
#define CHECK(func) do { \
pbstream_status_t status = func; \
if(status != PBSTREAM_STATUS_OK) return status; \
upb_status_t status = func; \
if(status != UPB_STATUS_OK) return status; \
} while (0)
/* Lowest-level functions -- these read integers from the input buffer.
* To avoid branches, none of these do bounds checking. So we force clients
* to overallocate their buffers by >=9 bytes. */
static pbstream_status_t get_v_uint64_t(uint8_t *restrict *buf,
uint64_t *restrict val)
static upb_status_t get_v_uint64_t(uint8_t *restrict *buf,
uint64_t *restrict val)
{
uint8_t *ptr = *buf, b;
uint32_t part0 = 0, part1 = 0, part2 = 0;
@ -45,15 +44,15 @@ static pbstream_status_t get_v_uint64_t(uint8_t *restrict *buf,
b = *(ptr++); part1 |= (b & 0x7F) << 21; if (!(b & 0x80)) goto done;
b = *(ptr++); part2 = (b & 0x7F) ; if (!(b & 0x80)) goto done;
b = *(ptr++); part2 |= (b & 0x7F) << 7; if (!(b & 0x80)) goto done;
return PBSTREAM_ERROR_UNTERMINATED_VARINT;
return UPB_ERROR_UNTERMINATED_VARINT;
done:
*buf = ptr;
*val = (uint64_t)part0 | ((uint64_t)part1 << 28) | ((uint64_t)part2 << 56);
return PBSTREAM_STATUS_OK;
return UPB_STATUS_OK;
}
static pbstream_status_t skip_v_uint64_t(uint8_t **buf)
static upb_status_t skip_v_uint64_t(uint8_t **buf)
{
uint8_t *ptr = *buf, b;
b = *(ptr++); if (!(b & 0x80)) goto done;
@ -66,15 +65,15 @@ static pbstream_status_t skip_v_uint64_t(uint8_t **buf)
b = *(ptr++); if (!(b & 0x80)) goto done;
b = *(ptr++); if (!(b & 0x80)) goto done;
b = *(ptr++); if (!(b & 0x80)) goto done;
return PBSTREAM_ERROR_UNTERMINATED_VARINT;
return UPB_ERROR_UNTERMINATED_VARINT;
done:
*buf = (uint8_t*)ptr;
return PBSTREAM_STATUS_OK;
return UPB_STATUS_OK;
}
static pbstream_status_t get_v_uint32_t(uint8_t *restrict *buf,
uint32_t *restrict val)
static upb_status_t get_v_uint32_t(uint8_t *restrict *buf,
uint32_t *restrict val)
{
uint8_t *ptr = *buf, b;
uint32_t result;
@ -85,33 +84,33 @@ static pbstream_status_t get_v_uint32_t(uint8_t *restrict *buf,
b = *(ptr++); result |= (b & 0x7F) << 14; if (!(b & 0x80)) goto done;
b = *(ptr++); result |= (b & 0x7F) << 21; if (!(b & 0x80)) goto done;
b = *(ptr++); result = (b & 0x7F) << 28; if (!(b & 0x80)) goto done;
return PBSTREAM_ERROR_UNTERMINATED_VARINT;
return UPB_ERROR_UNTERMINATED_VARINT;
done:
*buf = ptr;
*val = result;
return PBSTREAM_STATUS_OK;
return UPB_STATUS_OK;
}
static pbstream_status_t get_f_uint32_t(uint8_t *restrict *buf,
uint32_t *restrict val)
static upb_status_t get_f_uint32_t(uint8_t *restrict *buf,
uint32_t *restrict val)
{
uint8_t *b = *buf;
#define SHL(val, bits) ((uint32_t)val << bits)
*val = SHL(b[0], 0) | SHL(b[1], 8) | SHL(b[2], 16) | SHL(b[3], 24);
#undef SHL
*buf += sizeof(uint32_t);
return PBSTREAM_STATUS_OK;
return UPB_STATUS_OK;
}
static pbstream_status_t skip_f_uint32_t(uint8_t **buf)
static upb_status_t skip_f_uint32_t(uint8_t **buf)
{
*buf += sizeof(uint32_t);
return PBSTREAM_STATUS_OK;
return UPB_STATUS_OK;
}
static pbstream_status_t get_f_uint64_t(uint8_t *restrict *buf,
uint64_t *restrict val)
static upb_status_t get_f_uint64_t(uint8_t *restrict *buf,
uint64_t *restrict val)
{
uint8_t *b = *buf;
/* TODO: is this worth 32/64 specializing? */
@ -120,13 +119,13 @@ static pbstream_status_t get_f_uint64_t(uint8_t *restrict *buf,
SHL(b[4], 32) | SHL(b[5], 40) | SHL(b[6], 48) | SHL(b[7], 56);
#undef SHL
*buf += sizeof(uint64_t);
return PBSTREAM_STATUS_OK;
return UPB_STATUS_OK;
}
static pbstream_status_t skip_f_uint64_t(uint8_t **buf)
static upb_status_t skip_f_uint64_t(uint8_t **buf)
{
*buf += sizeof(uint64_t);
return PBSTREAM_STATUS_OK;
return UPB_STATUS_OK;
}
static int32_t zz_decode_32(uint32_t n) { return (n >> 1) ^ -(int32_t)(n & 1); }
@ -139,15 +138,15 @@ static int64_t zz_decode_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); }
static void wvtov_ ## type(wire_t s, val_t *d)
#define GET(type, v_or_f, wire_t, val_t, member_name) \
static pbstream_status_t get_ ## type(struct pbstream_parse_state *s, \
static upb_status_t get_ ## type(struct upb_parse_state *s, \
uint8_t *buf, \
struct pbstream_tagged_value *d) { \
struct upb_tagged_value *d) { \
wire_t tmp; \
uint8_t *b = buf; \
CHECK(get_ ## v_or_f ## _ ## wire_t(&b, &tmp)); \
wvtov_ ## type(tmp, &d->v.member_name); \
s->offset += (b-buf); \
return PBSTREAM_STATUS_OK; \
return UPB_STATUS_OK; \
}
#define T(type, v_or_f, wire_t, val_t, member_name) \
@ -173,28 +172,28 @@ T(ENUM, v, uint32_t, int32_t, int32) { *d = (int32_t)s; }
#undef GET
#undef T
static void wvtov_delimited(uint32_t s, struct pbstream_delimited *d, size_t o)
static void wvtov_delimited(uint32_t s, struct upb_delimited *d, size_t o)
{
d->offset = o;
d->len = s;
}
/* Use BYTES version for both STRING and BYTES, leave UTF-8 checks to client. */
static pbstream_status_t get_BYTES(struct pbstream_parse_state *s, uint8_t *buf,
struct pbstream_tagged_value *d) {
static upb_status_t get_BYTES(struct upb_parse_state *s, uint8_t *buf,
struct upb_tagged_value *d) {
uint32_t tmp;
uint8_t *b = buf;
CHECK(get_v_uint32_t(&b, &tmp));
s->offset += (b-buf); /* advance past length varint. */
wvtov_delimited(tmp, &d->v.delimited, s->offset);
size_t new_offset = s->offset + d->v.delimited.len; /* skip bytes */
if (unlikely(new_offset < s->offset)) return PBSTREAM_ERROR_OVERFLOW;
if (unlikely(new_offset < s->offset)) return UPB_ERROR_OVERFLOW;
s->offset = new_offset;
return PBSTREAM_STATUS_OK;
return UPB_STATUS_OK;
}
static pbstream_status_t get_MESSAGE(struct pbstream_parse_state *s, uint8_t *buf,
struct pbstream_tagged_value *d) {
static upb_status_t get_MESSAGE(struct upb_parse_state *s, uint8_t *buf,
struct upb_tagged_value *d) {
/* We're entering a sub-message. */
uint32_t tmp;
uint8_t *b = buf;
@ -202,146 +201,139 @@ static pbstream_status_t get_MESSAGE(struct pbstream_parse_state *s, uint8_t *bu
s->offset += (b-buf); /* advance past length varint. */
wvtov_delimited(tmp, &d->v.delimited, s->offset);
/* Unlike STRING and BYTES, we *don't* advance past delimited here. */
if (unlikely(++s->top == s->limit)) return PBSTREAM_ERROR_STACK_OVERFLOW;
if (unlikely(++s->top == s->limit)) return UPB_ERROR_STACK_OVERFLOW;
s->top->fieldset = d->field->fieldset;
s->top->end_offset = d->v.delimited.offset + d->v.delimited.len;
if (unlikely(s->top->end_offset < s->offset)) return PBSTREAM_ERROR_OVERFLOW;
return PBSTREAM_STATUS_OK;
if (unlikely(s->top->end_offset < s->offset)) return UPB_ERROR_OVERFLOW;
return UPB_STATUS_OK;
}
struct pbstream_type_info {
pbstream_wire_type_t expected_wire_type;
pbstream_status_t (*get)(struct pbstream_parse_state *s, uint8_t *buf,
struct pbstream_tagged_value *d);
struct upb_type_info {
upb_wire_type_t expected_wire_type;
upb_status_t (*get)(struct upb_parse_state *s, uint8_t *buf,
struct upb_tagged_value *d);
};
static struct pbstream_type_info type_info[] = {
{PBSTREAM_WIRE_TYPE_64BIT, get_DOUBLE},
{PBSTREAM_WIRE_TYPE_32BIT, get_FLOAT},
{PBSTREAM_WIRE_TYPE_VARINT, get_INT32},
{PBSTREAM_WIRE_TYPE_VARINT, get_INT64},
{PBSTREAM_WIRE_TYPE_VARINT, get_UINT32},
{PBSTREAM_WIRE_TYPE_VARINT, get_UINT64},
{PBSTREAM_WIRE_TYPE_VARINT, get_SINT32},
{PBSTREAM_WIRE_TYPE_VARINT, get_SINT64},
{PBSTREAM_WIRE_TYPE_32BIT, get_FIXED32},
{PBSTREAM_WIRE_TYPE_64BIT, get_FIXED64},
{PBSTREAM_WIRE_TYPE_32BIT, get_SFIXED32},
{PBSTREAM_WIRE_TYPE_64BIT, get_SFIXED64},
{PBSTREAM_WIRE_TYPE_VARINT, get_BOOL},
{PBSTREAM_WIRE_TYPE_DELIMITED, get_BYTES},
{PBSTREAM_WIRE_TYPE_DELIMITED, get_BYTES},
{PBSTREAM_WIRE_TYPE_VARINT, get_ENUM},
{PBSTREAM_WIRE_TYPE_DELIMITED, get_MESSAGE}
static struct upb_type_info type_info[] = {
{UPB_WIRE_TYPE_64BIT, get_DOUBLE},
{UPB_WIRE_TYPE_32BIT, get_FLOAT},
{UPB_WIRE_TYPE_VARINT, get_INT32},
{UPB_WIRE_TYPE_VARINT, get_INT64},
{UPB_WIRE_TYPE_VARINT, get_UINT32},
{UPB_WIRE_TYPE_VARINT, get_UINT64},
{UPB_WIRE_TYPE_VARINT, get_SINT32},
{UPB_WIRE_TYPE_VARINT, get_SINT64},
{UPB_WIRE_TYPE_32BIT, get_FIXED32},
{UPB_WIRE_TYPE_64BIT, get_FIXED64},
{UPB_WIRE_TYPE_32BIT, get_SFIXED32},
{UPB_WIRE_TYPE_64BIT, get_SFIXED64},
{UPB_WIRE_TYPE_VARINT, get_BOOL},
{UPB_WIRE_TYPE_DELIMITED, get_BYTES},
{UPB_WIRE_TYPE_DELIMITED, get_BYTES},
{UPB_WIRE_TYPE_VARINT, get_ENUM},
{UPB_WIRE_TYPE_DELIMITED, get_MESSAGE}
};
pbstream_status_t parse_tag(uint8_t **buf, struct pbstream_tag *tag)
upb_status_t parse_tag(uint8_t **buf, struct upb_tag *tag)
{
uint32_t tag_int;
CHECK(get_v_uint32_t(buf, &tag_int));
tag->wire_type = (pbstream_wire_type_t)(tag_int & 0x07);
tag->wire_type = (upb_wire_type_t)(tag_int & 0x07);
tag->field_number = tag_int >> 3;
return PBSTREAM_STATUS_OK;
return UPB_STATUS_OK;
}
pbstream_status_t parse_wire_value(uint8_t *buf, size_t *offset,
pbstream_wire_type_t wt,
union pbstream_wire_value *wv)
upb_status_t parse_wire_value(uint8_t *buf, size_t *offset,
upb_wire_type_t wt,
union upb_wire_value *wv)
{
#define READ(expr) CHECK(expr); *offset += (b-buf)
uint8_t *b = buf;
switch(wt) {
case PBSTREAM_WIRE_TYPE_VARINT:
case UPB_WIRE_TYPE_VARINT:
READ(get_v_uint64_t(&b, &wv->varint)); break;
case PBSTREAM_WIRE_TYPE_64BIT:
case UPB_WIRE_TYPE_64BIT:
READ(get_f_uint64_t(&b, &wv->_64bit)); break;
case PBSTREAM_WIRE_TYPE_32BIT:
case UPB_WIRE_TYPE_32BIT:
READ(get_f_uint32_t(&b, &wv->_32bit)); break;
case PBSTREAM_WIRE_TYPE_DELIMITED:
case UPB_WIRE_TYPE_DELIMITED:
wv->delimited.offset = *offset;
READ(get_v_uint32_t(&b, &wv->delimited.len));
size_t new_offset = *offset + wv->delimited.len;
if (new_offset < *offset) return PBSTREAM_ERROR_OVERFLOW;
if (new_offset < *offset) return UPB_ERROR_OVERFLOW;
*offset += new_offset;
break;
case PBSTREAM_WIRE_TYPE_START_GROUP:
case PBSTREAM_WIRE_TYPE_END_GROUP:
return PBSTREAM_ERROR_GROUP; /* deprecated, no plans to support. */
case UPB_WIRE_TYPE_START_GROUP:
case UPB_WIRE_TYPE_END_GROUP:
return UPB_ERROR_GROUP; /* deprecated, no plans to support. */
}
return PBSTREAM_STATUS_OK;
return UPB_STATUS_OK;
}
pbstream_status_t skip_wire_value(uint8_t *buf, size_t *offset,
pbstream_wire_type_t wt)
upb_status_t skip_wire_value(uint8_t *buf, size_t *offset,
upb_wire_type_t wt)
{
uint8_t *b = buf;
switch(wt) {
case PBSTREAM_WIRE_TYPE_VARINT:
case UPB_WIRE_TYPE_VARINT:
READ(skip_v_uint64_t(&b)); break;
case PBSTREAM_WIRE_TYPE_64BIT:
case UPB_WIRE_TYPE_64BIT:
READ(skip_f_uint64_t(&b)); break;
case PBSTREAM_WIRE_TYPE_32BIT:
case UPB_WIRE_TYPE_32BIT:
READ(skip_f_uint32_t(&b)); break;
case PBSTREAM_WIRE_TYPE_DELIMITED: {
case UPB_WIRE_TYPE_DELIMITED: {
/* Have to get (not skip) the length to skip the bytes. */
uint32_t len;
READ(get_v_uint32_t(&b, &len));
size_t new_offset = *offset + len;
if (new_offset < *offset) return PBSTREAM_ERROR_OVERFLOW;
if (new_offset < *offset) return UPB_ERROR_OVERFLOW;
*offset += new_offset;
break;
}
case PBSTREAM_WIRE_TYPE_START_GROUP:
case PBSTREAM_WIRE_TYPE_END_GROUP:
return PBSTREAM_ERROR_GROUP; /* deprecated, no plans to support. */
case UPB_WIRE_TYPE_START_GROUP:
case UPB_WIRE_TYPE_END_GROUP:
return UPB_ERROR_GROUP; /* deprecated, no plans to support. */
}
return PBSTREAM_STATUS_OK;
return UPB_STATUS_OK;
#undef READ
}
struct pbstream_field *pbstream_find_field(struct pbstream_fieldset* fs,
pbstream_field_number_t num)
{
/* TODO: the hashtable part. */
return fs->array[num-1];
}
/* Parses and processes the next value from buf. */
pbstream_status_t pbstream_parse_field(struct pbstream_parse_state *s,
uint8_t *buf,
pbstream_field_number_t *fieldnum,
struct pbstream_tagged_value *val,
struct pbstream_tagged_wire_value *wv)
upb_status_t upb_parse_field(struct upb_parse_state *s,
uint8_t *buf,
upb_field_number_t *fieldnum,
struct upb_tagged_value *val,
struct upb_tagged_wire_value *wv)
{
/* Check for end-of-message at the current stack depth. */
if(unlikely(s->offset >= s->top->end_offset)) {
/* If the end offset isn't an exact field boundary, the pb is corrupt. */
if(unlikely(s->offset != s->top->end_offset))
return PBSTREAM_ERROR_BAD_SUBMESSAGE_END;
return UPB_ERROR_BAD_SUBMESSAGE_END;
s->top--;
return PBSTREAM_STATUS_SUBMESSAGE_END;
return UPB_STATUS_SUBMESSAGE_END;
}
struct pbstream_tag tag;
struct upb_tag tag;
uint8_t *b = buf;
CHECK(parse_tag(&b, &tag));
s->offset += (b-buf);
struct pbstream_field *fd = pbstream_find_field(s->top->fieldset,
struct upb_field *fd = upb_find_field(s->top->fieldset,
tag.field_number);
pbstream_status_t unknown_value_status;
upb_status_t unknown_value_status;
if(unlikely(!fd)) {
unknown_value_status = PBSTREAM_ERROR_UNKNOWN_VALUE;
unknown_value_status = UPB_ERROR_UNKNOWN_VALUE;
goto unknown_value;
}
struct pbstream_type_info *info = &type_info[fd->type];
struct upb_type_info *info = &type_info[fd->type];
if(unlikely(tag.wire_type != info->expected_wire_type)) {
unknown_value_status = PBSTREAM_ERROR_MISMATCHED_TYPE;
unknown_value_status = UPB_ERROR_MISMATCHED_TYPE;
goto unknown_value;
}
*fieldnum = tag.field_number;
val->field = fd;
CHECK(info->get(s, b, val));
return PBSTREAM_STATUS_OK;
return UPB_STATUS_OK;
unknown_value:
wv->type = tag.wire_type;
@ -349,55 +341,20 @@ unknown_value:
return unknown_value_status;
}
void pbstream_init_parser(
struct pbstream_parse_state *state,
struct pbstream_fieldset *toplevel_fieldset)
void upb_init_parser(
struct upb_parse_state *state,
struct upb_fieldset *toplevel_fieldset)
{
state->offset = 0;
state->top = state->stack;
state->limit = state->top + PBSTREAM_MAX_STACK;
state->limit = state->top + UPB_MAX_STACK;
state->top->fieldset = toplevel_fieldset;
state->top->end_offset = SIZE_MAX;
}
static int compare_fields(const void *f1, const void *f2)
{
return ((struct pbstream_field*)f1)->field_number -
((struct pbstream_field*)f2)->field_number;
}
void pbstream_init_fieldset(struct pbstream_fieldset *fieldset,
struct pbstream_field *fields,
int num_fields)
{
qsort(fields, num_fields, sizeof(*fields), compare_fields);
/* Find the largest n for which at least half the fieldnums <n are used.
* Start at 8 to avoid noise of small numbers. */
pbstream_field_number_t n = 0, maybe_n;
for(int i = 0; i < num_fields; i++) {
maybe_n = fields[i].field_number;
if(maybe_n > 8 && maybe_n/(i+1) >= 2) break;
n = maybe_n;
}
fieldset->num_fields = num_fields;
fieldset->fields = malloc(sizeof(*fieldset->fields)*num_fields);
memcpy(fieldset->fields, fields, sizeof(*fields)*num_fields);
fieldset->array_size = n;
fieldset->array = malloc(sizeof(*fieldset->array)*n);
memset(fieldset->array, 0, sizeof(*fieldset->array)*n);
for (int i = 0; i < num_fields && fields[i].field_number <= n; i++)
fieldset->array[fields[i].field_number-1] = &fieldset->fields[i];
/* Until we support the hashtable part... */
assert(n == fields[num_fields-1].field_number);
return ((struct upb_field*)f1)->field_number -
((struct upb_field*)f2)->field_number;
}
void pbstream_free_fieldset(struct pbstream_fieldset *fieldset)
{
free(fieldset->fields);
free(fieldset->array);
}

@ -0,0 +1,137 @@
/*
* upb - a minimalist implementation of protocol buffers.
*
* This file contains parsing routines; both stream-oriented and tree-oriented
* models are supported.
*
* Copyright (c) 2008 Joshua Haberman. See LICENSE for details.
*/
#ifndef UPB_PARSE_H_
#define UPB_PARSE_H_
#include <stdint.h>
#include <stdbool.h>
#include "upb.h"
#ifdef __cplusplus
extern "C" {
#endif
/* A deserialized value as described in a .proto file. */
struct upb_tagged_value {
struct upb_field *field;
union upb_value v;
};
/* A value as it is encoded on-the-wire, before it has been interpreted as
* any particular .proto type. */
struct upb_tagged_wire_value {
upb_wire_type_t type;
union upb_wire_value v;
};
/* Definition of a single field in a message, for the purposes of the parser's
* fieldmap. Note that this does not include nearly all of the information
* that can be specified about a field in a .proto file. For example, we don't
* even know the field's name. We keep only the information necessary to parse
* the field. */
struct upb_field {
upb_field_number_t field_number;
int32_t type; /* google_protobuf_FieldDescriptorProto_Type */
struct upb_fieldset *fieldset; /* if type == MESSAGE */
};
struct upb_parse_stack_frame {
struct upb_fieldset *fieldset;
size_t end_offset; /* unknown for the top frame, so we set to SIZE_MAX */
};
/* The stream parser's state. */
struct upb_parse_state {
size_t offset;
struct upb_parse_stack_frame stack[UPB_MAX_STACK];
struct upb_parse_stack_frame *top, *limit;
};
/* Call this once before parsing to initialize the data structures.
* message_type can be NULL, in which case all fields will be reported as
* unknown. */
void upb_init_parser(struct upb_parse_state *state,
struct upb_fieldset *toplevel_fieldset);
/* Status as returned by upb_parse(). Status codes <0 are fatal errors
* that cannot be recovered. Status codes >0 are unusual but nonfatal events,
* which nonetheless must be handled differently since they do not return data
* in val. */
typedef enum upb_status {
UPB_STATUS_OK = 0,
UPB_STATUS_SUBMESSAGE_END = 1, // No data is stored in val or wv.
/** FATAL ERRORS: these indicate corruption, and cannot be recovered. */
// A varint did not terminate before hitting 64 bits.
UPB_ERROR_UNTERMINATED_VARINT = -1,
// A submessage ended in the middle of data.
UPB_ERROR_BAD_SUBMESSAGE_END = -2,
// Encountered a "group" on the wire (deprecated and unsupported).
UPB_ERROR_GROUP = -3,
// Input was nested more than UPB_MAX_NESTING deep.
UPB_ERROR_STACK_OVERFLOW = -4,
// The input data caused the pb's offset (a size_t) to overflow.
UPB_ERROR_OVERFLOW = -5,
/** NONFATAL ERRORS: the input was invalid, but we can continue if desired. */
// A value was encountered that was not defined in the .proto file. The
// unknown value is stored in wv.
UPB_ERROR_UNKNOWN_VALUE = 2,
// A field was encoded with the wrong wire type. The wire value is stored in
// wv.
UPB_ERROR_MISMATCHED_TYPE = 3,
} upb_status_t;
struct upb_parse_state;
/* The main parsing function. Parses the next value from buf, storing the
* parsed value in val. If val is of type UPB_TYPE_MESSAGE, then a
* submessage was entered.
*
* IMPORTANT NOTE: for efficiency, the parsing routines do not do bounds checks,
* and may read as much as far as buf+10. So the caller must ensure that buf is
* not within 10 bytes of unmapped memory, or the program will segfault. Clients
* are encouraged to overallocate their buffers by ten bytes to compensate. */
upb_status_t upb_parse_field(struct upb_parse_state *s,
uint8_t *buf,
upb_field_number_t *fieldnum,
struct upb_tagged_value *val,
struct upb_tagged_wire_value *wv);
/* Low-level parsing functions. ***********************************************/
/* Parses a single tag from the character data starting at buf, and updates
* buf to point one past the bytes that were consumed. buf will be incremented
* by at most ten bytes. */
upb_status_t parse_tag(uint8_t **buf, struct upb_tag *tag);
/* Parses a wire value with the given type (which must have been obtained from
* a tag that was just parsed) and adds the number of bytes that were consumed
* to *offset. For delimited types, offset is advanced past the delimited
* data. */
upb_status_t upb_parse_wire_value(uint8_t *buf, size_t *offset,
upb_wire_type_t wt,
union upb_wire_value *wv);
/* Like the above, but discards the wire value instead of saving it. */
upb_status_t skip_wire_value(uint8_t *buf, size_t *offset,
upb_wire_type_t wt);
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* UPB_PARSE_H_ */
Loading…
Cancel
Save