Lots of documentation, cleanup, and fixed memory leaks.

16 years ago · 5235966ed5
parent dd2094537a
commit 5235966ed5
8 changed files with 352 additions and 241 deletions
--- a/2
+++ b/2
@ -3,7 +3,7 @@
 CC=gcc
 CXX=g++
 CFLAGS=-std=c99
-CPPFLAGS=-O0 -Wall -Wextra -pedantic -g -DUPB_UNALIGNED_READS_OK -fomit-frame-pointer
+CPPFLAGS=-Wall -Wextra -pedantic -g -DUPB_UNALIGNED_READS_OK -fomit-frame-pointer
 OBJ=upb_parse.o upb_table.o upb_msg.o upb_enum.o upb_context.o descriptor.o
 all: $(OBJ) test_table tests upbc
 clean:
--- a/upb.h
+++ b/upb.h
@ -1,7 +1,9 @@
 /*
 * upb - a minimalist implementation of protocol buffers.
- *
+
 * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
+ *
+ * This file contains shared definitions that are widely used across upb.
 */

 #ifndef UPB_H_
@ -36,29 +38,14 @@ extern "C" {
 /* The maximum number of fields that any one .proto type can have. */
 #define UPB_MAX_FIELDS (1<<16)

+/* Nested type names are separated by periods. */
+#define UPB_SYMBOL_SEPARATOR '.'
+#define UPB_SYMBOL_MAX_LENGTH 256
+
 #define UPB_INDEX(base, i, m) (void*)((char*)(base) + ((i)*(m)))

 INLINE uint32_t max(uint32_t a, uint32_t b) { return a > b ? a : b; }

-/* A list of types as they are encoded on-the-wire. */
-enum upb_wire_type {
-  UPB_WIRE_TYPE_VARINT      = 0,
-  UPB_WIRE_TYPE_64BIT       = 1,
-  UPB_WIRE_TYPE_DELIMITED   = 2,
-  UPB_WIRE_TYPE_START_GROUP = 3,
-  UPB_WIRE_TYPE_END_GROUP   = 4,
-  UPB_WIRE_TYPE_32BIT       = 5
-};
-typedef uint8_t upb_wire_type_t;
-
-/* A value as it is encoded on-the-wire, except delimited, which is handled
- * separately. */
-union upb_wire_value {
-  uint64_t varint;
-  uint64_t _64bit;
-  uint32_t _32bit;
-};
-
 /* Value type as defined in a .proto file.  The values of this are defined by
 * google_protobuf_FieldDescriptorProto_Type (from descriptor.proto).
 * Note that descriptor.proto reserves "0" for errors, and we use it to
@ -76,48 +63,36 @@ struct upb_type_info {
  uint8_t expected_wire_type;
 };

-/* This array is indexed by upb_field_type_t. */
+/* Contains information for all .proto types.  Indexed by upb_field_type_t. */
 extern struct upb_type_info upb_type_info[];

-/* A scalar value as described in a .proto file */
+/* A pointer to a .proto value.  The owner must have an out-of-band way of
+ * knowing the type, so it knows which union member to use. */
 union upb_value {
-  double _double;
-  float  _float;
-  int32_t int32;
-  int64_t int64;
+  double   _double;
+  float    _float;
+  int32_t  int32;
+  int64_t  int64;
  uint32_t uint32;
  uint64_t uint64;
-  bool _bool;
+  bool     _bool;
+  struct upb_string **string;
+  struct upb_array **array;
+  void     *message;
 };

 union upb_value_ptr {
-  double *_double;
-  float  *_float;
-  int32_t *int32;
-  int64_t *int64;
+  double   *_double;
+  float    *_float;
+  int32_t  *int32;
+  int64_t  *int64;
  uint32_t *uint32;
  uint64_t *uint64;
-  bool *_bool;
+  bool     *_bool;
  struct upb_string **string;
  struct upb_array **array;
-  void **message;
-  void *_void;
-};
-
-/* The number of a field, eg. "optional string foo = 3". */
-typedef int32_t upb_field_number_t;
-
-/* A tag occurs before each value on-the-wire. */
-struct upb_tag {
-  upb_field_number_t field_number;
-  upb_wire_type_t wire_type;
-};
-
-enum upb_symbol_type {
-  UPB_SYM_MESSAGE,
-  UPB_SYM_ENUM,
-  UPB_SYM_SERVICE,
-  UPB_SYM_EXTENSION
+  void     **message;
+  void     *_void;
 };

 union upb_symbol_ref {
@ -126,7 +101,11 @@ union upb_symbol_ref {
  struct upb_svc *svc;
 };

-/* Status codes used as a return value. */
+/* The number of a field, eg. "optional string foo = 3". */
+typedef int32_t upb_field_number_t;
+
+/* Status codes used as a return value.  Codes >0 are not fatal and can be
+ * resumed. */
 typedef enum upb_status {
  UPB_STATUS_OK = 0,

--- a/upb_context.c
+++ b/upb_context.c
@ -58,8 +58,9 @@ static void free_symtab(struct upb_strtable *t)
 void upb_context_free(struct upb_context *c)
 {
  free_symtab(&c->symtab);
+  for(size_t i = 0; i < c->fds_len; i++)
+    upb_msgdata_free(c->fds[i], c->fds_msg, true);
  free_symtab(&c->psymtab);
-  for(size_t i = 0; i < c->fds_len; i++) free(c->fds[i]);
  free(c->fds);
 }

@ -73,23 +74,23 @@ static struct upb_symtab_entry *resolve(struct upb_strtable *t,
                                        struct upb_string *base,
                                        struct upb_string *symbol)
 {
-  if(base->byte_len + symbol->byte_len + 1 >= UPB_SYM_MAX_LENGTH ||
+  if(base->byte_len + symbol->byte_len + 1 >= UPB_SYMBOL_MAX_LENGTH ||
     symbol->byte_len == 0) return NULL;

-  if(symbol->ptr[0] == UPB_CONTEXT_SEPARATOR) {
+  if(symbol->ptr[0] == UPB_SYMBOL_SEPARATOR) {
    /* Symbols starting with '.' are absolute, so we do a single lookup. */
    struct upb_string sym_str = {.ptr = symbol->ptr+1,
                                 .byte_len = symbol->byte_len-1};
    return upb_strtable_lookup(t, &sym_str);
  } else {
    /* Remove components from base until we find an entry or run out. */
-    char sym[UPB_SYM_MAX_LENGTH+1];
+    char sym[UPB_SYMBOL_MAX_LENGTH+1];
    struct upb_string sym_str = {.ptr = sym};
    int baselen = base->byte_len;
    while(1) {
-      /* sym_str = base[0...base_len] + UPB_CONTEXT_SEPARATOR + symbol */
+      /* sym_str = base[0...base_len] + UPB_SYMBOL_SEPARATOR + symbol */
      memcpy(sym, base->ptr, baselen);
-      sym[baselen] = UPB_CONTEXT_SEPARATOR;
+      sym[baselen] = UPB_SYMBOL_SEPARATOR;
      memcpy(sym + baselen + 1, symbol->ptr, symbol->byte_len);
      sym_str.byte_len = baselen + symbol->byte_len + 1;

@ -97,7 +98,7 @@ static struct upb_symtab_entry *resolve(struct upb_strtable *t,
      if (e) return e;
      else if(baselen == 0) return NULL;  /* No more scopes to try. */

-      baselen = memrchr(base->ptr, UPB_CONTEXT_SEPARATOR, baselen);
+      baselen = memrchr(base->ptr, UPB_SYMBOL_SEPARATOR, baselen);
    }
  }
 }
@ -130,7 +131,7 @@ static struct upb_string join(struct upb_string *base, struct upb_string *name)
  if(base->byte_len > 0) {
    /* nested_base = base + '.' +  d->name */
    memcpy(joined.ptr, base->ptr, base->byte_len);
-    joined.ptr[base->byte_len] = UPB_CONTEXT_SEPARATOR;
+    joined.ptr[base->byte_len] = UPB_SYMBOL_SEPARATOR;
    memcpy(&joined.ptr[base->byte_len+1], name->ptr, name->byte_len);
  } else {
    memcpy(joined.ptr, name->ptr, name->byte_len);
--- a/upb_context.h
+++ b/upb_context.h
@ -2,8 +2,7 @@
 * upb - a minimalist implementation of protocol buffers.
 *
 * A context represents a namespace of proto definitions, sort of like an
- * interpreter's symbol table.  It is empty when first constructed, with the
- * exception of built-in types (those defined in descriptor.proto).  Clients
+ * interpreter's symbol table.  It is empty when first constructed.  Clients
 * add definitions to the context by supplying unserialized or serialized
 * descriptors (as defined in descriptor.proto).
 *
@ -22,6 +21,16 @@ struct google_protobuf_FileDescriptorProto;
 extern "C" {
 #endif

+/* Definitions. ***************************************************************/
+
+/* The symbol table maps names to various kinds of symbols. */
+enum upb_symbol_type {
+  UPB_SYM_MESSAGE,
+  UPB_SYM_ENUM,
+  UPB_SYM_SERVICE,
+  UPB_SYM_EXTENSION
+};
+
 struct upb_symtab_entry {
  struct upb_strtable_entry e;
  enum upb_symbol_type type;
@ -39,17 +48,12 @@ struct upb_context {
  struct google_protobuf_FileDescriptorSet **fds;
 };

-/* Initializes and frees a upb_context, respectively.  Newly initialized
- * contexts will always have the types in descriptor.proto defined. */
+/* Initializes and frees a upb_context, respectively. */
 bool upb_context_init(struct upb_context *c);
 void upb_context_free(struct upb_context *c);

 /* Looking up symbols. ********************************************************/

-/* Nested type names are separated by periods. */
-#define UPB_CONTEXT_SEPARATOR '.'
-#define UPB_SYM_MAX_LENGTH 256
-
 /* Resolves the given symbol using the rules described in descriptor.proto,
 * namely:
 *
--- a/upb_msg.c
+++ b/upb_msg.c
@ -9,6 +9,7 @@
 #include "upb_msg.h"
 #include "upb_parse.h"

+/* Rounds p up to the next multiple of t. */
 #define ALIGN_UP(p, t) (p % t == 0 ? p : p + (t - (p % t)))

 static int div_round_up(int numerator, int denominator) {
@ -55,6 +56,7 @@ bool upb_msg_init(struct upb_msg *m, struct google_protobuf_DescriptorProto *d)
    /* We count on the caller to keep this pointer alive. */
    m->field_descriptors[i] = d->field->elements[i];
  }
+  /* TODO: re-enable proper sorting once the compiler is sorted out. */
  //qsort(m->field_descriptors, m->num_fields, sizeof(void*), compare_fields);

  size_t max_align = 0;
@ -77,7 +79,7 @@ bool upb_msg_init(struct upb_msg *m, struct google_protobuf_DescriptorProto *d)

    /* Insert into the tables.  Note that f->ref will be uninitialized, even in
     * the tables' copies of *f, which is why we must update them separately
-     * when the references are resolved. */
+     * in upb_msg_ref() below. */
    struct upb_fieldsbynum_entry nument = {.e = {.key = fd->number}, .f = *f};
    struct upb_fieldsbyname_entry strent = {.e = {.key = *fd->name}, .f = *f};
    upb_inttable_insert(&m->fields_by_num, &nument.e);
@ -96,15 +98,6 @@ void upb_msg_free(struct upb_msg *m)
  free(m->field_descriptors);
 }

-void *upb_msg_new(struct upb_msg *m)
-{
-  void *msg = malloc(m->size);
-  memset(msg, 0, m->size);  /* Clear all pointers, values, and set bits. */
-  return msg;
-}
-
-//void upb_msg_free(void *msg, struct upb_msg *m, bool free_submsgs);
-
 void upb_msg_ref(struct upb_msg *m, struct upb_msg_field *f,
                 union upb_symbol_ref ref) {
  struct google_protobuf_FieldDescriptorProto *d =
@ -119,23 +112,35 @@ void upb_msg_ref(struct upb_msg *m, struct upb_msg_field *f,
  str_e->f.ref = ref;
 }

+/* Memory management  *********************************************************/
+
+/* Our memory management scheme is as follows:
+ *
+ * All pointers to dynamic memory (strings, arrays, and submessages) are
+ * expected to be good pointers if they are non-zero, *regardless* of whether
+ * that field's bit is set!  That way we can reuse the memory even if the field
+ * is unset and then set later. */
+
+/* For our memory-managed strings and arrays we store extra information
+ * (compared to a plain upb_string or upb_array).  But the data starts with
+ * a upb_string and upb_array, so we can overlay onto the regular types. */
 struct mm_upb_string {
  struct upb_string s;
+  /* Track the allocated size, so we know when we need to reallocate. */
  uint32_t size;
+  /* Our allocated data.  Stored separately so that clients can point s.ptr to
+   * a referenced string, but we can reuse this data later. */
  char *data;
 };

 struct mm_upb_array {
  struct upb_array a;
+  /* Track the allocated size, so we know when we need to reallocate. */
  uint32_t size;
-  char *data;
 };

 static uint32_t round_up_to_pow2(uint32_t v)
 {
-#if 0 // __GNUC__
-  return (1U<<31) >> (__builtin_clz(v-1)+1);
-#else
  /* cf. http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
  v--;
  v |= v >> 1;
@ -145,7 +150,54 @@ static uint32_t round_up_to_pow2(uint32_t v)
  v |= v >> 16;
  v++;
  return v;
-#endif
+}
+
+void *upb_msgdata_new(struct upb_msg *m)
+{
+  void *msg = malloc(m->size);
+  memset(msg, 0, m->size);  /* Clear all pointers, values, and set bits. */
+  return msg;
+}
+
+static void free_value(union upb_value_ptr p, struct upb_msg_field *f,
+                       bool free_submsgs)
+{
+  switch(f->type) {
+    case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_STRING:
+    case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_BYTES: {
+      struct mm_upb_string *mm_str = (void*)*p.string;
+      if(mm_str) {
+        free(mm_str->data);
+        free(mm_str);
+      }
+      break;
+    }
+    case GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_TYPE_MESSAGE:
+      if(free_submsgs) upb_msgdata_free(*p.message, f->ref.msg, free_submsgs);
+      break;
+    default: break;  /* For non-dynamic types, do nothing. */
+  }
+}
+
+void upb_msgdata_free(void *data, struct upb_msg *m, bool free_submsgs)
+{
+  if(!data) return;  /* A very free-like thing to do. */
+  for(unsigned int i = 0; i < m->num_fields; i++) {
+    struct upb_msg_field *f = &m->fields[i];
+    union upb_value_ptr p = upb_msg_getptr(data, f);
+    if(f->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REPEATED) {
+      if(*p.array) {
+        for(uint32_t j = 0; j < (*p.array)->len; j++)
+          free_value(upb_array_getelementptr(*p.array, j, f->type),
+                     f, free_submsgs);
+        free((*p.array)->elements._void);
+        free(*p.array);
+      }
+    } else {
+      free_value(p, f, free_submsgs);
+    }
+  }
+  free(data);
 }

 void upb_msg_reuse_str(struct upb_string **str, uint32_t size)
@ -185,11 +237,11 @@ void upb_msg_reuse_strref(struct upb_string **str) { upb_msg_reuse_str(str, 0);

 void upb_msg_reuse_submsg(void **msg, struct upb_msg *m)
 {
-  if(!*msg) *msg = upb_msg_new(m);
+  if(!*msg) *msg = upb_msgdata_new(m);
  else upb_msg_clear(*msg, m); /* Clears set bits, leaves pointers. */
 }

-/* Parser. */
+/* Serialization/Deserialization.  ********************************************/

 struct parse_frame_data {
  struct upb_msg *m;
@ -217,7 +269,7 @@ static upb_field_type_t tag_cb(struct upb_parse_state *s, struct upb_tag *tag,

 static union upb_value_ptr get_value_ptr(void *data, struct upb_msg_field *f)
 {
-  union upb_value_ptr p = upb_msg_get_ptr(data, f);
+  union upb_value_ptr p = upb_msg_getptr(data, f);
  if(f->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REPEATED) {
    size_t len = upb_msg_is_set(data, f) ? (*p.array)->len : 0;
    upb_msg_reuse_array(p.array, len+1, f->type);
@ -270,25 +322,18 @@ static void submsg_start_cb(struct upb_parse_state *_s, void *user_field_desc)
  if(!s->merge) upb_msg_clear(frame->data, f->ref.msg);
 }

-static void submsg_end_cb(struct upb_parse_state *s)
-{
-  struct parse_frame_data *frame = (void*)&s->top->user_data;
-}
-
-
 void upb_msg_parse_init(struct upb_msg_parse_state *s, void *msg,
                        struct upb_msg *m, bool merge, bool byref)
 {
  upb_parse_init(&s->s, sizeof(struct parse_frame_data));
  s->merge = merge;
  s->byref = byref;
-  if(!merge && msg == NULL) msg = upb_msg_new(m);
+  if(!merge && msg == NULL) msg = upb_msgdata_new(m);
  set_frame_data(&s->s, m, msg);
  s->s.tag_cb = tag_cb;
  s->s.value_cb = value_cb;
  s->s.str_cb = str_cb;
  s->s.submsg_start_cb = submsg_start_cb;
-  s->s.submsg_end_cb = submsg_end_cb;
 }

 void upb_msg_parse_free(struct upb_msg_parse_state *s)
@ -305,10 +350,11 @@ upb_status_t upb_msg_parse(struct upb_msg_parse_state *s,
 void *upb_alloc_and_parse(struct upb_msg *m, struct upb_string *str, bool byref)
 {
  struct upb_msg_parse_state s;
-  void *msg = upb_msg_new(m);
+  void *msg = upb_msgdata_new(m);
  upb_msg_parse_init(&s, msg, m, false, byref);
  size_t read;
  upb_status_t status = upb_msg_parse(&s, str->ptr, str->byte_len, &read);
+  upb_msg_parse_free(&s);
  if(status == UPB_STATUS_OK && read == str->byte_len) {
    return msg;
  } else {
@ -370,7 +416,7 @@ void upb_msg_print(void *data, struct upb_msg *m, FILE *stream)
    if(upb_msg_is_set(data, f)) fputs(" (set): ", stream);
    else fputs(" (NOT set): ", stream);

-    union upb_value_ptr p = upb_msg_get_ptr(data, f);
+    union upb_value_ptr p = upb_msg_getptr(data, f);
    if(f->label == GOOGLE_PROTOBUF_FIELDDESCRIPTORPROTO_LABEL_REPEATED) {
      if(*p.array) {
        fputc('[', stream);
--- a/upb_msg.h
+++ b/upb_msg.h
@ -3,46 +3,100 @@
 *
 * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
 *
- * upb_msg contains a full description of a message as defined in a .proto file.
- * It supports many features and operations for dealing with proto messages:
+ * A upb_msg provides a full description of a message as defined in a .proto
+ * file.  It supports many features and operations for dealing with proto
+ * messages:
 * - reflection over .proto types at runtime (list fields, get names, etc).
 * - an in-memory byte-level format for efficiently storing and accessing msgs.
 * - serializing and deserializing from the in-memory format to a protobuf.
 * - optional memory management for handling strings, arrays, and submessages.
 *
+ * Throughout this file, the following convention is used:
+ * - "struct upb_msg *m" describes a message type (name, list of fields, etc).
+ * - "void *data" is an actual message stored using the in-memory format.
+ *
 * The in-memory format is very much like a C struct that you can define at
 * run-time, but also supports reflection.  Like C structs it supports
 * offset-based access, as opposed to the much slower name-based lookup.  The
- * format represents both the values themselves and bits describing whether each
- * field is set or not.
+ * format stores both the values themselves and bits describing whether each
+ * field is set or not.  For example:
+ *
+ * parsed message Foo {
+ *   optional bool a = 1;
+ *   repeated uint32 b = 2;
+ *   optional Bar c = 3;
+ * }
+ *
+ * The in-memory layout for this message on a 32-bit machine will be something
+ * like:
+ *
+ *  Foo
+ * +------------------------+
+ * | set_flags a:1, b:1, c:1|
+ * +------------------------+
+ * | bool a (1 byte)        |
+ * +------------------------+
+ * | padding (3 bytes)      |
+ * +------------------------+         upb_array
+ * | upb_array* b (4 bytes) | ---->  +----------------------------+
+ * +------------------------+        | uint32* elements (4 bytes) | ---+
+ * | Bar* c (4 bytes)       |        +----------------------------+    |
+ * +------------------------+        | uint32 size (4 bytes)      |    |
+ *                                   +----------------------------+    |
+ *                                                                     |
+ *    -----------------------------------------------------------------+
+ *    |
+ *    V
+ *  uint32 array
+ * +----+----+----+----+----+----+
+ * | e1 | e2 | e3 | e4 | e5 | e6 |
+ * +----+----+----+----+----+----+
+ *
+ * And the corresponding C structure (as emitted by the proto compiler) would be:
 *
- * The upb compiler emits C structs that mimic this definition exactly, so that
- * you can access the same hunk of memory using either this run-time
- * reflection-supporting interface or a C struct that was generated by the upb
- * compiler.
+ * struct Foo {
+ *   union {
+ *     uint8_t bytes[1];
+ *     struct {
+ *       bool a:1;
+ *       bool b:1;
+ *       bool c:1;
+ *     } has;
+ *   } set_flags;
+ *   bool a;
+ *   upb_uint32_array *b;
+ *   Bar *c;
+ * }
 *
- * Like C structs the format depends on the endianness of the host machine, so
- * it is not suitable for exchanging across machines of differing endianness.
- * But there is no reason to do that -- the protobuf serialization format is
- * designed already for serialization/deserialization, and is more compact than
- * this format.  This format is designed to allow the fastest possible random
- * access of individual fields.
+ * Because the C struct emitted by the upb compiler uses exactly the same
+ * byte-level format as the reflection interface, you can access the same hunk
+ * of memory either way.  The C struct provides maximum performance and static
+ * type safety; upb_msg provides flexibility.
 *
- * Note that clients need not use the memory management facilities defined here.
- * They are for convenience only -- clients wishing to do their own memory
- * management may do so (allowing clients to perform advanced techniques like
- * reference-counting, garbage collection, and string references).  Different
+ * The in-memory format has no interoperability guarantees whatsoever, except
+ * that a single version of upb will interoperate with itself.  Don't even
+ * think about persisting the in-memory format or sending it anywhere.  That's
+ * what serialized protobufs are for!  The in-memory format is just that -- an
+ * in-memory representation that allows for fast access.
+ *
+ * The in-memory format is carefully designed to *not* mandate any particular
+ * memory management scheme.  This should make it easier to integrate with
+ * existing memory management schemes, or to perform advanced techniques like
+ * reference counting, garbage collection, and string references.  Different
 * clients can read each others messages regardless of what memory management
 * scheme each is using.
+ *
+ * A memory management scheme is provided for convenience, and it is used by
+ * default by the stock message parser.  Clients can substitute their own
+ * memory management scheme into this parser without any loss of generality
+ * or performance.
 */

 #ifndef UPB_MSG_H_
 #define UPB_MSG_H_

 #include <stdbool.h>
-#include <stddef.h>
 #include <stdint.h>
-#include <string.h>

 #include "upb.h"
 #include "upb_table.h"
@ -59,7 +113,10 @@ struct google_protobuf_FieldDescriptorProto;
 /* Message definition. ********************************************************/

 /* Structure that describes a single field in a message.  This structure is very
- * consciously designed to fit into 12/16 bytes (32/64 bit, respectively). */
+ * consciously designed to fit into 12/16 bytes (32/64 bit, respectively),
+ * because copies of this struct are in the hash table that is read in the
+ * critical path of parsing.  Minimizing the size of this struct increases
+ * cache-friendliness. */
 struct upb_msg_field {
  union upb_symbol_ref ref;
  uint32_t byte_offset;     /* Where to find the data. */
@ -102,7 +159,7 @@ INLINE struct google_protobuf_FieldDescriptorProto *upb_msg_field_descriptor(
  return m->field_descriptors[f->field_index];
 }

-/* Initialize and free a upb_msg.  Caller retains ownership of d, but the msg
+/* Initializes/frees a upb_msg.  Caller retains ownership of d, but the msg
 * will contain references to it, so it must outlive the msg.  Note that init
 * does not resolve upb_msg_field.ref -- the caller should do that
 * post-initialization by calling upb_msg_ref() below. */
@ -114,9 +171,9 @@ void upb_msg_free(struct upb_msg *m);
 * mutually-recursive ways, this step must be separated from initialization. */
 void upb_msg_ref(struct upb_msg *m, struct upb_msg_field *f, union upb_symbol_ref ref);

-/* While these are written to be as fast as possible, it will still be faster
- * to cache the results of this lookup if possible.  These return NULL if no
- * such field is found. */
+/* Looks up a field by name or number.  While these are written to be as fast
+ * as possible, it will still be faster to cache the results of this lookup if
+ * possible.  These return NULL if no such field is found. */
 INLINE struct upb_msg_field *upb_msg_fieldbynum(struct upb_msg *m,
                                                uint32_t number) {
  struct upb_fieldsbynum_entry *e = upb_inttable_lookup(
@ -130,33 +187,69 @@ INLINE struct upb_msg_field *upb_msg_fieldbyname(struct upb_msg *m,
  return e ? &e->f : NULL;
 }

+/* "Set" flag reading and writing.  *******************************************/
+
+INLINE size_t upb_isset_offset(uint32_t field_index) {
+  return field_index / 8;
+}
+
+INLINE uint8_t upb_isset_mask(uint32_t field_index) {
+  return 1 << (field_index % 8);
+}
+
+/* Functions for reading and writing the "set" flags in the msg.  Note that
+ * these do not perform memory management associated with any dynamic memory
+ * these fields may be referencing. These *only* set and test the flags. */
+INLINE void upb_msg_set(void *s, struct upb_msg_field *f)
+{
+  ((char*)s)[upb_isset_offset(f->field_index)] |= upb_isset_mask(f->field_index);
+}
+
+INLINE void upb_msg_unset(void *s, struct upb_msg_field *f)
+{
+  ((char*)s)[upb_isset_offset(f->field_index)] &= ~upb_isset_mask(f->field_index);
+}
+
+INLINE bool upb_msg_is_set(void *s, struct upb_msg_field *f)
+{
+  return ((char*)s)[upb_isset_offset(f->field_index)] & upb_isset_mask(f->field_index);
+}
+
+INLINE bool upb_msg_all_required_fields_set(void *s, struct upb_msg *m)
+{
+  int num_fields = m->num_required_fields;
+  int i = 0;
+  while(num_fields > 8) {
+    if(((uint8_t*)s)[i++] != 0xFF) return false;
+    num_fields -= 8;
+  }
+  if(((uint8_t*)s)[i] != (1 << num_fields) - 1) return false;
+  return true;
+}
+
+INLINE void upb_msg_clear(void *s, struct upb_msg *m)
+{
+  memset(s, 0, m->set_flags_bytes);
+}
+
+/* Scalar (non-array) data access. ********************************************/
+
+/* Returns a pointer to a specific field in a message. */
+INLINE union upb_value_ptr upb_msg_getptr(void *data, struct upb_msg_field *f) {
+  union upb_value_ptr p = {._void = ((char*)data + f->byte_offset)};
+  return p;
+}
+
 /* Arrays. ********************************************************************/

 /* Represents an array (a repeated field) of any type.  The interpretation of
 * the data in the array depends on the type. */
 struct upb_array {
-  union {
-    double   *_double;
-    float    *_float;
-    int32_t  *int32;
-    int64_t  *int64;
-    uint32_t *uint32;
-    uint64_t *uint64;
-    bool     *_bool;
-    struct upb_string **string;
-    void     **submsg;
-    void     *_void;
-  } elements;
+  union upb_value_ptr elements;
  uint32_t len;     /* Measured in elements. */
 };

-/* These are all overlays on upb_array, pointers between them can be cast. */
-#define UPB_DEFINE_ARRAY_TYPE(name, type) \
-  struct name ## _array { \
-    type *elements; \
-    uint32_t len; \
-  };
-
+/* Returns a pointer to an array element. */
 INLINE union upb_value_ptr upb_array_getelementptr(
    struct upb_array *arr, uint32_t n, upb_field_type_t type)
 {
@ -166,6 +259,13 @@ INLINE union upb_value_ptr upb_array_getelementptr(
  return ptr;
 }

+/* These are all overlays on upb_array, pointers between them can be cast. */
+#define UPB_DEFINE_ARRAY_TYPE(name, type) \
+  struct name ## _array { \
+    type *elements; \
+    uint32_t len; \
+  };
+
 UPB_DEFINE_ARRAY_TYPE(upb_double, double)
 UPB_DEFINE_ARRAY_TYPE(upb_float,  float)
 UPB_DEFINE_ARRAY_TYPE(upb_int32,  int32_t)
@ -175,6 +275,7 @@ UPB_DEFINE_ARRAY_TYPE(upb_uint64, uint64_t)
 UPB_DEFINE_ARRAY_TYPE(upb_bool,   bool)
 UPB_DEFINE_ARRAY_TYPE(upb_string, struct upb_string*)

+/* Defines an array of a specific message type. */
 #define UPB_MSG_ARRAY(msg_type) struct msg_type ## _array
 #define UPB_DEFINE_MSG_ARRAY(msg_type) \
  UPB_MSG_ARRAY(msg_type) { \
@ -182,52 +283,42 @@ UPB_DEFINE_ARRAY_TYPE(upb_string, struct upb_string*)
    uint32_t len; \
  };

-/* Accessors for primitive types.  ********************************************/
+/* Memory management  *********************************************************/

-/* For each primitive type we define a set of three functions:
- *
- *  // For fetching out of a msg (s points to the raw msg data).
- *  int32_t *upb_msg_get_int32_ptr(void *s, struct upb_msg_field *f);
- *  int32_t upb_msg_get_int32(void *s, struct upb_msg_field *f);
- *  void upb_msg_set_int32(void *s, struct upb_msg_field *f, int32_t val);
- *
- * These do no existence checks, bounds checks, or type checks. */
-
-#define UPB_DEFINE_ACCESSORS(INLINE, name, ctype) \
-  INLINE ctype *upb_msg_get_ ## name ## _ptr( \
-      void *s, struct upb_msg_field *f) { \
-    return (ctype*)((char*)s + f->byte_offset); \
-  } \
-  INLINE ctype upb_msg_get_ ## name( \
-      void *s, struct upb_msg_field *f) { \
-    return *upb_msg_get_ ## name ## _ptr(s, f); \
-  } \
-  INLINE void upb_msg_set_ ## name( \
-      void *s, struct upb_msg_field *f, ctype val) { \
-    *upb_msg_get_ ## name ## _ptr(s, f) = val; \
-  }
+/* One important note about these memory management routines: they must be used
+ * completely or not at all (for each message).  In other words, you can't
+ * allocate your own message and then free it with upb_msgdata_free.  As
+ * another example, you can't point a field to your own string and then call
+ * upb_msg_reuse_str. */
+
+/* Allocates and frees message data, respectively.  Newly allocated data is
+ * initialized to empty.  Freeing a message always frees string data, but
+ * the client can decide whether or not submessages should be deleted. */
+void *upb_msgdata_new(struct upb_msg *m);
+void upb_msgdata_free(void *data, struct upb_msg *m, bool free_submsgs);
+
+/* Given a pointer to the appropriate field of the message or array, these
+ * functions will lazily allocate memory for a string, array, or submessage.
+ * If the previously allocated memory is big enough, it will reuse it without
+ * re-allocating.  See upb_msg.c for example usage. */
+
+/* Reuse a string of at least the given size. */
+void upb_msg_reuse_str(struct upb_string **str, uint32_t size);
+/* Like the previous, but assumes that the string will be by reference, so
+ * doesn't allocate memory for the string itself. */
+void upb_msg_reuse_strref(struct upb_string **str);

-UPB_DEFINE_ACCESSORS(INLINE, double, double)
-UPB_DEFINE_ACCESSORS(INLINE, float,  float)
-UPB_DEFINE_ACCESSORS(INLINE, int32,  int32_t)
-UPB_DEFINE_ACCESSORS(INLINE, int64,  int64_t)
-UPB_DEFINE_ACCESSORS(INLINE, uint32, uint32_t)
-UPB_DEFINE_ACCESSORS(INLINE, uint64, uint64_t)
-UPB_DEFINE_ACCESSORS(INLINE, bool,   bool)
-UPB_DEFINE_ACCESSORS(INLINE, bytes,  struct upb_string*)
-UPB_DEFINE_ACCESSORS(INLINE, string, struct upb_string*)
-UPB_DEFINE_ACCESSORS(INLINE, submsg, void*)
-UPB_DEFINE_ACCESSORS(INLINE, array,  struct upb_array*)
-
-INLINE union upb_value_ptr upb_msg_get_ptr(
-    void *data, struct upb_msg_field *f) {
-  union upb_value_ptr p = {._void = ((char*)data + f->byte_offset)};
-  return p;
-}
+/* Reuse an array of at least the given size, with the given type. */
+void upb_msg_reuse_array(struct upb_array **arr, uint32_t size,
+                         upb_field_type_t t);

-/* Memory management  *********************************************************/
+/* Reuse a submessage of the given type. */
+void upb_msg_reuse_submsg(void **msg, struct upb_msg *m);

-void *upb_msg_new(struct upb_msg *m);
+/* Serialization/Deserialization.  ********************************************/
+
+/* This is all just a layer on top of the stream-oriented facility in
+ * upb_parse.h. */

 struct upb_msg_parse_state {
  struct upb_parse_state s;
@ -236,70 +327,32 @@ struct upb_msg_parse_state {
  struct upb_msg *m;
 };

-void upb_msg_parse_init(struct upb_msg_parse_state *s, void *msg,
+/* Initializes/frees a message parser.  The parser will write the data to the
+ * message data "data", which the caller must have previously allocated (the
+ * parser will allocate submsgs, strings, and arrays as needed, however).
+ *
+ * "Merge" controls whether the parser will append to data instead of
+ * overwriting.  Merging concatenates arrays and merges submessages instead
+ * of clearing both.
+ *
+ * "Byref" controls whether the new message data copies or references strings
+ * it encounters.  If byref == true, then all strings supplied to upb_msg_parse
+ * must remain unchanged and must outlive data. */
+void upb_msg_parse_init(struct upb_msg_parse_state *s, void *data,
                        struct upb_msg *m, bool merge, bool byref);
 void upb_msg_parse_free(struct upb_msg_parse_state *s);
+
+/* Parses a protobuf fragment, writing the data to the message that was passed
+ * to upb_msg_parse_init.  This function can be called multiple times as more
+ * data becomes available. */
 upb_status_t upb_msg_parse(struct upb_msg_parse_state *s,
                           void *data, size_t len, size_t *read);

+/* Parses the protobuf in s (which is expected to be complete) and allocates
+ * new message data to hold it.  This is an alternative to the streaming API
+ * above.  "byref" works as in upb_msg_parse_init(). */
 void *upb_alloc_and_parse(struct upb_msg *m, struct upb_string *s, bool byref);

-/* Note!  These two may not be use on a upb_string* that was initialized by
- * means other than these functions. */
-void upb_msg_reuse_str(struct upb_string **str, uint32_t len);
-void upb_msg_reuse_array(struct upb_array **arr, uint32_t n, upb_field_type_t t);
-void upb_msg_reuse_strref(struct upb_string **str);
-void upb_msg_reuse_submsg(void **msg, struct upb_msg *m);
-
-/* "Set" flag reading and writing.  *******************************************/
-
-INLINE size_t upb_isset_offset(uint32_t field_index) {
-  return field_index / 8;
-}
-
-INLINE uint8_t upb_isset_mask(uint32_t field_index) {
-  return 1 << (field_index % 8);
-}
-
-/* Functions for reading and writing the "set" flags in the msg.  Note that
- * these do not perform memory management associated with any dynamic memory
- * these fields may be referencing. These *only* set and test the flags. */
-INLINE void upb_msg_set(void *s, struct upb_msg_field *f)
-{
-  ((char*)s)[upb_isset_offset(f->field_index)] |= upb_isset_mask(f->field_index);
-}
-
-INLINE void upb_msg_unset(void *s, struct upb_msg_field *f)
-{
-  ((char*)s)[upb_isset_offset(f->field_index)] &= ~upb_isset_mask(f->field_index);
-}
-
-INLINE bool upb_msg_is_set(void *s, struct upb_msg_field *f)
-{
-  return ((char*)s)[upb_isset_offset(f->field_index)] & upb_isset_mask(f->field_index);
-}
-
-INLINE bool upb_msg_all_required_fields_set(void *s, struct upb_msg *m)
-{
-  int num_fields = m->num_required_fields;
-  int i = 0;
-  while(num_fields > 8) {
-    if(((uint8_t*)s)[i++] != 0xFF) return false;
-    num_fields -= 8;
-  }
-  if(((uint8_t*)s)[i] != (1 << num_fields) - 1) return false;
-  return true;
-}
-
-INLINE void upb_msg_clear(void *s, struct upb_msg *m)
-{
-  memset(s, 0, m->set_flags_bytes);
-}
-
-/* Serialization/Deserialization.  ********************************************/
-
-/* Parses the string data in s according to the message description in m. */
-upb_status_t upb_msg_merge(void *data, struct upb_msg *m, struct upb_string *s);

 /* Text dump  *****************************************************************/

--- a/upb_parse.h
+++ b/upb_parse.h
@ -18,6 +18,33 @@
 extern "C" {
 #endif

+/* Definitions. ***************************************************************/
+
+/* A list of types as they are encoded on-the-wire. */
+enum upb_wire_type {
+  UPB_WIRE_TYPE_VARINT      = 0,
+  UPB_WIRE_TYPE_64BIT       = 1,
+  UPB_WIRE_TYPE_DELIMITED   = 2,
+  UPB_WIRE_TYPE_START_GROUP = 3,
+  UPB_WIRE_TYPE_END_GROUP   = 4,
+  UPB_WIRE_TYPE_32BIT       = 5
+};
+typedef uint8_t upb_wire_type_t;
+
+/* A value as it is encoded on-the-wire, except delimited, which is handled
+ * separately. */
+union upb_wire_value {
+  uint64_t varint;
+  uint64_t _64bit;
+  uint32_t _32bit;
+};
+
+/* A tag occurs before each value on-the-wire. */
+struct upb_tag {
+  upb_field_number_t field_number;
+  upb_wire_type_t wire_type;
+};
+
 /* High-level parsing interface. **********************************************/

 struct upb_parse_state;
--- a/upbc.c
+++ b/upbc.c
@ -65,7 +65,7 @@ static void write_header(struct upb_symtab_entry entries[], int num_entries,

    struct upb_string enum_val_prefix = upb_strdup(entry->e.key);
    enum_val_prefix.byte_len = memrchr(enum_val_prefix.ptr,
-                                       UPB_CONTEXT_SEPARATOR,
+                                       UPB_SYMBOL_SEPARATOR,
                                       enum_val_prefix.byte_len);
    enum_val_prefix.byte_len++;
    to_preproc(enum_val_prefix);
@ -135,7 +135,7 @@ static void write_header(struct upb_symtab_entry entries[], int num_entries,
        /* Submessages get special treatment, since we have to use the message
         * name directly. */
        struct upb_string type_name_ref = *fd->type_name;
-        if(type_name_ref.ptr[0] == UPB_CONTEXT_SEPARATOR) {
+        if(type_name_ref.ptr[0] == UPB_SYMBOL_SEPARATOR) {
          /* Omit leading '.'. */
          type_name_ref.ptr++;
          type_name_ref.byte_len--;
@ -207,5 +207,6 @@ int main()
  struct upb_string name = UPB_STRLIT("descriptor.proto");
  write_header(entries, symcount, name, stdout);
  upb_context_free(&c);
+  upb_strfree(fds);
 }