Flesh out implementation of upb_sizebuilder.

15 years ago · 036fe6bb06
parent 611afe9c69
commit 036fe6bb06
5 changed files with 189 additions and 59 deletions
--- a/src/upb.h
+++ b/src/upb.h
@ -272,14 +272,14 @@ enum upb_status_code {
  // The input byte stream ended in the middle of a record.
  UPB_STATUS_NEED_MORE_DATA = 1,

-  // The user value callback opted to stop parsing.
-  UPB_STATUS_USER_CANCELLED = 2,
-
  // An unrecoverable error occurred.
  UPB_STATUS_ERROR = -1,

  // A varint went for 10 bytes without terminating.
-  UPB_ERROR_UNTERMINATED_VARINT = -2
+  UPB_ERROR_UNTERMINATED_VARINT = -2,
+
+  // The max nesting level (UPB_MAX_NESTING) was exceeded.
+  UPB_ERROR_MAX_NESTING_EXCEEDED = -3
 };

 #define UPB_ERRORMSG_MAXLEN 256
--- a/src/upb_decoder.c
+++ b/src/upb_decoder.c
@ -207,10 +207,8 @@ INLINE const uint8_t *decode_tag(const uint8_t *buf, const uint8_t *end,
 }


-/**
- * Parses a 64-bit varint that is known to be >= 2 bytes (the inline version
- * handles 1 and 2 byte varints).
- */
+// Parses a 64-bit varint that is known to be >= 2 bytes (the inline version
+// handles 1 and 2 byte varints).
 const uint8_t *upb_get_v_uint64_t_full(const uint8_t *buf, const uint8_t *end,
                                       uint64_t *val, upb_status *status)
 {
@ -367,10 +365,8 @@ INLINE bool upb_check_type(upb_wire_type_t wt, upb_field_type_t ft) {
 }


-/**
- * Pushes a new stack frame for a submessage with the given len (which will
- * be zero if the submessage is a group).
- */
+// Pushes a new stack frame for a submessage with the given len (which will
+// be zero if the submessage is a group).
 static const uint8_t *push(upb_decoder *d, const uint8_t *start,
                           uint32_t submsg_len, upb_fielddef *f,
                           upb_status *status)
@ -378,7 +374,7 @@ static const uint8_t *push(upb_decoder *d, const uint8_t *start,
  d->top->field = f;
  d->top++;
  if(d->top >= d->limit) {
-    upb_seterr(status, UPB_STATUS_ERROR,
+    upb_seterr(status, UPB_ERROR_MAX_NESTING_EXCEEDED,
               "Nesting exceeded maximum (%d levels)\n",
               UPB_MAX_NESTING);
    return NULL;
@ -391,10 +387,8 @@ static const uint8_t *push(upb_decoder *d, const uint8_t *start,
  return get_msgend(d, start);
 }

-/**
- * Pops a stack frame, returning a pointer for where the next submsg should
- * end (or a pointer that is out of range for a group).
- */
+// Pops a stack frame, returning a pointer for where the next submsg should
+// end (or a pointer that is out of range for a group).
 static const void *pop(upb_decoder *d, const uint8_t *start)
 {
  d->top--;
--- a/src/upb_encoder.c
+++ b/src/upb_encoder.c
@ -5,9 +5,11 @@
 */

 #include "upb_encoder.h"
+
+#include <stdlib.h>
 #include "descriptor.h"

-/* Functions for calculating sizes. *******************************************/
+/* Functions for calculating sizes of wire values. ****************************/

 static size_t upb_v_uint64_t_size(uint64_t val) {
 #ifdef __GNUC__
@ -103,9 +105,9 @@ static uint8_t *upb_put_f_uint64_t(uint8_t *buf, uint64_t val)
  return uint64_end;
 }

-/* Functions to write .proto values. ******************************************/
+/* Functions to write and calculate sizes for .proto values. ******************/

-/* Performs zig-zag encoding, which is used by sint32 and sint64. */
+// Performs zig-zag encoding, which is used by sint32 and sint64.
 static uint32_t upb_zzenc_32(int32_t n) { return (n << 1) ^ (n >> 31); }
 static uint64_t upb_zzenc_64(int64_t n) { return (n << 1) ^ (n >> 63); }

@ -167,7 +169,7 @@ T(FLOAT,    f, uint32_t, float,    _float)  {
 #undef PUT
 #undef T

-uint8_t *upb_encode_value(uint8_t *buf, upb_field_type_t ft, upb_value v)
+static uint8_t *upb_encode_value(uint8_t *buf, upb_field_type_t ft, upb_value v)
 {
 #define CASE(t, member_name) \
  case UPB_TYPE(t): return upb_put_ ## t(buf, v.member_name);
@ -191,11 +193,127 @@ uint8_t *upb_encode_value(uint8_t *buf, upb_field_type_t ft, upb_value v)
 #undef CASE
 }

-uint8_t *_upb_put_tag(uint8_t *buf, upb_field_number_t fn, upb_wire_type_t wt)
+static uint32_t _upb_get_value_size(upb_field_type_t ft, upb_value v)
+{
+#define CASE(t, member_name) \
+  case UPB_TYPE(t): return upb_get_ ## t ## _size(v.member_name);
+  switch(ft) {
+    CASE(DOUBLE,   _double)
+    CASE(FLOAT,    _float)
+    CASE(INT32,    int32)
+    CASE(INT64,    int64)
+    CASE(UINT32,   uint32)
+    CASE(UINT64,   uint64)
+    CASE(SINT32,   int32)
+    CASE(SINT64,   int64)
+    CASE(FIXED32,  uint32)
+    CASE(FIXED64,  uint64)
+    CASE(SFIXED32, int32)
+    CASE(SFIXED64, int64)
+    CASE(BOOL,     _bool)
+    CASE(ENUM,     int32)
+    default: assert(false); return 0;
+  }
+#undef CASE
+}
+
+static uint8_t *_upb_put_tag(uint8_t *buf, upb_field_number_t num,
+                             upb_wire_type_t wt)
+{
+  return upb_put_UINT32(buf, wt | (num << 3));
+}
+
+static uint32_t _upb_get_tag_size(upb_field_number_t num)
+{
+  return upb_get_UINT32_size(num << 3);
+}
+
+
+/* upb_sizebuilder ************************************************************/
+
+struct upb_sizebuilder {
+  // Accumulating size for the current level.
+  uint32_t size;
+
+  // Stack of sizes for our current nesting.
+  uint32_t stack[UPB_MAX_NESTING], *top, *limit;
+
+  // Vector of sizes.
+  uint32_t *sizes;
+  int sizes_len;
+  int sizes_size;
+
+  upb_status status;
+};
+
+// upb_sink callbacks.
+static upb_sink_status _upb_sizebuilder_valuecb(upb_sink *sink, upb_fielddef *f,
+                                                upb_value val)
+{
+  upb_sizebuilder *sb = (upb_sizebuilder*)sink;
+  uint32_t size = 0;
+  size += _upb_get_tag_size(f->number);
+  size += _upb_get_value_size(f->type, val);
+  sb->size += size;
+  return UPB_SINK_CONTINUE;
+}
+
+static upb_sink_status _upb_sizebuilder_strcb(upb_sink *sink, upb_fielddef *f,
+                                              upb_strptr str,
+                                              int32_t start, uint32_t end)
+{
+  (void)str;   // String data itself is not used.
+  upb_sizebuilder *sb = (upb_sizebuilder*)sink;
+  if(start >= 0) {
+    uint32_t size = 0;
+    size += _upb_get_tag_size(f->number);
+    size += upb_get_UINT32_size(end - start);
+    sb->size += size;
+  }
+  return UPB_SINK_CONTINUE;
+}
+
+static upb_sink_status _upb_sizebuilder_startcb(upb_sink *sink, upb_fielddef *f)
+{
+  (void)f;  // Unused (we calculate tag size and delimiter in endcb).
+  upb_sizebuilder *sb = (upb_sizebuilder*)sink;
+  *sb->top = sb->size;
+  sb->top++;
+  sb->size = 0;
+  if(sb->top == sb->limit) {
+    upb_seterr(&sb->status, UPB_ERROR_MAX_NESTING_EXCEEDED,
+               "Nesting exceeded maximum (%d levels)\n",
+               UPB_MAX_NESTING);
+    return UPB_SINK_STOP;
+  }
+  return UPB_SINK_CONTINUE;
+}
+
+static upb_sink_status _upb_sizebuilder_endcb(upb_sink *sink, upb_fielddef *f)
 {
-  return upb_put_UINT32(buf, wt | (fn << 3));
+  upb_sizebuilder *sb = (upb_sizebuilder*)sink;
+  if(sb->sizes_len == sb->sizes_size) {
+    sb->sizes_size *= 2;
+    sb->sizes = realloc(sb->sizes, sb->sizes_size * sizeof(*sb->sizes));
+  }
+  sb->sizes[sb->sizes_len++] = sb->size;
+  sb->top--;
+  // The size according to the parent includes the tag size and delimiter of
+  // the submessage.
+  sb->size += upb_get_UINT32_size(sb->size);
+  sb->size += _upb_get_tag_size(f->number);
+  // Include size accumulated in parent before child began.
+  sb->size += *sb->top;
+  return UPB_SINK_CONTINUE;
 }

+upb_sink_callbacks _upb_sizebuilder_sink_vtbl = {
+  _upb_sizebuilder_valuecb,
+  _upb_sizebuilder_strcb,
+  _upb_sizebuilder_startcb,
+  _upb_sizebuilder_endcb
+};
+

 /* upb_sink callbacks *********************************************************/

@ -283,34 +401,3 @@ upb_sink_callbacks _upb_encoder_sink_vtbl = {
  _upb_encoder_endcb
 };

-
-/* Public Interface ***********************************************************/
-
-size_t upb_get_encoded_size(upb_value v, upb_fielddef *f)
-{
-#define CASE(t, member_name) \
-  case UPB_TYPE(t): return upb_get_ ## t ## _size(v.member_name);
-  switch(f->type) {
-    CASE(DOUBLE,   _double)
-    CASE(FLOAT,    _float)
-    CASE(INT32,    int32)
-    CASE(INT64,    int64)
-    CASE(UINT32,   uint32)
-    CASE(UINT64,   uint64)
-    CASE(SINT32,   int32)
-    CASE(SINT64,   int64)
-    CASE(FIXED32,  uint32)
-    CASE(FIXED64,  uint64)
-    CASE(SFIXED32, int32)
-    CASE(SFIXED64, int64)
-    CASE(BOOL,     _bool)
-    CASE(ENUM,     int32)
-    default: assert(false); return 0;
-  }
-#undef CASE
-}
-
-size_t upb_get_encoded_tag_size(uint32_t fieldnum) {
-  return upb_v_uint64_t_size((uint64_t)fieldnum << 3);
-}
-
--- a/src/upb_encoder.h
+++ b/src/upb_encoder.h
@ -20,13 +20,48 @@
 extern "C" {
 #endif

+/* upb_sizebuilder ************************************************************/
+
+// A upb_sizebuilder performs a pre-pass on data to be serialized that gathers
+// the sizes of submessages.  This size data is required for serialization,
+// because we have to know at the beginning of a submessage how many encoded
+// bytes the submessage will represent.
+struct upb_sizebuilder;
+typedef struct upb_sizebuilder upb_sizebuilder;
+
+upb_sizebuilder *upb_sizebuilder_new();
+void upb_sizebuilder_free(upb_sizebuilder *sb);
+
+// Returns a sink that must be used to perform the pre-pass.  Note that the
+// pre-pass *must* occur in the opposite order from the actual encode that
+// follows, and the data *must* be identical both times (except for the
+// reversed order.
+upb_sink *upb_sizebuilder_sink(upb_sizebuilder *sb);
+
+
+/* upb_encoder ****************************************************************/
+
+// A upb_encoder is a upb_sink that emits data to a upb_bytesink in the protocol
+// buffer binary wire format.
 struct upb_encoder;
 typedef struct upb_encoder upb_encoder;

 upb_encoder *upb_encoder_new();
 void upb_encoder_free(upb_encoder *s);

-void upb_encoder_reset(upb_encoder *s, uint32_t *sizes);
+// Resets the given upb_encoder such that is is ready to begin encoding.  The
+// upb_sizebuilder "sb" is used to determine submessage sizes; it must have
+// previously been initialized by feeding it the same data in reverse order.
+// "sb" may be null if and only if the data contains no submessages; groups
+// are ok and do not require sizes to be precalculated.  The upb_bytesink
+// "out" is where the encoded output data will be sent.
+//
+// Both "sb" and "out" must live until the encoder is either reset or freed.
+void upb_encoder_reset(upb_encoder *s, upb_sizebuilder *sb, upb_bytesink *out);
+
+// The upb_sink to which data can be sent to be encoded.  Note that this data
+// must be identical to the data that was previously given to the sizebuilder
+// (if any).
 upb_sink *upb_encoder_sink(upb_encoder *s);

 #ifdef __cplusplus
--- a/src/upb_sink.h
+++ b/src/upb_sink.h
@ -98,9 +98,15 @@ typedef struct upb_sink_callbacks {
  upb_end_cb end_cb;
 } upb_sink_callbacks;

-// We could potentially define these later to also be capable of calling a C++
-// virtual method instead of doing the virtual dispatch manually.  This would
-// make it possible to write C++ sinks in a more natural style without loss of
+// These macros implement a mini virtual function dispatch for upb_sink instances.
+// This allows functions that call upb_sinks to just write:
+//
+//   upb_sink_onvalue(sink, field, val);
+//
+// The macro will handle the virtual function lookup and dispatch.  We could
+// potentially define these later to also be capable of calling a C++ virtual
+// method instead of doing the virtual dispatch manually.  This would make it
+// possible to write C++ sinks in a more natural style without loss of
 // efficiency.  We could have a flag in upb_sink defining whether it is a C
 // sink or a C++ one.
 #define upb_sink_onvalue(s, f, val) s->vtbl->value_cb(s, f, val)
@ -123,10 +129,18 @@ INLINE void upb_sink_init(upb_sink *s, upb_sink_callbacks *vtbl) {
 //
 // The two simplest kinds of sinks are "write to string" and "write to FILE*".

+// A forward declaration solely for the benefit of declaring upb_byte_cb below.
+// Always prefer upb_bytesink (without the "struct" keyword) instead.
+struct _upb_bytesink;
+
 // The single bytesink callback; it takes the bytes to be written and returns
 // how many were successfully written.  If zero is returned, it indicates that
 // no more bytes can be accepted right now.
-//typedef size_t (*upb_byte_cb)(upb_bytesink *s, upb_strptr str);
+typedef size_t (*upb_byte_cb)(struct _upb_bytesink *s, upb_strptr str);
+
+typedef struct _upb_bytesink {
+  upb_byte_cb *cb;
+} upb_bytesink;

 #ifdef __cplusplus
 }  /* extern "C" */