protobuf/upb/pb/decoder.h

/*
 * upb - a minimalist implementation of protocol buffers.
 *
 * Copyright (c) 2009-2010 Google Inc.  See LICENSE for details.
 * Author: Josh Haberman <jhaberman@gmail.com>
 *
 * upb_decoder implements a high performance, streaming decoder for protobuf
 * data that works by getting its input data from a upb_byteregion and calling
 * into a upb_handlers.
 */

#ifndef UPB_DECODER_H_
#define UPB_DECODER_H_

#include <setjmp.h>
#include "upb/bytestream.h"
#include "upb/sink.h"

#ifdef __cplusplus
extern "C" {
#endif

/* upb_decoderplan ************************************************************/

// A decoderplan contains whatever data structures and generated (JIT-ted) code
// are necessary to decode protobuf data of a specific type to a specific set
// of handlers.  By generating the plan ahead of time, we avoid having to
// redo this work every time we decode.
//
// A decoderplan is threadsafe, meaning that it can be used concurrently by
// different upb_decoders in different threads.  However, the upb_decoders are
// *not* thread-safe.
struct _upb_decoderplan;
typedef struct _upb_decoderplan upb_decoderplan;

// TODO(haberman):
// - add support for letting any message in the plan be at the top level.
// - make this object a handlers instead (when bytesrc/bytesink are merged
//   into handlers).
// - add support for sharing code with previously-built plans/handlers.
upb_decoderplan *upb_decoderplan_new(const upb_handlers *h, bool allowjit);
void upb_decoderplan_unref(upb_decoderplan *p);

// Returns true if the plan contains JIT-ted code.  This may not be the same as
// the "allowjit" parameter to the constructor if support for JIT-ting was not
// compiled in.
bool upb_decoderplan_hasjitcode(upb_decoderplan *p);


/* upb_decoder ****************************************************************/

struct dasm_State;

typedef struct {
  const upb_fielddef *f;
  uint64_t end_ofs;
  uint32_t group_fieldnum;  // UINT32_MAX for non-groups.
  bool is_sequence;   // frame represents seq or submsg? (f might be both).
  bool is_packed;     // !upb_issubmsg(f) && end_ofs != UINT64_MAX
                      // (strings aren't pushed).
} upb_decoder_frame;

typedef struct _upb_decoder {
  upb_decoderplan *plan;
  upb_byteregion  *input;          // Input data (serialized), not owned.
  upb_status      status;          // Where we store errors that occur.

  // Where we push parsed data.
  // TODO(haberman): make this a pointer and make upb_decoder_resetinput() take
  // one of these instead of a void*.
  upb_sink        sink;

  // Our internal stack.
  upb_decoder_frame *top, *limit;
  upb_decoder_frame stack[UPB_MAX_NESTING];

  // Current input buffer and its stream offset.
  const char *buf, *ptr, *end;
  uint64_t bufstart_ofs;

  // End of the delimited region, relative to ptr, or NULL if not in this buf.
  const char *delim_end;
  // True if the top stack frame represents a packed field.
  bool top_is_packed;

#ifdef UPB_USE_JIT_X64
  // For JIT, which doesn't do bounds checks in the middle of parsing a field.
  const char *jit_end, *effective_end;  // == MIN(jit_end, delim_end)

  // Used momentarily by the generated code to store a value while a user
  // function is called.
  uint32_t tmp_len;
#endif

  // For exiting the decoder on error.
  jmp_buf exitjmp;
} upb_decoder;

void upb_decoder_init(upb_decoder *d);
void upb_decoder_uninit(upb_decoder *d);

// Resets the plan that the decoder will parse from.  "msg_offset" indicates
// which message from the plan will be used as the top-level message.
//
// This will also reset the decoder's input to be uninitialized --
// upb_decoder_resetinput() must be called before parsing can occur.  The plan
// must live until the decoder is destroyed or reset to a different plan.
//
// Must be called before upb_decoder_resetinput() or upb_decoder_decode().
void upb_decoder_resetplan(upb_decoder *d, upb_decoderplan *p);

// Resets the input of an already-allocated decoder.  This puts it in a state
// where it has not seen any data, and expects the next data to be from the
// beginning of a new protobuf.  Decoders must have their input reset before
// they can be used.  A decoder can have its input reset multiple times.
// "input" must live until the decoder is destroyed or has it input reset
// again. "c" is the closure that will be passed to the handlers.
//
// Must be called before upb_decoder_decode().
void upb_decoder_resetinput(upb_decoder *d, upb_byteregion *input, void *c);

// Decodes serialized data (calling handlers as the data is parsed), returning
// the success of the operation (call upb_decoder_status() for details).
upb_success_t upb_decoder_decode(upb_decoder *d);

INLINE const upb_status *upb_decoder_status(upb_decoder *d) {
  return &d->status;
}

// Implementation details

struct _upb_decoderplan {
  // The top-level handlers that this plan calls into.  We own a ref.
  const upb_handlers *handlers;

#ifdef UPB_USE_JIT_X64
  // JIT-generated machine code (else NULL).
  char *jit_code;
  size_t jit_size;
  char *debug_info;

  // For storing upb_jitmsginfo, which contains per-msg runtime data needed
  // by the JIT.
  // Maps upb_handlers* -> upb_jitmsginfo.
  upb_inttable msginfo;

  // The following members are used only while the JIT is being built.

  // This pointer is allocated by dasm_init() and freed by dasm_free().
  struct dasm_State *dynasm;

  // For storing pclabel bases while we are building the JIT.
  // Maps (upb_handlers* or upb_fielddef*) -> int32 pclabel_base
  upb_inttable pclabels;

  // This is not the same as len(pclabels) because the table only contains base
  // offsets for each def, but each def can have many pclabels.
  uint32_t pclabel_count;
#endif
};

#ifdef __cplusplus
}  /* extern "C" */
#endif

#endif  /* UPB_DECODER_H_ */
upb_parser -> upb_decoder 15 years ago			`/*`
			`* upb - a minimalist implementation of protocol buffers.`
			`*`
Update copyright to be Google Inc. This doesn't reflect any material change in how I will be working on upb, and I have no problem making this change. It's still open source under the BSD license, and I'll still be working on it well beyond the hours that constitute a normal job. 14 years ago			`* Copyright (c) 2009-2010 Google Inc. See LICENSE for details.`
			`* Author: Josh Haberman <jhaberman@gmail.com>`
			`*`
More work on the decoder. 15 years ago			`* upb_decoder implements a high performance, streaming decoder for protobuf`
Refinement of upb_bytesrc interface. Added a upb_byteregion that tracks a region of the input buffer; decoders use this instead of using a upb_bytesrc directly. upb_byteregion is also used as the way of passing a string to a upb_handlers callback. This symmetry makes decoders compose better; if you want to take a parsed string and decode it as something else, you can take the string directly from the callback and feed it as input to another parser. A commented-out version of a pinning interface is present; I decline to actually implement it (and accept its extra complexity) until/unless it is clear that it is actually a win. But it is included as a proof-of-concept, to show that it fits well with the existing interface. 13 years ago			`* data that works by getting its input data from a upb_byteregion and calling`
Header cleanup, clarify/correct comments for interfaces. 13 years ago			`* into a upb_handlers.`
upb_parser -> upb_decoder 15 years ago			`*/`

			`#ifndef UPB_DECODER_H_`
			`#define UPB_DECODER_H_`

Decoder redesign in preparation for packed fields and start/endseq. 14 years ago			`#include <setjmp.h>`
Sync with 8 months of Google-internal development. Many things have changed and been simplified. The memory-management story for upb_def and upb_handlers is much more robust; upb_def and upb_handlers should be fairly stable interfaces now. There is still much work to do for the runtime component (upb_sink). 12 years ago			`#include "upb/bytestream.h"`
			`#include "upb/sink.h"`
upb_parser -> upb_decoder 15 years ago
			`#ifdef __cplusplus`
			`extern "C" {`
			`#endif`

Sync with internal Google development. This breaks the open-source build, will follow up with a change to fix it. 13 years ago			`/* upb_decoderplan ************************************************************/`

			`// A decoderplan contains whatever data structures and generated (JIT-ted) code`
			`// are necessary to decode protobuf data of a specific type to a specific set`
			`// of handlers. By generating the plan ahead of time, we avoid having to`
			`// redo this work every time we decode.`
			`//`
			`// A decoderplan is threadsafe, meaning that it can be used concurrently by`
			`// different upb_decoders in different threads. However, the upb_decoders are`
			`// not thread-safe.`
			`struct _upb_decoderplan;`
			`typedef struct _upb_decoderplan upb_decoderplan;`

Sync with 8 months of Google-internal development. Many things have changed and been simplified. The memory-management story for upb_def and upb_handlers is much more robust; upb_def and upb_handlers should be fairly stable interfaces now. There is still much work to do for the runtime component (upb_sink). 12 years ago			`// TODO(haberman):`
			`// - add support for letting any message in the plan be at the top level.`
			`// - make this object a handlers instead (when bytesrc/bytesink are merged`
			`// into handlers).`
			`// - add support for sharing code with previously-built plans/handlers.`
			`upb_decoderplan upb_decoderplan_new(const upb_handlers h, bool allowjit);`
Sync with internal Google development. This breaks the open-source build, will follow up with a change to fix it. 13 years ago			`void upb_decoderplan_unref(upb_decoderplan *p);`

			`// Returns true if the plan contains JIT-ted code. This may not be the same as`
			`// the "allowjit" parameter to the constructor if support for JIT-ting was not`
			`// compiled in.`
			`bool upb_decoderplan_hasjitcode(upb_decoderplan *p);`


			`/* upb_decoder ****************************************************************/`
upb_parser -> upb_decoder 15 years ago
First rough version of the JIT. It can successfully parse SpeedMessage1. Preliminary results: 750MB/s on Core2 2.4GHz. This number is 2.5x proto2. This isn't apples-to-apples, because proto2 is parsing to a struct and we are just doing stream parsing, but for apps that are currently using proto2, this is the improvement they would see if they could move to stream-based processing. Unfortunately perf-regression-test.py is broken, and I'm not 100% sure why. It would be nice to fix it first (to ensure that there are no performance regressions for the table-based decoder) but I'm really impatient to get the JIT checked in. 14 years ago			`struct dasm_State;`

Sync with 8 months of Google-internal development. Many things have changed and been simplified. The memory-management story for upb_def and upb_handlers is much more robust; upb_def and upb_handlers should be fairly stable interfaces now. There is still much work to do for the runtime component (upb_sink). 12 years ago			`typedef struct {`
			`const upb_fielddef *f;`
			`uint64_t end_ofs;`
			`uint32_t group_fieldnum; // UINT32_MAX for non-groups.`
			`bool is_sequence; // frame represents seq or submsg? (f might be both).`
			`bool is_packed; // !upb_issubmsg(f) && end_ofs != UINT64_MAX`
			`// (strings aren't pushed).`
			`} upb_decoder_frame;`

Major refactoring: upb_string is gone in favor of upb_strref. 14 years ago			`typedef struct _upb_decoder {`
Sync with internal Google development. This breaks the open-source build, will follow up with a change to fix it. 13 years ago			`upb_decoderplan *plan;`
			`upb_byteregion *input; // Input data (serialized), not owned.`
			`upb_status status; // Where we store errors that occur.`

Sync with 8 months of Google-internal development. Many things have changed and been simplified. The memory-management story for upb_def and upb_handlers is much more robust; upb_def and upb_handlers should be fairly stable interfaces now. There is still much work to do for the runtime component (upb_sink). 12 years ago			`// Where we push parsed data.`
			`// TODO(haberman): make this a pointer and make upb_decoder_resetinput() take`
			`// one of these instead of a void*.`
			`upb_sink sink;`

			`// Our internal stack.`
			`upb_decoder_frame top, limit;`
			`upb_decoder_frame stack[UPB_MAX_NESTING];`
Tons of work: we're close to passing test_vs_proto2 again. 14 years ago
Header cleanup, clarify/correct comments for interfaces. 13 years ago			`// Current input buffer and its stream offset.`
Major refactoring: upb_string is gone in favor of upb_strref. 14 years ago			`const char buf, ptr, *end;`
Refinement of upb_bytesrc interface. Added a upb_byteregion that tracks a region of the input buffer; decoders use this instead of using a upb_bytesrc directly. upb_byteregion is also used as the way of passing a string to a upb_handlers callback. This symmetry makes decoders compose better; if you want to take a parsed string and decode it as something else, you can take the string directly from the callback and feed it as input to another parser. A commented-out version of a pinning interface is present; I decline to actually implement it (and accept its extra complexity) until/unless it is clear that it is actually a win. But it is included as a proof-of-concept, to show that it fits well with the existing interface. 13 years ago			`uint64_t bufstart_ofs;`
Remove upb_dstate and specialize upb_decode_fixed for perf improvement. The compiler wasn't keeping upb_dstate in memory anyway (which was the original goal). This simplifies the decoder. upb_decode_fixed was intended to minimize the number of branches, but since it was calling out to memcpy as a function, this turned out to be a pessimization. Performance is encouraging: plain32.parsestream_googlemessage1.upb_table: 254 -> 242 (-4.72) plain32.parsestream_googlemessage2.upb_table: 357 -> 400 (12.04) plain32.parsetostruct_googlemessage1.upb_table_byref: 143 -> 144 (0.70) plain32.parsetostruct_googlemessage1.upb_table_byval: 122 -> 118 (-3.28) plain32.parsetostruct_googlemessage2.upb_table_byref: 189 -> 200 (5.82) plain32.parsetostruct_googlemessage2.upb_table_byval: 198 -> 200 (1.01) omitfp32.parsestream_googlemessage1.upb_table: 267 -> 265 (-0.75) omitfp32.parsestream_googlemessage2.upb_table: 377 -> 465 (23.34) omitfp32.parsetostruct_googlemessage1.upb_table_byref: 140 -> 151 (7.86) omitfp32.parsetostruct_googlemessage1.upb_table_byval: 131 -> 131 (0.00) omitfp32.parsetostruct_googlemessage2.upb_table_byref: 204 -> 214 (4.90) omitfp32.parsetostruct_googlemessage2.upb_table_byval: 200 -> 206 (3.00) plain.parsestream_googlemessage1.upb_table: 313 -> 317 (1.28) plain.parsestream_googlemessage2.upb_table: 476 -> 541 (13.66) plain.parsetostruct_googlemessage1.upb_table_byref: 189 -> 189 (0.00) plain.parsetostruct_googlemessage1.upb_table_byval: 165 -> 165 (0.00) plain.parsetostruct_googlemessage2.upb_table_byref: 263 -> 270 (2.66) plain.parsetostruct_googlemessage2.upb_table_byval: 248 -> 255 (2.82) omitfp.parsestream_googlemessage1.upb_table: 306 -> 305 (-0.33) omitfp.parsestream_googlemessage2.upb_table: 471 -> 531 (12.74) omitfp.parsetostruct_googlemessage1.upb_table_byref: 189 -> 190 (0.53) omitfp.parsetostruct_googlemessage1.upb_table_byval: 166 -> 172 (3.61) omitfp.parsetostruct_googlemessage2.upb_table_byref: 258 -> 270 (4.65) omitfp.parsetostruct_googlemessage2.upb_table_byval: 248 -> 265 (6.85) 14 years ago
Some source cleanup/commenting. 13 years ago			`// End of the delimited region, relative to ptr, or NULL if not in this buf.`
			`const char *delim_end;`
Sync with internal Google development. This breaks the open-source build, will follow up with a change to fix it. 13 years ago			`// True if the top stack frame represents a packed field.`
Add packed field support (untested). 13 years ago			`bool top_is_packed;`
Track buffer end instead of buffer length, for a small perf improvement. 14 years ago
Major refactoring: upb_string is gone in favor of upb_strref. 14 years ago			`#ifdef UPB_USE_JIT_X64`
			`// For JIT, which doesn't do bounds checks in the middle of parsing a field.`
Sync with 8 months of Google-internal development. Many things have changed and been simplified. The memory-management story for upb_def and upb_handlers is much more robust; upb_def and upb_handlers should be fairly stable interfaces now. There is still much work to do for the runtime component (upb_sink). 12 years ago			`const char jit_end, effective_end; // == MIN(jit_end, delim_end)`

			`// Used momentarily by the generated code to store a value while a user`
			`// function is called.`
			`uint32_t tmp_len;`
Major refactoring: upb_string is gone in favor of upb_strref. 14 years ago			`#endif`
Change dispatcher error handling model. Now the dispatcher will call error handlers instaed of returning statuses that the caller has to constantly check. 14 years ago
Header cleanup, clarify/correct comments for interfaces. 13 years ago			`// For exiting the decoder on error.`
Got decoder & textprinter compiling in kernel mode. 13 years ago			`jmp_buf exitjmp;`
Major refactoring: upb_string is gone in favor of upb_strref. 14 years ago			`} upb_decoder;`
upb_parser -> upb_decoder 15 years ago
Sync with internal Google development. This breaks the open-source build, will follow up with a change to fix it. 13 years ago			`void upb_decoder_init(upb_decoder *d);`
Tons of work: we're close to passing test_vs_proto2 again. 14 years ago			`void upb_decoder_uninit(upb_decoder *d);`
upb_parser -> upb_decoder 15 years ago
Sync with internal Google development. This breaks the open-source build, will follow up with a change to fix it. 13 years ago			`// Resets the plan that the decoder will parse from. "msg_offset" indicates`
			`// which message from the plan will be used as the top-level message.`
			`//`
			`// This will also reset the decoder's input to be uninitialized --`
			`// upb_decoder_resetinput() must be called before parsing can occur. The plan`
			`// must live until the decoder is destroyed or reset to a different plan.`
			`//`
			`// Must be called before upb_decoder_resetinput() or upb_decoder_decode().`
Sync with 8 months of Google-internal development. Many things have changed and been simplified. The memory-management story for upb_def and upb_handlers is much more robust; upb_def and upb_handlers should be fairly stable interfaces now. There is still much work to do for the runtime component (upb_sink). 12 years ago			`void upb_decoder_resetplan(upb_decoder d, upb_decoderplan p);`
Sync with internal Google development. This breaks the open-source build, will follow up with a change to fix it. 13 years ago
			`// Resets the input of an already-allocated decoder. This puts it in a state`
			`// where it has not seen any data, and expects the next data to be from the`
			`// beginning of a new protobuf. Decoders must have their input reset before`
			`// they can be used. A decoder can have its input reset multiple times.`
			`// "input" must live until the decoder is destroyed or has it input reset`
			`// again. "c" is the closure that will be passed to the handlers.`
			`//`
			`// Must be called before upb_decoder_decode().`
			`void upb_decoder_resetinput(upb_decoder d, upb_byteregion input, void *c);`

			`// Decodes serialized data (calling handlers as the data is parsed), returning`
			`// the success of the operation (call upb_decoder_status() for details).`
			`upb_success_t upb_decoder_decode(upb_decoder *d);`

			`INLINE const upb_status upb_decoder_status(upb_decoder d) {`
			`return &d->status;`
			`}`

			`// Implementation details`

			`struct _upb_decoderplan {`
Sync with 8 months of Google-internal development. Many things have changed and been simplified. The memory-management story for upb_def and upb_handlers is much more robust; upb_def and upb_handlers should be fairly stable interfaces now. There is still much work to do for the runtime component (upb_sink). 12 years ago			`// The top-level handlers that this plan calls into. We own a ref.`
			`const upb_handlers *handlers;`
Sync with internal Google development. This breaks the open-source build, will follow up with a change to fix it. 13 years ago
			`#ifdef UPB_USE_JIT_X64`
			`// JIT-generated machine code (else NULL).`
			`char *jit_code;`
			`size_t jit_size;`
			`char *debug_info;`
WIP: intrusive changes to upb_decoder. 15 years ago
Sync with 8 months of Google-internal development. Many things have changed and been simplified. The memory-management story for upb_def and upb_handlers is much more robust; upb_def and upb_handlers should be fairly stable interfaces now. There is still much work to do for the runtime component (upb_sink). 12 years ago			`// For storing upb_jitmsginfo, which contains per-msg runtime data needed`
			`// by the JIT.`
			`// Maps upb_handlers* -> upb_jitmsginfo.`
			`upb_inttable msginfo;`

			`// The following members are used only while the JIT is being built.`

Sync with internal Google development. This breaks the open-source build, will follow up with a change to fix it. 13 years ago			`// This pointer is allocated by dasm_init() and freed by dasm_free().`
			`struct dasm_State *dynasm;`
Sync with 8 months of Google-internal development. Many things have changed and been simplified. The memory-management story for upb_def and upb_handlers is much more robust; upb_def and upb_handlers should be fairly stable interfaces now. There is still much work to do for the runtime component (upb_sink). 12 years ago
			`// For storing pclabel bases while we are building the JIT.`
			`// Maps (upb_handlers* or upb_fielddef*) -> int32 pclabel_base`
			`upb_inttable pclabels;`

			`// This is not the same as len(pclabels) because the table only contains base`
			`// offsets for each def, but each def can have many pclabels.`
			`uint32_t pclabel_count;`
Sync with internal Google development. This breaks the open-source build, will follow up with a change to fix it. 13 years ago			`#endif`
			`};`
upb_parser -> upb_decoder 15 years ago
			`#ifdef __cplusplus`
			`} /* extern "C" */`
			`#endif`

			`#endif /* UPB_DECODER_H_ */`