diff --git a/Makefile b/Makefile index bebe023b72..5320876340 100644 --- a/Makefile +++ b/Makefile @@ -162,13 +162,9 @@ upb/pb/jit_debug_elf_file.o: upb/pb/jit_debug_elf_file.s $(E) GAS $< $(Q) gcc -c upb/pb/jit_debug_elf_file.s -o upb/pb/jit_debug_elf_file.o -upb/pb/jit_debug_elf_file2.o: upb/pb/jit_debug_elf_file.o - $(E) OBJCOPY $< - $(Q) objcopy --change-section-address .text=0x12345678 $< $@ - -upb/pb/jit_debug_elf_file.h: upb/pb/jit_debug_elf_file2.o +upb/pb/jit_debug_elf_file.h: upb/pb/jit_debug_elf_file.o $(E) XXD $< - $(Q) xxd -i < upb/pb/jit_debug_elf_file2.o > upb/pb/jit_debug_elf_file.h + $(Q) xxd -i < upb/pb/jit_debug_elf_file.o > upb/pb/jit_debug_elf_file.h upb/pb/decoder_x64.h: upb/pb/jit_debug_elf_file.h endif @@ -232,15 +228,13 @@ VALGRIND=valgrind --leak-check=full --error-exitcode=1 test: tests @echo Running all tests under valgrind. @set -e # Abort on error. - @for test in $(SIMPLE_TESTS) $(SIMPLE_CXX_TESTS); do \ + @for test in $(TESTS); do \ if [ -x ./$$test ] ; then \ echo !!! $(VALGRIND) ./$$test; \ - $(VALGRIND) ./$$test tests/test.proto.pb || exit 1; \ + $(VALGRIND) ./$$test || exit 1; \ fi \ done; \ - $(VALGRIND) ./tests/t.test_vs_proto2.googlemessage1 benchmarks/google_messages.proto.pb benchmarks/google_message1.dat - $(VALGRIND) ./tests/t.test_vs_proto2.googlemessage2 benchmarks/google_messages.proto.pb benchmarks/google_message2.dat - @echo "All tests passed!" + echo "All tests passed!" tests/t.test_vs_proto2.googlemessage1 \ tests/t.test_vs_proto2.googlemessage2: \ diff --git a/benchmarks/parsestream.upb.c b/benchmarks/parsestream.upb.c index 19d8ccf5fd..4d13e9d137 100644 --- a/benchmarks/parsestream.upb.c +++ b/benchmarks/parsestream.upb.c @@ -76,7 +76,8 @@ static size_t run(int i) (void)i; upb_status status = UPB_STATUS_INIT; upb_stringsrc_reset(&stringsrc, input_str, input_len); - upb_decoder_reset(&decoder, upb_stringsrc_allbytes(&stringsrc), NULL); + upb_decoder_reset(&decoder, upb_stringsrc_bytesrc(&stringsrc), + 0, UPB_NONDELIMITED, NULL); upb_decoder_decode(&decoder, &status); if(!upb_ok(&status)) goto err; return input_len; diff --git a/benchmarks/parsetoproto2.upb.cc b/benchmarks/parsetoproto2.upb.cc index 03a1039eec..75cd10c2fa 100644 --- a/benchmarks/parsetoproto2.upb.cc +++ b/benchmarks/parsetoproto2.upb.cc @@ -24,7 +24,6 @@ #include #undef private -char *str; static size_t len; MESSAGE_CIDENT msg[NUM_MESSAGES]; MESSAGE_CIDENT msg2; @@ -54,13 +53,9 @@ upb_flow_t proto2_setstr(void *m, upb_value fval, upb_value val) { const upb_fielddef *f = upb_value_getfielddef(fval); std::string **str = (std::string**)UPB_INDEX(m, f->offset, 1); if (*str == f->default_ptr) *str = new std::string; - const upb_byteregion *ref = upb_value_getbyteregion(val); - uint32_t len; - (*str)->assign( - upb_byteregion_getptr(ref, upb_byteregion_startofs(ref), &len), - upb_byteregion_len(ref)); - assert(len == upb_byteregion_len(ref)); + const upb_strref *ref = upb_value_getstrref(val); // XXX: only supports contiguous strings atm. + (*str)->assign(ref->ptr, ref->len); return UPB_CONTINUE; } @@ -69,13 +64,9 @@ upb_flow_t proto2_append_str(void *_r, upb_value fval, upb_value val) { typedef google::protobuf::RepeatedPtrField R; (void)fval; R *r = (R*)_r; - const upb_byteregion *ref = upb_value_getbyteregion(val); + const upb_strref *ref = upb_value_getstrref(val); // XXX: only supports contiguous strings atm. - uint32_t len; - r->Add()->assign( - upb_byteregion_getptr(ref, upb_byteregion_startofs(ref), &len), - upb_byteregion_len(ref)); - assert(len == upb_byteregion_len(ref)); + r->Add()->assign(ref->ptr, ref->len); return UPB_CONTINUE; } @@ -274,7 +265,7 @@ static bool initialize() upb_symtab_unref(s); // Read the message data itself. - str = upb_readfile(MESSAGE_FILE, &len); + char *str = upb_readfile(MESSAGE_FILE, &len); if(str == NULL) { fprintf(stderr, "Error reading " MESSAGE_FILE "\n"); return false; @@ -284,6 +275,7 @@ static bool initialize() msg2.ParseFromArray(str, len); upb_stringsrc_init(&strsrc); + upb_stringsrc_reset(&strsrc, str, len); upb_handlers *h = upb_handlers_new(); upb_accessors_reghandlers(h, def); if (!JIT) h->should_jit = false; @@ -304,8 +296,8 @@ static size_t run(int i) (void)i; upb_status status = UPB_STATUS_INIT; msg[i % NUM_MESSAGES].Clear(); - upb_stringsrc_reset(&strsrc, str, len); - upb_decoder_reset(&d, upb_stringsrc_allbytes(&strsrc), &msg[i % NUM_MESSAGES]); + upb_decoder_reset(&d, upb_stringsrc_bytesrc(&strsrc), + 0, UPB_NONDELIMITED, &msg[i % NUM_MESSAGES]); upb_decoder_decode(&d, &status); if(!upb_ok(&status)) goto err; return len; diff --git a/benchmarks/parsetostruct.upb.c b/benchmarks/parsetostruct.upb.c index 4eeafbb580..5e7aa3573b 100644 --- a/benchmarks/parsetostruct.upb.c +++ b/benchmarks/parsetostruct.upb.c @@ -8,7 +8,6 @@ #include "upb/pb/glue.h" static const upb_msgdef *def; -char *str; static size_t len; static void *msg[NUM_MESSAGES]; static upb_stringsrc strsrc; @@ -34,7 +33,7 @@ static bool initialize() upb_symtab_unref(s); // Read the message data itself. - str = upb_readfile(MESSAGE_FILE, &len); + char *str = upb_readfile(MESSAGE_FILE, &len); if(str == NULL) { fprintf(stderr, "Error reading " MESSAGE_FILE "\n"); return false; @@ -44,6 +43,7 @@ static bool initialize() msg[i] = upb_stdmsg_new(def); upb_stringsrc_init(&strsrc); + upb_stringsrc_reset(&strsrc, str, len); upb_handlers *h = upb_handlers_new(); upb_accessors_reghandlers(h, def); if (!JIT) h->should_jit = false; @@ -70,8 +70,8 @@ static size_t run(int i) upb_status status = UPB_STATUS_INIT; i %= NUM_MESSAGES; upb_msg_clear(msg[i], def); - upb_stringsrc_reset(&strsrc, str, len); - upb_decoder_reset(&d, upb_stringsrc_allbytes(&strsrc), msg[i]); + upb_decoder_reset(&d, upb_stringsrc_bytesrc(&strsrc), + 0, UPB_NONDELIMITED, msg[i]); upb_decoder_decode(&d, &status); if(!upb_ok(&status)) goto err; return len; diff --git a/bindings/cpp/upb/bytestream.cc b/bindings/cpp/upb/bytestream.cc new file mode 100644 index 0000000000..df0797e736 --- /dev/null +++ b/bindings/cpp/upb/bytestream.cc @@ -0,0 +1,39 @@ +// +// upb - a minimalist implementation of protocol buffers. +// +// Copyright (c) 2011 Google Inc. See LICENSE for details. +// Author: Josh Haberman + +#include "bytestream.hpp" + +namespace upb { + +upb_bytesrc_vtbl* ByteSourceBase::vtable() { + static upb_bytesrc_vtbl vtbl = { + &ByteSourceBase::VFetch, + &ByteSourceBase::VDiscard, + &ByteSourceBase::VCopy, + &ByteSourceBase::VGetPtr, + }; + return &vtbl; +} + +upb_bytesuccess_t ByteSourceBase::VFetch(void *src, uint64_t ofs, size_t *len) { + return static_cast(src)->Fetch(ofs, len); +} + +void ByteSourceBase::VCopy( + const void *src, uint64_t ofs, size_t len, char* dest) { + static_cast(src)->Copy(ofs, len, dest); +} + +void ByteSourceBase::VDiscard(void *src, uint64_t ofs) { + static_cast(src)->Discard(ofs); +} + +const char * ByteSourceBase::VGetPtr( + const void *src, uint64_t ofs, size_t* len) { + return static_cast(src)->GetPtr(ofs, len); +} + +} // namespace upb diff --git a/bindings/cpp/upb/bytestream.hpp b/bindings/cpp/upb/bytestream.hpp new file mode 100644 index 0000000000..968d542c2a --- /dev/null +++ b/bindings/cpp/upb/bytestream.hpp @@ -0,0 +1,238 @@ +// +// upb - a minimalist implementation of protocol buffers. +// +// Copyright (c) 2011 Google Inc. See LICENSE for details. +// Author: Josh Haberman +// +// This file defines three core interfaces: +// - upb::ByteSink: for writing streams of data. +// - upb::ByteSource: for reading streams of data. +// - upb::ByteRegion: for reading from a specific region of a ByteSource; +// should be used by decoders instead of using a ByteSource directly. +// +// These interfaces are used by streaming encoders and decoders: for example, a +// protobuf parser gets its input from a upb::ByteRegion. They are virtual +// base classes so concrete implementations can get the data from a fd, a +// FILE*, a string, etc. +// +// A ByteRegion represents a region of data from a ByteSource. +// +// Parsers get data from this interface instead of a bytesrc because we often +// want to parse only a specific region of the input. For example, if we parse +// a string from our input but know that the string represents a protobuf, we +// can pass its ByteRegion to an appropriate protobuf parser. +// +// Since the bytes may be coming from a file or network socket, bytes must be +// fetched before they can be read (though in some cases this fetch may be a +// no-op). "fetch" is the only operation on a byteregion that could fail or +// block, because it is the only operation that actually performs I/O. +// +// Bytes can be discarded when they are no longer needed. Parsers should +// always discard bytes they no longer need, both so the buffers can be freed +// when possible and to give better visibility into what bytes the parser is +// still using. +// +// start discard read fetch end +// ofs ofs ofs ofs ofs +// | |--->Discard() | |--->Fetch() | +// V V V V V +// +-------------+-------------------------+-----------------+-----------------+ +// | discarded | | | fetchable | +// +-------------+-------------------------+-----------------+-----------------+ +// | <------------- loaded ------------------> | +// | <- available -> | +// | <---------- remaining ----------> | +// +// Note that the start offset may be something other than zero! A byteregion +// is a view into an underlying bytesrc stream, and the region may start +// somewhere other than the beginning of that stream. +// +// The region can be either delimited or nondelimited. A non-delimited region +// will keep returning data until the underlying data source returns EOF. A +// delimited region will return EOF at a predetermined offset. +// +// end +// ofs +// | +// V +// +-----------------------+ +// | delimited region | <-- hard EOF, even if data source has more data. +// +-----------------------+ +// +// +------------------------ +// | nondelimited region Z <-- won't return EOF until data source hits EOF. +// +------------------------ + +#ifndef UPB_BYTESTREAM_HPP +#define UPB_BYTESTREAM_HPP + +#include "upb/bytestream.h" +#include "upb/upb.hpp" + +namespace upb { + +typedef upb_bytesuccess_t ByteSuccess; + +// Implement this interface to vend bytes to ByteRegions which will be used by +// a decoder. +class ByteSourceBase : public upb_bytesrc { + public: + ByteSourceBase() { upb_bytesrc_init(this, vtable()); } + virtual ~ByteSourceBase() { upb_bytesrc_uninit(this); } + + // Fetches at least one byte starting at ofs, setting *len to the actual + // number of bytes fetched (or 0 on EOF or error: see return value for + // details). It is valid for bytes to be fetched multiple times, as long as + // the bytes have not been previously discarded. + virtual ByteSuccess Fetch(uint64_t ofs, size_t* len) = 0; + + // Discards all data prior to ofs (except data that is pinned, if pinning + // support is added -- see TODO below). + virtual void Discard(uint64_t ofs) = 0; + + // Copies "len" bytes of data from ofs to "dst", which must be at least "len" + // bytes long. The given region must not be discarded. + virtual void Copy(uint64_t ofs, size_t len, char *dst) const = 0; + + // Returns a pointer to the bytesrc's internal buffer, storing in *len how + // much data is available. The given offset must not be discarded. The + // returned buffer is valid for as long as its bytes are not discarded (in + // the case that part of the returned buffer is discarded, only the + // non-discarded bytes remain valid). + virtual const char *GetPtr(uint64_t ofs, size_t *len) const = 0; + + // TODO: Add if/when there is a demonstrated need: + // + // // When the caller pins a region (which must not be already discarded), it + // // is guaranteed that the region will not be discarded (nor will the + // // bytesrc be destroyed) until the region is unpinned. However, not all + // // bytesrc's support pinning; a false return indicates that a pin was not + // // possible. + // virtual bool Pin(uint64_t ofs, size_t len); + // + // // Releases some number of pinned bytes from the beginning of a pinned + // // region (which may be fewer than the total number of bytes pinned). + // virtual void Unpin(uint64_t ofs, size_t len, size_t bytes_to_release); + // + // Adding pinning support would also involve adding a "pin_ofs" parameter to + // upb_bytesrc_fetch, so that the fetch can extend an already-pinned region. + private: + static upb_bytesrc_vtbl* vtable(); + static upb_bytesuccess_t VFetch(void*, uint64_t, size_t*); + static void VDiscard(void*, uint64_t); + static void VCopy(const void*, uint64_t, size_t, char*); + static const char *VGetPtr(const void*, uint64_t, size_t*); +}; + +class ByteRegion : public upb_byteregion { + public: + static const uint64_t kNondelimited = UPB_NONDELIMITED; + + ByteRegion() { upb_byteregion_init(this); } + ~ByteRegion() { upb_byteregion_uninit(this); } + + // Accessors for the regions bounds -- the meaning of these is described in + // the diagram above. + uint64_t start_ofs() const { return upb_byteregion_startofs(this); } + uint64_t discard_ofs() const { return upb_byteregion_discardofs(this); } + uint64_t fetch_ofs() const { return upb_byteregion_fetchofs(this); } + uint64_t end_ofs() const { return upb_byteregion_endofs(this); } + + // Returns how many bytes are fetched and available for reading starting from + // offset "offset". + uint64_t BytesAvailable(uint64_t offset) const { + return upb_byteregion_available(this, offset); + } + + // Returns the total number of bytes remaining after offset "offset", or + // kNondelimited if the byteregion is non-delimited. + uint64_t BytesRemaining(uint64_t offset) const { + return upb_byteregion_remaining(this, offset); + } + + uint64_t Length() const { return upb_byteregion_len(this); } + + // Sets the value of this byteregion to be a subset of the given byteregion's + // data. The caller is responsible for releasing this region before the src + // region is released (unless the region is first pinned, if pinning support + // is added. see below). + void Reset(const upb_byteregion *src, uint64_t ofs, uint64_t len) { + upb_byteregion_reset(this, src, ofs, len); + } + void Release() { upb_byteregion_release(this); } + + // Attempts to fetch more data, extending the fetched range of this + // byteregion. Returns true if the fetched region was extended by at least + // one byte, false on EOF or error (see *s for details). + ByteSuccess Fetch() { return upb_byteregion_fetch(this); } + + // Fetches all remaining data, returning false if the operation failed (see + // *s for details). May only be used on delimited byteregions. + ByteSuccess FetchAll() { return upb_byteregion_fetchall(this); } + + // Discards bytes from the byteregion up until ofs (which must be greater or + // equal to discard_ofs()). It is valid to discard bytes that have not been + // fetched (such bytes will never be fetched) but it is an error to discard + // past the end of a delimited byteregion. + void Discard(uint64_t ofs) { return upb_byteregion_discard(this, ofs); } + + // Copies "len" bytes of data into "dst", starting at ofs. The specified + // region must be available. + void Copy(uint64_t ofs, size_t len, char *dst) const { + upb_byteregion_copy(this, ofs, len, dst); + } + + // Copies all bytes from the byteregion into dst. Requires that the entire + // byteregion is fetched and that none has been discarded. + void CopyAll(char *dst) const { + upb_byteregion_copyall(this, dst); + } + + // Returns a pointer to the internal buffer for the byteregion starting at + // offset "ofs." Stores the number of bytes available in this buffer in *len. + // The returned buffer is invalidated when the byteregion is reset or + // released, or when the bytes are discarded. If the byteregion is not + // currently pinned, the pointer is only valid for the lifetime of the parent + // byteregion. + const char *GetPtr(uint64_t ofs, size_t *len) const { + return upb_byteregion_getptr(this, ofs, len); + } + + // Copies the contents of the byteregion into a newly-allocated, + // NULL-terminated string. Requires that the byteregion is fully fetched. + char *StrDup() const { + return upb_byteregion_strdup(this); + } + + // TODO: add if/when there is a demonstrated need. + // + // // Pins this byteregion's bytes in memory, allowing it to outlive its + // // parent byteregion. Normally a byteregion may only be used while its + // // parent is still valid, but a pinned byteregion may continue to be used + // // until it is reset or released. A byteregion must be fully fetched to + // // be pinned (this implies that the byteregion must be delimited). + // // + // // In some cases this operation may cause the input data to be copied. + // // + // // void Pin(); +}; + +class StringSource : public upb_stringsrc { + public: + StringSource() : upb_stringsrc() { upb_stringsrc_init(this); } + ~StringSource() { upb_stringsrc_uninit(this); } + + void Reset(const char* data, size_t len) { + upb_stringsrc_reset(this, data, len); + } + + ByteRegion* AllBytes() { + return static_cast(upb_stringsrc_allbytes(this)); + } + + upb_bytesrc* ByteSource() { return upb_stringsrc_bytesrc(this); } +}; + +} // namespace upb + +#endif diff --git a/bindings/cpp/upb/def.hpp b/bindings/cpp/upb/def.hpp index ac9aff1796..030ba40e86 100644 --- a/bindings/cpp/upb/def.hpp +++ b/bindings/cpp/upb/def.hpp @@ -1,42 +1,41 @@ -/* - * upb - a minimalist implementation of protocol buffers. - * - * Copyright (c) 2011 Google Inc. See LICENSE for details. - * Author: Josh Haberman - * - * The set of upb::*Def classes and upb::SymbolTable allow for defining and - * manipulating schema information (as defined in .proto files). - * - * Defs go through two distinct phases of life: - * - * 1. MUTABLE: when first created, the properties of the def can be set freely - * (for example a message's name, its list of fields, the name/number of - * fields, etc). During this phase the def is *not* thread-safe, and may - * not be used for any purpose except to set its properties (it can't be - * used to parse anything, create any messages in memory, etc). - * - * 2. FINALIZED: after being added to a symtab (which links the defs together) - * the defs become finalized (thread-safe and immutable). Programs may only - * access defs through a CONST POINTER during this stage -- upb_symtab will - * help you out with this requirement by only vending const pointers, but - * you need to make sure not to use any non-const pointers you still have - * sitting around. In practice this means that you may not call any setters - * on the defs (or functions that themselves call the setters). If you want - * to modify an existing immutable def, copy it with upb_*_dup(), modify the - * copy, and add the modified def to the symtab (replacing the existing - * def). - * - * You can test for which stage of life a def is in by calling - * upb::Def::IsMutable(). This is particularly useful for dynamic language - * bindings, which must properly guarantee that the dynamic language cannot - * break the rules laid out above. - * - * It would be possible to make the defs thread-safe during stage 1 by using - * mutexes internally and changing any methods returning pointers to return - * copies instead. This could be important if we are integrating with a VM or - * interpreter that does not naturally serialize access to wrapped objects (for - * example, in the case of Python this is not necessary because of the GIL). - */ +// +// upb - a minimalist implementation of protocol buffers. +// +// Copyright (c) 2011 Google Inc. See LICENSE for details. +// Author: Josh Haberman +// +// The set of upb::*Def classes and upb::SymbolTable allow for defining and +// manipulating schema information (as defined in .proto files). +// +// Defs go through two distinct phases of life: +// +// 1. MUTABLE: when first created, the properties of the def can be set freely +// (for example a message's name, its list of fields, the name/number of +// fields, etc). During this phase the def is *not* thread-safe, and may +// not be used for any purpose except to set its properties (it can't be +// used to parse anything, create any messages in memory, etc). +// +// 2. FINALIZED: after being added to a symtab (which links the defs together) +// the defs become finalized (thread-safe and immutable). Programs may only +// access defs through a CONST POINTER during this stage -- upb_symtab will +// help you out with this requirement by only vending const pointers, but +// you need to make sure not to use any non-const pointers you still have +// sitting around. In practice this means that you may not call any setters +// on the defs (or functions that themselves call the setters). If you want +// to modify an existing immutable def, copy it with upb_*_dup(), modify the +// copy, and add the modified def to the symtab (replacing the existing +// def). +// +// You can test for which stage of life a def is in by calling +// upb::Def::IsMutable(). This is particularly useful for dynamic language +// bindings, which must properly guarantee that the dynamic language cannot +// break the rules laid out above. +// +// It would be possible to make the defs thread-safe during stage 1 by using +// mutexes internally and changing any methods returning pointers to return +// copies instead. This could be important if we are integrating with a VM or +// interpreter that does not naturally serialize access to wrapped objects (for +// example, in the case of Python this is not necessary because of the GIL). #ifndef UPB_DEF_HPP #define UPB_DEF_HPP diff --git a/bindings/cpp/upb/handlers.hpp b/bindings/cpp/upb/handlers.hpp index 07683f6130..d356a33de3 100644 --- a/bindings/cpp/upb/handlers.hpp +++ b/bindings/cpp/upb/handlers.hpp @@ -1,15 +1,14 @@ -/* - * upb - a minimalist implementation of protocol buffers. - * - * Copyright (c) 2011 Google Inc. See LICENSE for details. - * Author: Josh Haberman - * - * upb::Handlers is a generic visitor-like interface for iterating over a - * stream of protobuf data. You can register function pointers that will be - * called for each message and/or field as the data is being parsed or iterated - * over, without having to know the source format that we are parsing from. - * This decouples the parsing logic from the processing logic. - */ +// +// upb - a minimalist implementation of protocol buffers. +// +// Copyright (c) 2011 Google Inc. See LICENSE for details. +// Author: Josh Haberman +// +// upb::Handlers is a generic visitor-like interface for iterating over a +// stream of protobuf data. You can register function pointers that will be +// called for each message and/or field as the data is being parsed or iterated +// over, without having to know the source format that we are parsing from. +// This decouples the parsing logic from the processing logic. #ifndef UPB_HANDLERS_HPP #define UPB_HANDLERS_HPP @@ -18,6 +17,7 @@ namespace upb { +typedef upb_fieldtype_t FieldType; typedef upb_flow_t Flow; class MessageHandlers; @@ -30,8 +30,8 @@ class FieldHandlers : public upb_fhandlers { // The FieldHandlers will live at least as long as the upb::Handlers to // which it belongs, but can be Ref'd/Unref'd to make it live longer (which // will prolong the life of the underlying upb::Handlers also). - void Ref() const { upb_fhandlers_ref(this); } - void Unref() const { upb_fhandlers_unref(this); } + void Ref() { upb_fhandlers_ref(this); } + void Unref() { upb_fhandlers_unref(this); } // Functions to set this field's handlers. // These return "this" so they can be conveniently chained, eg. @@ -46,13 +46,13 @@ class FieldHandlers : public upb_fhandlers { upb_fhandlers_setstartseq(this, h); return this; } FieldHandlers* SetEndSequenceHandler(EndFieldHandler* h) { - upb_fhandlers_endseq(this, h); return this; + upb_fhandlers_setendseq(this, h); return this; } FieldHandlers* SetStartSubmessageHandler(StartFieldHandler* h) { upb_fhandlers_setstartsubmsg(this, h); return this; } FieldHandlers* SetEndSubmessageHandler(EndFieldHandler* h) { - upb_fhandlers_endsubmsg(this, h); return this; + upb_fhandlers_setendsubmsg(this, h); return this; } // Get/Set the field's bound value, which will be passed to its handlers. @@ -62,27 +62,20 @@ class FieldHandlers : public upb_fhandlers { } // Returns the MessageHandlers to which we belong. - MessageHandlers* GetMessageHandlers() const { - return upb_fhandlers_msg(this); - } - + MessageHandlers* GetMessageHandlers() const; // Returns the MessageHandlers for this field's submessage (invalid to call // unless this field's type UPB_TYPE(MESSAGE) or UPB_TYPE(GROUP). - MessageHandlers* GetSubMessageHandlers() const { - return upb_fhandlers_submsg(this); - } - + MessageHandlers* GetSubMessageHandlers() const; // If set to >=0, the given hasbit will be set after the value callback is - // called (relative to the current closure). - int32_t GetValueHasbit() const { return upb_fhandler_valuehasbit(this); } - void SetValueHasbit(int32_t bit) { upb_fhandler_setvaluehasbit(this, bit); } + // called (offset relative to the current closure). + int32_t GetValueHasbit() const { return upb_fhandlers_getvaluehasbit(this); } + void SetValueHasbit(int32_t bit) { upb_fhandlers_setvaluehasbit(this, bit); } private: FieldHandlers(); // Only created by upb::Handlers. ~FieldHandlers(); // Only destroyed by refcounting. }; - class MessageHandlers : public upb_mhandlers { public: typedef upb_startmsg_handler StartMessageHandler; @@ -91,8 +84,8 @@ class MessageHandlers : public upb_mhandlers { // The MessageHandlers will live at least as long as the upb::Handlers to // which it belongs, but can be Ref'd/Unref'd to make it live longer (which // will prolong the life of the underlying upb::Handlers also). - void Ref() const { upb_mhandlers_ref(this); } - void Unref() const { upb_mhandlers_unref(this); } + void Ref() { upb_mhandlers_ref(this); } + void Unref() { upb_mhandlers_unref(this); } // Functions to set this message's handlers. // These return "this" so they can be conveniently chained, eg. @@ -107,12 +100,10 @@ class MessageHandlers : public upb_mhandlers { } // Functions to create new FieldHandlers for this message. - FieldHandlers* NewFieldHandlers(uint32_t fieldnum, upb_fieldtype_t type, + FieldHandlers* NewFieldHandlers(uint32_t fieldnum, FieldType type, bool repeated) { - return upb_mhandlers_newfhandlers(this, fieldnum, type, repeated); - } - FieldHandlers* NewFieldHandlers(FieldDef* f) { - return upb_mhandlers_newfhandlers_fordef(f); + return static_cast( + upb_mhandlers_newfhandlers(this, fieldnum, type, repeated)); } // Like the previous but for MESSAGE or GROUP fields. For GROUP fields, the @@ -120,15 +111,10 @@ class MessageHandlers : public upb_mhandlers { FieldHandlers* NewFieldHandlersForSubmessage(uint32_t n, const char *name, FieldType type, bool repeated, MessageHandlers* subm) { - return upb_mhandlers_newsubmsgfhandlers(this, n, type, repeated, subm); - } - - FieldHandlers* NewFieldHandlersForSubmessage(FieldDef* f, - MessageHandlers* subm) { - return upb_mhandlers_newsubmsgfhandlers_fordef(f); + return static_cast( + upb_mhandlers_newfhandlers_subm(this, n, type, repeated, subm)); } - private: MessageHandlers(); // Only created by upb::Handlers. ~MessageHandlers(); // Only destroyed by refcounting. @@ -137,26 +123,31 @@ class MessageHandlers : public upb_mhandlers { class Handlers : public upb_handlers { public: // Creates a new Handlers instance. - Handlers* New() { return static_cast(upb_handlers_new()); } + static Handlers* New() { return static_cast(upb_handlers_new()); } void Ref() { upb_handlers_ref(this); } void Unref() { upb_handlers_unref(this); } // Returns a new MessageHandlers object. The first such message that is // obtained will be the top-level message for this Handlers object. - MessageHandlers* NewMessageHandlers() { return upb_handlers_newmhandlers(this); } - - // Freezes the handlers against future modification. Handlers must be - // finalized before they can be passed to a data producer. After Finalize() - // has been called, you may only call const methods on the Handlers and its - // MessageHandlers/FieldHandlers. - void Finalize() { upb_handlers_finalize(this); } + MessageHandlers* NewMessageHandlers() { + return static_cast(upb_handlers_newmhandlers(this)); + } private: - FieldHandlers(); // Only created by Handlers::New(). - ~FieldHandlers(); // Only destroyed by refcounting. + Handlers(); // Only created by Handlers::New(). + ~Handlers(); // Only destroyed by refcounting. }; + +MessageHandlers* FieldHandlers::GetMessageHandlers() const { + return static_cast(upb_fhandlers_getmsg(this)); +} + +MessageHandlers* FieldHandlers::GetSubMessageHandlers() const { + return static_cast(upb_fhandlers_getsubmsg(this)); +} + } // namespace upb #endif diff --git a/bindings/cpp/upb/pb/decoder.hpp b/bindings/cpp/upb/pb/decoder.hpp new file mode 100644 index 0000000000..05bcb8a787 --- /dev/null +++ b/bindings/cpp/upb/pb/decoder.hpp @@ -0,0 +1,83 @@ +// +// upb - a minimalist implementation of protocol buffers. +// +// Copyright (c) 2011 Google Inc. See LICENSE for details. +// Author: Josh Haberman +// +// upb::Decoder is a high performance, streaming decoder for protobuf +// data that works by getting its input data from a ubp::ByteRegion and calling +// into a upb::Handlers. +// +// A DecoderPlan contains whatever data structures and generated (JIT-ted) code +// are necessary to decode protobuf data of a specific type to a specific set +// of handlers. By generating the plan ahead of time, we avoid having to +// redo this work every time we decode. +// +// A DecoderPlan is threadsafe, meaning that it can be used concurrently by +// different upb::Decoders in different threads. However, the upb::Decoders are +// *not* thread-safe. + +#ifndef UPB_PB_DECODER_HPP +#define UPB_PB_DECODER_HPP + +#include "upb/pb/decoder.h" + +#include "upb/bytestream.hpp" +#include "upb/upb.hpp" + +namespace upb { + +class DecoderPlan : public upb_decoderplan { + public: + static DecoderPlan* New(Handlers* h, bool allow_jit) { + return static_cast(upb_decoderplan_new(h, allow_jit)); + } + void Unref() { upb_decoderplan_unref(this); } + + // Returns true if the plan contains JIT-ted code. This may not be the same + // as the "allowjit" parameter to the constructor if support for JIT-ting was + // not compiled in. + bool HasJitCode() { return upb_decoderplan_hasjitcode(this); } + + private: + DecoderPlan() {} // Only constructed by New +}; + +class Decoder : public upb_decoder { + public: + Decoder() { upb_decoder_init(this); } + ~Decoder() { upb_decoder_uninit(this); } + + // Resets the plan that the decoder will parse from. This will also reset the + // decoder's input to be uninitialized -- ResetInput() must be called before + // parsing can occur. The plan must live until the decoder is destroyed or + // reset to a different plan. + // + // Must be called before ResetInput() or Decode(). + void ResetPlan(DecoderPlan* plan, int32_t msg_offset) { + upb_decoder_resetplan(this, plan, msg_offset); + } + + // Resets the input of the decoder. This puts it in a state where it has not + // seen any data, and expects the next data to be from the beginning of a new + // protobuf. + // + // ResetInput() must be called before Decode() but may be called more than + // once. "input" must live until the decoder destroyed or ResetInput is + // called again. "c" is the closure that will be passed to the handlers. + void ResetInput(ByteRegion* byte_region, void* c) { + upb_decoder_resetinput(this, byte_region, c); + } + + // Decodes serialized data (calling Handlers as the data is parsed) until + // error or EOF (see status() for details). + Success Decode() { return upb_decoder_decode(this); } + + const upb::Status& status() { + return static_cast(*upb_decoder_status(this)); + } +}; + +} // namespace upb + +#endif diff --git a/bindings/cpp/upb/upb.hpp b/bindings/cpp/upb/upb.hpp index 4fb337dd9c..226859c459 100644 --- a/bindings/cpp/upb/upb.hpp +++ b/bindings/cpp/upb/upb.hpp @@ -1,23 +1,34 @@ -/* - * upb - a minimalist implementation of protocol buffers. - * - * Copyright (c) 2011 Google Inc. See LICENSE for details. - * Author: Josh Haberman - */ +// +// upb - a minimalist implementation of protocol buffers. +// +// Copyright (c) 2011 Google Inc. See LICENSE for details. +// Author: Josh Haberman #ifndef UPB_HPP #define UPB_HPP #include "upb/upb.h" +#include namespace upb { +typedef upb_success_t Success; + class Status : public upb_status { public: Status() { upb_status_init(this); } ~Status() { upb_status_uninit(this); } + bool ok() const { return upb_ok(this); } + bool eof() const { return upb_eof(this); } + const char *GetString() const { return upb_status_getstr(this); } + void SetEof() { upb_status_seteof(this); } + void SetErrorLiteral(const char* msg) { + upb_status_seterrliteral(this, msg); + } + + void Clear() { upb_status_clear(this); } }; class Value : public upb_value { diff --git a/examples/stream_transcode.c b/examples/stream_transcode.c new file mode 100644 index 0000000000..21c375bce6 --- /dev/null +++ b/examples/stream_transcode.c @@ -0,0 +1,76 @@ + +#include +#include "upb/bytestream.h" +#include "upb/pb/decoder.h" +#include "upb/pb/glue.h" +#include "upb/pb/textprinter.h" + +int main(int argc, char *argv[]) { + if (argc < 3) { + fprintf(stderr, "Usage: stream_transcode \n"); + return 1; + } + + upb_symtab *symtab = upb_symtab_new(); + size_t desc_len; + const char *desc = upb_readfile(argv[1], &desc_len); + if (!desc) { + fprintf(stderr, "Couldn't open descriptor file: %s\n", argv[1]); + return 1; + } + + upb_status status = UPB_STATUS_INIT; + upb_load_descriptor_into_symtab(symtab, desc, desc_len, &status); + if (!upb_ok(&status)) { + fprintf(stderr, "Error parsing descriptor: %s", upb_status_getstr(&status)); + return 1; + } + free((void*)desc); + + const upb_def *md = upb_symtab_lookup(symtab, argv[2]); + if (!md) { + fprintf(stderr, "Descriptor did not contain message: %s\n", argv[2]); + return 1; + } + + const upb_msgdef *m = upb_dyncast_msgdef_const(md); + if (!m) { + fprintf(stderr, "Def was not a msgdef.\n"); + return 1; + } + + upb_stdio in, out; + upb_stdio_init(&in); + upb_stdio_init(&out); + upb_stdio_reset(&in, stdin); + upb_stdio_reset(&out, stdout); + + upb_handlers *handlers = upb_handlers_new(); + upb_textprinter *p = upb_textprinter_new(); + upb_textprinter_reset(p, upb_stdio_bytesink(&out), false); + upb_textprinter_reghandlers(handlers, m); + + upb_decoder d; + upb_decoder_init(&d, handlers); + upb_decoder_reset(&d, upb_stdio_bytesrc(&in), 0, UPB_NONDELIMITED, p); + + upb_status_clear(&status); + upb_decoder_decode(&d, &status); + + if (!upb_ok(&status)) { + fprintf(stderr, "Error parsing input: %s", upb_status_getstr(&status)); + } + + upb_status_uninit(&status); + upb_stdio_uninit(&in); + upb_stdio_uninit(&out); + upb_decoder_uninit(&d); + upb_textprinter_free(p); + upb_def_unref(UPB_UPCAST(m)); + upb_symtab_unref(symtab); + + // Prevent C library from holding buffers open, so Valgrind doesn't see + // memory leaks. + fclose(stdin); + fclose(stdout); +} diff --git a/tests/test_cpp.cc b/tests/test_cpp.cc index ecf27bf14c..5182217725 100644 --- a/tests/test_cpp.cc +++ b/tests/test_cpp.cc @@ -9,7 +9,11 @@ #include #include +#include "upb/bytestream.hpp" #include "upb/def.hpp" +#include "upb/handlers.hpp" +#include "upb/upb.hpp" +#include "upb/pb/decoder.hpp" #include "upb/pb/glue.hpp" static void TestSymbolTable(const char *descriptor_file) { @@ -26,11 +30,22 @@ static void TestSymbolTable(const char *descriptor_file) { md->Unref(); } +static void TestByteStream() { + upb::StringSource stringsrc; + stringsrc.Reset("testing", 7); + upb::ByteRegion* byteregion = stringsrc.AllBytes(); + assert(byteregion->FetchAll() == UPB_BYTE_OK); + char* str = byteregion->StrDup(); + assert(strcmp(str, "testing") == 0); + free(str); +} + int main(int argc, char *argv[]) { if (argc < 2) { fprintf(stderr, "Usage: test_cpp \n"); return 1; } TestSymbolTable(argv[1]); + TestByteStream(); return 0; } diff --git a/tests/test_decoder.c b/tests/test_decoder.c index 84a90cdee6..0db3bfa0d3 100644 --- a/tests/test_decoder.c +++ b/tests/test_decoder.c @@ -1,76 +1,666 @@ +/* + * upb - a minimalist implementation of protocol buffers. + * + * Copyright (c) 2011 Google Inc. See LICENSE for details. + * + * An exhaustive set of tests for parsing both valid and invalid protobuf + * input, with buffer breaks in arbitrary places. + * + * Tests to add: + * - unknown field handler called appropriately + * - unknown fields can be inserted in random places + * - fuzzing of valid input + * - resource limits (max stack depth, max string len) + * - testing of groups + * - more throrough testing of sequences + * - test skipping of submessages + * - test suspending the decoder + * - buffers that are close enough to the end of the address space that + * pointers overflow (this might be difficult). + * - a few "kitchen sink" examples (one proto that uses all types, lots + * of submsg/sequences, etc. + */ +#include +#include +#include #include -#include "upb/bytestream.h" +#include +#include "upb/handlers.h" #include "upb/pb/decoder.h" -#include "upb/pb/glue.h" -#include "upb/pb/textprinter.h" +#include "upb/pb/varint.h" +#include "upb/upb.h" +#include "upb_test.h" -int main(int argc, char *argv[]) { - if (argc < 3) { - fprintf(stderr, "Usage: test_decoder \n"); - return 1; +typedef struct { + char *buf; + size_t len; +} buffer; + +// Mem is initialized to NULL. +buffer *buffer_new(size_t len) { + buffer *buf = malloc(sizeof(*buf)); + buf->buf = malloc(len); + buf->len = len; + memset(buf->buf, 0, buf->len); + return buf; +} + +buffer *buffer_new2(const void *data, size_t len) { + buffer *buf = buffer_new(len); + memcpy(buf->buf, data, len); + return buf; +} + +buffer *buffer_new3(const char *data) { + return buffer_new2(data, strlen(data)); +} + +buffer *buffer_dup(buffer *buf) { return buffer_new2(buf->buf, buf->len); } + +void buffer_free(buffer *buf) { + free(buf->buf); + free(buf); +} + +void buffer_appendf(buffer *buf, const char *fmt, ...) { + va_list args; + va_start(args, fmt); + size_t size = buf->len; + buf->len += upb_vrprintf(&buf->buf, &size, buf->len, fmt, args); + va_end(args); +} + +void buffer_cat(buffer *buf, buffer *buf2) { + size_t newlen = buf->len + buf2->len; + buf->buf = realloc(buf->buf, newlen); + memcpy(buf->buf + buf->len, buf2->buf, buf2->len); + buf->len = newlen; + buffer_free(buf2); +} + +bool buffer_eql(buffer *buf, buffer *buf2) { + return buf->len == buf2->len && memcmp(buf->buf, buf2->buf, buf->len) == 0; +} + + +/* Routines for building arbitrary protos *************************************/ + +buffer *cat(buffer *arg1, ...) { + va_list ap; + buffer *arg; + va_start(ap, arg1); + while ((arg = va_arg(ap, buffer*)) != NULL) { + buffer_cat(arg1, arg); } + va_end(ap); + return arg1; +} + +buffer *varint(uint64_t x) { + buffer *buf = buffer_new(UPB_PB_VARINT_MAX_LEN + 1); + buf->len = upb_vencode64(x, buf->buf); + return buf; +} + +// TODO: proper byte-swapping for big-endian machines. +buffer *fixed32(void *data) { return buffer_new2(data, 4); } +buffer *fixed64(void *data) { return buffer_new2(data, 8); } + +buffer *delim(buffer *buf) { return cat( varint(buf->len), buf, NULL ); } +buffer *uint32(uint32_t u32) { return fixed32(&u32); } +buffer *uint64(uint64_t u64) { return fixed64(&u64); } +buffer *flt(float f) { return fixed32(&f); } +buffer *dbl(double d) { return fixed64(&d); } +buffer *zz32(int32_t x) { return varint(upb_zzenc_32(x)); } +buffer *zz64(int64_t x) { return varint(upb_zzenc_64(x)); } + +buffer *tag(uint32_t fieldnum, char wire_type) { + return varint((fieldnum << 3) | wire_type); +} + +buffer *submsg(uint32_t fn, buffer *buf) { + return cat( tag(fn, UPB_WIRE_TYPE_DELIMITED), delim(buf), NULL ); +} - upb_symtab *symtab = upb_symtab_new(); - size_t desc_len; - const char *desc = upb_readfile(argv[1], &desc_len); - if (!desc) { - fprintf(stderr, "Couldn't open descriptor file: %s\n", argv[1]); - return 1; + +/* A set of handlers that covers all .proto types *****************************/ + +// The handlers simply append to a string indicating what handlers were called. +// This string is similar to protobuf text format but fields are referred to by +// number instead of name and sequences are explicitly delimited. + +#define VALUE_HANDLER(member, fmt) \ + upb_flow_t value_ ## member(void *closure, upb_value fval, upb_value val) { \ + buffer_appendf(closure, "%" PRIu32 ":%" fmt "; ", \ + upb_value_getuint32(fval), upb_value_get ## member(val)); \ + return UPB_CONTINUE; \ } - upb_status status = UPB_STATUS_INIT; - upb_load_descriptor_into_symtab(symtab, desc, desc_len, &status); - if (!upb_ok(&status)) { - fprintf(stderr, "Error parsing descriptor: %s", upb_status_getstr(&status)); - return 1; +VALUE_HANDLER(uint32, PRIu32) +VALUE_HANDLER(uint64, PRIu64) +VALUE_HANDLER(int32, PRId32) +VALUE_HANDLER(int64, PRId64) +VALUE_HANDLER(float, "g") +VALUE_HANDLER(double, "g") + +upb_flow_t value_bool(void *closure, upb_value fval, upb_value val) { + buffer_appendf(closure, "%" PRIu32 ":%s; ", + upb_value_getuint32(fval), + upb_value_getbool(val) ? "true" : "false"); + return UPB_CONTINUE; +} + +upb_flow_t value_string(void *closure, upb_value fval, upb_value val) { + // Note: won't work with strings that contain NULL. + char *str = upb_byteregion_strdup(upb_value_getbyteregion(val)); + buffer_appendf(closure, "%" PRIu32 ":%s; ", upb_value_getuint32(fval), str); + free(str); + return UPB_CONTINUE; +} + +upb_sflow_t startsubmsg(void *closure, upb_value fval) { + buffer_appendf(closure, "%" PRIu32 ":{ ", upb_value_getuint32(fval)); + return UPB_CONTINUE_WITH(closure); +} + +upb_flow_t endsubmsg(void *closure, upb_value fval) { + buffer_appendf(closure, "} "); + return UPB_CONTINUE; +} + +upb_sflow_t startseq(void *closure, upb_value fval) { + buffer_appendf(closure, "%" PRIu32 ":[ ", upb_value_getuint32(fval)); + return UPB_CONTINUE_WITH(closure); +} + +upb_flow_t endseq(void *closure, upb_value fval) { + buffer_appendf(closure, "] "); + return UPB_CONTINUE; +} + +void doreg(upb_mhandlers *m, uint32_t num, upb_fieldtype_t type, bool repeated, + upb_value_handler *handler) { + upb_fhandlers *f = upb_mhandlers_newfhandlers(m, num, type, repeated); + ASSERT(f); + upb_fhandlers_setvalue(f, handler); + upb_fhandlers_setstartseq(f, &startseq); + upb_fhandlers_setendseq(f, &endseq); + upb_fhandlers_setfval(f, upb_value_uint32(num)); +} + +// The repeated field number to correspond to the given non-repeated field +// number. +uint32_t rep_fn(uint32_t fn) { + return (UPB_MAX_FIELDNUMBER - 1000) + fn; +} + +#define NOP_FIELD 40 +#define UNKNOWN_FIELD 666 + +void reg(upb_mhandlers *m, upb_fieldtype_t type, upb_value_handler *handler) { + // We register both a repeated and a non-repeated field for every type. + // For the non-repeated field we make the field number the same as the + // type. For the repeated field we make it a function of the type. + doreg(m, type, type, false, handler); + doreg(m, rep_fn(type), type, true, handler); +} + +void reg_subm(upb_mhandlers *m, uint32_t num, upb_fieldtype_t type, + bool repeated) { + upb_fhandlers *f = + upb_mhandlers_newfhandlers_subm(m, num, type, repeated, m); + ASSERT(f); + upb_fhandlers_setstartseq(f, &startseq); + upb_fhandlers_setendseq(f, &endseq); + upb_fhandlers_setstartsubmsg(f, &startsubmsg); + upb_fhandlers_setendsubmsg(f, &endsubmsg); + upb_fhandlers_setfval(f, upb_value_uint32(num)); +} + +void reghandlers(upb_mhandlers *m) { + // Register handlers for each type. + reg(m, UPB_TYPE(DOUBLE), &value_double); + reg(m, UPB_TYPE(FLOAT), &value_float); + reg(m, UPB_TYPE(INT64), &value_int64); + reg(m, UPB_TYPE(UINT64), &value_uint64); + reg(m, UPB_TYPE(INT32) , &value_int32); + reg(m, UPB_TYPE(FIXED64), &value_uint64); + reg(m, UPB_TYPE(FIXED32), &value_uint32); + reg(m, UPB_TYPE(BOOL), &value_bool); + reg(m, UPB_TYPE(STRING), &value_string); + reg(m, UPB_TYPE(BYTES), &value_string); + reg(m, UPB_TYPE(UINT32), &value_uint32); + reg(m, UPB_TYPE(ENUM), &value_int32); + reg(m, UPB_TYPE(SFIXED32), &value_int32); + reg(m, UPB_TYPE(SFIXED64), &value_int64); + reg(m, UPB_TYPE(SINT32), &value_int32); + reg(m, UPB_TYPE(SINT64), &value_int64); + + // Register submessage/group handlers that are self-recursive + // to this type, eg: message M { optional M m = 1; } + reg_subm(m, UPB_TYPE(MESSAGE), UPB_TYPE(MESSAGE), false); + reg_subm(m, UPB_TYPE(GROUP), UPB_TYPE(GROUP), false); + reg_subm(m, rep_fn(UPB_TYPE(MESSAGE)), UPB_TYPE(MESSAGE), true); + reg_subm(m, rep_fn(UPB_TYPE(GROUP)), UPB_TYPE(GROUP), true); + + // Register a no-op string field so we can pad the proto wherever we want. + upb_mhandlers_newfhandlers(m, NOP_FIELD, UPB_TYPE(STRING), false); +} + + +/* Custom bytesrc that can insert buffer seams in arbitrary places ************/ + +typedef struct { + upb_bytesrc bytesrc; + const char *str; + size_t len, seam1, seam2; + upb_byteregion byteregion; +} upb_seamsrc; + +size_t upb_seamsrc_avail(const upb_seamsrc *src, size_t ofs) { + if (ofs < src->seam1) return src->seam1 - ofs; + if (ofs < src->seam2) return src->seam2 - ofs; + return src->len - ofs; +} + +upb_bytesuccess_t upb_seamsrc_fetch(void *_src, uint64_t ofs, size_t *read) { + upb_seamsrc *src = _src; + assert(ofs < src->len); + if (ofs == src->len) { + upb_status_seteof(&src->bytesrc.status); + return UPB_BYTE_EOF; } - free((void*)desc); + *read = upb_seamsrc_avail(src, ofs); + return UPB_BYTE_OK; +} + +void upb_seamsrc_copy(const void *_src, uint64_t ofs, + size_t len, char *dst) { + const upb_seamsrc *src = _src; + assert(ofs + len <= src->len); + memcpy(dst, src->str + ofs, len); +} + +void upb_seamsrc_discard(void *src, uint64_t ofs) { + (void)src; + (void)ofs; +} + +const char *upb_seamsrc_getptr(const void *_s, uint64_t ofs, size_t *len) { + const upb_seamsrc *src = _s; + *len = upb_seamsrc_avail(src, ofs); + return src->str + ofs; +} - const upb_def *md = upb_symtab_lookup(symtab, argv[2]); - if (!md) { - fprintf(stderr, "Descriptor did not contain message: %s\n", argv[2]); - return 1; +void upb_seamsrc_init(upb_seamsrc *s, const char *str, size_t len) { + static upb_bytesrc_vtbl vtbl = { + &upb_seamsrc_fetch, + &upb_seamsrc_discard, + &upb_seamsrc_copy, + &upb_seamsrc_getptr, + }; + upb_bytesrc_init(&s->bytesrc, &vtbl); + s->seam1 = 0; + s->seam2 = 0; + s->str = str; + s->len = len; + s->byteregion.bytesrc = &s->bytesrc; + s->byteregion.toplevel = true; + s->byteregion.start = 0; + s->byteregion.end = len; +} + +void upb_seamsrc_resetseams(upb_seamsrc *s, size_t seam1, size_t seam2) { + ASSERT(seam1 <= seam2); + s->seam1 = seam1; + s->seam2 = seam2; + s->byteregion.discard = 0; + s->byteregion.fetch = 0; +} + +void upb_seamsrc_uninit(upb_seamsrc *s) { (void)s; } + +upb_bytesrc *upb_seamsrc_bytesrc(upb_seamsrc *s) { + return &s->bytesrc; +} + +// Returns the top-level upb_byteregion* for this seamsrc. Invalidated when +// the seamsrc is reset. +upb_byteregion *upb_seamsrc_allbytes(upb_seamsrc *s) { + return &s->byteregion; +} + + +/* Running of test cases ******************************************************/ + +upb_decoderplan *plan; + +void run_decoder(buffer *proto, buffer *expected_output) { + upb_seamsrc src; + upb_seamsrc_init(&src, proto->buf, proto->len); + upb_decoder d; + upb_decoder_init(&d); + upb_decoder_resetplan(&d, plan, 0); + for (size_t i = 0; i < proto->len; i++) { + for (size_t j = i; j < proto->len; j++) { + upb_seamsrc_resetseams(&src, i, j); + upb_byteregion *input = upb_seamsrc_allbytes(&src); + buffer *output = buffer_new(0); + upb_decoder_resetinput(&d, input, output); + upb_success_t success = UPB_SUSPENDED; + while (success == UPB_SUSPENDED) + success = upb_decoder_decode(&d); + ASSERT(upb_ok(upb_decoder_status(&d)) == (success == UPB_OK)); + if (expected_output) { + ASSERT(success == UPB_OK); + // The input should be fully consumed. + ASSERT(upb_byteregion_fetchofs(input) == upb_byteregion_endofs(input)); + ASSERT(upb_byteregion_discardofs(input) == + upb_byteregion_endofs(input)); + if (!buffer_eql(output, expected_output)) { + fprintf(stderr, "Text mismatch: '%s' vs '%s'\n", + output->buf, expected_output->buf); + } + ASSERT(strcmp(output->buf, expected_output->buf) == 0); + } else { + ASSERT(success == UPB_ERROR); + } + buffer_free(output); + } } + upb_seamsrc_uninit(&src); + upb_decoder_uninit(&d); + buffer_free(proto); +} + +void assert_successful_parse_at_eof(buffer *proto, const char *expected_fmt, + va_list args) { + buffer *expected_text = buffer_new(0); + size_t size = expected_text->len; + expected_text->len += upb_vrprintf(&expected_text->buf, &size, + expected_text->len, expected_fmt, args); + run_decoder(proto, expected_text); + buffer_free(expected_text); +} + +void assert_does_not_parse_at_eof(buffer *proto) { + run_decoder(proto, NULL); +} + +void assert_successful_parse(buffer *proto, const char *expected_fmt, ...) { + // The JIT is only used for data >=20 bytes from end-of-buffer, so + // repeat once with no-op padding data at the end of buffer. + va_list args, args2; + va_start(args, expected_fmt); + va_copy(args2, args); + assert_successful_parse_at_eof(buffer_dup(proto), expected_fmt, args); + assert_successful_parse_at_eof( + cat( proto, + tag(NOP_FIELD, UPB_WIRE_TYPE_DELIMITED), delim(buffer_new(30)), + NULL ), + expected_fmt, args2); + va_end(args); + va_end(args2); +} + +void assert_does_not_parse(buffer *proto) { + // The JIT is only used for data >=20 bytes from end-of-buffer, so + // repeat once with no-op padding data at the end of buffer. + assert_does_not_parse_at_eof(buffer_dup(proto)); + assert_does_not_parse_at_eof( + cat( proto, + tag(NOP_FIELD, UPB_WIRE_TYPE_DELIMITED), delim( buffer_new(30)), + NULL )); +} + + +/* The actual tests ***********************************************************/ + +void test_premature_eof_for_type(upb_fieldtype_t type) { + // Incomplete values for each wire type. + static const char *incompletes[] = { + "\x80", // UPB_WIRE_TYPE_VARINT + "abcdefg", // UPB_WIRE_TYPE_64BIT + "\x80", // UPB_WIRE_TYPE_DELIMITED (partial length) + NULL, // UPB_WIRE_TYPE_START_GROUP (no value required) + NULL, // UPB_WIRE_TYPE_END_GROUP (no value required) + "abc" // UPB_WIRE_TYPE_32BIT + }; + + uint32_t fieldnum = type; + uint32_t rep_fieldnum = rep_fn(type); + int wire_type = upb_types[type].native_wire_type; + const char *incomplete = incompletes[wire_type]; + + // EOF before a known non-repeated value. + assert_does_not_parse_at_eof(tag(fieldnum, wire_type)); + + // EOF before a known repeated value. + assert_does_not_parse_at_eof(tag(rep_fieldnum, wire_type)); + + // EOF before an unknown value. + assert_does_not_parse_at_eof(tag(UNKNOWN_FIELD, wire_type)); + + // EOF inside a known non-repeated value. + assert_does_not_parse_at_eof( + cat( tag(fieldnum, wire_type), buffer_new3(incomplete), NULL )); + + // EOF inside a known repeated value. + assert_does_not_parse_at_eof( + cat( tag(rep_fieldnum, wire_type), buffer_new3(incomplete), NULL )); + + // EOF inside an unknown value. + assert_does_not_parse_at_eof( + cat( tag(UNKNOWN_FIELD, wire_type), buffer_new3(incomplete), NULL )); + + if (wire_type == UPB_WIRE_TYPE_DELIMITED) { + // EOF in the middle of delimited data for known non-repeated value. + assert_does_not_parse_at_eof( + cat( tag(fieldnum, wire_type), varint(1), NULL )); + + // EOF in the middle of delimited data for known repeated value. + assert_does_not_parse_at_eof( + cat( tag(rep_fieldnum, wire_type), varint(1), NULL )); - const upb_msgdef *m = upb_dyncast_msgdef_const(md); - if (!m) { - fprintf(stderr, "Def was not a msgdef.\n"); - return 1; + // EOF in the middle of delimited data for unknown value. + assert_does_not_parse_at_eof( + cat( tag(UNKNOWN_FIELD, wire_type), varint(1), NULL )); + + if (type == UPB_TYPE(MESSAGE)) { + // Submessage ends in the middle of a value. + buffer *incomplete_submsg = + cat ( tag(UPB_TYPE(INT32), UPB_WIRE_TYPE_VARINT), + buffer_new3(incompletes[UPB_WIRE_TYPE_VARINT]), NULL ); + assert_does_not_parse( + cat( tag(fieldnum, UPB_WIRE_TYPE_DELIMITED), + varint(incomplete_submsg->len), + incomplete_submsg, NULL )); + } + } else { + // Packed region ends in the middle of a value. + assert_does_not_parse( + cat( tag(rep_fieldnum, UPB_WIRE_TYPE_DELIMITED), + varint(strlen(incomplete)), + buffer_new3(incomplete), NULL )); + + // EOF in the middle of packed region. + assert_does_not_parse_at_eof( + cat( tag(rep_fieldnum, UPB_WIRE_TYPE_DELIMITED), varint(1), NULL )); } +} - upb_stdio in, out; - upb_stdio_init(&in); - upb_stdio_init(&out); - upb_stdio_reset(&in, stdin); - upb_stdio_reset(&out, stdout); +// "33" and "66" are just two random values that all numeric types can +// represent. +void test_valid_data_for_type(upb_fieldtype_t type, + buffer *enc33, buffer *enc66) { + uint32_t fieldnum = type; + uint32_t rep_fieldnum = rep_fn(type); + int wire_type = upb_types[type].native_wire_type; - upb_handlers *handlers = upb_handlers_new(); - upb_textprinter *p = upb_textprinter_new(); - upb_textprinter_reset(p, upb_stdio_bytesink(&out), false); - upb_textprinter_reghandlers(handlers, m); + // Non-repeated + assert_successful_parse( + cat( tag(fieldnum, wire_type), buffer_dup(enc33), + tag(fieldnum, wire_type), buffer_dup(enc66), NULL ), + "%u:33; %u:66; ", fieldnum, fieldnum); - upb_decoder d; - upb_decoder_init(&d, handlers); - upb_decoder_reset(&d, upb_stdio_allbytes(&in), p); + // Non-packed repeated. + assert_successful_parse( + cat( tag(rep_fieldnum, wire_type), buffer_dup(enc33), + tag(rep_fieldnum, wire_type), buffer_dup(enc66), NULL ), + "%u:[ %u:33; %u:66; ] ", rep_fieldnum, rep_fieldnum, rep_fieldnum); + + // Packed repeated. + assert_successful_parse( + cat( tag(rep_fieldnum, UPB_WIRE_TYPE_DELIMITED), + delim(cat( buffer_dup(enc33), buffer_dup(enc66), NULL )), NULL ), + "%u:[ %u:33; %u:66; ] ", rep_fieldnum, rep_fieldnum, rep_fieldnum); + + buffer_free(enc33); + buffer_free(enc66); +} + +void test_valid_data_for_signed_type(upb_fieldtype_t type, + buffer *enc33, buffer *enc66) { + uint32_t fieldnum = type; + uint32_t rep_fieldnum = rep_fn(type); + int wire_type = upb_types[type].native_wire_type; + + // Non-repeated + assert_successful_parse( + cat( tag(fieldnum, wire_type), buffer_dup(enc33), + tag(fieldnum, wire_type), buffer_dup(enc66), NULL ), + "%u:33; %u:-66; ", fieldnum, fieldnum); + + // Non-packed repeated. + assert_successful_parse( + cat( tag(rep_fieldnum, wire_type), buffer_dup(enc33), + tag(rep_fieldnum, wire_type), buffer_dup(enc66), NULL ), + "%u:[ %u:33; %u:-66; ] ", rep_fieldnum, rep_fieldnum, rep_fieldnum); + + // Packed repeated. + assert_successful_parse( + cat( tag(rep_fieldnum, UPB_WIRE_TYPE_DELIMITED), + delim(cat( buffer_dup(enc33), buffer_dup(enc66), NULL )), NULL ), + "%u:[ %u:33; %u:-66; ] ", rep_fieldnum, rep_fieldnum, rep_fieldnum); + + buffer_free(enc33); + buffer_free(enc66); +} + +// Test that invalid protobufs are properly detected (without crashing) and +// have an error reported. Field numbers match registered handlers above. +void test_invalid() { + test_premature_eof_for_type(UPB_TYPE(DOUBLE)); + test_premature_eof_for_type(UPB_TYPE(FLOAT)); + test_premature_eof_for_type(UPB_TYPE(INT64)); + test_premature_eof_for_type(UPB_TYPE(UINT64)); + test_premature_eof_for_type(UPB_TYPE(INT32)); + test_premature_eof_for_type(UPB_TYPE(FIXED64)); + test_premature_eof_for_type(UPB_TYPE(FIXED32)); + test_premature_eof_for_type(UPB_TYPE(BOOL)); + test_premature_eof_for_type(UPB_TYPE(STRING)); + test_premature_eof_for_type(UPB_TYPE(BYTES)); + test_premature_eof_for_type(UPB_TYPE(UINT32)); + test_premature_eof_for_type(UPB_TYPE(ENUM)); + test_premature_eof_for_type(UPB_TYPE(SFIXED32)); + test_premature_eof_for_type(UPB_TYPE(SFIXED64)); + test_premature_eof_for_type(UPB_TYPE(SINT32)); + test_premature_eof_for_type(UPB_TYPE(SINT64)); + + // EOF inside a tag's varint. + assert_does_not_parse_at_eof( buffer_new3("\x80") ); + + // EOF inside a known group. + assert_does_not_parse_at_eof( tag(4, UPB_WIRE_TYPE_START_GROUP) ); + + // EOF inside an unknown group. + assert_does_not_parse_at_eof( tag(UNKNOWN_FIELD, UPB_WIRE_TYPE_START_GROUP) ); - upb_status_clear(&status); - upb_decoder_decode(&d, &status); + // End group that we are not currently in. + assert_does_not_parse( tag(4, UPB_WIRE_TYPE_END_GROUP) ); - if (!upb_ok(&status)) { - fprintf(stderr, "Error parsing input: %s", upb_status_getstr(&status)); + // Field number is 0. + assert_does_not_parse( + cat( tag(0, UPB_WIRE_TYPE_DELIMITED), varint(0), NULL )); + + // Field number is too large. + assert_does_not_parse( + cat( tag(UPB_MAX_FIELDNUMBER + 1, UPB_WIRE_TYPE_DELIMITED), + varint(0), NULL )); + + // Test exceeding the resource limit of stack depth. + buffer *buf = buffer_new3(""); + for (int i = 0; i < UPB_MAX_NESTING; i++) { + buf = submsg(UPB_TYPE(MESSAGE), buf); } + assert_does_not_parse(buf); - upb_status_uninit(&status); - upb_stdio_uninit(&in); - upb_stdio_uninit(&out); - upb_decoder_uninit(&d); - upb_textprinter_free(p); - upb_def_unref(UPB_UPCAST(m)); - upb_symtab_unref(symtab); - - // Prevent C library from holding buffers open, so Valgrind doesn't see - // memory leaks. - fclose(stdin); - fclose(stdout); + // Staying within the stack limit should work properly. + buf = buffer_new3(""); + buffer *textbuf = buffer_new3(""); + int total = UPB_MAX_NESTING - 1; + for (int i = 0; i < total; i++) { + buf = submsg(UPB_TYPE(MESSAGE), buf); + buffer_appendf(textbuf, "%u:{ ", UPB_TYPE(MESSAGE)); + } + for (int i = 0; i < total; i++) { + buffer_appendf(textbuf, "} "); + } + assert_successful_parse(buf, "%s", textbuf->buf); + buffer_free(textbuf); +} + +void test_valid() { + test_valid_data_for_signed_type(UPB_TYPE(DOUBLE), dbl(33), dbl(-66)); + test_valid_data_for_signed_type(UPB_TYPE(FLOAT), flt(33), flt(-66)); + test_valid_data_for_signed_type(UPB_TYPE(INT64), varint(33), varint(-66)); + test_valid_data_for_signed_type(UPB_TYPE(INT32), varint(33), varint(-66)); + test_valid_data_for_signed_type(UPB_TYPE(ENUM), varint(33), varint(-66)); + test_valid_data_for_signed_type(UPB_TYPE(SFIXED32), uint32(33), uint32(-66)); + test_valid_data_for_signed_type(UPB_TYPE(SFIXED64), uint64(33), uint64(-66)); + test_valid_data_for_signed_type(UPB_TYPE(SINT32), zz32(33), zz32(-66)); + test_valid_data_for_signed_type(UPB_TYPE(SINT64), zz64(33), zz64(-66)); + + test_valid_data_for_type(UPB_TYPE(UINT64), varint(33), varint(66)); + test_valid_data_for_type(UPB_TYPE(UINT32), varint(33), varint(66)); + test_valid_data_for_type(UPB_TYPE(FIXED64), uint64(33), uint64(66)); + test_valid_data_for_type(UPB_TYPE(FIXED32), uint32(33), uint32(66)); + + // Submessage tests. + uint32_t msg_fn = UPB_TYPE(MESSAGE); + assert_successful_parse( + submsg(msg_fn, submsg(msg_fn, submsg(msg_fn, buffer_new3("")))), + "%u:{ %u:{ %u:{ } } } ", msg_fn, msg_fn, msg_fn); + + uint32_t repm_fn = rep_fn(UPB_TYPE(MESSAGE)); + assert_successful_parse( + submsg(repm_fn, submsg(repm_fn, buffer_new3(""))), + "%u:[ %u:{ %u:[ %u:{ } ] } ] ", repm_fn, repm_fn, repm_fn, repm_fn); +} + +void run_tests() { + test_invalid(); + test_valid(); +} + +int main() { + // Construct decoder plan. + upb_handlers *h = upb_handlers_new(); + reghandlers(upb_handlers_newmhandlers(h)); + + // Test without JIT. + plan = upb_decoderplan_new(h, false); + run_tests(); + upb_decoderplan_unref(plan); + + // Test JIT. + plan = upb_decoderplan_new(h, true); + run_tests(); + upb_decoderplan_unref(plan); + + plan = NULL; + printf("All tests passed, %d assertions.\n", num_assertions); + upb_handlers_unref(h); + return 0; } diff --git a/tests/test_varint.c b/tests/test_varint.c index 4c076b3a47..0fc93f02ed 100644 --- a/tests/test_varint.c +++ b/tests/test_varint.c @@ -8,12 +8,39 @@ #include "upb/pb/varint.h" #include "upb_test.h" +// Test that we can round-trip from int->varint->int. +static void test_varint_for_num(upb_decoderet (*decoder)(const char*), + uint64_t num) { + char buf[16]; + memset(buf, 0xff, sizeof(buf)); + size_t bytes = upb_vencode64(num, buf); + + if (num <= UINT32_MAX) { + char buf2[16]; + memset(buf2, 0, sizeof(buf2)); + uint64_t encoded = upb_vencode32(num); + memcpy(&buf2, &encoded, 8); + upb_decoderet r = decoder(buf2); + ASSERT(r.val == num); + ASSERT(r.p == buf2 + upb_value_size(encoded)); + ASSERT(upb_zzenc_32(upb_zzdec_32(num)) == num); + } + + upb_decoderet r = decoder(buf); + ASSERT(r.val == num); + ASSERT(r.p == buf + bytes); + ASSERT(upb_zzenc_64(upb_zzdec_64(num)) == num); +} + static void test_varint_decoder(upb_decoderet (*decoder)(const char*)) { #define TEST(bytes, expected_val) {\ - const char buf[] = bytes "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff" ; \ + size_t n = sizeof(bytes) - 1; /* for NULL */ \ + char buf[UPB_PB_VARINT_MAX_LEN]; \ + memset(buf, 0xff, sizeof(buf)); \ + memcpy(buf, bytes, n); \ upb_decoderet r = decoder(buf); \ ASSERT(r.val == expected_val); \ - ASSERT(r.p == buf + sizeof(buf) - 16); /* - 1 for NULL */ \ + ASSERT(r.p == buf + n); \ } TEST("\x00", 0ULL); @@ -30,12 +57,19 @@ static void test_varint_decoder(upb_decoderet (*decoder)(const char*)) { TEST("\x81\x83\x87\x8f\x9f\xbf\xff\x81\x83\x07", 0x8303fdf9f1e1c181ULL); #undef TEST - char twelvebyte[16] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01, 0x01}; + char twelvebyte[16] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x01, 0x01}; const char *twelvebyte_buf = twelvebyte; // A varint that terminates before hitting the end of the provided buffer, // but in too many bytes (11 instead of 10). upb_decoderet r = decoder(twelvebyte_buf); ASSERT(r.p == NULL); + + + for (uint64_t num = 5; num * 1.5 > num; num *= 1.5) { + test_varint_for_num(decoder, num); + } + test_varint_for_num(decoder, 0); } diff --git a/tests/test_vs_proto2.cc b/tests/test_vs_proto2.cc index 8d13f33c1f..c43649cf62 100644 --- a/tests/test_vs_proto2.cc +++ b/tests/test_vs_proto2.cc @@ -7,15 +7,19 @@ * given proto type and input protobuf. */ +#define __STDC_LIMIT_MACROS // So we get UINT32_MAX #include #include +#include #include #include #include -#include "benchmarks/google_messages.pb.h" +#include +#include "upb/benchmarks/google_messages.pb.h" #include "upb/def.h" #include "upb/msg.h" #include "upb/pb/glue.h" +#include "upb/pb/varint.h" #include "upb_test.h" size_t string_size; @@ -179,13 +183,13 @@ void compare(const google::protobuf::Message& proto2_msg, void parse_and_compare(MESSAGE_CIDENT *proto2_msg, void *upb_msg, const upb_msgdef *upb_md, - const char *str, size_t len) + const char *str, size_t len, bool allow_jit) { // Parse to both proto2 and upb. ASSERT(proto2_msg->ParseFromArray(str, len)); upb_status status = UPB_STATUS_INIT; upb_msg_clear(upb_msg, upb_md); - upb_strtomsg(str, len, upb_msg, upb_md, &status); + upb_strtomsg(str, len, upb_msg, upb_md, allow_jit, &status); if (!upb_ok(&status)) { fprintf(stderr, "Error parsing protobuf: %s", upb_status_getstr(&status)); exit(1); @@ -241,8 +245,10 @@ int main(int argc, char *argv[]) // Run twice to test proper object reuse. MESSAGE_CIDENT proto2_msg; void *upb_msg = upb_stdmsg_new(msgdef); - parse_and_compare(&proto2_msg, upb_msg, msgdef, str, len); - parse_and_compare(&proto2_msg, upb_msg, msgdef, str, len); + parse_and_compare(&proto2_msg, upb_msg, msgdef, str, len, true); + parse_and_compare(&proto2_msg, upb_msg, msgdef, str, len, false); + parse_and_compare(&proto2_msg, upb_msg, msgdef, str, len, true); + parse_and_compare(&proto2_msg, upb_msg, msgdef, str, len, false); printf("All tests passed, %d assertions.\n", num_assertions); upb_stdmsg_free(upb_msg, msgdef); @@ -250,6 +256,17 @@ int main(int argc, char *argv[]) free((void*)str); upb_symtab_unref(symtab); upb_status_uninit(&status); + + // Test Zig-Zag encoding/decoding. + for (uint64_t num = 5; num * 1.5 > num; num *= 1.5) { + ASSERT(upb_zzenc_64(num) == + google::protobuf::internal::WireFormatLite::ZigZagEncode64(num)); + if (num < UINT32_MAX) { + ASSERT(upb_zzenc_32(num) == + google::protobuf::internal::WireFormatLite::ZigZagEncode32(num)); + } + } + google::protobuf::ShutdownProtobufLibrary(); return 0; diff --git a/tests/tests.c b/tests/tests.c index 83fb3ef311..12ff4bb23d 100644 --- a/tests/tests.c +++ b/tests/tests.c @@ -39,9 +39,13 @@ static void test_upb_jit() { upb_handlers *h = upb_handlers_new(); upb_handlerset hset = {NULL, NULL, &upb_test_onvalue, NULL, NULL, NULL, NULL}; upb_handlers_reghandlerset(h, upb_downcast_msgdef_const(def), &hset); - upb_decoder d; - upb_decoder_init(&d, h); - upb_decoder_uninit(&d); + upb_decoderplan *p = upb_decoderplan_new(h, true); +#ifdef UPB_USE_JIT_X64 + ASSERT(upb_decoderplan_hasjitcode(p)); +#else + ASSERT(!upb_decoderplan_hasjitcode(p)); +#endif + upb_decoderplan_unref(p); upb_symtab_unref(s); upb_def_unref(def); upb_handlers_unref(h); diff --git a/upb/bytestream.c b/upb/bytestream.c index 135f269535..8feb678037 100644 --- a/upb/bytestream.c +++ b/upb/bytestream.c @@ -25,7 +25,7 @@ upb_byteregion *upb_byteregion_new(const void *str) { return upb_byteregion_newl(str, strlen(str)); } -upb_byteregion *upb_byteregion_newl(const void *str, uint32_t len) { +upb_byteregion *upb_byteregion_newl(const void *str, size_t len) { upb_stringsrc *src = malloc(sizeof(*src)); upb_stringsrc_init(src); char *ptr = malloc(len + 1); @@ -37,7 +37,7 @@ upb_byteregion *upb_byteregion_newl(const void *str, uint32_t len) { void upb_byteregion_free(upb_byteregion *r) { if (!r) return; - uint32_t len; + size_t len; free((char*)upb_byteregion_getptr(r, 0, &len)); upb_stringsrc_uninit((upb_stringsrc*)r->bytesrc); free(r->bytesrc); @@ -64,16 +64,14 @@ void upb_byteregion_reset(upb_byteregion *r, const upb_byteregion *src, r->fetch = UPB_MIN(src->fetch, r->end); } -bool upb_byteregion_fetch(upb_byteregion *r, upb_status *s) { +upb_bytesuccess_t upb_byteregion_fetch(upb_byteregion *r) { uint64_t fetchable = upb_byteregion_remaining(r, r->fetch); - if (fetchable == 0) { - upb_status_seteof(s); - return false; - } - uint64_t num = upb_bytesrc_fetch(r->bytesrc, r->fetch, s); - if (num == 0) return false; - r->fetch += UPB_MIN(num, fetchable); - return true; + if (fetchable == 0) return UPB_BYTE_EOF; + size_t fetched; + upb_bytesuccess_t ret = upb_bytesrc_fetch(r->bytesrc, r->fetch, &fetched); + if (ret != UPB_BYTE_OK) return false; + r->fetch += UPB_MIN(fetched, fetchable); + return UPB_BYTE_OK; } @@ -93,10 +91,10 @@ static upb_stdio_buf *upb_stdio_findbuf(const upb_stdio *s, uint64_t ofs) { static upb_stdio_buf *upb_stdio_rotatebufs(upb_stdio *s) { upb_stdio_buf **reuse = NULL; // XXX - uint32_t num_reused = 0, num_inuse = 0; + int num_reused = 0, num_inuse = 0; // Could sweep only a subset of bufs if this was a hotspot. - for (uint32_t i = 0; i < s->nbuf; i++) { + for (int i = 0; i < s->nbuf; i++) { upb_stdio_buf *buf = s->bufs[i]; if (buf->refcount > 0) { s->bufs[num_inuse++] = buf; @@ -120,28 +118,37 @@ void upb_stdio_discard(void *src, uint64_t ofs) { (void)ofs; } -uint32_t upb_stdio_fetch(void *src, uint64_t ofs, upb_status *s) { +upb_bytesuccess_t upb_stdio_fetch(void *src, uint64_t ofs, size_t *bytes_read) { (void)ofs; upb_stdio *stdio = (upb_stdio*)src; upb_stdio_buf *buf = upb_stdio_rotatebufs(stdio); - uint32_t read = fread(&buf->data, 1, BUF_SIZE, stdio->file); - buf->len = read; - if(read < (uint32_t)BUF_SIZE) { +retry: + *bytes_read = fread(&buf->data, 1, BUF_SIZE, stdio->file); + buf->len = *bytes_read; + if (*bytes_read < (size_t)BUF_SIZE) { // Error or EOF. - if(feof(stdio->file)) { - upb_status_seteof(s); - return read; + if (feof(stdio->file)) { + upb_status_seteof(&stdio->src.status); + return UPB_BYTE_EOF; } - if(ferror(stdio->file)) { - upb_status_fromerrno(s); - return 0; + if (ferror(stdio->file)) { +#ifdef EINTR + // If we encounter a client who doesn't want to retry EINTR, we can easily + // add a boolean property of the stdio that controls this behavior. + if (errno == EINTR) { + clearerr(stdio->file); + goto retry; + } +#endif + upb_status_fromerrno(&stdio->src.status); + return upb_errno_is_wouldblock() ? UPB_BYTE_WOULDBLOCK : UPB_BYTE_ERROR; } assert(false); } - return buf->ofs + buf->len; + return UPB_BYTE_OK; } -void upb_stdio_read(const void *src, uint64_t ofs, uint32_t len, char *dst) { +void upb_stdio_copy(const void *src, uint64_t ofs, size_t len, char *dst) { upb_stdio_buf *buf = upb_stdio_findbuf(src, ofs); ofs -= buf->ofs; memcpy(dst, buf->data + ofs, BUF_SIZE - ofs); @@ -149,14 +156,14 @@ void upb_stdio_read(const void *src, uint64_t ofs, uint32_t len, char *dst) { dst += (BUF_SIZE - ofs); while (len > 0) { ++buf; - uint32_t bytes = UPB_MIN(len, BUF_SIZE); + size_t bytes = UPB_MIN(len, BUF_SIZE); memcpy(dst, buf->data, bytes); len -= bytes; dst += bytes; } } -const char *upb_stdio_getptr(const void *src, uint64_t ofs, uint32_t *len) { +const char *upb_stdio_getptr(const void *src, uint64_t ofs, size_t *len) { upb_stdio_buf *buf = upb_stdio_findbuf(src, ofs); ofs -= buf->ofs; *len = BUF_SIZE - ofs; @@ -168,7 +175,7 @@ upb_strlen_t upb_stdio_putstr(upb_bytesink *sink, upb_string *str, upb_status *s upb_stdio *stdio = (upb_stdio*)((char*)sink - offsetof(upb_stdio, sink)); upb_strlen_t len = upb_string_len(str); upb_strlen_t written = fwrite(upb_string_getrobuf(str), 1, len, stdio->file); - if(written < len) { + if (written < len) { upb_status_setf(status, UPB_ERROR, "Error writing to stdio stream."); return -1; } @@ -191,7 +198,7 @@ void upb_stdio_init(upb_stdio *stdio) { static upb_bytesrc_vtbl bytesrc_vtbl = { &upb_stdio_fetch, &upb_stdio_discard, - &upb_stdio_read, + &upb_stdio_copy, &upb_stdio_getptr, }; upb_bytesrc_init(&stdio->src, &bytesrc_vtbl); @@ -226,20 +233,25 @@ void upb_stdio_uninit(upb_stdio *stdio) { stdio->file = NULL; } -upb_byteregion* upb_stdio_allbytes(upb_stdio *stdio) { return &stdio->byteregion; } +upb_bytesrc* upb_stdio_bytesrc(upb_stdio *stdio) { return &stdio->src; } upb_bytesink* upb_stdio_bytesink(upb_stdio *stdio) { return &stdio->sink; } /* upb_stringsrc **************************************************************/ -uint32_t upb_stringsrc_fetch(void *_src, uint64_t ofs, upb_status *s) { +upb_bytesuccess_t upb_stringsrc_fetch(void *_src, uint64_t ofs, size_t *read) { upb_stringsrc *src = _src; - upb_status_seteof(s); - return src->len - ofs; + assert(ofs < src->len); + if (ofs == src->len) { + upb_status_seteof(&src->bytesrc.status); + return UPB_BYTE_EOF; + } + *read = src->len - ofs; + return UPB_BYTE_OK; } -void upb_stringsrc_read(const void *_src, uint64_t ofs, - uint32_t len, char *dst) { +void upb_stringsrc_copy(const void *_src, uint64_t ofs, + size_t len, char *dst) { const upb_stringsrc *src = _src; assert(ofs + len <= src->len); memcpy(dst, src->str + ofs, len); @@ -250,7 +262,7 @@ void upb_stringsrc_discard(void *src, uint64_t ofs) { (void)ofs; } -const char *upb_stringsrc_getptr(const void *_s, uint64_t ofs, uint32_t *len) { +const char *upb_stringsrc_getptr(const void *_s, uint64_t ofs, size_t *len) { const upb_stringsrc *src = _s; *len = src->len - ofs; return src->str + ofs; @@ -260,7 +272,7 @@ void upb_stringsrc_init(upb_stringsrc *s) { static upb_bytesrc_vtbl vtbl = { &upb_stringsrc_fetch, &upb_stringsrc_discard, - &upb_stringsrc_read, + &upb_stringsrc_copy, &upb_stringsrc_getptr, }; upb_bytesrc_init(&s->bytesrc, &vtbl); @@ -269,7 +281,7 @@ void upb_stringsrc_init(upb_stringsrc *s) { s->byteregion.toplevel = true; } -void upb_stringsrc_reset(upb_stringsrc *s, const char *str, uint32_t len) { +void upb_stringsrc_reset(upb_stringsrc *s, const char *str, size_t len) { s->str = str; s->len = len; s->byteregion.start = 0; @@ -280,18 +292,13 @@ void upb_stringsrc_reset(upb_stringsrc *s, const char *str, uint32_t len) { void upb_stringsrc_uninit(upb_stringsrc *s) { (void)s; } -upb_bytesrc *upb_stringsrc_bytesrc(upb_stringsrc *s) { - return &s->bytesrc; -} - - /* upb_stringsink *************************************************************/ void upb_stringsink_uninit(upb_stringsink *s) { free(s->str); } -void upb_stringsink_reset(upb_stringsink *s, char *str, uint32_t size) { +void upb_stringsink_reset(upb_stringsink *s, char *str, size_t size) { free(s->str); s->str = str; s->len = 0; diff --git a/upb/bytestream.h b/upb/bytestream.h index 3b339f17d7..409ae80f1c 100644 --- a/upb/bytestream.h +++ b/upb/bytestream.h @@ -63,11 +63,17 @@ // +------------------------ // | nondelimited region Z <-- won't return EOF until data source hits EOF. // +------------------------ +// +// TODO: if 64-bit math for stream offsets is a performance issue on +// non-64-bit machines, we could introduce a upb_off_t typedef that can be +// defined as a 32-bit type for applications that don't need to handle +// streams longer than 4GB. #ifndef UPB_BYTESTREAM_H #define UPB_BYTESTREAM_H +#include #include #include #include @@ -79,6 +85,12 @@ extern "C" { #endif +typedef enum { + UPB_BYTE_OK = UPB_OK, + UPB_BYTE_WOULDBLOCK = UPB_SUSPENDED, + UPB_BYTE_ERROR = UPB_ERROR, + UPB_BYTE_EOF +} upb_bytesuccess_t; /* upb_bytesrc ****************************************************************/ @@ -90,10 +102,10 @@ extern "C" { // upb_bytesrc is a virtual base class with implementations that get data from // eg. a string, a cord, a file descriptor, a FILE*, etc. -typedef uint32_t upb_bytesrc_fetch_func(void*, uint64_t, upb_status*); +typedef upb_bytesuccess_t upb_bytesrc_fetch_func(void*, uint64_t, size_t*); typedef void upb_bytesrc_discard_func(void*, uint64_t); -typedef void upb_bytesrc_copy_func(const void*, uint64_t, uint32_t, char*); -typedef const char *upb_bytesrc_getptr_func(const void*, uint64_t, uint32_t*); +typedef void upb_bytesrc_copy_func(const void*, uint64_t, size_t, char*); +typedef const char *upb_bytesrc_getptr_func(const void*, uint64_t, size_t*); typedef struct _upb_bytesrc_vtbl { upb_bytesrc_fetch_func *fetch; upb_bytesrc_discard_func *discard; @@ -102,21 +114,27 @@ typedef struct _upb_bytesrc_vtbl { } upb_bytesrc_vtbl; typedef struct { - upb_bytesrc_vtbl *vtbl; + const upb_bytesrc_vtbl *vtbl; + upb_status status; } upb_bytesrc; -INLINE void upb_bytesrc_init(upb_bytesrc *src, upb_bytesrc_vtbl *vtbl) { +INLINE void upb_bytesrc_init(upb_bytesrc *src, const upb_bytesrc_vtbl *vtbl) { src->vtbl = vtbl; + upb_status_init(&src->status); +} + +INLINE void upb_bytesrc_uninit(upb_bytesrc *src) { + upb_status_uninit(&src->status); } -// Fetches at least one byte starting at ofs, returning the actual number of -// bytes fetched (or 0 on EOF or error: see *s for details). Some bytesrc's -// may set EOF on *s after a successful read if no further data is available, -// but not all bytesrc's support this. It is valid for bytes to be fetched -// multiple times, as long as the bytes have not been previously discarded. -INLINE uint32_t upb_bytesrc_fetch(upb_bytesrc *src, uint64_t ofs, - upb_status *s) { - return src->vtbl->fetch(src, ofs, s); +// Fetches at least one byte starting at ofs, returning the success or failure +// of the operation. If UPB_BYTE_OK is returned, *read indicates the number of +// of bytes successfully fetched; any error or EOF status will be reflected in +// upb_bytesrc_status(). It is valid for bytes to be fetched multiple times, +// as long as the bytes have not been previously discarded. +INLINE upb_bytesuccess_t upb_bytesrc_fetch(upb_bytesrc *src, uint64_t ofs, + size_t *read) { + return src->vtbl->fetch(src, ofs, read); } // Discards all data prior to ofs (except data that is pinned, if pinning @@ -127,7 +145,7 @@ INLINE void upb_bytesrc_discard(upb_bytesrc *src, uint64_t ofs) { // Copies "len" bytes of data from ofs to "dst", which must be at least "len" // bytes long. The given region must not be discarded. -INLINE void upb_bytesrc_copy(const upb_bytesrc *src, uint64_t ofs, uint32_t len, +INLINE void upb_bytesrc_copy(const upb_bytesrc *src, uint64_t ofs, size_t len, char *dst) { src->vtbl->copy(src, ofs, len, dst); } @@ -138,7 +156,7 @@ INLINE void upb_bytesrc_copy(const upb_bytesrc *src, uint64_t ofs, uint32_t len, // part of the returned buffer is discarded, only the non-discarded bytes // remain valid). INLINE const char *upb_bytesrc_getptr(const upb_bytesrc *src, uint64_t ofs, - uint32_t *len) { + size_t *len) { return src->vtbl->getptr(src, ofs, len); } @@ -148,14 +166,14 @@ INLINE const char *upb_bytesrc_getptr(const upb_bytesrc *src, uint64_t ofs, // // is guaranteed that the region will not be discarded (nor will the bytesrc // // be destroyed) until the region is unpinned. However, not all bytesrc's // // support pinning; a false return indicates that a pin was not possible. -// INLINE bool upb_bytesrc_pin(upb_bytesrc *src, uint64_t ofs, uint32_t len) { +// INLINE bool upb_bytesrc_pin(upb_bytesrc *src, uint64_t ofs, size_t len) { // return src->vtbl->refregion(src, ofs, len); // } // // // Releases some number of pinned bytes from the beginning of a pinned // // region (which may be fewer than the total number of bytes pinned). -// INLINE void upb_bytesrc_unpin(upb_bytesrc *src, uint64_t ofs, uint32_t len, -// uint32_t bytes_to_release) { +// INLINE void upb_bytesrc_unpin(upb_bytesrc *src, uint64_t ofs, size_t len, +// size_t bytes_to_release) { // src->vtbl->unpin(src, ofs, len); // } // @@ -173,7 +191,7 @@ typedef struct _upb_byteregion { uint64_t fetch; uint64_t end; // UPB_NONDELIMITED if nondelimited. upb_bytesrc *bytesrc; - bool toplevel; // If true, discards hit the underlying byteregion. + bool toplevel; // If true, discards hit the underlying bytesrc. } upb_byteregion; // Initializes a byteregion. Its initial value will be empty. No methods may @@ -225,14 +243,17 @@ void upb_byteregion_release(upb_byteregion *r); // Attempts to fetch more data, extending the fetched range of this byteregion. // Returns true if the fetched region was extended by at least one byte, false // on EOF or error (see *s for details). -bool upb_byteregion_fetch(upb_byteregion *r, upb_status *s); +upb_bytesuccess_t upb_byteregion_fetch(upb_byteregion *r); -// Fetches all remaining data for "r", returning false if the operation failed -// (see "*s" for details). May only be used on delimited byteregions. -INLINE bool upb_byteregion_fetchall(upb_byteregion *r, upb_status *s) { +// Fetches all remaining data for "r", returning the success of the operation +// May only be used on delimited byteregions. +INLINE upb_bytesuccess_t upb_byteregion_fetchall(upb_byteregion *r) { assert(upb_byteregion_len(r) != UPB_NONDELIMITED); - while (upb_byteregion_fetch(r, s)) ; // Empty body. - return upb_eof(s); + upb_bytesuccess_t ret; + do { + ret = upb_byteregion_fetch(r); + } while (ret == UPB_BYTE_OK); + return ret == UPB_BYTE_EOF ? UPB_BYTE_OK : ret; } // Discards bytes from the byteregion up until ofs (which must be greater or @@ -243,13 +264,14 @@ INLINE void upb_byteregion_discard(upb_byteregion *r, uint64_t ofs) { assert(ofs >= upb_byteregion_discardofs(r)); assert(ofs <= upb_byteregion_endofs(r)); r->discard = ofs; + if (ofs > r->fetch) r->fetch = ofs; if (r->toplevel) upb_bytesrc_discard(r->bytesrc, ofs); } // Copies "len" bytes of data into "dst", starting at ofs. The specified // region must be available. INLINE void upb_byteregion_copy(const upb_byteregion *r, uint64_t ofs, - uint32_t len, char *dst) { + size_t len, char *dst) { assert(ofs >= upb_byteregion_discardofs(r)); assert(len <= upb_byteregion_available(r, ofs)); upb_bytesrc_copy(r->bytesrc, ofs, len, dst); @@ -268,7 +290,7 @@ INLINE void upb_byteregion_copyall(const upb_byteregion *r, char *dst) { // or when the bytes are discarded. If the byteregion is not currently pinned, // the pointer is only valid for the lifetime of the parent byteregion. INLINE const char *upb_byteregion_getptr(const upb_byteregion *r, - uint64_t ofs, uint32_t *len) { + uint64_t ofs, size_t *len) { assert(ofs >= upb_byteregion_discardofs(r)); const char *ret = upb_bytesrc_getptr(r->bytesrc, ofs, len); *len = UPB_MIN(*len, upb_byteregion_available(r, ofs)); @@ -295,7 +317,7 @@ INLINE const char *upb_byteregion_getptr(const upb_byteregion *r, // The string data in the returned region is guaranteed to be contiguous and // NULL-terminated. upb_byteregion *upb_byteregion_new(const void *str); -upb_byteregion *upb_byteregion_newl(const void *str, uint32_t len); +upb_byteregion *upb_byteregion_newl(const void *str, size_t len); // May *only* be called on a byteregion created with upb_byteregion_new[l]()! void upb_byteregion_free(upb_byteregion *r); @@ -399,7 +421,7 @@ INLINE void upb_bytesink_rewind(upb_bytesink *sink, uint64_t offset) { typedef struct { uint64_t ofs; - uint32_t len; + size_t len; uint32_t refcount; char data[]; } upb_stdio_buf; @@ -414,7 +436,6 @@ typedef struct { bool should_close; upb_stdio_buf **bufs; uint32_t nbuf, szbuf; - upb_byteregion byteregion; } upb_stdio; void upb_stdio_init(upb_stdio *stdio); @@ -433,7 +454,7 @@ void upb_stdio_reset(upb_stdio *stdio, FILE *file); void upb_stdio_open(upb_stdio *stdio, const char *filename, const char *mode, upb_status *s); -upb_byteregion *upb_stdio_allbytes(upb_stdio *stdio); +upb_bytesrc *upb_stdio_bytesrc(upb_stdio *stdio); upb_bytesink *upb_stdio_bytesink(upb_stdio *stdio); @@ -444,7 +465,7 @@ upb_bytesink *upb_stdio_bytesink(upb_stdio *stdio); typedef struct { upb_bytesrc bytesrc; const char *str; - uint32_t len; + size_t len; upb_byteregion byteregion; } upb_stringsrc; @@ -454,7 +475,11 @@ void upb_stringsrc_uninit(upb_stringsrc *s); // Resets the stringsrc to a state where it will vend the given string. The // string data must be valid until the stringsrc is reset again or destroyed. -void upb_stringsrc_reset(upb_stringsrc *s, const char *str, uint32_t len); +void upb_stringsrc_reset(upb_stringsrc *s, const char *str, size_t len); + +INLINE upb_bytesrc *upb_stringsrc_bytesrc(upb_stringsrc *s) { + return &s->bytesrc; +} // Returns the top-level upb_byteregion* for this stringsrc. Invalidated when // the stringsrc is reset. @@ -468,7 +493,7 @@ INLINE upb_byteregion *upb_stringsrc_allbytes(upb_stringsrc *s) { struct _upb_stringsink { upb_bytesink bytesink; char *str; - uint32_t len, size; + size_t len, size; }; typedef struct _upb_stringsink upb_stringsink; @@ -478,12 +503,12 @@ void upb_stringsink_uninit(upb_stringsink *s); // Resets the sink's string to "str", which the sink takes ownership of. // "str" may be NULL, which will make the sink allocate a new string. -void upb_stringsink_reset(upb_stringsink *s, char *str, uint32_t len); +void upb_stringsink_reset(upb_stringsink *s, char *str, size_t len); // Releases ownership of the returned string (which is "len" bytes long) and // resets the internal string to be empty again (as if reset were called with // NULL). -const char *upb_stringsink_release(upb_stringsink *s, uint32_t *len); +const char *upb_stringsink_release(upb_stringsink *s, size_t *len); // Returns the upb_bytesink* for this stringsrc. Invalidated by reset above. upb_bytesink *upb_stringsink_bytesink(upb_stringsink *s); diff --git a/upb/def.c b/upb/def.c index 13418c6fdd..246e9bb7a0 100644 --- a/upb/def.c +++ b/upb/def.c @@ -334,7 +334,7 @@ static bool upb_fielddef_resolve(upb_fielddef *f, upb_def *def, upb_status *s) { if (upb_byteregion_len(bytes) == 0) { upb_value_setint32(&f->defaultval, e->defaultval); } else { - uint32_t len; + size_t len; // ptr is guaranteed to be NULL-terminated because the byteregion was // created with upb_byteregion_newl(). const char *ptr = upb_byteregion_getptr(bytes, 0, &len); diff --git a/upb/handlers.c b/upb/handlers.c index 0af09ef43c..d1b68ad81e 100644 --- a/upb/handlers.c +++ b/upb/handlers.c @@ -13,7 +13,7 @@ static upb_mhandlers *upb_mhandlers_new() { upb_mhandlers *m = malloc(sizeof(*m)); - upb_inttable_init(&m->fieldtab, 8, sizeof(upb_fhandlers)); + upb_inttable_init(&m->fieldtab, 8, sizeof(upb_itofhandlers_ent)); m->startmsg = NULL; m->endmsg = NULL; m->is_group = false; @@ -26,21 +26,21 @@ static upb_mhandlers *upb_mhandlers_new() { static upb_fhandlers *_upb_mhandlers_newfhandlers(upb_mhandlers *m, uint32_t n, upb_fieldtype_t type, bool repeated) { - uint32_t tag = n << 3 | upb_types[type].native_wire_type; - upb_fhandlers *f = upb_inttable_lookup(&m->fieldtab, tag); - if (f) abort(); - upb_fhandlers new_f = {false, type, repeated, - repeated && upb_isprimitivetype(type), UPB_ATOMIC_INIT(0), + upb_itofhandlers_ent *e = upb_inttable_lookup(&m->fieldtab, n); + // TODO: design/refine the API for changing the set of fields or modifying + // existing handlers. + if (e) return NULL; + upb_fhandlers new_f = {type, repeated, UPB_ATOMIC_INIT(0), n, -1, m, NULL, UPB_NO_VALUE, NULL, NULL, NULL, NULL, NULL, #ifdef UPB_USE_JIT_X64 0, 0, 0, #endif NULL}; - upb_inttable_insert(&m->fieldtab, tag, &new_f); - f = upb_inttable_lookup(&m->fieldtab, tag); - assert(f); - assert(f->type == type); - return f; + upb_fhandlers *ptr = malloc(sizeof(*ptr)); + memcpy(ptr, &new_f, sizeof(upb_fhandlers)); + upb_itofhandlers_ent ent = {false, ptr}; + upb_inttable_insert(&m->fieldtab, n, &ent); + return ptr; } upb_fhandlers *upb_mhandlers_newfhandlers(upb_mhandlers *m, uint32_t n, @@ -57,6 +57,7 @@ upb_fhandlers *upb_mhandlers_newfhandlers_subm(upb_mhandlers *m, uint32_t n, assert(type == UPB_TYPE(MESSAGE) || type == UPB_TYPE(GROUP)); assert(subm); upb_fhandlers *f = _upb_mhandlers_newfhandlers(m, n, type, repeated); + if (!f) return NULL; f->submsg = subm; if (type == UPB_TYPE(GROUP)) _upb_mhandlers_newfhandlers(subm, n, UPB_TYPE_ENDGROUP, false); @@ -82,6 +83,12 @@ void upb_handlers_unref(upb_handlers *h) { if (upb_atomic_unref(&h->refcount)) { for (int i = 0; i < h->msgs_len; i++) { upb_mhandlers *mh = h->msgs[i]; + for(upb_inttable_iter j = upb_inttable_begin(&mh->fieldtab); + !upb_inttable_done(j); + j = upb_inttable_next(&mh->fieldtab, j)) { + upb_itofhandlers_ent *e = upb_inttable_iter_value(j); + free(e->f); + } upb_inttable_free(&mh->fieldtab); #ifdef UPB_USE_JIT_X64 free(mh->tablearray); @@ -154,41 +161,24 @@ upb_mhandlers *upb_handlers_regmsgdef(upb_handlers *h, const upb_msgdef *m, /* upb_dispatcher *************************************************************/ -static upb_fhandlers toplevel_f = { - false, UPB_TYPE(GROUP), false, false, UPB_ATOMIC_INIT(0), 0, - -1, NULL, NULL, // submsg -#ifdef NDEBUG - {{0}}, -#else - {{0}, -1}, -#endif - NULL, NULL, NULL, NULL, NULL, -#ifdef UPB_USE_JIT_X64 - 0, 0, 0, -#endif - NULL}; - -void upb_dispatcher_init(upb_dispatcher *d, upb_handlers *h, - upb_skip_handler *skip, upb_exit_handler *exit, +void upb_dispatcher_init(upb_dispatcher *d, upb_status *status, + upb_exit_handler UPB_NORETURN *exit, void *srcclosure) { - d->handlers = h; - upb_handlers_ref(h); - for (int i = 0; i < h->msgs_len; i++) { - upb_mhandlers *m = h->msgs[i]; - upb_inttable_compact(&m->fieldtab); - } - d->stack[0].f = &toplevel_f; + d->stack[0].f = NULL; // Should never be read. d->limit = &d->stack[UPB_MAX_NESTING]; - d->skip = skip; - d->exit = exit; + d->exitjmp = exit; d->srcclosure = srcclosure; d->top_is_implicit = false; - upb_status_init(&d->status); + d->msgent = NULL; + d->top = NULL; + d->toplevel_msgent = NULL; + d->status = status; } -upb_dispatcher_frame *upb_dispatcher_reset(upb_dispatcher *d, void *closure) { - d->msgent = d->handlers->msgs[0]; - d->dispatch_table = &d->msgent->fieldtab; +upb_dispatcher_frame *upb_dispatcher_reset(upb_dispatcher *d, void *closure, + upb_mhandlers *top) { + d->msgent = top; + d->toplevel_msgent = top; d->top = d->stack; d->top->closure = closure; d->top->is_sequence = false; @@ -197,46 +187,32 @@ upb_dispatcher_frame *upb_dispatcher_reset(upb_dispatcher *d, void *closure) { } void upb_dispatcher_uninit(upb_dispatcher *d) { - upb_handlers_unref(d->handlers); - upb_status_uninit(&d->status); } void upb_dispatch_startmsg(upb_dispatcher *d) { upb_flow_t flow = UPB_CONTINUE; if (d->msgent->startmsg) d->msgent->startmsg(d->top->closure); - if (flow != UPB_CONTINUE) _upb_dispatcher_unwind(d, flow); + if (flow != UPB_CONTINUE) _upb_dispatcher_abortjmp(d); } void upb_dispatch_endmsg(upb_dispatcher *d, upb_status *status) { assert(d->top == d->stack); - if (d->msgent->endmsg) d->msgent->endmsg(d->top->closure, &d->status); + if (d->msgent->endmsg) d->msgent->endmsg(d->top->closure, d->status); // TODO: should we avoid this copy by passing client's status obj to cbs? - upb_status_copy(status, &d->status); -} - -void indent(upb_dispatcher *d) { - for (int i = 0; i < (d->top - d->stack); i++) fprintf(stderr, " "); -} - -void indentm1(upb_dispatcher *d) { - for (int i = 0; i < (d->top - d->stack - 1); i++) fprintf(stderr, " "); + upb_status_copy(status, d->status); } upb_dispatcher_frame *upb_dispatch_startseq(upb_dispatcher *d, upb_fhandlers *f) { - //indent(d); - //fprintf(stderr, "START SEQ: %d\n", f->number); - if((d->top+1) >= d->limit) { - upb_status_seterrliteral(&d->status, "Nesting too deep."); - _upb_dispatcher_unwind(d, UPB_BREAK); - return d->top; // Dummy. + if (d->top + 1 >= d->limit) { + upb_status_seterrliteral(d->status, "Nesting too deep."); + _upb_dispatcher_abortjmp(d); } upb_sflow_t sflow = UPB_CONTINUE_WITH(d->top->closure); if (f->startseq) sflow = f->startseq(d->top->closure, f->fval); if (sflow.flow != UPB_CONTINUE) { - _upb_dispatcher_unwind(d, sflow.flow); - return d->top; // Dummy. + _upb_dispatcher_abortjmp(d); } ++d->top; @@ -248,8 +224,6 @@ upb_dispatcher_frame *upb_dispatch_startseq(upb_dispatcher *d, } upb_dispatcher_frame *upb_dispatch_endseq(upb_dispatcher *d) { - //indentm1(d); - //fprintf(stderr, "END SEQ\n"); assert(d->top > d->stack); assert(d->top->is_sequence); upb_fhandlers *f = d->top->f; @@ -257,30 +231,23 @@ upb_dispatcher_frame *upb_dispatch_endseq(upb_dispatcher *d) { upb_flow_t flow = UPB_CONTINUE; if (f->endseq) flow = f->endseq(d->top->closure, f->fval); if (flow != UPB_CONTINUE) { - printf("YO, UNWINDING!\n"); - _upb_dispatcher_unwind(d, flow); - return d->top; // Dummy. + _upb_dispatcher_abortjmp(d); } - d->msgent = d->top->f->submsg ? d->top->f->submsg : d->handlers->msgs[0]; - d->dispatch_table = &d->msgent->fieldtab; + d->msgent = d->top->f ? d->top->f->submsg : d->toplevel_msgent; return d->top; } upb_dispatcher_frame *upb_dispatch_startsubmsg(upb_dispatcher *d, upb_fhandlers *f) { - //indent(d); - //fprintf(stderr, "START SUBMSG: %d\n", f->number); - if((d->top+1) >= d->limit) { - upb_status_seterrliteral(&d->status, "Nesting too deep."); - _upb_dispatcher_unwind(d, UPB_BREAK); - return d->top; // Dummy. + if (d->top + 1 >= d->limit) { + upb_status_seterrliteral(d->status, "Nesting too deep."); + _upb_dispatcher_abortjmp(d); } upb_sflow_t sflow = UPB_CONTINUE_WITH(d->top->closure); if (f->startsubmsg) sflow = f->startsubmsg(d->top->closure, f->fval); if (sflow.flow != UPB_CONTINUE) { - _upb_dispatcher_unwind(d, sflow.flow); - return d->top; // Dummy. + _upb_dispatcher_abortjmp(d); } ++d->top; @@ -289,24 +256,20 @@ upb_dispatcher_frame *upb_dispatch_startsubmsg(upb_dispatcher *d, d->top->is_packed = false; d->top->closure = sflow.closure; d->msgent = f->submsg; - d->dispatch_table = &d->msgent->fieldtab; upb_dispatch_startmsg(d); return d->top; } upb_dispatcher_frame *upb_dispatch_endsubmsg(upb_dispatcher *d) { - //indentm1(d); - //fprintf(stderr, "END SUBMSG\n"); assert(d->top > d->stack); assert(!d->top->is_sequence); upb_fhandlers *f = d->top->f; - if (d->msgent->endmsg) d->msgent->endmsg(d->top->closure, &d->status); + if (d->msgent->endmsg) d->msgent->endmsg(d->top->closure, d->status); d->msgent = d->top->f->msg; - d->dispatch_table = &d->msgent->fieldtab; --d->top; upb_flow_t flow = UPB_CONTINUE; if (f->endsubmsg) f->endsubmsg(d->top->closure, f->fval); - if (flow != UPB_CONTINUE) _upb_dispatcher_unwind(d, flow); + if (flow != UPB_CONTINUE) _upb_dispatcher_abortjmp(d); return d->top; } @@ -320,14 +283,7 @@ bool upb_dispatcher_islegalend(upb_dispatcher *d) { return false; } -void _upb_dispatcher_unwind(upb_dispatcher *d, upb_flow_t flow) { - upb_dispatcher_frame *frame = d->top; - while (1) { - frame->f->submsg->endmsg(frame->closure, &d->status); - frame->f->endsubmsg(frame->closure, frame->f->fval); - --frame; - if (frame < d->stack) { d->exit(d->srcclosure); return; } - d->top = frame; - if (flow == UPB_SKIPSUBMSG) return; - } +void _upb_dispatcher_abortjmp(upb_dispatcher *d) { + d->exitjmp(d->srcclosure); + assert(false); // Never returns. } diff --git a/upb/handlers.h b/upb/handlers.h index e17a72694f..9ed02c114f 100644 --- a/upb/handlers.h +++ b/upb/handlers.h @@ -132,13 +132,15 @@ typedef upb_flow_t (upb_endfield_handler)(void *closure, upb_value fval); // A upb_fhandlers object represents the set of handlers associated with one // specific message field. +// +// TODO: remove upb_decoder-specific fields from this, and instead have +// upb_decoderplan make a deep copy of the whole graph with its own fields +// added. struct _upb_decoder; struct _upb_mhandlers; typedef struct _upb_fieldent { - bool junk; upb_fieldtype_t type; bool repeated; - bool is_repeated_primitive; upb_atomic_t refcount; uint32_t number; int32_t valuehasbit; @@ -158,6 +160,11 @@ typedef struct _upb_fieldent { void (*decode)(struct _upb_decoder *d, struct _upb_fieldent *f); } upb_fhandlers; +typedef struct { + bool junk; // Stolen by table impl; see table.h for details. + upb_fhandlers *f; +} upb_itofhandlers_ent; + // fhandlers are created as part of a upb_handlers instance, but can be ref'd // and unref'd to prolong the life of the handlers. void upb_fhandlers_ref(upb_fhandlers *m); @@ -194,16 +201,18 @@ typedef struct _upb_mhandlers { upb_inttable fieldtab; // Maps field number -> upb_fhandlers. bool is_group; #ifdef UPB_USE_JIT_X64 - uint32_t jit_startmsg_pclabel; - uint32_t jit_endofbuf_pclabel; - uint32_t jit_endofmsg_pclabel; - uint32_t jit_dyndispatch_pclabel; - uint32_t jit_unknownfield_pclabel; - int32_t jit_parent_field_done_pclabel; + // Used inside the JIT to track labels (jmp targets) in the generated code. + uint32_t jit_startmsg_pclabel; // Starting a parse of this (sub-)message. + uint32_t jit_endofbuf_pclabel; // ptr hitend, but delim_end or jit_end? + uint32_t jit_endofmsg_pclabel; // Done parsing this (sub-)message. + uint32_t jit_dyndispatch_pclabel; // Dispatch by table lookup. + uint32_t jit_unknownfield_pclabel; // Parsed an unknown field. uint32_t max_field_number; // Currently keyed on field number. Could also try keying it // on encoded or decoded tag, or on encoded field number. void **tablearray; + // Pointer to the JIT code for parsing this message. + void *jit_func; #endif } upb_mhandlers; @@ -316,62 +325,47 @@ INLINE upb_mhandlers *upb_handlers_reghandlerset(upb_handlers *h, const upb_msgd typedef struct { upb_fhandlers *f; void *closure; - - // Members to use as the data source requires. - void *srcclosure; uint64_t end_ofs; - uint16_t msgindex; - uint16_t fieldindex; - bool is_sequence; // frame represents seq or submsg? (f might be both). bool is_packed; // !upb_issubmsg(f) && end_ofs != UINT64_MAX // (strings aren't pushed). } upb_dispatcher_frame; -// Called when some of the input needs to be skipped. All frames from the -// current top to "bottom", inclusive, should be skipped. -typedef void upb_skip_handler(void *, upb_dispatcher_frame *bottom); typedef void upb_exit_handler(void *); typedef struct { upb_dispatcher_frame *top, *limit; - upb_handlers *handlers; - // Msg and dispatch table for the current level. upb_mhandlers *msgent; - upb_inttable *dispatch_table; - upb_skip_handler *skip; - upb_exit_handler *exit; + upb_mhandlers *toplevel_msgent; + upb_exit_handler UPB_NORETURN *exitjmp; void *srcclosure; bool top_is_implicit; // Stack. - upb_status status; + upb_status *status; upb_dispatcher_frame stack[UPB_MAX_NESTING]; } upb_dispatcher; -void upb_dispatcher_init(upb_dispatcher *d, upb_handlers *h, - upb_skip_handler *skip, upb_exit_handler *exit, - void *closure); -upb_dispatcher_frame *upb_dispatcher_reset(upb_dispatcher *d, void *topclosure); +// Caller retains ownership of the status object. +void upb_dispatcher_init(upb_dispatcher *d, upb_status *status, + upb_exit_handler UPB_NORETURN *exit, void *closure); +upb_dispatcher_frame *upb_dispatcher_reset(upb_dispatcher *d, void *topclosure, + upb_mhandlers *top_msg); void upb_dispatcher_uninit(upb_dispatcher *d); // Tests whether the message could legally end here (either the stack is empty // or the only open stack frame is implicit). bool upb_dispatcher_islegalend(upb_dispatcher *d); -// Looks up a field by number for the current message. -INLINE upb_fhandlers *upb_dispatcher_lookup(upb_dispatcher *d, uint32_t n) { - return (upb_fhandlers*)upb_inttable_fastlookup( - d->dispatch_table, n, sizeof(upb_fhandlers)); -} - -void _upb_dispatcher_unwind(upb_dispatcher *d, upb_flow_t flow); +// Unwinds one or more stack frames based on the given flow constant that was +// just returned from a handler. Calls end handlers as appropriate. +void _upb_dispatcher_abortjmp(upb_dispatcher *d) UPB_NORETURN; INLINE void _upb_dispatcher_sethas(void *_p, int32_t hasbit) { char *p = (char*)_p; - if (hasbit >= 0) p[hasbit / 8] |= (1 << (hasbit % 8)); + if (hasbit >= 0) p[(uint32_t)hasbit / 8] |= (1 << ((uint32_t)hasbit % 8)); } // Dispatch functions -- call the user handler and handle errors. @@ -380,11 +374,12 @@ INLINE void upb_dispatch_value(upb_dispatcher *d, upb_fhandlers *f, upb_flow_t flow = UPB_CONTINUE; if (f->value) flow = f->value(d->top->closure, f->fval, val); _upb_dispatcher_sethas(d->top->closure, f->valuehasbit); - if (flow != UPB_CONTINUE) _upb_dispatcher_unwind(d, flow); + if (flow != UPB_CONTINUE) _upb_dispatcher_abortjmp(d); } void upb_dispatch_startmsg(upb_dispatcher *d); void upb_dispatch_endmsg(upb_dispatcher *d, upb_status *status); -upb_dispatcher_frame *upb_dispatch_startsubmsg(upb_dispatcher *d, upb_fhandlers *f); +upb_dispatcher_frame *upb_dispatch_startsubmsg(upb_dispatcher *d, + upb_fhandlers *f); upb_dispatcher_frame *upb_dispatch_endsubmsg(upb_dispatcher *d); upb_dispatcher_frame *upb_dispatch_startseq(upb_dispatcher *d, upb_fhandlers *f); upb_dispatcher_frame *upb_dispatch_endseq(upb_dispatcher *d); diff --git a/upb/msg.c b/upb/msg.c index 78309cf7ed..77521e5f0a 100644 --- a/upb/msg.c +++ b/upb/msg.c @@ -86,14 +86,16 @@ void upb_stdmsg_sethas(void *_m, upb_value fval) { assert(_m != NULL); char *m = _m; const upb_fielddef *f = upb_value_getfielddef(fval); - if (f->hasbit >= 0) m[f->hasbit / 8] |= (1 << (f->hasbit % 8)); + if (f->hasbit >= 0) + m[(uint32_t)f->hasbit / 8] |= (1 << ((uint32_t)f->hasbit % 8)); } bool upb_stdmsg_has(const void *_m, upb_value fval) { assert(_m != NULL); const char *m = _m; const upb_fielddef *f = upb_value_getfielddef(fval); - return f->hasbit < 0 || (m[f->hasbit / 8] & (1 << (f->hasbit % 8))); + return f->hasbit < 0 || + (m[(uint32_t)f->hasbit / 8] & (1 << ((uint32_t)f->hasbit % 8))); } #define UPB_ACCESSORS(type, ctype) \ diff --git a/upb/pb/decoder.c b/upb/pb/decoder.c index ae54e47af8..1b5fc17f54 100644 --- a/upb/pb/decoder.c +++ b/upb/pb/decoder.c @@ -13,14 +13,95 @@ #include "upb/pb/decoder.h" #include "upb/pb/varint.h" +/* upb_decoderplan ************************************************************/ + #ifdef UPB_USE_JIT_X64 -#define Dst_DECL upb_decoder *d -#define Dst_REF (d->dynasm) -#define Dst (d) +// These defines are necessary for DynASM codegen. +// See dynasm/dasm_proto.h for more info. +#define Dst_DECL upb_decoderplan *plan +#define Dst_REF (plan->dynasm) +#define Dst (plan) + +// In debug mode, make DynASM do internal checks (must be defined before any +// dasm header is included. +#ifndef NDEBUG +#define DASM_CHECKS +#endif + #include "dynasm/dasm_proto.h" #include "upb/pb/decoder_x64.h" #endif +typedef struct { + upb_fhandlers base; + void (*decode)(struct _upb_decoder *d, struct _upb_fieldent *f); +#ifdef UPB_USE_JIT_X64 + uint32_t jit_pclabel; + uint32_t jit_pclabel_notypecheck; +#endif +} upb_dplanfield; + +typedef struct { + upb_mhandlers base; +#ifdef UPB_USE_JIT_X64 + uint32_t jit_startmsg_pclabel; + uint32_t jit_endofbuf_pclabel; + uint32_t jit_endofmsg_pclabel; + uint32_t jit_dyndispatch_pclabel; + uint32_t jit_unknownfield_pclabel; + int32_t jit_parent_field_done_pclabel; + uint32_t max_field_number; + // Currently keyed on field number. Could also try keying it + // on encoded or decoded tag, or on encoded field number. + void **tablearray; +#endif +} upb_dplanmsg; + +static void *upb_decoderplan_fptrs[]; + +void upb_decoderplan_initfhandlers(upb_fhandlers *f) { + f->decode = upb_decoderplan_fptrs[f->type]; +} + +upb_decoderplan *upb_decoderplan_new(upb_handlers *h, bool allowjit) { + upb_decoderplan *p = malloc(sizeof(*p)); + p->handlers = h; + upb_handlers_ref(h); + h->should_jit = allowjit; +#ifdef UPB_USE_JIT_X64 + p->jit_code = NULL; + if (allowjit) upb_decoderplan_makejit(p); +#endif + // Set function pointers for each field's decode function. + for (int i = 0; i < h->msgs_len; i++) { + upb_mhandlers *m = h->msgs[i]; + for(upb_inttable_iter i = upb_inttable_begin(&m->fieldtab); + !upb_inttable_done(i); + i = upb_inttable_next(&m->fieldtab, i)) { + upb_itofhandlers_ent *e = upb_inttable_iter_value(i); + upb_fhandlers *f = e->f; + upb_decoderplan_initfhandlers(f); + } + } + return p; +} + +void upb_decoderplan_unref(upb_decoderplan *p) { + // TODO: make truly refcounted. + upb_handlers_unref(p->handlers); +#ifdef UPB_USE_JIT_X64 + if (p->jit_code) upb_decoderplan_freejit(p); +#endif + free(p); +} + +bool upb_decoderplan_hasjitcode(upb_decoderplan *p) { + return p->jit_code != NULL; +} + + +/* upb_decoder ****************************************************************/ + // It's unfortunate that we have to micro-manage the compiler this way, // especially since this tuning is necessarily specific to one hardware // configuration. But emperically on a Core i7, performance increases 30-50% @@ -29,18 +110,17 @@ #define FORCEINLINE static __attribute__((always_inline)) #define NOINLINE static __attribute__((noinline)) -static void upb_decoder_exit(upb_decoder *d) { +UPB_NORETURN static void upb_decoder_exitjmp(upb_decoder *d) { // Resumable decoder would back out to completed_ptr (and possibly get a // previous buffer). siglongjmp(d->exitjmp, 1); } -static void upb_decoder_exit2(void *_d) { - upb_decoder *d = _d; - upb_decoder_exit(d); +UPB_NORETURN static void upb_decoder_exitjmp2(void *d) { + upb_decoder_exitjmp(d); } -static void upb_decoder_abort(upb_decoder *d, const char *msg) { - upb_status_seterrliteral(d->status, msg); - upb_decoder_exit(d); +UPB_NORETURN static void upb_decoder_abortjmp(upb_decoder *d, const char *msg) { + upb_status_seterrliteral(&d->status, msg); + upb_decoder_exitjmp(d); } /* Buffering ******************************************************************/ @@ -50,8 +130,12 @@ static void upb_decoder_abort(upb_decoder *d, const char *msg) { // the next one. When we've committed our progress we discard any previous // buffers' regions. -static uint32_t upb_decoder_bufleft(upb_decoder *d) { return d->end - d->ptr; } -static void upb_decoder_advance(upb_decoder *d, uint32_t len) { +static size_t upb_decoder_bufleft(upb_decoder *d) { + assert(d->end >= d->ptr); + return d->end - d->ptr; +} + +static void upb_decoder_advance(upb_decoder *d, size_t len) { assert(upb_decoder_bufleft(d) >= len); d->ptr += len; } @@ -66,29 +150,49 @@ uint64_t upb_decoder_bufendofs(upb_decoder *d) { static void upb_decoder_setmsgend(upb_decoder *d) { upb_dispatcher_frame *f = d->dispatcher.top; - uint32_t delimlen = f->end_ofs - d->bufstart_ofs; - uint32_t buflen = d->end - d->buf; + size_t delimlen = f->end_ofs - d->bufstart_ofs; + size_t buflen = d->end - d->buf; d->delim_end = (f->end_ofs != UPB_NONDELIMITED && delimlen <= buflen) ? d->buf + delimlen : NULL; // NULL if not in this buf. d->top_is_packed = f->is_packed; + d->dispatch_table = &d->dispatcher.msgent->fieldtab; } -static bool upb_trypullbuf(upb_decoder *d) { - assert(upb_decoder_bufleft(d) == 0); - d->bufstart_ofs = upb_decoder_offset(d); +static void upb_decoder_skiptonewbuf(upb_decoder *d, uint64_t ofs) { + assert(ofs >= upb_decoder_offset(d)); + if (ofs > upb_byteregion_endofs(d->input)) + upb_decoder_abortjmp(d, "Unexpected EOF"); d->buf = NULL; d->ptr = NULL; d->end = NULL; - if (upb_byteregion_available(d->input, upb_decoder_offset(d)) == 0 && - !upb_byteregion_fetch(d->input, d->status)) { - if (upb_eof(d->status)) return false; - upb_decoder_exit(d); // Non-EOF error. + d->delim_end = NULL; +#ifdef UPB_USE_JIT_X64 + d->jit_end = NULL; +#endif + d->bufstart_ofs = ofs; +} + +static bool upb_trypullbuf(upb_decoder *d) { + assert(upb_decoder_bufleft(d) == 0); + upb_decoder_skiptonewbuf(d, upb_decoder_offset(d)); + if (upb_byteregion_available(d->input, d->bufstart_ofs) == 0) { + switch (upb_byteregion_fetch(d->input)) { + case UPB_BYTE_OK: + assert(upb_byteregion_available(d->input, d->bufstart_ofs) > 0); + break; + case UPB_BYTE_EOF: return false; + case UPB_BYTE_ERROR: upb_decoder_abortjmp(d, "I/O error in input"); + // Decoder resuming is not yet supported. + case UPB_BYTE_WOULDBLOCK: + upb_decoder_abortjmp(d, "Input returned WOULDBLOCK"); + } } - uint32_t len; + size_t len; d->buf = upb_byteregion_getptr(d->input, d->bufstart_ofs, &len); assert(len > 0); d->ptr = d->buf; d->end = d->buf + len; + upb_decoder_setmsgend(d); #ifdef UPB_USE_JIT_X64 // If we start parsing a value, we can parse up to 20 bytes without // having to bounds-check anything (2 10-byte varints). Since the @@ -96,27 +200,29 @@ static bool upb_trypullbuf(upb_decoder *d) { // JIT bails if there are not 20 bytes available. d->jit_end = d->end - 20; #endif - upb_decoder_setmsgend(d); + assert(upb_decoder_bufleft(d) > 0); return true; } static void upb_pullbuf(upb_decoder *d) { - if (!upb_trypullbuf(d)) upb_decoder_abort(d, "Unexpected EOF"); + if (!upb_trypullbuf(d)) upb_decoder_abortjmp(d, "Unexpected EOF"); } -void upb_decoder_skipto(upb_decoder *d, uint64_t ofs) { - if (ofs < upb_decoder_bufendofs(d)) { +void upb_decoder_checkpoint(upb_decoder *d) { + upb_byteregion_discard(d->input, upb_decoder_offset(d)); +} + +void upb_decoder_discardto(upb_decoder *d, uint64_t ofs) { + if (ofs <= upb_decoder_bufendofs(d)) { upb_decoder_advance(d, ofs - upb_decoder_offset(d)); } else { - d->buf = NULL; - d->ptr = NULL; - d->end = NULL; - d->bufstart_ofs = ofs; + upb_decoder_skiptonewbuf(d, ofs); } + upb_decoder_checkpoint(d); } -void upb_decoder_checkpoint(upb_decoder *d) { - upb_byteregion_discard(d->input, upb_decoder_offset(d)); +void upb_decoder_discard(upb_decoder *d, size_t bytes) { + upb_decoder_discardto(d, upb_decoder_offset(d) + bytes); } @@ -126,15 +232,13 @@ NOINLINE uint64_t upb_decode_varint_slow(upb_decoder *d) { uint8_t byte = 0x80; uint64_t u64 = 0; int bitpos; - const char *ptr = d->ptr; for(bitpos = 0; bitpos < 70 && (byte & 0x80); bitpos += 7) { - if (upb_decoder_bufleft(d) == 0) { - upb_pullbuf(d); - ptr = d->ptr; - } - u64 |= ((uint64_t)(byte = *ptr++) & 0x7F) << bitpos; + if (upb_decoder_bufleft(d) == 0) upb_pullbuf(d); + u64 |= ((uint64_t)(byte = *d->ptr) & 0x7F) << bitpos; + upb_decoder_advance(d, 1); } - if(bitpos == 70 && (byte & 0x80)) upb_decoder_abort(d, "Unterminated varint"); + if(bitpos == 70 && (byte & 0x80)) + upb_decoder_abortjmp(d, "Unterminated varint"); return u64; } @@ -151,7 +255,7 @@ FORCEINLINE uint32_t upb_decode_varint32(upb_decoder *d) { if ((*(p++) & 0x80) == 0) goto done; // likely slow: u64 = upb_decode_varint_slow(d); - if (u64 > 0xffffffff) upb_decoder_abort(d, "Unterminated 32-bit varint"); + if (u64 > UINT32_MAX) upb_decoder_abortjmp(d, "Unterminated 32-bit varint"); ret = (uint32_t)u64; p = d->ptr; // Turn the next line into a nop. done: @@ -174,7 +278,7 @@ FORCEINLINE uint64_t upb_decode_varint(upb_decoder *d) { if (upb_decoder_bufleft(d) >= 10) { // Fast case. upb_decoderet r = upb_vdecode_fast(d->ptr); - if (r.p == NULL) upb_decoder_abort(d, "Unterminated varint"); + if (r.p == NULL) upb_decoder_abortjmp(d, "Unterminated varint"); upb_decoder_advance(d, r.p - d->ptr); return r.val; } else if (upb_decoder_bufleft(d) > 0) { @@ -200,11 +304,12 @@ FORCEINLINE void upb_decode_fixed(upb_decoder *d, char *buf, size_t bytes) { } else { // Slow case. size_t read = 0; - while (read < bytes) { - size_t avail = upb_decoder_bufleft(d); + while (1) { + size_t avail = UPB_MIN(upb_decoder_bufleft(d), bytes - read); memcpy(buf + read, d->ptr, avail); upb_decoder_advance(d, avail); read += avail; + if (read == bytes) break; upb_pullbuf(d); } } @@ -213,26 +318,28 @@ FORCEINLINE void upb_decode_fixed(upb_decoder *d, char *buf, size_t bytes) { FORCEINLINE uint32_t upb_decode_fixed32(upb_decoder *d) { uint32_t u32; upb_decode_fixed(d, (char*)&u32, sizeof(uint32_t)); - return u32; // TODO: proper byte swapping + return u32; // TODO: proper byte swapping for big-endian machines. } FORCEINLINE uint64_t upb_decode_fixed64(upb_decoder *d) { uint64_t u64; upb_decode_fixed(d, (char*)&u64, sizeof(uint64_t)); - return u64; // TODO: proper byte swapping + return u64; // TODO: proper byte swapping for big-endian machines. } INLINE upb_byteregion *upb_decode_string(upb_decoder *d) { uint32_t strlen = upb_decode_varint32(d); uint64_t offset = upb_decoder_offset(d); + if (offset + strlen > upb_byteregion_endofs(d->input)) + upb_decoder_abortjmp(d, "Unexpected EOF"); upb_byteregion_reset(&d->str_byteregion, d->input, offset, strlen); // Could make it an option on the callback whether we fetchall() first or not. - upb_byteregion_fetchall(&d->str_byteregion, d->status); - if (!upb_ok(d->status)) upb_decoder_exit(d); - upb_decoder_skipto(d, offset + strlen); + if (upb_byteregion_fetchall(&d->str_byteregion) != UPB_BYTE_OK) + upb_decoder_abortjmp(d, "Couldn't fetchall() on string."); + upb_decoder_discardto(d, offset + strlen); return &d->str_byteregion; } -INLINE void upb_push(upb_decoder *d, upb_fhandlers *f, uint64_t end) { +INLINE void upb_push_msg(upb_decoder *d, upb_fhandlers *f, uint64_t end) { upb_dispatch_startsubmsg(&d->dispatcher, f)->end_ofs = end; upb_decoder_setmsgend(d); } @@ -253,8 +360,6 @@ INLINE void upb_push(upb_decoder *d, upb_fhandlers *f, uint64_t end) { static double upb_asdouble(uint64_t n) { double d; memcpy(&d, &n, 8); return d; } static float upb_asfloat(uint32_t n) { float f; memcpy(&f, &n, 4); return f; } -static int32_t upb_zzdec_32(uint32_t n) { return (n >> 1) ^ -(int32_t)(n & 1); } -static int64_t upb_zzdec_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); } T(INT32, varint, int32, int32_t) T(INT64, varint, int64, int64_t) @@ -271,9 +376,10 @@ T(FLOAT, fixed32, float, upb_asfloat) T(SINT32, varint, int32, upb_zzdec_32) T(SINT64, varint, int64, upb_zzdec_64) T(STRING, string, byteregion, upb_byteregion*) +#undef T static void upb_decode_GROUP(upb_decoder *d, upb_fhandlers *f) { - upb_push(d, f, UPB_NONDELIMITED); + upb_push_msg(d, f, UPB_NONDELIMITED); } static void upb_endgroup(upb_decoder *d, upb_fhandlers *f) { (void)f; @@ -281,15 +387,30 @@ static void upb_endgroup(upb_decoder *d, upb_fhandlers *f) { upb_decoder_setmsgend(d); } static void upb_decode_MESSAGE(upb_decoder *d, upb_fhandlers *f) { - upb_push(d, f, upb_decode_varint32(d) + upb_decoder_offset(d)); + uint32_t len = upb_decode_varint32(d); + upb_push_msg(d, f, upb_decoder_offset(d) + len); } +#define F(type) &upb_decode_ ## type +static void *upb_decoderplan_fptrs[] = { + &upb_endgroup, F(DOUBLE), F(FLOAT), F(INT64), + F(UINT64), F(INT32), F(FIXED64), F(FIXED32), F(BOOL), F(STRING), + F(GROUP), F(MESSAGE), F(STRING), F(UINT32), F(ENUM), F(SFIXED32), + F(SFIXED64), F(SINT32), F(SINT64)}; +#undef F + /* The main decoding loop *****************************************************/ static void upb_decoder_checkdelim(upb_decoder *d) { + // TODO: This doesn't work for the case that no buffer is currently loaded + // (ie. d->buf == NULL) because delim_end is NULL even if we are at + // end-of-delim. Need to add a test that exercises this by putting a buffer + // seam in the middle of the final delimited value in a proto that we skip + // for some reason (like because it's unknown and we have no unknown field + // handler). while (d->delim_end != NULL && d->ptr >= d->delim_end) { - if (d->ptr > d->delim_end) upb_decoder_abort(d, "Bad submessage end"); + if (d->ptr > d->delim_end) upb_decoder_abortjmp(d, "Bad submessage end"); if (d->dispatcher.top->is_sequence) { upb_dispatch_endseq(&d->dispatcher); } else { @@ -299,33 +420,36 @@ static void upb_decoder_checkdelim(upb_decoder *d) { } } -static void upb_decoder_enterjit(upb_decoder *d) { - (void)d; -#ifdef UPB_USE_JIT_X64 - if (d->jit_code && d->dispatcher.top == d->dispatcher.stack && d->ptr < d->jit_end) { - // Decodes as many fields as possible, updating d->ptr appropriately, - // before falling through to the slow(er) path. - void (*upb_jit_decode)(upb_decoder *d) = (void*)d->jit_code; - upb_jit_decode(d); - } -#endif -} - INLINE upb_fhandlers *upb_decode_tag(upb_decoder *d) { while (1) { uint32_t tag; if (!upb_trydecode_varint32(d, &tag)) return NULL; uint8_t wire_type = tag & 0x7; - upb_fhandlers *f = upb_dispatcher_lookup(&d->dispatcher, tag); + uint32_t fieldnum = tag >> 3; + upb_itofhandlers_ent *e = upb_inttable_fastlookup( + d->dispatch_table, fieldnum, sizeof(upb_itofhandlers_ent)); + upb_fhandlers *f = e ? e->f : NULL; + + if (f) { + // Wire type check. + if (wire_type == upb_types[f->type].native_wire_type || + (wire_type == UPB_WIRE_TYPE_DELIMITED && + upb_types[f->type].is_numeric)) { + // Wire type is ok. + } else { + f = NULL; + } + } // There are no explicit "startseq" or "endseq" markers in protobuf // streams, so we have to infer them by noticing when a repeated field // starts or ends. - if (d->dispatcher.top->is_sequence && d->dispatcher.top->f != f) { + upb_dispatcher_frame *fr = d->dispatcher.top; + if (fr->is_sequence && fr->f != f) { upb_dispatch_endseq(&d->dispatcher); upb_decoder_setmsgend(d); } - if (f && f->repeated && d->dispatcher.top->f != f) { + if (f && f->repeated && (!fr->is_sequence || fr->f != f)) { uint64_t old_end = d->dispatcher.top->end_ofs; upb_dispatcher_frame *fr = upb_dispatch_startseq(&d->dispatcher, f); if (wire_type != UPB_WIRE_TYPE_DELIMITED || @@ -334,7 +458,8 @@ INLINE upb_fhandlers *upb_decode_tag(upb_decoder *d) { fr->end_ofs = old_end; } else { // Packed primitive field. - fr->end_ofs = upb_decoder_offset(d) + upb_decode_varint(d); + uint32_t len = upb_decode_varint32(d); + fr->end_ofs = upb_decoder_offset(d) + len; fr->is_packed = true; } upb_decoder_setmsgend(d); @@ -343,14 +468,20 @@ INLINE upb_fhandlers *upb_decode_tag(upb_decoder *d) { if (f) return f; // Unknown field. + if (fieldnum == 0 || fieldnum > UPB_MAX_FIELDNUMBER) + upb_decoder_abortjmp(d, "Invalid field number"); switch (wire_type) { case UPB_WIRE_TYPE_VARINT: upb_decode_varint(d); break; - case UPB_WIRE_TYPE_32BIT: upb_decoder_advance(d, 4); break; - case UPB_WIRE_TYPE_64BIT: upb_decoder_advance(d, 8); break; + case UPB_WIRE_TYPE_32BIT: upb_decoder_discard(d, 4); break; + case UPB_WIRE_TYPE_64BIT: upb_decoder_discard(d, 8); break; case UPB_WIRE_TYPE_DELIMITED: - upb_decoder_advance(d, upb_decode_varint32(d)); break; + upb_decoder_discard(d, upb_decode_varint32(d)); break; + case UPB_WIRE_TYPE_START_GROUP: + upb_decoder_abortjmp(d, "Can't handle unknown groups yet"); + case UPB_WIRE_TYPE_END_GROUP: + upb_decoder_abortjmp(d, "Unmatched ENDGROUP tag"); default: - upb_decoder_abort(d, "Invalid wire type"); + upb_decoder_abortjmp(d, "Invalid wire type"); } // TODO: deliver to unknown field callback. upb_decoder_checkpoint(d); @@ -358,16 +489,22 @@ INLINE upb_fhandlers *upb_decode_tag(upb_decoder *d) { } } -void upb_decoder_decode(upb_decoder *d, upb_status *status) { - if (sigsetjmp(d->exitjmp, 0)) { assert(!upb_ok(status)); return; } - d->status = status; +upb_success_t upb_decoder_decode(upb_decoder *d) { + assert(d->input); + if (sigsetjmp(d->exitjmp, 0)) { + assert(!upb_ok(&d->status)); + return UPB_ERROR; + } upb_dispatch_startmsg(&d->dispatcher); // Prime the buf so we can hit the JIT immediately. upb_trypullbuf(d); upb_fhandlers *f = d->dispatcher.top->f; - while(1) { // Main loop: executed once per tag/field pair. + while(1) { upb_decoder_checkdelim(d); +#ifdef UPB_USE_JIT_X64 upb_decoder_enterjit(d); + upb_decoder_checkpoint(d); +#endif if (!d->top_is_packed) f = upb_decode_tag(d); if (!f) { // Sucessful EOF. We may need to dispatch a top-level implicit frame. @@ -375,64 +512,46 @@ void upb_decoder_decode(upb_decoder *d, upb_status *status) { assert(d->dispatcher.top->is_sequence); upb_dispatch_endseq(&d->dispatcher); } - return; + return UPB_OK; } f->decode(d, f); upb_decoder_checkpoint(d); } } -static void upb_decoder_skip(void *_d, upb_dispatcher_frame *f) { - upb_decoder *d = _d; - if (f->end_ofs != UPB_NONDELIMITED) { - upb_decoder_skipto(d, d->dispatcher.top->end_ofs); - } else { - // TODO: how to support skipping groups? Dispatcher could drop callbacks, - // or it could be special-cased inside the decoder. - } +void upb_decoder_init(upb_decoder *d) { + upb_status_init(&d->status); + upb_dispatcher_init(&d->dispatcher, &d->status, &upb_decoder_exitjmp2, d); + d->plan = NULL; + d->input = NULL; } -void upb_decoder_init(upb_decoder *d, upb_handlers *handlers) { - upb_dispatcher_init( - &d->dispatcher, handlers, upb_decoder_skip, upb_decoder_exit2, d); -#ifdef UPB_USE_JIT_X64 - d->jit_code = NULL; - if (d->dispatcher.handlers->should_jit) upb_decoder_makejit(d); -#endif - // Set function pointers for each field's decode function. - for (int i = 0; i < handlers->msgs_len; i++) { - upb_mhandlers *m = handlers->msgs[i]; - for(upb_inttable_iter i = upb_inttable_begin(&m->fieldtab); !upb_inttable_done(i); - i = upb_inttable_next(&m->fieldtab, i)) { - upb_fhandlers *f = upb_inttable_iter_value(i); -#define F(type) &upb_decode_ ## type - static void *fptrs[] = {&upb_endgroup, F(DOUBLE), F(FLOAT), F(INT64), - F(UINT64), F(INT32), F(FIXED64), F(FIXED32), F(BOOL), F(STRING), - F(GROUP), F(MESSAGE), F(STRING), F(UINT32), F(ENUM), F(SFIXED32), - F(SFIXED64), F(SINT32), F(SINT64)}; - f->decode = fptrs[f->type]; - } - } +void upb_decoder_resetplan(upb_decoder *d, upb_decoderplan *p, int msg_offset) { + assert(msg_offset >= 0); + assert(msg_offset < p->handlers->msgs_len); + d->plan = p; + d->msg_offset = msg_offset; + d->input = NULL; } -void upb_decoder_reset(upb_decoder *d, upb_byteregion *input, void *closure) { - upb_dispatcher_frame *f = upb_dispatcher_reset(&d->dispatcher, closure); +void upb_decoder_resetinput(upb_decoder *d, upb_byteregion *input, + void *closure) { + assert(d->plan); + upb_dispatcher_frame *f = + upb_dispatcher_reset(&d->dispatcher, closure, d->plan->handlers->msgs[0]); + upb_status_clear(&d->status); f->end_ofs = UPB_NONDELIMITED; d->input = input; - d->bufstart_ofs = upb_byteregion_startofs(input); - d->buf = NULL; - d->ptr = NULL; - d->end = NULL; // Force a buffer pull. - d->delim_end = NULL; // But don't let end-of-message get triggered. d->str_byteregion.bytesrc = input->bytesrc; -#ifdef UPB_USE_JIT_X64 - d->jit_end = NULL; -#endif + + // Protect against assert in skiptonewbuf(). + d->bufstart_ofs = 0; + d->ptr = NULL; + d->buf = NULL; + upb_decoder_skiptonewbuf(d, upb_byteregion_startofs(input)); } void upb_decoder_uninit(upb_decoder *d) { -#ifdef UPB_USE_JIT_X64 - if (d->dispatcher.handlers->should_jit) upb_decoder_freejit(d); -#endif upb_dispatcher_uninit(&d->dispatcher); + upb_status_uninit(&d->status); } diff --git a/upb/pb/decoder.h b/upb/pb/decoder.h index c35bec4f83..13e5774936 100644 --- a/upb/pb/decoder.h +++ b/upb/pb/decoder.h @@ -21,15 +21,43 @@ extern "C" { #endif -/* upb_decoder *****************************************************************/ +/* upb_decoderplan ************************************************************/ + +// A decoderplan contains whatever data structures and generated (JIT-ted) code +// are necessary to decode protobuf data of a specific type to a specific set +// of handlers. By generating the plan ahead of time, we avoid having to +// redo this work every time we decode. +// +// A decoderplan is threadsafe, meaning that it can be used concurrently by +// different upb_decoders in different threads. However, the upb_decoders are +// *not* thread-safe. +struct _upb_decoderplan; +typedef struct _upb_decoderplan upb_decoderplan; + +// TODO: add parameter for a list of other decoder plans that we can share +// generated code with. +upb_decoderplan *upb_decoderplan_new(upb_handlers *h, bool allowjit); +void upb_decoderplan_unref(upb_decoderplan *p); + +// Returns true if the plan contains JIT-ted code. This may not be the same as +// the "allowjit" parameter to the constructor if support for JIT-ting was not +// compiled in. +bool upb_decoderplan_hasjitcode(upb_decoderplan *p); + + +/* upb_decoder ****************************************************************/ struct dasm_State; typedef struct _upb_decoder { - upb_byteregion *input; // Input data (serialized). - upb_dispatcher dispatcher; // Dispatcher to which we push parsed data. - upb_status *status; // Where we will store any errors that occur. - upb_byteregion str_byteregion; // For passing string data to callbacks. + upb_decoderplan *plan; + int msg_offset; // Which message from the plan is top-level. + upb_byteregion *input; // Input data (serialized), not owned. + upb_dispatcher dispatcher; // Dispatcher to which we push parsed data. + upb_status status; // Where we store errors that occur. + upb_byteregion str_byteregion; // For passing string data to callbacks. + + upb_inttable *dispatch_table; // Current input buffer and its stream offset. const char *buf, *ptr, *end; @@ -37,40 +65,64 @@ typedef struct _upb_decoder { // End of the delimited region, relative to ptr, or NULL if not in this buf. const char *delim_end; + // True if the top stack frame represents a packed field. bool top_is_packed; #ifdef UPB_USE_JIT_X64 // For JIT, which doesn't do bounds checks in the middle of parsing a field. const char *jit_end, *effective_end; // == MIN(jit_end, submsg_end) - - // JIT-generated machine code (else NULL). - char *jit_code; - size_t jit_size; - char *debug_info; - - struct dasm_State *dynasm; #endif // For exiting the decoder on error. sigjmp_buf exitjmp; } upb_decoder; -// Initializes/uninitializes a decoder for calling into the given handlers -// or to write into the given msgdef, given its accessors). Takes a ref -// on the handlers. -void upb_decoder_init(upb_decoder *d, upb_handlers *h); +void upb_decoder_init(upb_decoder *d); void upb_decoder_uninit(upb_decoder *d); -// Resets the internal state of an already-allocated decoder. This puts it in a -// state where it has not seen any data, and expects the next data to be from -// the beginning of a new protobuf. Decoders must be reset before they can be -// used. A decoder can be reset multiple times. "input" must live until the -// decoder is reset again (or destroyed). -void upb_decoder_reset(upb_decoder *d, upb_byteregion *input, void *closure); +// Resets the plan that the decoder will parse from. "msg_offset" indicates +// which message from the plan will be used as the top-level message. +// +// This will also reset the decoder's input to be uninitialized -- +// upb_decoder_resetinput() must be called before parsing can occur. The plan +// must live until the decoder is destroyed or reset to a different plan. +// +// Must be called before upb_decoder_resetinput() or upb_decoder_decode(). +void upb_decoder_resetplan(upb_decoder *d, upb_decoderplan *p, int msg_offset); + +// Resets the input of an already-allocated decoder. This puts it in a state +// where it has not seen any data, and expects the next data to be from the +// beginning of a new protobuf. Decoders must have their input reset before +// they can be used. A decoder can have its input reset multiple times. +// "input" must live until the decoder is destroyed or has it input reset +// again. "c" is the closure that will be passed to the handlers. +// +// Must be called before upb_decoder_decode(). +void upb_decoder_resetinput(upb_decoder *d, upb_byteregion *input, void *c); + +// Decodes serialized data (calling handlers as the data is parsed), returning +// the success of the operation (call upb_decoder_status() for details). +upb_success_t upb_decoder_decode(upb_decoder *d); + +INLINE const upb_status *upb_decoder_status(upb_decoder *d) { + return &d->status; +} + +// Implementation details + +struct _upb_decoderplan { + upb_handlers *handlers; // owns reference. + +#ifdef UPB_USE_JIT_X64 + // JIT-generated machine code (else NULL). + char *jit_code; + size_t jit_size; + char *debug_info; -// Decodes serialized data (calling handlers as the data is parsed) until error -// or EOF (see *status for details). -void upb_decoder_decode(upb_decoder *d, upb_status *status); + // This pointer is allocated by dasm_init() and freed by dasm_free(). + struct dasm_State *dynasm; +#endif +}; #ifdef __cplusplus } /* extern "C" */ diff --git a/upb/pb/decoder_x64.dasc b/upb/pb/decoder_x64.dasc index 75e5b6b46a..807191b8e6 100644 --- a/upb/pb/decoder_x64.dasc +++ b/upb/pb/decoder_x64.dasc @@ -4,20 +4,15 @@ |// Copyright (c) 2011 Google Inc. See LICENSE for details. |// Author: Josh Haberman |// -|// JIT compiler for upb_decoder on x86. Given a upb_handlers object, -|// generates code specialized to parsing the specific message and -|// calling specific handlers. +|// JIT compiler for upb_decoder on x86. Given a upb_decoderplan object (which +|// contains an embedded set of upb_handlers), generates code specialized to +|// parsing the specific message and calling specific handlers. |// |// Since the JIT can call other functions (the JIT'ted code is not a leaf |// function) we must respect alignment rules. On OS X, this means aligning |// the stack to 16 bytes. -#define UPB_NONE -1 -#define UPB_MULTIPLE -2 -#define UPB_TOPLEVEL_ONE -3 - #include -#include "dynasm/dasm_proto.h" #include "dynasm/dasm_x86.h" #ifndef MAP_ANONYMOUS @@ -73,15 +68,15 @@ gdb_jit_descriptor __jit_debug_descriptor = {1, GDB_JIT_NOACTION, NULL, NULL}; void __attribute__((noinline)) __jit_debug_register_code() { __asm__ __volatile__(""); } -void upb_reg_jit_gdb(upb_decoder *d) { +void upb_reg_jit_gdb(upb_decoderplan *plan) { // Create debug info. size_t elf_len = sizeof(upb_jit_debug_elf_file); - d->debug_info = malloc(elf_len); - memcpy(d->debug_info, upb_jit_debug_elf_file, elf_len); - uint64_t *p = (void*)d->debug_info; - for (; (void*)(p+1) <= (void*)d->debug_info + elf_len; ++p) { - if (*p == 0x12345678) { *p = (uintptr_t)d->jit_code; } - if (*p == 0x321) { *p = d->jit_size; } + plan->debug_info = malloc(elf_len); + memcpy(plan->debug_info, upb_jit_debug_elf_file, elf_len); + uint64_t *p = (void*)plan->debug_info; + for (; (void*)(p+1) <= (void*)plan->debug_info + elf_len; ++p) { + if (*p == 0x12345678) { *p = (uintptr_t)plan->jit_code; } + if (*p == 0x321) { *p = plan->jit_size; } } // Register the JIT-ted code with GDB. @@ -89,7 +84,7 @@ void upb_reg_jit_gdb(upb_decoder *d) { e->next_entry = __jit_debug_descriptor.first_entry; e->prev_entry = NULL; if (e->next_entry) e->next_entry->prev_entry = e; - e->symfile_addr = d->debug_info; + e->symfile_addr = plan->debug_info; e->symfile_size = elf_len; __jit_debug_descriptor.first_entry = e; __jit_debug_descriptor.relevant_entry = e; @@ -99,12 +94,17 @@ void upb_reg_jit_gdb(upb_decoder *d) { #else -void upb_reg_jit_gdb(upb_decoder *d) { - (void)d; +void upb_reg_jit_gdb(upb_decoderplan *plan) { + (void)plan; } #endif +// Has to be a separate function, otherwise GCC will complain about +// expressions like (&foo != NULL) because they will never evaluate +// to false. +static void upb_assert_notnull(void *addr) { assert(addr != NULL); } + |.arch x64 |.actionlist upb_jit_actionlist |.globals UPB_JIT_GLOBAL_ @@ -126,7 +126,7 @@ void upb_reg_jit_gdb(upb_decoder *d) { |// ALL of the code in this file uses these register allocations. |// When we "call" within this file, we do not use regular calling |// conventions, but of course when calling to user callbacks we must. -|.define PTR, rbx +|.define PTR, rbx // Writing this to DECODER->ptr commits our progress. |.define CLOSURE, r12 |.type FRAME, upb_dispatcher_frame, r13 |.type BYTEREGION,upb_byteregion, r14 @@ -134,6 +134,7 @@ void upb_reg_jit_gdb(upb_decoder *d) { |.type STDARRAY, upb_stdarray | |.macro callp, addr +|| upb_assert_notnull(addr); || if ((uintptr_t)addr < 0xffffffff) { | call &addr || } else { @@ -191,11 +192,12 @@ void upb_reg_jit_gdb(upb_decoder *d) { | decode_loaded_varint, 0 | mov ecx, edx | shr ecx, 3 -| and edx, 0x7 +| and edx, 0x7 // For the type check that will happen later. | cmp ecx, m->max_field_number // Bounds-check the field. | ja ->exit_jit // In the future; could be unknown label || if ((uintptr_t)m->tablearray < 0xffffffff) { -| mov rax, qword [rcx*8 + m->tablearray] // TODO: support hybrid array/hash tables. +| // TODO: support hybrid array/hash tables. +| mov rax, qword [rcx*8 + m->tablearray] || } else { | mov64 rax, (uintptr_t)m->tablearray | mov rax, qword [rax + rcx*8] @@ -217,8 +219,9 @@ void upb_reg_jit_gdb(upb_decoder *d) { | lea rax, [FRAME + sizeof(upb_dispatcher_frame)] // rax for shorter addressing. | cmp rax, qword DECODER->dispatcher.limit | jae ->exit_jit // Frame stack overflow. -| mov qword FRAME:rax->f, f -| mov dword FRAME:rax->end_ofs, end_offset_ +| mov64 r8, (uintptr_t)f +| mov qword FRAME:rax->f, r8 +| mov qword FRAME:rax->end_ofs, end_offset_ | mov byte FRAME:rax->is_sequence, is_sequence_ | mov DECODER->dispatcher.top, rax | mov FRAME, rax @@ -294,7 +297,7 @@ void upb_reg_jit_gdb(upb_decoder *d) { | |.macro sethas, reg, hasbit || if (hasbit >= 0) { -| or byte [reg + (hasbit / 8)], (1 << (hasbit % 8)) +| or byte [reg + ((uint32_t)hasbit / 8)], (1 << ((uint32_t)hasbit % 8)) || } |.endmacro @@ -304,8 +307,9 @@ void upb_reg_jit_gdb(upb_decoder *d) { #include "upb/msg.h" // Decodes the next val into ARG3, advances PTR. -static void upb_decoder_jit_decodefield(upb_decoder *d, upb_mhandlers *m, - uint8_t type, size_t tag_size) { +static void upb_decoderplan_jit_decodefield(upb_decoderplan *plan, + upb_mhandlers *m, + uint8_t type, size_t tag_size) { // Decode the value into arg 3 for the callback. switch (type) { case UPB_TYPE(DOUBLE): @@ -365,9 +369,9 @@ static void upb_decoder_jit_decodefield(upb_decoder *d, upb_mhandlers *m, // robust checks. | mov ecx, dword [PTR + tag_size] | decode_loaded_varint tag_size - | mov rdi, DECODER->effective_end + | mov rdi, DECODER->end | sub rdi, rax - | cmp ARG3_64, rdi // if (len > d->effective_end - str) + | cmp ARG3_64, rdi // if (len > d->end - str) | ja ->exit_jit // Can't deliver, whole string not in buf. // Update PTR to point past end of string. @@ -401,8 +405,8 @@ static void upb_decoder_jit_decodefield(upb_decoder *d, upb_mhandlers *m, #if 0 // These appear not to speed things up, but keeping around for // further experimentation. -static void upb_decoder_jit_doappend(upb_decoder *d, uint8_t size, - upb_fhandlers *f) { +static void upb_decoderplan_jit_doappend(upb_decoderplan *plan, uint8_t size, + upb_fhandlers *f) { | mov eax, STDARRAY:ARG1_64->len | cmp eax, STDARRAY:ARG1_64->size | jne >2 @@ -434,18 +438,19 @@ static void upb_decoder_jit_doappend(upb_decoder *d, uint8_t size, } #endif -static void upb_decoder_jit_callcb(upb_decoder *d, upb_fhandlers *f) { +static void upb_decoderplan_jit_callcb(upb_decoderplan *plan, + upb_fhandlers *f) { // Call callbacks. if (upb_issubmsgtype(f->type)) { if (f->type == UPB_TYPE(MESSAGE)) { | mov rsi, PTR | sub rsi, DECODER->buf - | add esi, ARG3_32 // = (d->ptr - d->buf) + delim_len + | add rsi, ARG3_64 // = (d->ptr - d->buf) + delim_len } else { assert(f->type == UPB_TYPE(GROUP)); - | mov esi, UPB_NONDELIMITED + | mov rsi, UPB_NONDELIMITED } - | pushframe f, esi, false + | pushframe f, rsi, false // Call startsubmsg handler (if any). if (f->startsubmsg) { @@ -456,15 +461,11 @@ static void upb_decoder_jit_callcb(upb_decoder *d, upb_fhandlers *f) { | mov CLOSURE, rdx } | mov qword FRAME->closure, CLOSURE + // TODO: Handle UPB_SKIPSUBMSG, UPB_BREAK + | mov DECODER->ptr, PTR const upb_mhandlers *sub_m = upb_fhandlers_getsubmsg(f); - if (sub_m->jit_parent_field_done_pclabel != UPB_MULTIPLE) { - | jmp =>sub_m->jit_startmsg_pclabel; - } else { - | call =>sub_m->jit_startmsg_pclabel; - } - - |=>f->jit_submsg_done_pclabel: + | call =>sub_m->jit_startmsg_pclabel; // Call endsubmsg handler (if any). if (f->endsubmsg) { @@ -474,6 +475,8 @@ static void upb_decoder_jit_callcb(upb_decoder *d, upb_fhandlers *f) { | callp f->endsubmsg } | popframe upb_fhandlers_getmsg(f) + // TODO: Handle UPB_SKIPSUBMSG, UPB_BREAK + | mov DECODER->ptr, PTR } else { | mov ARG1_64, CLOSURE // Test for callbacks we can specialize. @@ -499,15 +502,15 @@ static void upb_decoder_jit_callcb(upb_decoder *d, upb_fhandlers *f) { f->value == &upb_stdmsg_setuint64_r || f->value == &upb_stdmsg_setptr_r || f->value == &upb_stdmsg_setdouble_r) { - upb_decoder_jit_doappend(d, 8, f); + upb_decoderplan_jit_doappend(plan, 8, f); } else if (f->value == &upb_stdmsg_setint32_r || f->value == &upb_stdmsg_setuint32_r || f->value == &upb_stdmsg_setfloat_r) { - upb_decoder_jit_doappend(d, 4, f); + upb_decoderplan_jit_doappend(plan, 4, f); } else if (f->value == &upb_stdmsg_setbool_r) { - upb_decoder_jit_doappend(d, 1, f); + upb_decoderplan_jit_doappend(plan, 1, f); #endif - } else { + } else if (f->value) { // Load closure and fval into arg registers. ||#ifndef NDEBUG ||// Since upb_value carries type information in debug mode @@ -519,14 +522,15 @@ static void upb_decoder_jit_callcb(upb_decoder *d, upb_fhandlers *f) { | callp f->value } | sethas CLOSURE, f->valuehasbit + // TODO: Handle UPB_SKIPSUBMSG, UPB_BREAK + | mov DECODER->ptr, PTR } - // TODO: Handle UPB_SKIPSUBMSG, UPB_BREAK } // PTR should point to the beginning of the tag. -static void upb_decoder_jit_field(upb_decoder *d, uint32_t tag, - uint32_t next_tag, upb_mhandlers *m, - upb_fhandlers *f, upb_fhandlers *next_f) { +static void upb_decoderplan_jit_field(upb_decoderplan *plan, uint64_t tag, + uint64_t next_tag, upb_mhandlers *m, + upb_fhandlers *f, upb_fhandlers *next_f) { // PC-label for the dispatch table. // We check the wire type (which must be loaded in edx) because the // table is keyed on field number, not type. @@ -535,8 +539,8 @@ static void upb_decoder_jit_field(upb_decoder *d, uint32_t tag, | jne ->exit_jit // In the future: could be an unknown field or packed. |=>f->jit_pclabel_notypecheck: if (f->repeated) { - | mov esi, FRAME->end_ofs - | pushframe f, esi, true + | mov rsi, FRAME->end_ofs + | pushframe f, rsi, true if (f->startseq) { | mov ARG1_64, CLOSURE | loadfval f @@ -555,8 +559,8 @@ static void upb_decoder_jit_field(upb_decoder *d, uint32_t tag, return; } - upb_decoder_jit_decodefield(d, m, f->type, tag_size); - upb_decoder_jit_callcb(d, f); + upb_decoderplan_jit_decodefield(plan, m, f->type, tag_size); + upb_decoderplan_jit_callcb(plan, f); // Epilogue: load next tag, check for repeated field. | check_eob m @@ -586,13 +590,11 @@ static int upb_compare_uint32(const void *a, const void *b) { return *(uint32_t*)a - *(uint32_t*)b; } -static void upb_decoder_jit_msg(upb_decoder *d, upb_mhandlers *m) { +static void upb_decoderplan_jit_msg(upb_decoderplan *plan, upb_mhandlers *m) { |=>m->jit_startmsg_pclabel: + // There was a call to get here, so we need to align the stack. + | sub rsp, 8 - if (m->jit_parent_field_done_pclabel == UPB_MULTIPLE) { - // There was a call to get here, so we need to align the stack. - | sub rsp, 8 - } // Call startmsg handler (if any): if (m->startmsg) { // upb_flow_t startmsg(void *closure); @@ -615,23 +617,30 @@ static void upb_decoder_jit_msg(upb_decoder *d, upb_mhandlers *m) { int num_keys = upb_inttable_count(&m->fieldtab); uint32_t *keys = malloc(num_keys * sizeof(*keys)); int idx = 0; - for(upb_inttable_iter i = upb_inttable_begin(&m->fieldtab); !upb_inttable_done(i); + for(upb_inttable_iter i = upb_inttable_begin(&m->fieldtab); + !upb_inttable_done(i); i = upb_inttable_next(&m->fieldtab, i)) { keys[idx++] = upb_inttable_iter_key(i); } qsort(keys, num_keys, sizeof(uint32_t), &upb_compare_uint32); upb_fhandlers *last_f = NULL; - uint32_t last_tag = 0; + uint64_t last_encoded_tag = 0; for(int i = 0; i < num_keys; i++) { - uint32_t key = keys[i]; - upb_fhandlers *f = upb_inttable_lookup(&m->fieldtab, key); - uint32_t tag = upb_vencode32(key); - if (last_f) upb_decoder_jit_field(d, last_tag, tag, m, last_f, f); - last_tag = tag; + uint32_t fieldnum = keys[i]; + upb_itofhandlers_ent *e = upb_inttable_lookup(&m->fieldtab, fieldnum); + upb_fhandlers *f = e->f; + assert(f->number == fieldnum); + uint32_t tag = (f->number << 3) | upb_types[f->type].native_wire_type; + uint64_t encoded_tag = upb_vencode32(tag); + // No tag should be greater than 5 bytes. + assert(encoded_tag <= 0xffffffffff); + if (last_f) upb_decoderplan_jit_field( + plan, last_encoded_tag, encoded_tag, m, last_f, f); + last_encoded_tag = encoded_tag; last_f = f; } - upb_decoder_jit_field(d, last_tag, 0, m, last_f, NULL); + upb_decoderplan_jit_field(plan, last_encoded_tag, 0, m, last_f, NULL); free(keys); @@ -655,22 +664,29 @@ static void upb_decoder_jit_msg(upb_decoder *d, upb_mhandlers *m) { | callp m->endmsg } - if (m->jit_parent_field_done_pclabel == UPB_MULTIPLE) { - // Counter previous alignment. - | add rsp, 8 - | ret - } else if (m->jit_parent_field_done_pclabel == UPB_TOPLEVEL_ONE) { - | jmp ->exit_jit - } else { - | jmp =>m->jit_parent_field_done_pclabel + if (m->is_group) { + // Advance past the "end group" tag. + // TODO: Handle UPB_BREAK + | mov DECODER->ptr, PTR } + // Counter previous alignment. + | add rsp, 8 + | ret } -static const char *dbgfmt = - "JIT encountered unknown field! wt=%d, fn=%d\n"; - -static void upb_decoder_jit(upb_decoder *d) { +static void upb_decoderplan_jit(upb_decoderplan *plan) { + // The JIT prologue/epilogue trampoline that is generated in this function + // does not depend on the handlers, so it will never vary. Ideally we would + // put it in an object file and just link it into upb so we could have only a + // single copy of it instead of one copy for each decoderplan. But our + // options for doing that are undesirable: GCC inline assembly is + // complicated, not portable to other compilers, and comes with subtle + // caveats about incorrect things what the optimizer might do if you eg. + // execute non-local jumps. Putting this code in a .s file would force us to + // calculate the structure offsets ourself instead of symbolically + // (ie. [r15 + 0xcd] instead of DECODER->ptr). So we tolerate a bit of + // unnecessary duplication/redundancy. | push rbp | mov rbp, rsp | push r15 @@ -686,18 +702,14 @@ static void upb_decoder_jit(upb_decoder *d) { | mov CLOSURE, FRAME->closure | mov PTR, DECODER->ptr - upb_handlers *h = d->dispatcher.handlers; - if (h->msgs[0]->jit_parent_field_done_pclabel == UPB_MULTIPLE) { - | call =>h->msgs[0]->jit_startmsg_pclabel - | jmp ->exit_jit - } - // TODO: push return addresses for re-entry (will be necessary for multiple // buffer support). - for (int i = 0; i < h->msgs_len; i++) upb_decoder_jit_msg(d, h->msgs[i]); + | call ARG2_64 |->exit_jit: - | mov DECODER->ptr, PTR + // Restore stack pointer to where it was before any "call" instructions + // inside our generated code. + | lea rsp, [rbp - 48] // Counter previous alignment. | add rsp, 8 | pop rbx @@ -707,122 +719,128 @@ static void upb_decoder_jit(upb_decoder *d) { | pop r15 | leave | ret - |=>0: - | mov rdi, stderr - | mov rsi, dbgfmt - | callp fprintf - | callp abort + + upb_handlers *h = plan->handlers; + for (int i = 0; i < h->msgs_len; i++) + upb_decoderplan_jit_msg(plan, h->msgs[i]); } -void upb_decoder_jit_assignfieldlabs(upb_fhandlers *f, - uint32_t *pclabel_count) { +static void upb_decoderplan_jit_assignfieldlabs(upb_fhandlers *f, + uint32_t *pclabel_count) { f->jit_pclabel = (*pclabel_count)++; f->jit_pclabel_notypecheck = (*pclabel_count)++; - f->jit_submsg_done_pclabel = (*pclabel_count)++; } -void upb_decoder_jit_assignmsglabs(upb_mhandlers *m, uint32_t *pclabel_count) { +static void upb_decoderplan_jit_assignmsglabs(upb_mhandlers *m, + uint32_t *pclabel_count) { m->jit_startmsg_pclabel = (*pclabel_count)++; m->jit_endofbuf_pclabel = (*pclabel_count)++; m->jit_endofmsg_pclabel = (*pclabel_count)++; m->jit_dyndispatch_pclabel = (*pclabel_count)++; m->jit_unknownfield_pclabel = (*pclabel_count)++; - m->jit_parent_field_done_pclabel = UPB_NONE; m->max_field_number = 0; upb_inttable_iter i; for(i = upb_inttable_begin(&m->fieldtab); !upb_inttable_done(i); i = upb_inttable_next(&m->fieldtab, i)) { uint32_t key = upb_inttable_iter_key(i); m->max_field_number = UPB_MAX(m->max_field_number, key); - upb_fhandlers *f = upb_inttable_iter_value(i); - upb_decoder_jit_assignfieldlabs(f, pclabel_count); + upb_itofhandlers_ent *e = upb_inttable_iter_value(i); + upb_decoderplan_jit_assignfieldlabs(e->f, pclabel_count); } - // XXX: Won't work for large field numbers; will need to use a upb_table. + // TODO: support large field numbers by either using a hash table or + // generating code for a binary search. For now large field numbers + // will just fall back to the table decoder. + m->max_field_number = UPB_MIN(m->max_field_number, 16000); m->tablearray = malloc((m->max_field_number + 1) * sizeof(void*)); } -// Second pass: for messages that have only one parent, link them to the field -// from which they are called. -void upb_decoder_jit_assignmsglabs2(upb_mhandlers *m) { - upb_inttable_iter i; - for(i = upb_inttable_begin(&m->fieldtab); !upb_inttable_done(i); - i = upb_inttable_next(&m->fieldtab, i)) { - upb_fhandlers *f = upb_inttable_iter_value(i); - if (upb_issubmsgtype(f->type)) { - upb_mhandlers *sub_m = upb_fhandlers_getsubmsg(f); - if (sub_m->jit_parent_field_done_pclabel == UPB_NONE) { - sub_m->jit_parent_field_done_pclabel = f->jit_submsg_done_pclabel; - } else { - sub_m->jit_parent_field_done_pclabel = UPB_MULTIPLE; - } - } - } -} - -void upb_decoder_makejit(upb_decoder *d) { - d->debug_info = NULL; +static void upb_decoderplan_makejit(upb_decoderplan *plan) { + plan->debug_info = NULL; // Assign pclabels. - uint32_t pclabel_count = 1; - upb_handlers *h = d->dispatcher.handlers; + uint32_t pclabel_count = 0; + upb_handlers *h = plan->handlers; for (int i = 0; i < h->msgs_len; i++) - upb_decoder_jit_assignmsglabs(h->msgs[i], &pclabel_count); - for (int i = 0; i < h->msgs_len; i++) - upb_decoder_jit_assignmsglabs2(h->msgs[i]); - - if (h->msgs[0]->jit_parent_field_done_pclabel == UPB_NONE) { - h->msgs[0]->jit_parent_field_done_pclabel = UPB_TOPLEVEL_ONE; - } + upb_decoderplan_jit_assignmsglabs(h->msgs[i], &pclabel_count); void **globals = malloc(UPB_JIT_GLOBAL__MAX * sizeof(*globals)); - dasm_init(d, 1); - dasm_setupglobal(d, globals, UPB_JIT_GLOBAL__MAX); - dasm_growpc(d, pclabel_count); - dasm_setup(d, upb_jit_actionlist); + dasm_init(plan, 1); + dasm_setupglobal(plan, globals, UPB_JIT_GLOBAL__MAX); + dasm_growpc(plan, pclabel_count); + dasm_setup(plan, upb_jit_actionlist); - upb_decoder_jit(d); + upb_decoderplan_jit(plan); - dasm_link(d, &d->jit_size); + int dasm_status = dasm_link(plan, &plan->jit_size); + (void)dasm_status; + assert(dasm_status == DASM_S_OK); - d->jit_code = mmap(NULL, d->jit_size, PROT_READ | PROT_WRITE, - MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + plan->jit_code = mmap(NULL, plan->jit_size, PROT_READ | PROT_WRITE, + MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - upb_reg_jit_gdb(d); + upb_reg_jit_gdb(plan); - dasm_encode(d, d->jit_code); + dasm_encode(plan, plan->jit_code); // Create dispatch tables. for (int i = 0; i < h->msgs_len; i++) { upb_mhandlers *m = h->msgs[i]; + m->jit_func = + plan->jit_code + dasm_getpclabel(plan, m->jit_startmsg_pclabel); for (uint32_t j = 0; j <= m->max_field_number; j++) { - upb_fhandlers *f = NULL; - for (int k = 0; k < 8; k++) { - f = upb_inttable_lookup(&m->fieldtab, (j << 3) | k); - if (f) break; - } + upb_itofhandlers_ent *e = upb_inttable_lookup(&m->fieldtab, j); + upb_fhandlers *f = e ? e->f : NULL; if (f) { - m->tablearray[j] = d->jit_code + dasm_getpclabel(d, f->jit_pclabel); + m->tablearray[j] = + plan->jit_code + dasm_getpclabel(plan, f->jit_pclabel); } else { - // Don't handle unknown fields yet. - m->tablearray[j] = d->jit_code + dasm_getpclabel(d, 0); + // TODO: extend the JIT to handle unknown fields. + // For the moment we exit the JIT for any unknown field. + m->tablearray[j] = globals[UPB_JIT_GLOBAL_exit_jit]; } } } - dasm_free(d); + dasm_free(plan); free(globals); - mprotect(d->jit_code, d->jit_size, PROT_EXEC | PROT_READ); + mprotect(plan->jit_code, plan->jit_size, PROT_EXEC | PROT_READ); // View with: objdump -M intel -D -b binary -mi386 -Mx86-64 /tmp/machine-code // Or: ndisasm -b 64 /tmp/machine-code FILE *f = fopen("/tmp/machine-code", "wb"); - fwrite(d->jit_code, d->jit_size, 1, f); + fwrite(plan->jit_code, plan->jit_size, 1, f); fclose(f); } -void upb_decoder_freejit(upb_decoder *d) { - munmap(d->jit_code, d->jit_size); - free(d->debug_info); +static void upb_decoderplan_freejit(upb_decoderplan *plan) { + munmap(plan->jit_code, plan->jit_size); + free(plan->debug_info); // TODO: unregister } + +static void upb_decoder_enterjit(upb_decoder *d) { + if (d->plan->jit_code && + d->dispatcher.top == d->dispatcher.stack && + d->ptr && d->ptr < d->jit_end) { +#ifndef NDEBUG + register uint64_t rbx asm ("rbx") = 11; + register uint64_t r12 asm ("r12") = 12; + register uint64_t r13 asm ("r13") = 13; + register uint64_t r14 asm ("r14") = 14; + register uint64_t r15 asm ("r15") = 15; +#endif + // Decodes as many fields as possible, updating d->ptr appropriately, + // before falling through to the slow(er) path. + void (*upb_jit_decode)(upb_decoder *d, void*) = (void*)d->plan->jit_code; + upb_jit_decode(d, d->plan->handlers->msgs[d->msg_offset]->jit_func); + assert(d->ptr <= d->end); + + // Test that callee-save registers were properly restored. + assert(rbx == 11); + assert(r12 == 12); + assert(r13 == 13); + assert(r14 == 14); + assert(r15 == 15); + } +} diff --git a/upb/pb/glue.c b/upb/pb/glue.c index 3176355ac9..4949fe3e24 100644 --- a/upb/pb/glue.c +++ b/upb/pb/glue.c @@ -12,8 +12,8 @@ #include "upb/pb/glue.h" #include "upb/pb/textprinter.h" -void upb_strtomsg(const char *str, size_t len, void *msg, const upb_msgdef *md, - upb_status *status) { +bool upb_strtomsg(const char *str, size_t len, void *msg, const upb_msgdef *md, + bool allow_jit, upb_status *status) { upb_stringsrc strsrc; upb_stringsrc_init(&strsrc); upb_stringsrc_reset(&strsrc, str, len); @@ -21,13 +21,21 @@ void upb_strtomsg(const char *str, size_t len, void *msg, const upb_msgdef *md, upb_decoder d; upb_handlers *h = upb_handlers_new(); upb_accessors_reghandlers(h, md); - upb_decoder_init(&d, h); + upb_decoderplan *p = upb_decoderplan_new(h, allow_jit); + upb_decoder_init(&d); upb_handlers_unref(h); - upb_decoder_reset(&d, upb_stringsrc_allbytes(&strsrc), msg); - upb_decoder_decode(&d, status); + upb_decoder_resetplan(&d, p, 0); + upb_decoder_resetinput(&d, upb_stringsrc_allbytes(&strsrc), msg); + upb_success_t ret = upb_decoder_decode(&d); + // stringsrc and the handlers registered by upb_accessors_reghandlers() + // should not suspend. + assert((ret == UPB_OK) == upb_ok(upb_decoder_status(&d))); + if (status) upb_status_copy(status, upb_decoder_status(&d)); upb_stringsrc_uninit(&strsrc); upb_decoder_uninit(&d); + upb_decoderplan_unref(p); + return ret == UPB_OK; } void *upb_filetonewmsg(const char *fname, const upb_msgdef *md, upb_status *s) { @@ -35,7 +43,7 @@ void *upb_filetonewmsg(const char *fname, const upb_msgdef *md, upb_status *s) { size_t len; char *data = upb_readfile(fname, &len); if (!data) goto err; - upb_strtomsg(data, len, msg, md, s); + upb_strtomsg(data, len, msg, md, false, s); if (!upb_ok(s)) goto err; return msg; @@ -69,7 +77,6 @@ void upb_msgtotext(upb_string *str, upb_msg *msg, upb_msgdef *md, } #endif -// TODO: read->load. upb_def **upb_load_defs_from_descriptor(const char *str, size_t len, int *n, upb_status *status) { upb_stringsrc strsrc; @@ -79,17 +86,21 @@ upb_def **upb_load_defs_from_descriptor(const char *str, size_t len, int *n, upb_handlers *h = upb_handlers_new(); upb_descreader_reghandlers(h); + upb_decoderplan *p = upb_decoderplan_new(h, false); upb_decoder d; - upb_decoder_init(&d, h); + upb_decoder_init(&d); upb_handlers_unref(h); upb_descreader r; upb_descreader_init(&r); - upb_decoder_reset(&d, upb_stringsrc_allbytes(&strsrc), &r); + upb_decoder_resetplan(&d, p, 0); + upb_decoder_resetinput(&d, upb_stringsrc_allbytes(&strsrc), &r); - upb_decoder_decode(&d, status); + upb_success_t ret = upb_decoder_decode(&d); + if (status) upb_status_copy(status, upb_decoder_status(&d)); upb_stringsrc_uninit(&strsrc); upb_decoder_uninit(&d); - if (!upb_ok(status)) { + upb_decoderplan_unref(p); + if (ret != UPB_OK) { upb_descreader_uninit(&r); return NULL; } diff --git a/upb/pb/glue.h b/upb/pb/glue.h index 38e8d8ec06..ff8c85e535 100644 --- a/upb/pb/glue.h +++ b/upb/pb/glue.h @@ -36,8 +36,8 @@ extern "C" { // Decodes the given string, which must be in protobuf binary format, to the // given upb_msg with msgdef "md", storing the status of the operation in "s". -void upb_strtomsg(const char *str, size_t len, void *msg, - const upb_msgdef *md, upb_status *s); +bool upb_strtomsg(const char *str, size_t len, void *msg, + const upb_msgdef *md, bool allow_jit, upb_status *s); // Parses the given file into a new message of the given type. Caller owns // the returned message (or NULL if an error occurred). diff --git a/upb/pb/varint.h b/upb/pb/varint.h index 19977e97e9..815a7a1ea2 100644 --- a/upb/pb/varint.h +++ b/upb/pb/varint.h @@ -19,6 +19,18 @@ extern "C" { #endif +// The maximum number of bytes that it takes to encode a 64-bit varint. +// Note that with a better encoding this could be 9 (TODO: write up a +// wiki document about this). +#define UPB_PB_VARINT_MAX_LEN 10 + +/* Zig-zag encoding/decoding **************************************************/ + +INLINE int32_t upb_zzdec_32(uint32_t n) { return (n >> 1) ^ -(int32_t)(n & 1); } +INLINE int64_t upb_zzdec_64(uint64_t n) { return (n >> 1) ^ -(int64_t)(n & 1); } +INLINE uint32_t upb_zzenc_32(int32_t n) { return (n << 1) ^ (n >> 31); } +INLINE uint64_t upb_zzenc_64(int64_t n) { return (n << 1) ^ (n >> 63); } + /* Decoding *******************************************************************/ // All decoding functions return this struct by value. @@ -56,7 +68,7 @@ done: INLINE upb_decoderet upb_vdecode_branch64(const char *p) { uint64_t val; uint64_t b; - upb_decoderet r = {(void*)0, 0}; + upb_decoderet r = {NULL, 0}; b = *(p++); val = (b & 0x7f) ; if(!(b & 0x80)) goto done; b = *(p++); val |= (b & 0x7f) << 7; if(!(b & 0x80)) goto done; b = *(p++); val |= (b & 0x7f) << 14; if(!(b & 0x80)) goto done; @@ -124,17 +136,33 @@ INLINE int upb_value_size(uint64_t val) { return val == 0 ? 1 : high_bit / 8 + 1; } +// Encodes a 64-bit varint into buf (which must be >=UPB_PB_VARINT_MAX_LEN +// bytes long), returning how many bytes were used. +// +// TODO: benchmark and optimize if necessary. +INLINE size_t upb_vencode64(uint64_t val, char *buf) { + if (val == 0) { buf[0] = 0; return 1; } + size_t i = 0; + while (val) { + uint8_t byte = val & 0x7f; + val >>= 7; + if (val) byte |= 0x80; + buf[i++] = byte; + } + return i; +} + // Encodes a 32-bit varint, *not* sign-extended. INLINE uint64_t upb_vencode32(uint32_t val) { + char buf[UPB_PB_VARINT_MAX_LEN]; + size_t bytes = upb_vencode64(val, buf); uint64_t ret = 0; - for (int bitpos = 0; val; bitpos+=8, val >>=7) { - if (bitpos > 0) ret |= (1 << (bitpos-1)); - ret |= (val & 0x7f) << bitpos; - } + assert(bytes <= 5); + memcpy(&ret, buf, bytes); + assert(ret <= 0xffffffffff); return ret; } - #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/upb/table.h b/upb/table.h index 0786a1afc2..0c0a7854c0 100644 --- a/upb/table.h +++ b/upb/table.h @@ -127,6 +127,8 @@ INLINE bool _upb_inttable_isarrkey(const upb_inttable *t, uint32_t k) { // We have the caller specify the entry_size because fixing this as a literal // (instead of reading table->entry_size) gives the compiler more ability to // optimize. +// +// Note: All returned pointers are invalidated by inserts! INLINE void *_upb_inttable_fastlookup(const upb_inttable *t, uint32_t key, size_t entry_size, size_t value_size) { upb_inttable_value *arrval = @@ -203,8 +205,11 @@ typedef struct { } upb_inttable_iter; upb_inttable_iter upb_inttable_begin(const upb_inttable *t); -upb_inttable_iter upb_inttable_next(const upb_inttable *t, upb_inttable_iter iter); -INLINE bool upb_inttable_done(upb_inttable_iter iter) { return iter.value == NULL; } +upb_inttable_iter upb_inttable_next(const upb_inttable *t, + upb_inttable_iter iter); +INLINE bool upb_inttable_done(upb_inttable_iter iter) { + return iter.value == NULL; +} INLINE uint32_t upb_inttable_iter_key(upb_inttable_iter iter) { return iter.key; } diff --git a/upb/upb.c b/upb/upb.c index 5002e10b9c..a3e07e4a3f 100644 --- a/upb/upb.c +++ b/upb/upb.c @@ -15,29 +15,32 @@ #include "upb/bytestream.h" #define alignof(t) offsetof(struct { char c; t x; }, x) -#define TYPE_INFO(wire_type, ctype, inmemory_type) \ - {alignof(ctype), sizeof(ctype), wire_type, UPB_TYPE(inmemory_type), #ctype}, +#define TYPE_INFO(wire_type, ctype, inmemory_type, is_numeric) \ + {alignof(ctype), sizeof(ctype), wire_type, UPB_TYPE(inmemory_type), \ + #ctype, is_numeric}, const upb_type_info upb_types[] = { - TYPE_INFO(UPB_WIRE_TYPE_END_GROUP, void*, MESSAGE) // ENDGROUP (fake) - TYPE_INFO(UPB_WIRE_TYPE_64BIT, double, DOUBLE) // DOUBLE - TYPE_INFO(UPB_WIRE_TYPE_32BIT, float, FLOAT) // FLOAT - TYPE_INFO(UPB_WIRE_TYPE_VARINT, int64_t, INT64) // INT64 - TYPE_INFO(UPB_WIRE_TYPE_VARINT, uint64_t, UINT64) // UINT64 - TYPE_INFO(UPB_WIRE_TYPE_VARINT, int32_t, INT32) // INT32 - TYPE_INFO(UPB_WIRE_TYPE_64BIT, uint64_t, UINT64) // FIXED64 - TYPE_INFO(UPB_WIRE_TYPE_32BIT, uint32_t, UINT32) // FIXED32 - TYPE_INFO(UPB_WIRE_TYPE_VARINT, bool, BOOL) // BOOL - TYPE_INFO(UPB_WIRE_TYPE_DELIMITED, void*, STRING) // STRING - TYPE_INFO(UPB_WIRE_TYPE_START_GROUP, void*, MESSAGE) // GROUP - TYPE_INFO(UPB_WIRE_TYPE_DELIMITED, void*, MESSAGE) // MESSAGE - TYPE_INFO(UPB_WIRE_TYPE_DELIMITED, void*, STRING) // BYTES - TYPE_INFO(UPB_WIRE_TYPE_VARINT, uint32_t, UINT32) // UINT32 - TYPE_INFO(UPB_WIRE_TYPE_VARINT, uint32_t, INT32) // ENUM - TYPE_INFO(UPB_WIRE_TYPE_32BIT, int32_t, INT32) // SFIXED32 - TYPE_INFO(UPB_WIRE_TYPE_64BIT, int64_t, INT64) // SFIXED64 - TYPE_INFO(UPB_WIRE_TYPE_VARINT, int32_t, INT32) // SINT32 - TYPE_INFO(UPB_WIRE_TYPE_VARINT, int64_t, INT64) // SINT64 + // END_GROUP is not real, but used to signify the pseudo-field that + // ends a group from within the group. + TYPE_INFO(UPB_WIRE_TYPE_END_GROUP, void*, MESSAGE, false) // ENDGROUP + TYPE_INFO(UPB_WIRE_TYPE_64BIT, double, DOUBLE, true) // DOUBLE + TYPE_INFO(UPB_WIRE_TYPE_32BIT, float, FLOAT, true) // FLOAT + TYPE_INFO(UPB_WIRE_TYPE_VARINT, int64_t, INT64, true) // INT64 + TYPE_INFO(UPB_WIRE_TYPE_VARINT, uint64_t, UINT64, true) // UINT64 + TYPE_INFO(UPB_WIRE_TYPE_VARINT, int32_t, INT32, true) // INT32 + TYPE_INFO(UPB_WIRE_TYPE_64BIT, uint64_t, UINT64, true) // FIXED64 + TYPE_INFO(UPB_WIRE_TYPE_32BIT, uint32_t, UINT32, true) // FIXED32 + TYPE_INFO(UPB_WIRE_TYPE_VARINT, bool, BOOL, true) // BOOL + TYPE_INFO(UPB_WIRE_TYPE_DELIMITED, void*, STRING, false) // STRING + TYPE_INFO(UPB_WIRE_TYPE_START_GROUP, void*, MESSAGE, false) // GROUP + TYPE_INFO(UPB_WIRE_TYPE_DELIMITED, void*, MESSAGE, false) // MESSAGE + TYPE_INFO(UPB_WIRE_TYPE_DELIMITED, void*, STRING, false) // BYTES + TYPE_INFO(UPB_WIRE_TYPE_VARINT, uint32_t, UINT32, true) // UINT32 + TYPE_INFO(UPB_WIRE_TYPE_VARINT, uint32_t, INT32, true) // ENUM + TYPE_INFO(UPB_WIRE_TYPE_32BIT, int32_t, INT32, true) // SFIXED32 + TYPE_INFO(UPB_WIRE_TYPE_64BIT, int64_t, INT64, true) // SFIXED64 + TYPE_INFO(UPB_WIRE_TYPE_VARINT, int32_t, INT32, true) // SINT32 + TYPE_INFO(UPB_WIRE_TYPE_VARINT, int64_t, INT64, true) // SINT64 }; #ifdef NDEBUG @@ -66,13 +69,13 @@ void upb_status_seterrf(upb_status *s, const char *msg, ...) { } void upb_status_seterrliteral(upb_status *status, const char *msg) { - status->code = UPB_ERROR; + status->error = true; status->str = msg; status->space = NULL; } void upb_status_copy(upb_status *to, const upb_status *from) { - to->status = from->status; + to->error = from->error; to->eof = from->eof; to->code = from->code; to->space = from->space; @@ -92,15 +95,20 @@ const char *upb_status_getstr(const upb_status *_status) { // Function is logically const but can modify internal state to materialize // the string. upb_status *status = (upb_status*)_status; - if (status->str == NULL && status->space && status->space->code_to_string) { - status->space->code_to_string(status->code, status->buf, status->bufsize); - status->str = status->buf; + if (status->str == NULL && status->space) { + if (status->space->code_to_string) { + status->space->code_to_string(status->code, status->buf, status->bufsize); + status->str = status->buf; + } else { + upb_status_seterrf(status, "No message, error space=%s, code=%d\n", + status->space->name, status->code); + } } return status->str; } void upb_status_clear(upb_status *status) { - status->status = UPB_OK; + status->error = false; status->eof = false; status->code = 0; status->space = NULL; @@ -114,19 +122,38 @@ void upb_status_setcode(upb_status *status, upb_errorspace *space, int code) { } void upb_status_fromerrno(upb_status *status) { - if (errno == 0) { - status->status = UPB_OK; - } else if (errno == EAGAIN || errno == EWOULDBLOCK) { - status->status = UPB_WOULDBLOCK; - } else { - status->status = UPB_ERROR; + if (errno != 0 && !upb_errno_is_wouldblock()) { + status->error = true; + upb_status_setcode(status, &upb_posix_errorspace, errno); + } +} + +bool upb_errno_is_wouldblock() { + return +#ifdef EAGAIN + errno == EAGAIN || +#endif +#ifdef EWOULDBLOCK + errno == EWOULDBLOCK || +#endif + false; +} + +bool upb_posix_codetostr(int code, char *buf, size_t len) { + if (strerror_r(code, buf, len) == -1) { + if (errno == EINVAL) { + return snprintf(buf, len, "Invalid POSIX error number %d\n", code) >= len; + } else if (errno == ERANGE) { + return false; + } + assert(false); } - upb_status_setcode(status, &upb_posix_errorspace, errno); + return true; } -upb_errorspace upb_posix_errorspace = {"POSIX", NULL}; // TODO +upb_errorspace upb_posix_errorspace = {"POSIX", &upb_posix_codetostr}; -int upb_vrprintf(char **buf, uint32_t *size, uint32_t ofs, +int upb_vrprintf(char **buf, size_t *size, size_t ofs, const char *fmt, va_list args) { // Try once without reallocating. We have to va_copy because we might have // to call vsnprintf again. @@ -141,7 +168,7 @@ int upb_vrprintf(char **buf, uint32_t *size, uint32_t ofs, // Need to print again, because some characters were truncated. vsnprintf // will not write the entire string unless you give it space to store the // NULL terminator also. - while (*size < (ofs + true_len + 1)) *size = UPB_MAX(*size * 2, 2); + *size = (ofs + true_len + 1); char *newbuf = realloc(*buf, *size); if (!newbuf) return -1; vsnprintf(newbuf + ofs, true_len + 1, fmt, args); diff --git a/upb/upb.h b/upb/upb.h index e43418fbea..d11c7cb15a 100644 --- a/upb/upb.h +++ b/upb/upb.h @@ -10,10 +10,12 @@ #ifndef UPB_H_ #define UPB_H_ -#include -#include #include #include +#include +#include +#include +#include #include "descriptor_const.h" #include "atomic.h" @@ -26,6 +28,12 @@ extern "C" { #define INLINE static inline #endif +#ifdef __GNUC__ +#define UPB_NORETURN __attribute__((__noreturn__)) +#else +#define UPB_NORETURN +#endif + #define UPB_MAX(x, y) ((x) > (y) ? (x) : (y)) #define UPB_MIN(x, y) ((x) < (y) ? (x) : (y)) #define UPB_INDEX(base, i, m) (void*)((char*)(base) + ((i)*(m))) @@ -115,6 +123,7 @@ typedef struct { uint8_t native_wire_type; uint8_t inmemory_type; // For example, INT32, SINT32, and SFIXED32 -> INT32 const char *ctype; + bool is_numeric; // Only numeric types can be packed. } upb_type_info; // A static array of info about all of the field types, indexed by type number. @@ -176,6 +185,7 @@ typedef struct { return val.val.membername; \ } \ INLINE void upb_value_set ## name(upb_value *val, ctype cval) { \ + memset(val, 0, sizeof(*val)); \ SET_TYPE(val->type, proto_type); \ val->val.membername = cval; \ } \ @@ -206,27 +216,31 @@ extern upb_value UPB_NO_VALUE; /* upb_status *****************************************************************/ -enum { +typedef enum { UPB_OK, // The operation completed successfully. - UPB_WOULDBLOCK, // Stream is nonblocking and the operation would block. + UPB_SUSPENDED, // The operation was suspended and may be resumed later. UPB_ERROR, // An error occurred. -}; +} upb_success_t; typedef struct { const char *name; // Writes a NULL-terminated string to "buf" containing an error message for // the given error code, returning false if the message was too large to fit. - bool (*code_to_string)(int code, char *buf, uint32_t len); + bool (*code_to_string)(int code, char *buf, size_t len); } upb_errorspace; typedef struct { - char status; + bool error; bool eof; - int code; // Can be set to a more specific code (defined by error space). + + // Specific status code defined by some error space (optional). + int code; upb_errorspace *space; + + // Error message (optional). const char *str; // NULL when no message is present. NULL-terminated. char *buf; // Owned by the status. - uint32_t bufsize; + size_t bufsize; } upb_status; #define UPB_STATUS_INIT {UPB_OK, false, 0, NULL, NULL, NULL, 0} @@ -234,7 +248,7 @@ typedef struct { void upb_status_init(upb_status *status); void upb_status_uninit(upb_status *status); -INLINE bool upb_ok(const upb_status *status) { return status->code == UPB_OK; } +INLINE bool upb_ok(const upb_status *status) { return !status->error; } INLINE bool upb_eof(const upb_status *status) { return status->eof; } void upb_status_clear(upb_status *status); @@ -248,6 +262,7 @@ void upb_status_copy(upb_status *to, const upb_status *from); extern upb_errorspace upb_posix_errorspace; void upb_status_fromerrno(upb_status *status); +bool upb_errno_is_wouldblock(); // Like vasprintf (which allocates a string large enough for the result), but // uses *buf (which can be NULL) as a starting point and reallocates it only if @@ -255,7 +270,7 @@ void upb_status_fromerrno(upb_status *status); // of the buffer. Starts writing at the given offset into the string; bytes // preceding this offset are unaffected. Returns the new length of the string, // or -1 on memory allocation failure. -int upb_vrprintf(char **buf, uint32_t *size, uint32_t ofs, +int upb_vrprintf(char **buf, size_t *size, size_t ofs, const char *fmt, va_list args); #ifdef __cplusplus