Refinement of upb_bytesrc interface.

Added a upb_byteregion that tracks a region of
the input buffer; decoders use this instead of
using a upb_bytesrc directly.  upb_byteregion
is also used as the way of passing a string to
a upb_handlers callback.  This symmetry makes
decoders compose better; if you want to take
a parsed string and decode it as something else,
you can take the string directly from the callback
and feed it as input to another parser.

A commented-out version of a pinning interface
is present; I decline to actually implement it
(and accept its extra complexity) until/unless
it is clear that it is actually a win.  But it
is included as a proof-of-concept, to show that
it fits well with the existing interface.
pull/13171/head
Joshua Haberman 13 years ago
parent 99ae0ed397
commit b5f5ee867e
  1. 3
      benchmarks/parsestream.upb.c
  2. 24
      benchmarks/parsetoproto2.upb.cc
  3. 8
      benchmarks/parsetostruct.upb.c
  4. 1
      tests/test_cpp.cc
  5. 2
      tests/test_decoder.c
  6. 1
      tests/test_varint.c
  7. 8
      tests/tests.c
  8. 161
      upb/bytestream.c
  9. 378
      upb/bytestream.h
  10. 30
      upb/def.c
  11. 15
      upb/descriptor.c
  12. 10
      upb/handlers.h
  13. 11
      upb/msg.c
  14. 149
      upb/pb/decoder.c
  15. 34
      upb/pb/decoder.h
  16. 47
      upb/pb/decoder_x64.dasc
  17. 13
      upb/pb/glue.c
  18. 13
      upb/pb/textprinter.c
  19. 2
      upb/pb/varint.h
  20. 1
      upb/table.h
  21. 6
      upb/upb.c
  22. 42
      upb/upb.h

@ -76,8 +76,7 @@ static size_t run(int i)
(void)i; (void)i;
upb_status status = UPB_STATUS_INIT; upb_status status = UPB_STATUS_INIT;
upb_stringsrc_reset(&stringsrc, input_str, input_len); upb_stringsrc_reset(&stringsrc, input_str, input_len);
upb_decoder_reset(&decoder, upb_stringsrc_bytesrc(&stringsrc), upb_decoder_reset(&decoder, upb_stringsrc_allbytes(&stringsrc), NULL);
0, UPB_NONDELIMITED, NULL);
upb_decoder_decode(&decoder, &status); upb_decoder_decode(&decoder, &status);
if(!upb_ok(&status)) goto err; if(!upb_ok(&status)) goto err;
return input_len; return input_len;

@ -24,6 +24,7 @@
#include <google/protobuf/descriptor.h> #include <google/protobuf/descriptor.h>
#undef private #undef private
char *str;
static size_t len; static size_t len;
MESSAGE_CIDENT msg[NUM_MESSAGES]; MESSAGE_CIDENT msg[NUM_MESSAGES];
MESSAGE_CIDENT msg2; MESSAGE_CIDENT msg2;
@ -53,9 +54,13 @@ upb_flow_t proto2_setstr(void *m, upb_value fval, upb_value val) {
const upb_fielddef *f = upb_value_getfielddef(fval); const upb_fielddef *f = upb_value_getfielddef(fval);
std::string **str = (std::string**)UPB_INDEX(m, f->offset, 1); std::string **str = (std::string**)UPB_INDEX(m, f->offset, 1);
if (*str == f->default_ptr) *str = new std::string; if (*str == f->default_ptr) *str = new std::string;
const upb_strref *ref = upb_value_getstrref(val); const upb_byteregion *ref = upb_value_getbyteregion(val);
uint32_t len;
(*str)->assign(
upb_byteregion_getptr(ref, upb_byteregion_startofs(ref), &len),
upb_byteregion_len(ref));
assert(len == upb_byteregion_len(ref));
// XXX: only supports contiguous strings atm. // XXX: only supports contiguous strings atm.
(*str)->assign(ref->ptr, ref->len);
return UPB_CONTINUE; return UPB_CONTINUE;
} }
@ -64,9 +69,13 @@ upb_flow_t proto2_append_str(void *_r, upb_value fval, upb_value val) {
typedef google::protobuf::RepeatedPtrField<std::string> R; typedef google::protobuf::RepeatedPtrField<std::string> R;
(void)fval; (void)fval;
R *r = (R*)_r; R *r = (R*)_r;
const upb_strref *ref = upb_value_getstrref(val); const upb_byteregion *ref = upb_value_getbyteregion(val);
// XXX: only supports contiguous strings atm. // XXX: only supports contiguous strings atm.
r->Add()->assign(ref->ptr, ref->len); uint32_t len;
r->Add()->assign(
upb_byteregion_getptr(ref, upb_byteregion_startofs(ref), &len),
upb_byteregion_len(ref));
assert(len == upb_byteregion_len(ref));
return UPB_CONTINUE; return UPB_CONTINUE;
} }
@ -265,7 +274,7 @@ static bool initialize()
upb_symtab_unref(s); upb_symtab_unref(s);
// Read the message data itself. // Read the message data itself.
char *str = upb_readfile(MESSAGE_FILE, &len); str = upb_readfile(MESSAGE_FILE, &len);
if(str == NULL) { if(str == NULL) {
fprintf(stderr, "Error reading " MESSAGE_FILE "\n"); fprintf(stderr, "Error reading " MESSAGE_FILE "\n");
return false; return false;
@ -275,7 +284,6 @@ static bool initialize()
msg2.ParseFromArray(str, len); msg2.ParseFromArray(str, len);
upb_stringsrc_init(&strsrc); upb_stringsrc_init(&strsrc);
upb_stringsrc_reset(&strsrc, str, len);
upb_handlers *h = upb_handlers_new(); upb_handlers *h = upb_handlers_new();
upb_accessors_reghandlers(h, def); upb_accessors_reghandlers(h, def);
if (!JIT) h->should_jit = false; if (!JIT) h->should_jit = false;
@ -296,8 +304,8 @@ static size_t run(int i)
(void)i; (void)i;
upb_status status = UPB_STATUS_INIT; upb_status status = UPB_STATUS_INIT;
msg[i % NUM_MESSAGES].Clear(); msg[i % NUM_MESSAGES].Clear();
upb_decoder_reset(&d, upb_stringsrc_bytesrc(&strsrc), upb_stringsrc_reset(&strsrc, str, len);
0, UPB_NONDELIMITED, &msg[i % NUM_MESSAGES]); upb_decoder_reset(&d, upb_stringsrc_allbytes(&strsrc), &msg[i % NUM_MESSAGES]);
upb_decoder_decode(&d, &status); upb_decoder_decode(&d, &status);
if(!upb_ok(&status)) goto err; if(!upb_ok(&status)) goto err;
return len; return len;

@ -8,6 +8,7 @@
#include "upb/pb/glue.h" #include "upb/pb/glue.h"
static const upb_msgdef *def; static const upb_msgdef *def;
char *str;
static size_t len; static size_t len;
static void *msg[NUM_MESSAGES]; static void *msg[NUM_MESSAGES];
static upb_stringsrc strsrc; static upb_stringsrc strsrc;
@ -33,7 +34,7 @@ static bool initialize()
upb_symtab_unref(s); upb_symtab_unref(s);
// Read the message data itself. // Read the message data itself.
char *str = upb_readfile(MESSAGE_FILE, &len); str = upb_readfile(MESSAGE_FILE, &len);
if(str == NULL) { if(str == NULL) {
fprintf(stderr, "Error reading " MESSAGE_FILE "\n"); fprintf(stderr, "Error reading " MESSAGE_FILE "\n");
return false; return false;
@ -43,7 +44,6 @@ static bool initialize()
msg[i] = upb_stdmsg_new(def); msg[i] = upb_stdmsg_new(def);
upb_stringsrc_init(&strsrc); upb_stringsrc_init(&strsrc);
upb_stringsrc_reset(&strsrc, str, len);
upb_handlers *h = upb_handlers_new(); upb_handlers *h = upb_handlers_new();
upb_accessors_reghandlers(h, def); upb_accessors_reghandlers(h, def);
if (!JIT) h->should_jit = false; if (!JIT) h->should_jit = false;
@ -70,8 +70,8 @@ static size_t run(int i)
upb_status status = UPB_STATUS_INIT; upb_status status = UPB_STATUS_INIT;
i %= NUM_MESSAGES; i %= NUM_MESSAGES;
upb_msg_clear(msg[i], def); upb_msg_clear(msg[i], def);
upb_decoder_reset(&d, upb_stringsrc_bytesrc(&strsrc), upb_stringsrc_reset(&strsrc, str, len);
0, UPB_NONDELIMITED, msg[i]); upb_decoder_reset(&d, upb_stringsrc_allbytes(&strsrc), msg[i]);
upb_decoder_decode(&d, &status); upb_decoder_decode(&d, &status);
if(!upb_ok(&status)) goto err; if(!upb_ok(&status)) goto err;
return len; return len;

@ -7,6 +7,7 @@
* Tests for C++ wrappers. * Tests for C++ wrappers.
*/ */
#include <stdio.h>
#include <iostream> #include <iostream>
#include "upb/def.hpp" #include "upb/def.hpp"
#include "upb/pb/glue.hpp" #include "upb/pb/glue.hpp"

@ -52,7 +52,7 @@ int main(int argc, char *argv[]) {
upb_decoder d; upb_decoder d;
upb_decoder_init(&d, handlers); upb_decoder_init(&d, handlers);
upb_decoder_reset(&d, upb_stdio_bytesrc(&in), 0, UPB_NONDELIMITED, p); upb_decoder_reset(&d, upb_stdio_allbytes(&in), p);
upb_status_clear(&status); upb_status_clear(&status);
upb_decoder_decode(&d, &status); upb_decoder_decode(&d, &status);

@ -4,6 +4,7 @@
* Copyright (c) 2011 Google Inc. See LICENSE for details. * Copyright (c) 2011 Google Inc. See LICENSE for details.
*/ */
#include <stdio.h>
#include "upb/pb/varint.h" #include "upb/pb/varint.h"
#include "upb_test.h" #include "upb_test.h"

@ -16,15 +16,16 @@ static upb_symtab *load_test_proto() {
ASSERT(s); ASSERT(s);
upb_status status = UPB_STATUS_INIT; upb_status status = UPB_STATUS_INIT;
if (!upb_load_descriptor_file_into_symtab(s, descriptor_file, &status)) { if (!upb_load_descriptor_file_into_symtab(s, descriptor_file, &status)) {
fprintf(stderr, "Error loading descriptor file: %s\n", upb_status_getstr(&status)); fprintf(stderr, "Error loading descriptor file: %s\n",
upb_status_getstr(&status));
exit(1); exit(1);
} }
upb_status_uninit(&status); upb_status_uninit(&status);
return s; return s;
} }
static upb_flow_t upb_test_onvalue(void *closure, upb_value fval, upb_value val) { static upb_flow_t upb_test_onvalue(void *c, upb_value fval, upb_value val) {
(void)closure; (void)c;
(void)fval; (void)fval;
(void)val; (void)val;
return UPB_CONTINUE; return UPB_CONTINUE;
@ -56,6 +57,7 @@ static void test_upb_symtab() {
upb_symtab_unref(s); upb_symtab_unref(s);
const upb_msgdef *m = upb_downcast_msgdef_const(def); const upb_msgdef *m = upb_downcast_msgdef_const(def);
upb_msg_iter i = upb_msg_begin(m); upb_msg_iter i = upb_msg_begin(m);
ASSERT(!upb_msg_done(i));
upb_fielddef *f = upb_msg_iter_field(i); upb_fielddef *f = upb_msg_iter_field(i);
ASSERT(upb_hassubdef(f)); ASSERT(upb_hassubdef(f));
upb_def *def2 = f->def; upb_def *def2 = f->def;

@ -14,29 +14,33 @@
// We can make this configurable if necessary. // We can make this configurable if necessary.
#define BUF_SIZE 32768 #define BUF_SIZE 32768
char *upb_strref_dup(const struct _upb_strref *r) { char *upb_byteregion_strdup(const struct _upb_byteregion *r) {
char *ret = (char*)malloc(r->len + 1); char *ret = malloc(upb_byteregion_len(r) + 1);
upb_bytesrc_read(r->bytesrc, r->stream_offset, r->len, ret); upb_byteregion_copyall(r, ret);
ret[r->len] = '\0'; ret[upb_byteregion_len(r)] = '\0';
return ret; return ret;
} }
upb_strref *upb_strref_new(const char *str) { upb_byteregion *upb_byteregion_new(const void *str) {
return upb_strref_newl(str, strlen(str)); return upb_byteregion_newl(str, strlen(str));
} }
upb_strref *upb_strref_newl(const void *str, size_t len) { upb_byteregion *upb_byteregion_newl(const void *str, uint32_t len) {
upb_strref *s = malloc(sizeof(*s)); upb_stringsrc *src = malloc(sizeof(*src));
s->bytesrc = NULL; upb_stringsrc_init(src);
s->ptr = malloc(len); char *ptr = malloc(len + 1);
memcpy((void*)s->ptr, str, len); memcpy(ptr, str, len);
return s; ptr[len] = '\0';
upb_stringsrc_reset(src, ptr, len);
return upb_stringsrc_allbytes(src);
} }
void upb_strref_free(upb_strref *ref) { void upb_byteregion_free(upb_byteregion *r) {
if (!ref) return; if (!r) return;
free((char*)ref->ptr); uint32_t len;
free(ref); free((char*)upb_byteregion_getptr(r, 0, &len));
upb_stringsrc_uninit((upb_stringsrc*)r->bytesrc);
free(r->bytesrc);
} }
void upb_bytesink_init(upb_bytesink *sink, upb_bytesink_vtbl *vtbl) { void upb_bytesink_init(upb_bytesink *sink, upb_bytesink_vtbl *vtbl) {
@ -48,6 +52,31 @@ void upb_bytesink_uninit(upb_bytesink *sink) {
upb_status_uninit(&sink->status); upb_status_uninit(&sink->status);
} }
void upb_byteregion_reset(upb_byteregion *r, const upb_byteregion *src,
uint64_t ofs, uint64_t len) {
assert(ofs >= upb_byteregion_startofs(src));
assert(len <= upb_byteregion_remaining(src, ofs));
r->bytesrc = src->bytesrc;
r->toplevel = false;
r->start = ofs;
r->discard = ofs;
r->end = ofs + len;
r->fetch = UPB_MIN(src->fetch, r->end);
}
bool upb_byteregion_fetch(upb_byteregion *r, upb_status *s) {
uint64_t fetchable = upb_byteregion_remaining(r, r->fetch);
if (fetchable == 0) {
upb_status_seteof(s);
return false;
}
uint64_t num = upb_bytesrc_fetch(r->bytesrc, r->fetch, s);
if (num == 0) return false;
r->fetch += UPB_MIN(num, fetchable);
return true;
}
/* upb_stdio ******************************************************************/ /* upb_stdio ******************************************************************/
int upb_stdio_cmpbuf(const void *_key, const void *_elem) { int upb_stdio_cmpbuf(const void *_key, const void *_elem) {
@ -86,63 +115,54 @@ static upb_stdio_buf *upb_stdio_rotatebufs(upb_stdio *s) {
return s->bufs[s->nbuf-num_reused]; return s->bufs[s->nbuf-num_reused];
} }
size_t upb_stdio_fetch(void *src, uint64_t ofs, upb_status *s) { void upb_stdio_discard(void *src, uint64_t ofs) {
(void)src;
(void)ofs;
}
uint32_t upb_stdio_fetch(void *src, uint64_t ofs, upb_status *s) {
(void)ofs; (void)ofs;
upb_stdio *stdio = (upb_stdio*)src; upb_stdio *stdio = (upb_stdio*)src;
upb_stdio_buf *buf = upb_stdio_rotatebufs(stdio); upb_stdio_buf *buf = upb_stdio_rotatebufs(stdio);
size_t read = fread(&buf->data, 1, BUF_SIZE, stdio->file); uint32_t read = fread(&buf->data, 1, BUF_SIZE, stdio->file);
if(read < (size_t)BUF_SIZE) { buf->len = read;
if(read < (uint32_t)BUF_SIZE) {
// Error or EOF. // Error or EOF.
if(feof(stdio->file)) return 0; if(feof(stdio->file)) {
upb_status_seteof(s);
return read;
}
if(ferror(stdio->file)) { if(ferror(stdio->file)) {
upb_status_fromerrno(s); upb_status_fromerrno(s);
return -1; return 0;
} }
assert(false); assert(false);
} }
buf->len = read;
return buf->ofs + buf->len; return buf->ofs + buf->len;
} }
void upb_stdio_read(const void *src, uint64_t src_ofs, size_t len, char *dst) { void upb_stdio_read(const void *src, uint64_t ofs, uint32_t len, char *dst) {
upb_stdio_buf *buf = upb_stdio_findbuf(src, src_ofs); upb_stdio_buf *buf = upb_stdio_findbuf(src, ofs);
src_ofs -= buf->ofs; ofs -= buf->ofs;
memcpy(dst, &buf->data[src_ofs], BUF_SIZE - src_ofs); memcpy(dst, buf->data + ofs, BUF_SIZE - ofs);
len -= (BUF_SIZE - src_ofs); len -= (BUF_SIZE - ofs);
dst += (BUF_SIZE - src_ofs); dst += (BUF_SIZE - ofs);
while (len > 0) { while (len > 0) {
++buf; ++buf;
size_t bytes = UPB_MIN(len, BUF_SIZE); uint32_t bytes = UPB_MIN(len, BUF_SIZE);
memcpy(dst, buf->data, bytes); memcpy(dst, buf->data, bytes);
len -= bytes; len -= bytes;
dst += bytes; dst += bytes;
} }
} }
const char *upb_stdio_getptr(void *src, uint64_t ofs, size_t *len) { const char *upb_stdio_getptr(const void *src, uint64_t ofs, uint32_t *len) {
upb_stdio_buf *buf = upb_stdio_findbuf(src, ofs); upb_stdio_buf *buf = upb_stdio_findbuf(src, ofs);
ofs -= buf->ofs; ofs -= buf->ofs;
*len = BUF_SIZE - ofs; *len = BUF_SIZE - ofs;
return &buf->data[ofs]; return &buf->data[ofs];
} }
void upb_stdio_refregion(void *src, uint64_t ofs, size_t len) {
upb_stdio_buf *buf = upb_stdio_findbuf(src, ofs);
len -= (BUF_SIZE - ofs);
++buf->refcount;
while (len > 0) {
len -= BUF_SIZE;
++buf;
++buf->refcount;
}
}
void upb_stdio_unrefregion(void *src, uint64_t ofs, size_t len) {
(void)src;
(void)ofs;
(void)len;
}
#if 0 #if 0
upb_strlen_t upb_stdio_putstr(upb_bytesink *sink, upb_string *str, upb_status *status) { upb_strlen_t upb_stdio_putstr(upb_bytesink *sink, upb_string *str, upb_status *status) {
upb_stdio *stdio = (upb_stdio*)((char*)sink - offsetof(upb_stdio, sink)); upb_stdio *stdio = (upb_stdio*)((char*)sink - offsetof(upb_stdio, sink));
@ -154,7 +174,6 @@ upb_strlen_t upb_stdio_putstr(upb_bytesink *sink, upb_string *str, upb_status *s
} }
return written; return written;
} }
#endif
uint32_t upb_stdio_vprintf(upb_bytesink *sink, upb_status *status, uint32_t upb_stdio_vprintf(upb_bytesink *sink, upb_status *status,
const char *fmt, va_list args) { const char *fmt, va_list args) {
@ -166,16 +185,14 @@ uint32_t upb_stdio_vprintf(upb_bytesink *sink, upb_status *status,
} }
return written; return written;
} }
#endif
void upb_stdio_init(upb_stdio *stdio) { void upb_stdio_init(upb_stdio *stdio) {
static upb_bytesrc_vtbl bytesrc_vtbl = { static upb_bytesrc_vtbl bytesrc_vtbl = {
upb_stdio_fetch, &upb_stdio_fetch,
upb_stdio_read, &upb_stdio_discard,
upb_stdio_getptr, &upb_stdio_read,
upb_stdio_refregion, &upb_stdio_getptr,
upb_stdio_unrefregion,
NULL,
NULL
}; };
upb_bytesrc_init(&stdio->src, &bytesrc_vtbl); upb_bytesrc_init(&stdio->src, &bytesrc_vtbl);
@ -209,26 +226,32 @@ void upb_stdio_uninit(upb_stdio *stdio) {
stdio->file = NULL; stdio->file = NULL;
} }
upb_bytesrc* upb_stdio_bytesrc(upb_stdio *stdio) { return &stdio->src; } upb_byteregion* upb_stdio_allbytes(upb_stdio *stdio) { return &stdio->byteregion; }
upb_bytesink* upb_stdio_bytesink(upb_stdio *stdio) { return &stdio->sink; } upb_bytesink* upb_stdio_bytesink(upb_stdio *stdio) { return &stdio->sink; }
/* upb_stringsrc **************************************************************/ /* upb_stringsrc **************************************************************/
size_t upb_stringsrc_fetch(void *_src, uint64_t ofs, upb_status *s) { uint32_t upb_stringsrc_fetch(void *_src, uint64_t ofs, upb_status *s) {
upb_stringsrc *src = _src; upb_stringsrc *src = _src;
(void)s; // No errors can occur. upb_status_seteof(s);
return src->len - ofs; return src->len - ofs;
} }
void upb_stringsrc_read(const void *_src, uint64_t src_ofs, void upb_stringsrc_read(const void *_src, uint64_t ofs,
size_t len, char *dst) { uint32_t len, char *dst) {
const upb_stringsrc *src = _src; const upb_stringsrc *src = _src;
memcpy(dst, src->str + src_ofs, len); assert(ofs + len <= src->len);
memcpy(dst, src->str + ofs, len);
} }
const char *upb_stringsrc_getptr(void *_src, uint64_t ofs, size_t *len) { void upb_stringsrc_discard(void *src, uint64_t ofs) {
upb_stringsrc *src = _src; (void)src;
(void)ofs;
}
const char *upb_stringsrc_getptr(const void *_s, uint64_t ofs, uint32_t *len) {
const upb_stringsrc *src = _s;
*len = src->len - ofs; *len = src->len - ofs;
return src->str + ofs; return src->str + ofs;
} }
@ -236,17 +259,23 @@ const char *upb_stringsrc_getptr(void *_src, uint64_t ofs, size_t *len) {
void upb_stringsrc_init(upb_stringsrc *s) { void upb_stringsrc_init(upb_stringsrc *s) {
static upb_bytesrc_vtbl vtbl = { static upb_bytesrc_vtbl vtbl = {
&upb_stringsrc_fetch, &upb_stringsrc_fetch,
&upb_stringsrc_discard,
&upb_stringsrc_read, &upb_stringsrc_read,
&upb_stringsrc_getptr, &upb_stringsrc_getptr,
NULL, NULL, NULL, NULL
}; };
upb_bytesrc_init(&s->bytesrc, &vtbl); upb_bytesrc_init(&s->bytesrc, &vtbl);
s->str = NULL; s->str = NULL;
s->byteregion.bytesrc = &s->bytesrc;
s->byteregion.toplevel = true;
} }
void upb_stringsrc_reset(upb_stringsrc *s, const char *str, size_t len) { void upb_stringsrc_reset(upb_stringsrc *s, const char *str, uint32_t len) {
s->str = str; s->str = str;
s->len = len; s->len = len;
s->byteregion.start = 0;
s->byteregion.discard = 0;
s->byteregion.fetch = 0;
s->byteregion.end = len;
} }
void upb_stringsrc_uninit(upb_stringsrc *s) { (void)s; } void upb_stringsrc_uninit(upb_stringsrc *s) { (void)s; }
@ -262,7 +291,7 @@ void upb_stringsink_uninit(upb_stringsink *s) {
free(s->str); free(s->str);
} }
void upb_stringsink_reset(upb_stringsink *s, char *str, size_t size) { void upb_stringsink_reset(upb_stringsink *s, char *str, uint32_t size) {
free(s->str); free(s->str);
s->str = str; s->str = str;
s->len = 0; s->len = 0;

@ -4,19 +4,73 @@
* Copyright (c) 2011 Google Inc. See LICENSE for details. * Copyright (c) 2011 Google Inc. See LICENSE for details.
* Author: Josh Haberman <jhaberman@gmail.com> * Author: Josh Haberman <jhaberman@gmail.com>
* *
* This file contains upb_bytesrc and upb_bytesink, which are abstractions of * This file defines three core interfaces:
* stdio (fread()/fwrite()/etc) that provide useful buffering/sharing * - upb_bytesink: for writing streams of data.
* semantics. They are virtual base classes so concrete implementations * - upb_bytesrc: for reading streams of data.
* can get the data from a fd, a string, a cord, etc. * - upb_byteregion: for reading from a specific region of a upb_bytesrc;
* should be used by decoders instead of using upb_bytesrc directly.
* *
* Byte streams are NOT thread-safe! (Like f{read,write}_unlocked()) * These interfaces are used by streaming encoders and decoders: for example, a
* This may change (in particular, bytesrc objects may be better thread-safe). * protobuf parser gets its input from a upb_byteregion. They are virtual base
* classes so concrete implementations can get the data from a fd, a FILE*, a
* string, etc.
*/ */
// A upb_byteregion represents a region of data from a bytesrc.
//
// Parsers get data from this interface instead of a bytesrc because we often
// want to parse only a specific region of the input. For example, if we parse
// a string from our input but know that the string represents a protobuf, we
// can pass its upb_byteregion to an appropriate protobuf parser.
//
// Since the bytes may be coming from a file or network socket, bytes must be
// fetched before they can be read (though in some cases this fetch may be a
// no-op). "fetch" is the only operation on a byteregion that could fail or
// block, because it is the only operation that actually performs I/O.
//
// Bytes can be discarded when they are no longer needed. Parsers should
// always discard bytes they no longer need, both so the buffers can be freed
// when possible and to give better visibility into what bytes the parser is
// still using.
//
// start discard read fetch end
// ofs ofs ofs ofs ofs
// | |--->discard() | |--->fetch() |
// V V V V V
// +-------------+-------------------------+-----------------+-----------------+
// | discarded | | | fetchable |
// +-------------+-------------------------+-----------------+-----------------+
// | <------------- loaded ------------------> |
// | <- available -> |
// | <---------- remaining ----------> |
//
// Note that the start offset may be something other than zero! A byteregion
// is a view into an underlying bytesrc stream, and the region may start
// somewhere other than the beginning of that stream.
//
// The region can be either delimited or nondelimited. A non-delimited region
// will keep returning data until the underlying data source returns EOF. A
// delimited region will return EOF at a predetermined offset.
//
// end
// ofs
// |
// V
// +-----------------------+
// | delimited region | <-- hard EOF, even if data source has more data.
// +-----------------------+
//
// +------------------------
// | nondelimited region Z <-- won't return EOF until data source hits EOF.
// +------------------------
#ifndef UPB_BYTESTREAM_H #ifndef UPB_BYTESTREAM_H
#define UPB_BYTESTREAM_H #define UPB_BYTESTREAM_H
#include <stdarg.h> #include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "upb.h" #include "upb.h"
@ -29,25 +83,22 @@ extern "C" {
/* upb_bytesrc ****************************************************************/ /* upb_bytesrc ****************************************************************/
// A upb_bytesrc allows the consumer of a stream of bytes to obtain buffers as // A upb_bytesrc allows the consumer of a stream of bytes to obtain buffers as
// they become available, and to preserve some trailing amount of data, which // they become available, and to preserve some trailing amount of data before
// is useful for lazy parsing (among other things). If there is a submessage // it is discarded. Consumers should not use upb_bytesrc directly, but rather
// that we want to parse later we can take a reference on that region of the // should use a upb_byteregion (which allows access to a region of a bytesrc).
// input buffer. This will guarantee that the bytesrc keeps the submessage //
// data around for later use, without requiring a copy out of the input // upb_bytesrc is a virtual base class with implementations that get data from
// buffers. // eg. a string, a cord, a file descriptor, a FILE*, etc.
typedef size_t upb_bytesrc_fetch_func(void*, uint64_t, upb_status*);
typedef void upb_bytesrc_read_func(const void*, uint64_t, size_t, char*); typedef uint32_t upb_bytesrc_fetch_func(void*, uint64_t, upb_status*);
typedef const char *upb_bytesrc_getptr_func(void*, uint64_t, size_t*); typedef void upb_bytesrc_discard_func(void*, uint64_t);
typedef void upb_bytesrc_refregion_func(void*, uint64_t, size_t); typedef void upb_bytesrc_copy_func(const void*, uint64_t, uint32_t, char*);
typedef void upb_bytesrc_ref_func(void*); typedef const char *upb_bytesrc_getptr_func(const void*, uint64_t, uint32_t*);
typedef struct _upb_bytesrc_vtbl { typedef struct _upb_bytesrc_vtbl {
upb_bytesrc_fetch_func *fetch; upb_bytesrc_fetch_func *fetch;
upb_bytesrc_read_func *read; upb_bytesrc_discard_func *discard;
upb_bytesrc_copy_func *copy;
upb_bytesrc_getptr_func *getptr; upb_bytesrc_getptr_func *getptr;
upb_bytesrc_refregion_func *refregion;
upb_bytesrc_refregion_func *unrefregion;
upb_bytesrc_ref_func *ref;
upb_bytesrc_ref_func *unref;
} upb_bytesrc_vtbl; } upb_bytesrc_vtbl;
typedef struct { typedef struct {
@ -59,114 +110,198 @@ INLINE void upb_bytesrc_init(upb_bytesrc *src, upb_bytesrc_vtbl *vtbl) {
} }
// Fetches at least one byte starting at ofs, returning the actual number of // Fetches at least one byte starting at ofs, returning the actual number of
// bytes fetched (or 0 on error: see "s" for details). A successful return // bytes fetched (or 0 on EOF or error: see *s for details). Some bytesrc's
// gives caller a ref on the fetched region. // may set EOF on *s after a successful read if no further data is available,
// // but not all bytesrc's support this. It is valid for bytes to be fetched
// If "ofs" may be greater or equal than the end of the already-fetched region. // multiple times, as long as the bytes have not been previously discarded.
// It may also be less than the end of the already-fetch region *if* either of INLINE uint32_t upb_bytesrc_fetch(upb_bytesrc *src, uint64_t ofs,
// the following is true: upb_status *s) {
//
// * the region is ref'd (this implies that the data is still in-memory)
// * the bytesrc is seekable (this implies that the data can be fetched again).
INLINE size_t upb_bytesrc_fetch(upb_bytesrc *src, uint64_t ofs, upb_status *s) {
return src->vtbl->fetch(src, ofs, s); return src->vtbl->fetch(src, ofs, s);
} }
// Copies "len" bytes of data from offset src_ofs to "dst", which must be at // Discards all data prior to ofs (except data that is pinned, if pinning
// least "len" bytes long. The caller must own a ref on the given region. // support is added -- see TODO below).
INLINE void upb_bytesrc_read(const upb_bytesrc *src, uint64_t src_ofs, INLINE void upb_bytesrc_discard(upb_bytesrc *src, uint64_t ofs) {
size_t len, char *dst) { src->vtbl->discard(src, ofs);
src->vtbl->read(src, src_ofs, len, dst); }
// Copies "len" bytes of data from ofs to "dst", which must be at least "len"
// bytes long. The given region must not be discarded.
INLINE void upb_bytesrc_copy(const upb_bytesrc *src, uint64_t ofs, uint32_t len,
char *dst) {
src->vtbl->copy(src, ofs, len, dst);
} }
// Returns a pointer to the bytesrc's internal buffer, storing in *len how much // Returns a pointer to the bytesrc's internal buffer, storing in *len how much
// data is available. The caller must own refs on the given region. The // data is available. The given offset must not be discarded. The returned
// returned buffer is valid for as long as the region remains ref'd. // buffer is valid for as long as its bytes are not discarded (in the case that
// // part of the returned buffer is discarded, only the non-discarded bytes
// TODO: if more data is available than the caller has ref'd is it ok for the // remain valid).
// caller to read *len bytes? INLINE const char *upb_bytesrc_getptr(const upb_bytesrc *src, uint64_t ofs,
INLINE const char *upb_bytesrc_getptr(upb_bytesrc *src, uint64_t ofs, uint32_t *len) {
size_t *len) {
return src->vtbl->getptr(src, ofs, len); return src->vtbl->getptr(src, ofs, len);
} }
// Gives the caller a ref on the given region. The caller must know that the // TODO: Add if/when there is a demonstrated need:
// given region is already ref'd (for example, inside a upb_handlers callback //
// that receives a upb_strref, the region is guaranteed to be ref'd -- this // // When the caller pins a region (which must not be already discarded), it
// function allows that handler to take its own ref). // // is guaranteed that the region will not be discarded (nor will the bytesrc
INLINE void upb_bytesrc_refregion(upb_bytesrc *src, uint64_t ofs, size_t len) { // // be destroyed) until the region is unpinned. However, not all bytesrc's
src->vtbl->refregion(src, ofs, len); // // support pinning; a false return indicates that a pin was not possible.
} // INLINE bool upb_bytesrc_pin(upb_bytesrc *src, uint64_t ofs, uint32_t len) {
// return src->vtbl->refregion(src, ofs, len);
// }
//
// // Releases some number of pinned bytes from the beginning of a pinned
// // region (which may be fewer than the total number of bytes pinned).
// INLINE void upb_bytesrc_unpin(upb_bytesrc *src, uint64_t ofs, uint32_t len,
// uint32_t bytes_to_release) {
// src->vtbl->unpin(src, ofs, len);
// }
//
// Adding pinning support would also involve adding a "pin_ofs" parameter to
// upb_bytesrc_fetch, so that the fetch can extend an already-pinned region.
// Releases a ref on the given region, which the caller must have previously
// ref'd.
INLINE void upb_bytesrc_unrefregion(upb_bytesrc *src, uint64_t ofs, size_t len) {
src->vtbl->unrefregion(src, ofs, len);
}
// Attempts to ref the bytesrc itself, returning false if this bytesrc is /* upb_byteregion *************************************************************/
// not ref-able.
INLINE bool upb_bytesrc_tryref(upb_bytesrc *src) {
if (src->vtbl->ref) {
src->vtbl->ref(src);
return true;
} else {
return false;
}
}
// Unref's the bytesrc itself. May only be called when upb_bytesrc_tryref() #define UPB_NONDELIMITED (0xffffffffffffffffULL)
// has previously returned true.
INLINE void upb_bytesrc_unref(upb_bytesrc *src) {
assert(src->vtbl->unref);
src->vtbl->unref(src);
}
typedef struct _upb_byteregion {
uint64_t start;
uint64_t discard;
uint64_t fetch;
uint64_t end; // UPB_NONDELIMITED if nondelimited.
upb_bytesrc *bytesrc;
bool toplevel; // If true, discards hit the underlying byteregion.
} upb_byteregion;
// Initializes a byteregion. Its initial value will be empty. No methods may
// be called on an empty byteregion except upb_byteregion_reset().
void upb_byteregion_init(upb_byteregion *r);
void upb_byteregion_uninit(upb_byteregion *r);
// Accessors for the regions bounds -- the meaning of these is described in the
// diagram above.
INLINE uint64_t upb_byteregion_startofs(const upb_byteregion *r) {
return r->start;
}
INLINE uint64_t upb_byteregion_discardofs(const upb_byteregion *r) {
return r->discard;
}
INLINE uint64_t upb_byteregion_fetchofs(const upb_byteregion *r) {
return r->fetch;
}
INLINE uint64_t upb_byteregion_endofs(const upb_byteregion *r) {
return r->end;
}
/* upb_strref *****************************************************************/ // Returns how many bytes are fetched and available for reading starting
// from offset "o".
INLINE uint64_t upb_byteregion_available(const upb_byteregion *r, uint64_t o) {
assert(o >= upb_byteregion_discardofs(r));
assert(o <= r->fetch); // Could relax this.
return r->fetch - o;
}
// The structure we pass to upb_handlers for a string value. // Returns the total number of bytes remaining after offset "o", or
typedef struct _upb_strref { // UPB_NONDELIMITED if the byteregion is non-delimited.
// Pointer to the string data. NULL if the string spans multiple input INLINE uint64_t upb_byteregion_remaining(const upb_byteregion *r, uint64_t o) {
// buffers (in which case upb_bytesrc_getptr() must be called to obtain return r->end == UPB_NONDELIMITED ? UPB_NONDELIMITED : r->end - o;
// the actual pointers). }
const char *ptr;
// Total length of the string. INLINE uint64_t upb_byteregion_len(const upb_byteregion *r) {
uint32_t len; return upb_byteregion_remaining(r, r->start);
}
// Offset in the bytesrc that represents the beginning of this string. // Sets the value of this byteregion to be a subset of the given byteregion's
uint32_t stream_offset; // data. The caller is responsible for releasing this region before the src
// region is released (unless the region is first pinned, if pinning support is
// added. see below).
void upb_byteregion_reset(upb_byteregion *r, const upb_byteregion *src,
uint64_t ofs, uint64_t len);
void upb_byteregion_release(upb_byteregion *r);
// Attempts to fetch more data, extending the fetched range of this byteregion.
// Returns true if the fetched region was extended by at least one byte, false
// on EOF or error (see *s for details).
bool upb_byteregion_fetch(upb_byteregion *r, upb_status *s);
// Fetches all remaining data for "r", returning false if the operation failed
// (see "*s" for details). May only be used on delimited byteregions.
INLINE bool upb_byteregion_fetchall(upb_byteregion *r, upb_status *s) {
assert(upb_byteregion_len(r) != UPB_NONDELIMITED);
while (upb_byteregion_fetch(r, s)) ; // Empty body.
return upb_eof(s);
}
// Bytesrc from which this string data comes. May be NULL if ptr is set. If // Discards bytes from the byteregion up until ofs (which must be greater or
// non-NULL, the bytesrc is only guaranteed to be alive from inside the // equal to upb_byteregion_discardofs()). It is valid to discard bytes that
// callback; however if the handler knows more about its type and how to // have not been fetched (such bytes will never be fetched) but it is an error
// prolong its life, it may do so. // to discard past the end of a delimited byteregion.
upb_bytesrc *bytesrc; INLINE void upb_byteregion_discard(upb_byteregion *r, uint64_t ofs) {
assert(ofs >= upb_byteregion_discardofs(r));
assert(ofs <= upb_byteregion_endofs(r));
r->discard = ofs;
if (r->toplevel) upb_bytesrc_discard(r->bytesrc, ofs);
}
// Possibly add optional members here like start_line, start_column, etc. // Copies "len" bytes of data into "dst", starting at ofs. The specified
} upb_strref; // region must be available.
INLINE void upb_byteregion_copy(const upb_byteregion *r, uint64_t ofs,
uint32_t len, char *dst) {
assert(ofs >= upb_byteregion_discardofs(r));
assert(len <= upb_byteregion_available(r, ofs));
upb_bytesrc_copy(r->bytesrc, ofs, len, dst);
}
// Copies the contents of the strref into a newly-allocated, NULL-terminated // Copies all bytes from the byteregion into dst. Requires that the entire
// string. // byteregion is fetched and that none has been discarded.
char *upb_strref_dup(const struct _upb_strref *r); INLINE void upb_byteregion_copyall(const upb_byteregion *r, char *dst) {
assert(r->start == r->discard && r->end == r->fetch);
upb_byteregion_copy(r, r->start, upb_byteregion_len(r), dst);
}
INLINE void upb_strref_read(const struct _upb_strref *r, char *buf) { // Returns a pointer to the internal buffer for the byteregion starting at
if (r->ptr) { // offset "ofs." Stores the number of bytes available in this buffer in *len.
memcpy(buf, r->ptr, r->len); // The returned buffer is invalidated when the byteregion is reset or released,
} else { // or when the bytes are discarded. If the byteregion is not currently pinned,
assert(r->bytesrc); // the pointer is only valid for the lifetime of the parent byteregion.
upb_bytesrc_read(r->bytesrc, r->stream_offset, r->len, buf); INLINE const char *upb_byteregion_getptr(const upb_byteregion *r,
} uint64_t ofs, uint32_t *len) {
assert(ofs >= upb_byteregion_discardofs(r));
const char *ret = upb_bytesrc_getptr(r->bytesrc, ofs, len);
*len = UPB_MIN(*len, upb_byteregion_available(r, ofs));
return ret;
} }
// Dynamically allocates a upb_strref object whose contents are the given // TODO: add if/when there is a demonstrated need.
// string. The given string data is copied into the strref, which makes these //
// functions unsuitable for tight loops (in those cases a strref should be made // // Pins this byteregion's bytes in memory, allowing it to outlive its parent
// to point to existing string data). // // byteregion. Normally a byteregion may only be used while its parent is
upb_strref *upb_strref_new(const char *str); // // still valid, but a pinned byteregion may continue to be used until it is
upb_strref *upb_strref_newl(const void *str, size_t len); // // reset or released. A byteregion must be fully fetched to be pinned
void upb_strref_free(upb_strref *ref); // // (this implies that the byteregion must be delimited).
// //
// // In some cases this operation may cause the input data to be copied.
// //
// // void upb_byteregion_pin(upb_byteregion *r);
// Convenience functions for creating and destroying a byteregion with a simple
// string as its data. These are relatively inefficient compared with creating
// your own bytesrc (they call malloc() and copy the string data) so should not
// be used on any critical path.
//
// The string data in the returned region is guaranteed to be contiguous and
// NULL-terminated.
upb_byteregion *upb_byteregion_new(const void *str);
upb_byteregion *upb_byteregion_newl(const void *str, uint32_t len);
// May *only* be called on a byteregion created with upb_byteregion_new[l]()!
void upb_byteregion_free(upb_byteregion *r);
// Copies the contents of the byteregion into a newly-allocated, NULL-terminated
// string. Requires that the byteregion is fully fetched.
char *upb_byteregion_strdup(const upb_byteregion *r);
/* upb_bytesink ***************************************************************/ /* upb_bytesink ***************************************************************/
@ -279,6 +414,7 @@ typedef struct {
bool should_close; bool should_close;
upb_stdio_buf **bufs; upb_stdio_buf **bufs;
uint32_t nbuf, szbuf; uint32_t nbuf, szbuf;
upb_byteregion byteregion;
} upb_stdio; } upb_stdio;
void upb_stdio_init(upb_stdio *stdio); void upb_stdio_init(upb_stdio *stdio);
@ -297,7 +433,7 @@ void upb_stdio_reset(upb_stdio *stdio, FILE *file);
void upb_stdio_open(upb_stdio *stdio, const char *filename, const char *mode, void upb_stdio_open(upb_stdio *stdio, const char *filename, const char *mode,
upb_status *s); upb_status *s);
upb_bytesrc *upb_stdio_bytesrc(upb_stdio *stdio); upb_byteregion *upb_stdio_allbytes(upb_stdio *stdio);
upb_bytesink *upb_stdio_bytesink(upb_stdio *stdio); upb_bytesink *upb_stdio_bytesink(upb_stdio *stdio);
@ -305,24 +441,26 @@ upb_bytesink *upb_stdio_bytesink(upb_stdio *stdio);
// bytesrc/bytesink for a simple contiguous string. // bytesrc/bytesink for a simple contiguous string.
struct _upb_stringsrc { typedef struct {
upb_bytesrc bytesrc; upb_bytesrc bytesrc;
const char *str; const char *str;
size_t len; uint32_t len;
}; upb_byteregion byteregion;
typedef struct _upb_stringsrc upb_stringsrc; } upb_stringsrc;
// Create/free a stringsrc. // Create/free a stringsrc.
void upb_stringsrc_init(upb_stringsrc *s); void upb_stringsrc_init(upb_stringsrc *s);
void upb_stringsrc_uninit(upb_stringsrc *s); void upb_stringsrc_uninit(upb_stringsrc *s);
// Resets the stringsrc to a state where it will vend the given string. The // Resets the stringsrc to a state where it will vend the given string. The
// stringsrc will take a reference on the string, so the caller need not ensure // string data must be valid until the stringsrc is reset again or destroyed.
// that it outlives the stringsrc. A stringsrc can be reset multiple times. void upb_stringsrc_reset(upb_stringsrc *s, const char *str, uint32_t len);
void upb_stringsrc_reset(upb_stringsrc *s, const char *str, size_t len);
// Returns the upb_bytesrc* for this stringsrc. // Returns the top-level upb_byteregion* for this stringsrc. Invalidated when
upb_bytesrc *upb_stringsrc_bytesrc(upb_stringsrc *s); // the stringsrc is reset.
INLINE upb_byteregion *upb_stringsrc_allbytes(upb_stringsrc *s) {
return &s->byteregion;
}
/* upb_stringsink *************************************************************/ /* upb_stringsink *************************************************************/
@ -330,7 +468,7 @@ upb_bytesrc *upb_stringsrc_bytesrc(upb_stringsrc *s);
struct _upb_stringsink { struct _upb_stringsink {
upb_bytesink bytesink; upb_bytesink bytesink;
char *str; char *str;
size_t len, size; uint32_t len, size;
}; };
typedef struct _upb_stringsink upb_stringsink; typedef struct _upb_stringsink upb_stringsink;
@ -340,12 +478,12 @@ void upb_stringsink_uninit(upb_stringsink *s);
// Resets the sink's string to "str", which the sink takes ownership of. // Resets the sink's string to "str", which the sink takes ownership of.
// "str" may be NULL, which will make the sink allocate a new string. // "str" may be NULL, which will make the sink allocate a new string.
void upb_stringsink_reset(upb_stringsink *s, char *str, size_t size); void upb_stringsink_reset(upb_stringsink *s, char *str, uint32_t len);
// Releases ownership of the returned string (which is "len" bytes long) and // Releases ownership of the returned string (which is "len" bytes long) and
// resets the internal string to be empty again (as if reset were called with // resets the internal string to be empty again (as if reset were called with
// NULL). // NULL).
const char *upb_stringsink_release(upb_stringsink *s, size_t *len); const char *upb_stringsink_release(upb_stringsink *s, uint32_t *len);
// Returns the upb_bytesink* for this stringsrc. Invalidated by reset above. // Returns the upb_bytesink* for this stringsrc. Invalidated by reset above.
upb_bytesink *upb_stringsink_bytesink(upb_stringsink *s); upb_bytesink *upb_stringsink_bytesink(upb_stringsink *s);

@ -251,7 +251,8 @@ static void upb_fielddef_init_default(upb_fielddef *f) {
case UPB_TYPE(FIXED32): upb_value_setuint32(&f->defaultval, 0); break; case UPB_TYPE(FIXED32): upb_value_setuint32(&f->defaultval, 0); break;
case UPB_TYPE(BOOL): upb_value_setbool(&f->defaultval, false); break; case UPB_TYPE(BOOL): upb_value_setbool(&f->defaultval, false); break;
case UPB_TYPE(STRING): case UPB_TYPE(STRING):
case UPB_TYPE(BYTES): upb_value_setstrref(&f->defaultval, upb_strref_new("")); break; case UPB_TYPE(BYTES):
upb_value_setbyteregion(&f->defaultval, upb_byteregion_new("")); break;
case UPB_TYPE(GROUP): case UPB_TYPE(GROUP):
case UPB_TYPE(MESSAGE): upb_value_setptr(&f->defaultval, NULL); break; case UPB_TYPE(MESSAGE): upb_value_setptr(&f->defaultval, NULL); break;
} }
@ -260,7 +261,7 @@ static void upb_fielddef_init_default(upb_fielddef *f) {
static void upb_fielddef_uninit_default(upb_fielddef *f) { static void upb_fielddef_uninit_default(upb_fielddef *f) {
if (upb_isstring(f) || f->default_is_symbolic) { if (upb_isstring(f) || f->default_is_symbolic) {
upb_strref_free((upb_strref*)upb_value_getstrref(f->defaultval)); upb_byteregion_free(upb_value_getbyteregion(f->defaultval));
} }
} }
@ -324,24 +325,29 @@ static bool upb_fielddef_resolve(upb_fielddef *f, upb_def *def, upb_status *s) {
f->def = def; f->def = def;
if (f->type == UPB_TYPE(ENUM) && f->default_is_symbolic) { if (f->type == UPB_TYPE(ENUM) && f->default_is_symbolic) {
// Resolve the enum's default from a string to an integer. // Resolve the enum's default from a string to an integer.
upb_strref *str = (upb_strref*)upb_value_getstrref(f->defaultval); upb_byteregion *bytes = upb_value_getbyteregion(f->defaultval);
assert(str); // Should point to either a real default or the empty string. assert(bytes); // Points to either a real default or the empty string.
upb_enumdef *e = upb_downcast_enumdef(f->def); upb_enumdef *e = upb_downcast_enumdef(f->def);
int32_t val = 0; int32_t val = 0;
// Could do a sanity check that the default value does not have embedded // Could do a sanity check that the default value does not have embedded
// NULLs. // NULLs.
if (str->ptr[0] == '\0') { if (upb_byteregion_len(bytes) == 0) {
upb_value_setint32(&f->defaultval, e->defaultval); upb_value_setint32(&f->defaultval, e->defaultval);
} else { } else {
bool success = upb_enumdef_ntoi(e, str->ptr, &val); uint32_t len;
// ptr is guaranteed to be NULL-terminated because the byteregion was
// created with upb_byteregion_newl().
const char *ptr = upb_byteregion_getptr(bytes, 0, &len);
assert(len == upb_byteregion_len(bytes)); // Should all be in one chunk.
bool success = upb_enumdef_ntoi(e, ptr, &val);
if (!success) { if (!success) {
upb_status_seterrf( upb_status_seterrf(
s, "Default enum value (%s) is not a member of the enum", str); s, "Default enum value (%s) is not a member of the enum", ptr);
return false; return false;
} }
upb_value_setint32(&f->defaultval, val); upb_value_setint32(&f->defaultval, val);
} }
upb_strref_free(str); upb_byteregion_free(bytes);
} }
return true; return true;
} }
@ -381,10 +387,10 @@ void upb_fielddef_setdefault(upb_fielddef *f, upb_value value) {
void upb_fielddef_setdefaultstr(upb_fielddef *f, const void *str, size_t len) { void upb_fielddef_setdefaultstr(upb_fielddef *f, const void *str, size_t len) {
assert(upb_isstring(f) || f->type == UPB_TYPE(ENUM)); assert(upb_isstring(f) || f->type == UPB_TYPE(ENUM));
const upb_strref *ref = upb_value_getstrref(f->defaultval); upb_byteregion *bytes = upb_value_getbyteregion(f->defaultval);
assert(ref); assert(bytes);
upb_strref_free((upb_strref*)ref); upb_byteregion_free(bytes);
upb_value_setstrref(&f->defaultval, upb_strref_newl(str, len)); upb_value_setbyteregion(&f->defaultval, upb_byteregion_newl(str, len));
f->default_is_symbolic = true; f->default_is_symbolic = true;
} }

@ -123,7 +123,8 @@ static upb_flow_t upb_descreader_FileDescriptorProto_package(void *_r,
upb_value val) { upb_value val) {
(void)fval; (void)fval;
upb_descreader *r = _r; upb_descreader *r = _r;
upb_descreader_setscopename(r, upb_strref_dup(upb_value_getstrref(val))); upb_descreader_setscopename(
r, upb_byteregion_strdup(upb_value_getbyteregion(val)));
return UPB_CONTINUE; return UPB_CONTINUE;
} }
@ -180,7 +181,7 @@ static upb_flow_t upb_enumdef_EnumValueDescriptorProto_name(void *_r,
(void)fval; (void)fval;
upb_descreader *r = _r; upb_descreader *r = _r;
free(r->name); free(r->name);
r->name = upb_strref_dup(upb_value_getstrref(val)); r->name = upb_byteregion_strdup(upb_value_getbyteregion(val));
r->saw_name = true; r->saw_name = true;
return UPB_CONTINUE; return UPB_CONTINUE;
} }
@ -259,7 +260,7 @@ static upb_flow_t upb_enumdef_EnumDescriptorProto_name(void *_r,
upb_descreader *r = _r; upb_descreader *r = _r;
upb_enumdef *e = upb_downcast_enumdef(upb_descreader_last(r)); upb_enumdef *e = upb_downcast_enumdef(upb_descreader_last(r));
free(e->base.fqname); free(e->base.fqname);
e->base.fqname = upb_strref_dup(upb_value_getstrref(val)); e->base.fqname = upb_byteregion_strdup(upb_value_getbyteregion(val));
return UPB_CONTINUE; return UPB_CONTINUE;
} }
@ -423,7 +424,7 @@ static upb_flow_t upb_fielddef_onnumber(void *_r, upb_value fval, upb_value val)
static upb_flow_t upb_fielddef_onname(void *_r, upb_value fval, upb_value val) { static upb_flow_t upb_fielddef_onname(void *_r, upb_value fval, upb_value val) {
(void)fval; (void)fval;
upb_descreader *r = _r; upb_descreader *r = _r;
char *name = upb_strref_dup(upb_value_getstrref(val)); char *name = upb_byteregion_strdup(upb_value_getbyteregion(val));
upb_fielddef_setname(r->f, name); upb_fielddef_setname(r->f, name);
free(name); free(name);
return UPB_CONTINUE; return UPB_CONTINUE;
@ -433,7 +434,7 @@ static upb_flow_t upb_fielddef_ontypename(void *_r, upb_value fval,
upb_value val) { upb_value val) {
(void)fval; (void)fval;
upb_descreader *r = _r; upb_descreader *r = _r;
char *name = upb_strref_dup(upb_value_getstrref(val)); char *name = upb_byteregion_strdup(upb_value_getbyteregion(val));
upb_fielddef_settypename(r->f, name); upb_fielddef_settypename(r->f, name);
free(name); free(name);
return UPB_CONTINUE; return UPB_CONTINUE;
@ -446,7 +447,7 @@ static upb_flow_t upb_fielddef_ondefaultval(void *_r, upb_value fval,
// Have to convert from string to the correct type, but we might not know the // Have to convert from string to the correct type, but we might not know the
// type yet. // type yet.
free(r->default_string); free(r->default_string);
r->default_string = upb_strref_dup(upb_value_getstrref(val)); r->default_string = upb_byteregion_strdup(upb_value_getbyteregion(val));
return UPB_CONTINUE; return UPB_CONTINUE;
} }
@ -499,7 +500,7 @@ static upb_flow_t upb_msgdef_onname(void *_r, upb_value fval, upb_value val) {
assert(val.type == UPB_TYPE(STRING)); assert(val.type == UPB_TYPE(STRING));
upb_msgdef *m = upb_descreader_top(r); upb_msgdef *m = upb_descreader_top(r);
free(m->base.fqname); free(m->base.fqname);
m->base.fqname = upb_strref_dup(upb_value_getstrref(val)); m->base.fqname = upb_byteregion_strdup(upb_value_getbyteregion(val));
upb_descreader_setscopename(r, strdup(m->base.fqname)); upb_descreader_setscopename(r, strdup(m->base.fqname));
return UPB_CONTINUE; return UPB_CONTINUE;
} }

@ -324,13 +324,13 @@ typedef struct {
uint16_t fieldindex; uint16_t fieldindex;
bool is_sequence; // frame represents seq or submsg? (f might be both). bool is_sequence; // frame represents seq or submsg? (f might be both).
bool is_packed; // !upb_issubmsg(f) && end_ofs != UINT64_MAX (strings aren't pushed) bool is_packed; // !upb_issubmsg(f) && end_ofs != UINT64_MAX
// (strings aren't pushed).
} upb_dispatcher_frame; } upb_dispatcher_frame;
// Called when some of the input needs to be skipped. All frames from // Called when some of the input needs to be skipped. All frames from the
// top to bottom, inclusive, should be skipped. // current top to "bottom", inclusive, should be skipped.
typedef void upb_skip_handler(void *, upb_dispatcher_frame *top, typedef void upb_skip_handler(void *, upb_dispatcher_frame *bottom);
upb_dispatcher_frame *bottom);
typedef void upb_exit_handler(void *); typedef void upb_exit_handler(void *);
typedef struct { typedef struct {

@ -151,13 +151,14 @@ static void _upb_stdmsg_setstr(void *_dst, upb_value src) {
*dstp = dst; *dstp = dst;
} }
dst->len = 0; dst->len = 0;
const upb_strref *ref = upb_value_getstrref(src); const upb_byteregion *bytes = upb_value_getbyteregion(src);
if (ref->len > dst->size) { uint32_t len = upb_byteregion_len(bytes);
dst->size = ref->len; if (len > dst->size) {
dst->size = len;
dst->ptr = realloc(dst->ptr, dst->size); dst->ptr = realloc(dst->ptr, dst->size);
} }
dst->len = ref->len; dst->len = len;
upb_bytesrc_read(ref->bytesrc, ref->stream_offset, ref->len, dst->ptr); upb_byteregion_copyall(bytes, dst->ptr);
} }
upb_flow_t upb_stdmsg_setstr(void *_m, upb_value fval, upb_value val) { upb_flow_t upb_stdmsg_setstr(void *_m, upb_value fval, upb_value val) {

@ -45,27 +45,29 @@ static void upb_decoder_abort(upb_decoder *d, const char *msg) {
/* Buffering ******************************************************************/ /* Buffering ******************************************************************/
// We operate on one buffer at a time, which may be a subset of the bytesrc // We operate on one buffer at a time, which may be a subset of the currently
// region we have ref'd. When data for the buffer is completely gone we pull // loaded byteregion data. When data for the buffer is completely gone we pull
// the next one. When we've committed our progress we release our ref on any // the next one. When we've committed our progress we discard any previous
// previous buffers' regions. // buffers' regions.
static size_t upb_decoder_bufleft(upb_decoder *d) { return d->end - d->ptr; } static uint32_t upb_decoder_bufleft(upb_decoder *d) { return d->end - d->ptr; }
static void upb_decoder_advance(upb_decoder *d, size_t len) { static void upb_decoder_advance(upb_decoder *d, uint32_t len) {
assert((size_t)(d->end - d->ptr) >= len); assert(upb_decoder_bufleft(d) >= len);
d->ptr += len; d->ptr += len;
} }
size_t upb_decoder_offset(upb_decoder *d) { uint64_t upb_decoder_offset(upb_decoder *d) {
size_t offset = d->bufstart_ofs; return d->bufstart_ofs + (d->ptr - d->buf);
if (d->ptr) offset += (d->ptr - d->buf); }
return offset;
uint64_t upb_decoder_bufendofs(upb_decoder *d) {
return d->bufstart_ofs + (d->end - d->buf);
} }
static void upb_decoder_setmsgend(upb_decoder *d) { static void upb_decoder_setmsgend(upb_decoder *d) {
upb_dispatcher_frame *f = d->dispatcher.top; upb_dispatcher_frame *f = d->dispatcher.top;
size_t delimlen = f->end_ofs - d->bufstart_ofs; uint32_t delimlen = f->end_ofs - d->bufstart_ofs;
size_t buflen = d->end - d->buf; uint32_t buflen = d->end - d->buf;
d->delim_end = (f->end_ofs != UPB_NONDELIMITED && delimlen <= buflen) ? d->delim_end = (f->end_ofs != UPB_NONDELIMITED && delimlen <= buflen) ?
d->buf + delimlen : NULL; // NULL if not in this buf. d->buf + delimlen : NULL; // NULL if not in this buf.
d->top_is_packed = f->is_packed; d->top_is_packed = f->is_packed;
@ -73,24 +75,25 @@ static void upb_decoder_setmsgend(upb_decoder *d) {
static bool upb_trypullbuf(upb_decoder *d) { static bool upb_trypullbuf(upb_decoder *d) {
assert(upb_decoder_bufleft(d) == 0); assert(upb_decoder_bufleft(d) == 0);
if (d->bufend_ofs == d->refend_ofs) { d->bufstart_ofs = upb_decoder_offset(d);
size_t read = upb_bytesrc_fetch(d->bytesrc, d->refend_ofs, d->status); d->buf = NULL;
if (read <= 0) {
d->ptr = NULL; d->ptr = NULL;
d->end = NULL; d->end = NULL;
if (read == 0) return false; // EOF if (upb_byteregion_available(d->input, upb_decoder_offset(d)) == 0 &&
!upb_byteregion_fetch(d->input, d->status)) {
if (upb_eof(d->status)) return false;
upb_decoder_exit(d); // Non-EOF error. upb_decoder_exit(d); // Non-EOF error.
} }
d->refend_ofs += read; uint32_t len;
} d->buf = upb_byteregion_getptr(d->input, d->bufstart_ofs, &len);
d->bufstart_ofs = d->bufend_ofs;
size_t len;
d->buf = upb_bytesrc_getptr(d->bytesrc, d->bufstart_ofs, &len);
assert(len > 0); assert(len > 0);
d->bufend_ofs = d->bufstart_ofs + len;
d->ptr = d->buf; d->ptr = d->buf;
d->end = d->buf + len; d->end = d->buf + len;
#ifdef UPB_USE_JIT_X64 #ifdef UPB_USE_JIT_X64
// If we start parsing a value, we can parse up to 20 bytes without
// having to bounds-check anything (2 10-byte varints). Since the
// JIT bounds-checks only *between* values (and for strings), the
// JIT bails if there are not 20 bytes available.
d->jit_end = d->end - 20; d->jit_end = d->end - 20;
#endif #endif
upb_decoder_setmsgend(d); upb_decoder_setmsgend(d);
@ -101,16 +104,21 @@ static void upb_pullbuf(upb_decoder *d) {
if (!upb_trypullbuf(d)) upb_decoder_abort(d, "Unexpected EOF"); if (!upb_trypullbuf(d)) upb_decoder_abort(d, "Unexpected EOF");
} }
void upb_decoder_commit(upb_decoder *d) { void upb_decoder_skipto(upb_decoder *d, uint64_t ofs) {
d->completed_ptr = d->ptr; if (ofs < upb_decoder_bufendofs(d)) {
if (d->refstart_ofs < d->bufstart_ofs) { upb_decoder_advance(d, ofs - upb_decoder_offset(d));
// Drop our ref on the previous buf's region. } else {
upb_bytesrc_refregion(d->bytesrc, d->bufstart_ofs, d->refend_ofs); d->buf = NULL;
upb_bytesrc_unrefregion(d->bytesrc, d->refstart_ofs, d->refend_ofs); d->ptr = NULL;
d->refstart_ofs = d->bufstart_ofs; d->end = NULL;
d->bufstart_ofs = ofs;
} }
} }
void upb_decoder_checkpoint(upb_decoder *d) {
upb_byteregion_discard(d->input, upb_decoder_offset(d));
}
/* Decoding of wire types *****************************************************/ /* Decoding of wire types *****************************************************/
@ -151,11 +159,12 @@ done:
return ret; return ret;
} }
// Returns true on success or false if we've hit a valid EOF.
FORCEINLINE bool upb_trydecode_varint32(upb_decoder *d, uint32_t *val) { FORCEINLINE bool upb_trydecode_varint32(upb_decoder *d, uint32_t *val) {
if (upb_decoder_bufleft(d) == 0 && upb_dispatcher_islegalend(&d->dispatcher)) { if (upb_decoder_bufleft(d) == 0 &&
// Check for our two successful end-of-message conditions upb_dispatcher_islegalend(&d->dispatcher) &&
// (user-specified EOM and bytesrc EOF). !upb_trypullbuf(d)) {
if (d->bufend_ofs == d->end_ofs || !upb_trypullbuf(d)) return false; return false;
} }
*val = upb_decode_varint32(d); *val = upb_decode_varint32(d);
return true; return true;
@ -212,26 +221,15 @@ FORCEINLINE uint64_t upb_decode_fixed64(upb_decoder *d) {
return u64; // TODO: proper byte swapping return u64; // TODO: proper byte swapping
} }
INLINE upb_strref *upb_decode_string(upb_decoder *d) { INLINE upb_byteregion *upb_decode_string(upb_decoder *d) {
uint32_t strlen = upb_decode_varint32(d); uint32_t strlen = upb_decode_varint32(d);
d->strref.stream_offset = upb_decoder_offset(d); uint64_t offset = upb_decoder_offset(d);
d->strref.len = strlen; upb_byteregion_reset(&d->str_byteregion, d->input, offset, strlen);
if (upb_decoder_bufleft(d) == 0) upb_pullbuf(d); // Could make it an option on the callback whether we fetchall() first or not.
if (upb_decoder_bufleft(d) >= strlen) { upb_byteregion_fetchall(&d->str_byteregion, d->status);
// Fast case. if (!upb_ok(d->status)) upb_decoder_exit(d);
d->strref.ptr = d->ptr; upb_decoder_skipto(d, offset + strlen);
upb_decoder_advance(d, strlen); return &d->str_byteregion;
} else {
// Slow case.
while (1) {
size_t consume = UPB_MIN(upb_decoder_bufleft(d), strlen);
upb_decoder_advance(d, consume);
strlen -= consume;
if (strlen == 0) break;
upb_pullbuf(d);
}
}
return &d->strref;
} }
INLINE void upb_push(upb_decoder *d, upb_fhandlers *f, uint64_t end) { INLINE void upb_push(upb_decoder *d, upb_fhandlers *f, uint64_t end) {
@ -272,7 +270,7 @@ T(DOUBLE, fixed64, double, upb_asdouble)
T(FLOAT, fixed32, float, upb_asfloat) T(FLOAT, fixed32, float, upb_asfloat)
T(SINT32, varint, int32, upb_zzdec_32) T(SINT32, varint, int32, upb_zzdec_32)
T(SINT64, varint, int64, upb_zzdec_64) T(SINT64, varint, int64, upb_zzdec_64)
T(STRING, string, strref, upb_strref*) T(STRING, string, byteregion, upb_byteregion*)
static void upb_decode_GROUP(upb_decoder *d, upb_fhandlers *f) { static void upb_decode_GROUP(upb_decoder *d, upb_fhandlers *f) {
upb_push(d, f, UPB_NONDELIMITED); upb_push(d, f, UPB_NONDELIMITED);
@ -352,10 +350,10 @@ INLINE upb_fhandlers *upb_decode_tag(upb_decoder *d) {
case UPB_WIRE_TYPE_DELIMITED: case UPB_WIRE_TYPE_DELIMITED:
upb_decoder_advance(d, upb_decode_varint32(d)); break; upb_decoder_advance(d, upb_decode_varint32(d)); break;
default: default:
upb_decoder_abort(d, "Invavlid wire type"); upb_decoder_abort(d, "Invalid wire type");
} }
// TODO: deliver to unknown field callback. // TODO: deliver to unknown field callback.
upb_decoder_commit(d); upb_decoder_checkpoint(d);
upb_decoder_checkdelim(d); upb_decoder_checkdelim(d);
} }
} }
@ -380,24 +378,18 @@ void upb_decoder_decode(upb_decoder *d, upb_status *status) {
return; return;
} }
f->decode(d, f); f->decode(d, f);
upb_decoder_commit(d); upb_decoder_checkpoint(d);
} }
} }
static void upb_decoder_skip(void *_d, upb_dispatcher_frame *top, static void upb_decoder_skip(void *_d, upb_dispatcher_frame *f) {
upb_dispatcher_frame *bottom) {
(void)top;
(void)bottom;
(void)_d;
#if 0
upb_decoder *d = _d; upb_decoder *d = _d;
// TODO if (f->end_ofs != UPB_NONDELIMITED) {
if (bottom->end_offset == UPB_NONDELIMITED) { upb_decoder_skipto(d, d->dispatcher.top->end_ofs);
// TODO: support skipping groups. } else {
abort(); // TODO: how to support skipping groups? Dispatcher could drop callbacks,
// or it could be special-cased inside the decoder.
} }
d->ptr = d->buf.ptr + bottom->end_offset;
#endif
} }
void upb_decoder_init(upb_decoder *d, upb_handlers *handlers) { void upb_decoder_init(upb_decoder *d, upb_handlers *handlers) {
@ -423,24 +415,19 @@ void upb_decoder_init(upb_decoder *d, upb_handlers *handlers) {
} }
} }
void upb_decoder_reset(upb_decoder *d, upb_bytesrc *bytesrc, uint64_t start_ofs, void upb_decoder_reset(upb_decoder *d, upb_byteregion *input, void *closure) {
uint64_t end_ofs, void *closure) {
upb_dispatcher_frame *f = upb_dispatcher_reset(&d->dispatcher, closure); upb_dispatcher_frame *f = upb_dispatcher_reset(&d->dispatcher, closure);
f->end_ofs = end_ofs; f->end_ofs = UPB_NONDELIMITED;
d->end_ofs = end_ofs; d->input = input;
d->refstart_ofs = start_ofs; d->bufstart_ofs = upb_byteregion_startofs(input);
d->refend_ofs = start_ofs;
d->bufstart_ofs = start_ofs;
d->bufend_ofs = start_ofs;
d->bytesrc = bytesrc;
d->buf = NULL; d->buf = NULL;
d->ptr = NULL; d->ptr = NULL;
d->end = NULL; // Force a buffer pull. d->end = NULL; // Force a buffer pull.
d->delim_end = NULL; // But don't let end-of-message get triggered.
d->str_byteregion.bytesrc = input->bytesrc;
#ifdef UPB_USE_JIT_X64 #ifdef UPB_USE_JIT_X64
d->jit_end = NULL; d->jit_end = NULL;
#endif #endif
d->delim_end = NULL; // But don't let end-of-message get triggered.
d->strref.bytesrc = bytesrc;
} }
void upb_decoder_uninit(upb_decoder *d) { void upb_decoder_uninit(upb_decoder *d) {

@ -5,7 +5,7 @@
* Author: Josh Haberman <jhaberman@gmail.com> * Author: Josh Haberman <jhaberman@gmail.com>
* *
* upb_decoder implements a high performance, streaming decoder for protobuf * upb_decoder implements a high performance, streaming decoder for protobuf
* data that works by getting its input data from a upb_bytesrc and calling * data that works by getting its input data from a upb_byteregion and calling
* into a upb_handlers. * into a upb_handlers.
*/ */
@ -26,24 +26,14 @@ extern "C" {
struct dasm_State; struct dasm_State;
typedef struct _upb_decoder { typedef struct _upb_decoder {
upb_bytesrc *bytesrc; // Source of our serialized data. upb_byteregion *input; // Input data (serialized).
upb_dispatcher dispatcher; // Dispatcher to which we push parsed data. upb_dispatcher dispatcher; // Dispatcher to which we push parsed data.
upb_status *status; // Where we will store any errors that occur. upb_status *status; // Where we will store any errors that occur.
upb_strref strref; // For passing string data to callbacks. upb_byteregion str_byteregion; // For passing string data to callbacks.
// Offsets for the bytesrc region we currently have ref'd.
uint64_t refstart_ofs, refend_ofs;
// Current input buffer and its stream offset. // Current input buffer and its stream offset.
const char *buf, *ptr, *end; const char *buf, *ptr, *end;
uint64_t bufstart_ofs, bufend_ofs; uint64_t bufstart_ofs;
// Stream offset for the end of the top-level message, if any.
uint64_t end_ofs;
// Buf offset as of which we've delivered calbacks; needed for rollback if
// a callback returns UPB_BREAK.
const char *completed_ptr;
// End of the delimited region, relative to ptr, or NULL if not in this buf. // End of the delimited region, relative to ptr, or NULL if not in this buf.
const char *delim_end; const char *delim_end;
@ -65,10 +55,6 @@ typedef struct _upb_decoder {
sigjmp_buf exitjmp; sigjmp_buf exitjmp;
} upb_decoder; } upb_decoder;
// Used for frames that have no specific end offset: groups, repeated primitive
// fields inside groups, and the top-level message.
#define UPB_NONDELIMITED UINT64_MAX
// Initializes/uninitializes a decoder for calling into the given handlers // Initializes/uninitializes a decoder for calling into the given handlers
// or to write into the given msgdef, given its accessors). Takes a ref // or to write into the given msgdef, given its accessors). Takes a ref
// on the handlers. // on the handlers.
@ -77,13 +63,13 @@ void upb_decoder_uninit(upb_decoder *d);
// Resets the internal state of an already-allocated decoder. This puts it in a // Resets the internal state of an already-allocated decoder. This puts it in a
// state where it has not seen any data, and expects the next data to be from // state where it has not seen any data, and expects the next data to be from
// the beginning of a new protobuf. Parsers must be reset before they can be // the beginning of a new protobuf. Decoders must be reset before they can be
// used. A decoder can be reset multiple times. // used. A decoder can be reset multiple times. "input" must live until the
// // decoder is reset again (or destroyed).
// Pass UINT64_MAX for end_ofs to indicate a non-delimited top-level message. void upb_decoder_reset(upb_decoder *d, upb_byteregion *input, void *closure);
void upb_decoder_reset(upb_decoder *d, upb_bytesrc *src, uint64_t start_ofs,
uint64_t end_ofs, void *closure);
// Decodes serialized data (calling handlers as the data is parsed) until error
// or EOF (see *status for details).
void upb_decoder_decode(upb_decoder *d, upb_status *status); void upb_decoder_decode(upb_decoder *d, upb_status *status);
#ifdef __cplusplus #ifdef __cplusplus

@ -129,7 +129,7 @@ void upb_reg_jit_gdb(upb_decoder *d) {
|.define PTR, rbx |.define PTR, rbx
|.define CLOSURE, r12 |.define CLOSURE, r12
|.type FRAME, upb_dispatcher_frame, r13 |.type FRAME, upb_dispatcher_frame, r13
|.type STRREF, upb_strref, r14 |.type BYTEREGION,upb_byteregion, r14
|.type DECODER, upb_decoder, r15 |.type DECODER, upb_decoder, r15
|.type STDARRAY, upb_stdarray |.type STDARRAY, upb_stdarray
| |
@ -365,23 +365,26 @@ static void upb_decoder_jit_decodefield(upb_decoder *d, upb_mhandlers *m,
// robust checks. // robust checks.
| mov ecx, dword [PTR + tag_size] | mov ecx, dword [PTR + tag_size]
| decode_loaded_varint tag_size | decode_loaded_varint tag_size
| mov rdi, DECODER->effective_end
| sub rdi, rax
| cmp ARG3_64, rdi // if (len > d->effective_end - str)
| ja ->exit_jit // Can't deliver, whole string not in buf.
// Update PTR to point past end of string.
| mov rdi, rax | mov rdi, rax
| add rdi, ARG3_64 | add rdi, ARG3_64
| mov STRREF->len, ARG3_32
| mov STRREF->ptr, rax
| sub rax, DECODER->buf
| add eax, DECODER->bufstart_ofs // = d->ptr - d->buf + d->bufstart_ofs
| mov STRREF->stream_offset, eax
| mov ARG3_64, STRREF
| cmp rdi, DECODER->effective_end
| ja ->exit_jit // Can't deliver, whole string not in buf.
| mov PTR, rdi | mov PTR, rdi
break;
case UPB_TYPE_ENDGROUP: // A pseudo-type. // Populate BYTEREGION appropriately.
| add PTR, tag_size | sub rax, DECODER->buf
| jmp =>m->jit_endofmsg_pclabel | add rax, DECODER->bufstart_ofs // = d->ptr - d->buf + d->bufstart_ofs
return; | mov BYTEREGION->start, rax
| mov BYTEREGION->discard, rax
| add rax, ARG3_64
| mov BYTEREGION->end, rax
| mov BYTEREGION->fetch, rax // Fast path ensures whole string is loaded
| mov ARG3_64, BYTEREGION
break;
// Will dispatch callbacks and call submessage in a second. // Will dispatch callbacks and call submessage in a second.
case UPB_TYPE(MESSAGE): case UPB_TYPE(MESSAGE):
@ -471,7 +474,6 @@ static void upb_decoder_jit_callcb(upb_decoder *d, upb_fhandlers *f) {
| callp f->endsubmsg | callp f->endsubmsg
} }
| popframe upb_fhandlers_getmsg(f) | popframe upb_fhandlers_getmsg(f)
} else { } else {
| mov ARG1_64, CLOSURE | mov ARG1_64, CLOSURE
// Test for callbacks we can specialize. // Test for callbacks we can specialize.
@ -522,8 +524,8 @@ static void upb_decoder_jit_callcb(upb_decoder *d, upb_fhandlers *f) {
} }
// PTR should point to the beginning of the tag. // PTR should point to the beginning of the tag.
static void upb_decoder_jit_field(upb_decoder *d, uint32_t tag, uint32_t next_tag, static void upb_decoder_jit_field(upb_decoder *d, uint32_t tag,
upb_mhandlers *m, uint32_t next_tag, upb_mhandlers *m,
upb_fhandlers *f, upb_fhandlers *next_f) { upb_fhandlers *f, upb_fhandlers *next_f) {
// PC-label for the dispatch table. // PC-label for the dispatch table.
// We check the wire type (which must be loaded in edx) because the // We check the wire type (which must be loaded in edx) because the
@ -546,7 +548,14 @@ static void upb_decoder_jit_field(upb_decoder *d, uint32_t tag, uint32_t next_ta
|1: // Label for repeating this field. |1: // Label for repeating this field.
upb_decoder_jit_decodefield(d, m, f->type, upb_value_size(tag)); int tag_size = upb_value_size(tag);
if (f->type == UPB_TYPE_ENDGROUP) {
| add PTR, tag_size
| jmp =>m->jit_endofmsg_pclabel
return;
}
upb_decoder_jit_decodefield(d, m, f->type, tag_size);
upb_decoder_jit_callcb(d, f); upb_decoder_jit_callcb(d, f);
// Epilogue: load next tag, check for repeated field. // Epilogue: load next tag, check for repeated field.
@ -673,7 +682,7 @@ static void upb_decoder_jit(upb_decoder *d) {
| sub rsp, 8 | sub rsp, 8
| mov DECODER, ARG1_64 | mov DECODER, ARG1_64
| mov FRAME, DECODER:ARG1_64->dispatcher.top | mov FRAME, DECODER:ARG1_64->dispatcher.top
| lea STRREF, DECODER:ARG1_64->strref | lea BYTEREGION, DECODER:ARG1_64->str_byteregion
| mov CLOSURE, FRAME->closure | mov CLOSURE, FRAME->closure
| mov PTR, DECODER->ptr | mov PTR, DECODER->ptr

@ -23,7 +23,7 @@ void upb_strtomsg(const char *str, size_t len, void *msg, const upb_msgdef *md,
upb_accessors_reghandlers(h, md); upb_accessors_reghandlers(h, md);
upb_decoder_init(&d, h); upb_decoder_init(&d, h);
upb_handlers_unref(h); upb_handlers_unref(h);
upb_decoder_reset(&d, upb_stringsrc_bytesrc(&strsrc), 0, UINT64_MAX, msg); upb_decoder_reset(&d, upb_stringsrc_allbytes(&strsrc), msg);
upb_decoder_decode(&d, status); upb_decoder_decode(&d, status);
upb_stringsrc_uninit(&strsrc); upb_stringsrc_uninit(&strsrc);
@ -84,16 +84,19 @@ upb_def **upb_load_defs_from_descriptor(const char *str, size_t len, int *n,
upb_handlers_unref(h); upb_handlers_unref(h);
upb_descreader r; upb_descreader r;
upb_descreader_init(&r); upb_descreader_init(&r);
upb_decoder_reset(&d, upb_stringsrc_bytesrc(&strsrc), 0, UINT64_MAX, &r); upb_decoder_reset(&d, upb_stringsrc_allbytes(&strsrc), &r);
upb_decoder_decode(&d, status); upb_decoder_decode(&d, status);
upb_stringsrc_uninit(&strsrc);
upb_decoder_uninit(&d);
if (!upb_ok(status)) {
upb_descreader_uninit(&r);
return NULL;
}
upb_def **defs = upb_descreader_getdefs(&r, n); upb_def **defs = upb_descreader_getdefs(&r, n);
upb_def **defscopy = malloc(sizeof(upb_def*) * (*n)); upb_def **defscopy = malloc(sizeof(upb_def*) * (*n));
memcpy(defscopy, defs, sizeof(upb_def*) * (*n)); memcpy(defscopy, defs, sizeof(upb_def*) * (*n));
upb_descreader_uninit(&r); upb_descreader_uninit(&r);
upb_stringsrc_uninit(&strsrc);
upb_decoder_uninit(&d);
// Set default accessors and layouts on all messages. // Set default accessors and layouts on all messages.
for(int i = 0; i < *n; i++) { for(int i = 0; i < *n; i++) {

@ -35,15 +35,16 @@ err:
return -1; return -1;
} }
static int upb_textprinter_putescaped(upb_textprinter *p, const upb_strref *strref, static int upb_textprinter_putescaped(upb_textprinter *p,
const upb_byteregion *bytes,
bool preserve_utf8) { bool preserve_utf8) {
// Based on CEscapeInternal() from Google's protobuf release. // Based on CEscapeInternal() from Google's protobuf release.
// TODO; we could read directly from a bytesrc's buffer instead. // TODO; we could read directly from a bytesrc's buffer instead.
// TODO; we could write strrefs to the sink when possible. // TODO; we could write byteregions to the sink when possible.
char dstbuf[4096], *dst = dstbuf, *dstend = dstbuf + sizeof(dstbuf); char dstbuf[4096], *dst = dstbuf, *dstend = dstbuf + sizeof(dstbuf);
char *buf = malloc(strref->len), *src = buf; char *buf = malloc(upb_byteregion_len(bytes)), *src = buf;
char *end = src + strref->len; char *end = src + upb_byteregion_len(bytes);
upb_bytesrc_read(strref->bytesrc, strref->stream_offset, strref->len, buf); upb_byteregion_copyall(bytes, buf);
// I think hex is prettier and more useful, but proto2 uses octal; should // I think hex is prettier and more useful, but proto2 uses octal; should
// investigate whether it can parse hex also. // investigate whether it can parse hex also.
@ -142,7 +143,7 @@ static upb_flow_t upb_textprinter_putstr(void *_p, upb_value fval,
uint64_t start_ofs = upb_bytesink_getoffset(p->sink); uint64_t start_ofs = upb_bytesink_getoffset(p->sink);
const upb_fielddef *f = upb_value_getfielddef(fval); const upb_fielddef *f = upb_value_getfielddef(fval);
CHECK(upb_bytesink_putc(p->sink, '"')); CHECK(upb_bytesink_putc(p->sink, '"'));
CHECK(upb_textprinter_putescaped(p, upb_value_getstrref(val), CHECK(upb_textprinter_putescaped(p, upb_value_getbyteregion(val),
f->type == UPB_TYPE(STRING))); f->type == UPB_TYPE(STRING)));
CHECK(upb_bytesink_putc(p->sink, '"')); CHECK(upb_bytesink_putc(p->sink, '"'));
return UPB_CONTINUE; return UPB_CONTINUE;

@ -113,7 +113,7 @@ INLINE upb_decoderet upb_vdecode_max8_fast(upb_decoderet r) {
/* Encoding *******************************************************************/ /* Encoding *******************************************************************/
INLINE size_t upb_value_size(uint64_t val) { INLINE int upb_value_size(uint64_t val) {
#ifdef __GNUC__ #ifdef __GNUC__
int high_bit = 63 - __builtin_clzll(val); // 0-based, undef if val == 0. int high_bit = 63 - __builtin_clzll(val); // 0-based, undef if val == 0.
#else #else

@ -20,6 +20,7 @@
#define UPB_TABLE_H_ #define UPB_TABLE_H_
#include <assert.h> #include <assert.h>
#include <stddef.h>
#include "upb.h" #include "upb.h"
#ifdef __cplusplus #ifdef __cplusplus

@ -71,8 +71,9 @@ void upb_status_seterrliteral(upb_status *status, const char *msg) {
status->space = NULL; status->space = NULL;
} }
void upb_status_copy(upb_status *to, upb_status *from) { void upb_status_copy(upb_status *to, const upb_status *from) {
to->status = from->status; to->status = from->status;
to->eof = from->eof;
to->code = from->code; to->code = from->code;
to->space = from->space; to->space = from->space;
if (from->str == from->buf) { if (from->str == from->buf) {
@ -100,6 +101,7 @@ const char *upb_status_getstr(const upb_status *_status) {
void upb_status_clear(upb_status *status) { void upb_status_clear(upb_status *status) {
status->status = UPB_OK; status->status = UPB_OK;
status->eof = false;
status->code = 0; status->code = 0;
status->space = NULL; status->space = NULL;
status->str = NULL; status->str = NULL;
@ -124,7 +126,7 @@ void upb_status_fromerrno(upb_status *status) {
upb_errorspace upb_posix_errorspace = {"POSIX", NULL}; // TODO upb_errorspace upb_posix_errorspace = {"POSIX", NULL}; // TODO
int upb_vrprintf(char **buf, size_t *size, size_t ofs, int upb_vrprintf(char **buf, uint32_t *size, uint32_t ofs,
const char *fmt, va_list args) { const char *fmt, va_list args) {
// Try once without reallocating. We have to va_copy because we might have // Try once without reallocating. We have to va_copy because we might have
// to call vsnprintf again. // to call vsnprintf again.

@ -12,7 +12,6 @@
#include <stdbool.h> #include <stdbool.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h> // only for size_t.
#include <assert.h> #include <assert.h>
#include <stdarg.h> #include <stdarg.h>
#include "descriptor_const.h" #include "descriptor_const.h"
@ -40,7 +39,7 @@ INLINE void nop_printf(const char *fmt, ...) { (void)fmt; }
#endif #endif
// Rounds val up to the next multiple of align. // Rounds val up to the next multiple of align.
INLINE size_t upb_align_up(size_t val, size_t align) { INLINE uint32_t upb_align_up(uint32_t val, uint32_t align) {
return val % align == 0 ? val : val + align - (val % align); return val % align == 0 ? val : val + align - (val % align);
} }
@ -124,7 +123,7 @@ extern const upb_type_info upb_types[];
/* upb_value ******************************************************************/ /* upb_value ******************************************************************/
struct _upb_strref; struct _upb_byteregion;
struct _upb_fielddef; struct _upb_fielddef;
// Special constants for the upb_value.type field. These must not conflict // Special constants for the upb_value.type field. These must not conflict
@ -144,7 +143,7 @@ typedef struct {
int64_t int64; int64_t int64;
uint32_t uint32; uint32_t uint32;
bool _bool; bool _bool;
const struct _upb_strref *strref; struct _upb_byteregion *byteregion;
const struct _upb_fielddef *fielddef; const struct _upb_fielddef *fielddef;
void *_void; void *_void;
} val; } val;
@ -194,11 +193,13 @@ UPB_VALUE_ACCESSORS(uint32, uint32, uint32_t, UPB_TYPE(UINT32));
UPB_VALUE_ACCESSORS(uint64, uint64, uint64_t, UPB_TYPE(UINT64)); UPB_VALUE_ACCESSORS(uint64, uint64, uint64_t, UPB_TYPE(UINT64));
UPB_VALUE_ACCESSORS(bool, _bool, bool, UPB_TYPE(BOOL)); UPB_VALUE_ACCESSORS(bool, _bool, bool, UPB_TYPE(BOOL));
UPB_VALUE_ACCESSORS(ptr, _void, void*, UPB_VALUETYPE_PTR); UPB_VALUE_ACCESSORS(ptr, _void, void*, UPB_VALUETYPE_PTR);
UPB_VALUE_ACCESSORS(byteregion, byteregion, struct _upb_byteregion*,
UPB_TYPE(STRING));
// upb_fielddef and upb_strref should never be modified from a callback // upb_fielddef should never be modified from a callback
// (ie. when they're getting passed through a upb_value). // (ie. when they're getting passed through a upb_value).
UPB_VALUE_ACCESSORS(strref, strref, const struct _upb_strref*, UPB_TYPE(STRING)); UPB_VALUE_ACCESSORS(fielddef, fielddef, const struct _upb_fielddef*,
UPB_VALUE_ACCESSORS(fielddef, fielddef, const struct _upb_fielddef*, UPB_VALUETYPE_FIELDDEF); UPB_VALUETYPE_FIELDDEF);
extern upb_value UPB_NO_VALUE; extern upb_value UPB_NO_VALUE;
@ -215,43 +216,46 @@ typedef struct {
const char *name; const char *name;
// Writes a NULL-terminated string to "buf" containing an error message for // Writes a NULL-terminated string to "buf" containing an error message for
// the given error code, returning false if the message was too large to fit. // the given error code, returning false if the message was too large to fit.
bool (*code_to_string)(int code, char *buf, size_t len); bool (*code_to_string)(int code, char *buf, uint32_t len);
} upb_errorspace; } upb_errorspace;
// TODO: consider adding error space and code, to let ie. errno be stored
// as a proper code, or application-specific error codes.
typedef struct { typedef struct {
char status; char status;
bool eof;
int code; // Can be set to a more specific code (defined by error space). int code; // Can be set to a more specific code (defined by error space).
upb_errorspace *space; upb_errorspace *space;
const char *str; // NULL when no message is present. NULL-terminated. const char *str; // NULL when no message is present. NULL-terminated.
char *buf; // Owned by the status. char *buf; // Owned by the status.
size_t bufsize; uint32_t bufsize;
} upb_status; } upb_status;
#define UPB_STATUS_INIT {UPB_OK, 0, NULL, NULL, NULL, 0} #define UPB_STATUS_INIT {UPB_OK, false, 0, NULL, NULL, NULL, 0}
void upb_status_init(upb_status *status); void upb_status_init(upb_status *status);
void upb_status_uninit(upb_status *status); void upb_status_uninit(upb_status *status);
INLINE bool upb_ok(upb_status *status) { return status->code == UPB_OK; } INLINE bool upb_ok(const upb_status *status) { return status->code == UPB_OK; }
INLINE bool upb_eof(const upb_status *status) { return status->eof; }
void upb_status_clear(upb_status *status); void upb_status_clear(upb_status *status);
void upb_status_seterrliteral(upb_status *status, const char *msg); void upb_status_seterrliteral(upb_status *status, const char *msg);
void upb_status_seterrf(upb_status *s, const char *msg, ...); void upb_status_seterrf(upb_status *s, const char *msg, ...);
void upb_status_setcode(upb_status *s, upb_errorspace *space, int code); void upb_status_setcode(upb_status *s, upb_errorspace *space, int code);
INLINE void upb_status_seteof(upb_status *s) { s->eof = true; }
// The returned string is invalidated by any other call into the status. // The returned string is invalidated by any other call into the status.
const char *upb_status_getstr(const upb_status *s); const char *upb_status_getstr(const upb_status *s);
void upb_status_copy(upb_status *to, upb_status *from); void upb_status_copy(upb_status *to, const upb_status *from);
extern upb_errorspace upb_posix_errorspace; extern upb_errorspace upb_posix_errorspace;
void upb_status_fromerrno(upb_status *status); void upb_status_fromerrno(upb_status *status);
// Like vaprintf, but uses *buf (which can be NULL) as a starting point and // Like vasprintf (which allocates a string large enough for the result), but
// reallocates it only if the new value will not fit. "size" is updated to // uses *buf (which can be NULL) as a starting point and reallocates it only if
// reflect the allocated size of the buffer. Returns false on memory alloc // the new value will not fit. "size" is updated to reflect the allocated size
// failure. // of the buffer. Starts writing at the given offset into the string; bytes
int upb_vrprintf(char **buf, size_t *size, size_t ofs, // preceding this offset are unaffected. Returns the new length of the string,
// or -1 on memory allocation failure.
int upb_vrprintf(char **buf, uint32_t *size, uint32_t ofs,
const char *fmt, va_list args); const char *fmt, va_list args);
#ifdef __cplusplus #ifdef __cplusplus

Loading…
Cancel
Save