Text format serializer for upb_msg (#242)

* WIP, first version of encoder.

* More progress on text encoder.

* A lot of progress on the text printer.

* Added textencode header file.

* Text encoder now passes conformance tests.

These aren't very stringent though, and more testing is needed.

* Print text into static buffer.  Passes all conformance tests.

* Fixed kokoro errors.

* Fix for indent depth when printing map fields.
pull/13171/head
Joshua Haberman 5 years ago committed by GitHub
parent 888f35cae6
commit ce1a399a19
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 22
      BUILD
  2. 5
      CMakeLists.txt
  3. 41
      tests/conformance_upb.c
  4. 10
      upb/def.c
  5. 1
      upb/def.h
  6. 2
      upb/msg.c
  7. 60
      upb/reflection.c
  8. 27
      upb/reflection.h
  9. 393
      upb/textencode.c
  10. 27
      upb/textencode.h

22
BUILD

@ -142,6 +142,20 @@ cc_library(
],
)
cc_library(
name = "textformat",
srcs = [
"upb/textencode.c",
],
hdrs = [
"upb/textencode.h",
],
visibility = ["//visibility:public"],
deps = [
":reflection",
],
)
# Internal C/C++ libraries #####################################################
cc_library(
@ -522,6 +536,12 @@ upb_proto_library(
deps = ["@com_google_protobuf//:conformance_proto"],
)
upb_proto_reflection_library(
name = "conformance_proto_upbdefs",
testonly = 1,
deps = ["@com_google_protobuf//:conformance_proto"],
)
upb_proto_reflection_library(
name = "test_messages_proto2_upbdefs",
testonly = 1,
@ -546,9 +566,11 @@ cc_binary(
}) + ["-Ibazel-out/k8-fastbuild/bin"],
deps = [
":conformance_proto_upb",
":conformance_proto_upbdefs",
":test_messages_proto2_upbdefs",
":test_messages_proto3_upbdefs",
":reflection",
":textformat",
":upb",
],
)

@ -88,6 +88,11 @@ target_link_libraries(reflection
port
table
upb)
add_library(textformat
upb/textencode.c
upb/textencode.h)
target_link_libraries(textformat
reflection)
add_library(table INTERFACE)
target_link_libraries(table INTERFACE
port

@ -9,11 +9,16 @@
#include <unistd.h>
#include "conformance/conformance.upb.h"
#include "conformance/conformance.upbdefs.h"
#include "src/google/protobuf/test_messages_proto2.upbdefs.h"
#include "src/google/protobuf/test_messages_proto3.upbdefs.h"
#include "upb/decode.h"
#include "upb/encode.h"
#include "upb/reflection.h"
#include "upb/textencode.h"
int test_count = 0;
bool verbose = false; /* Set to true to get req/resp printed on stderr. */
bool CheckedRead(int fd, void *buf, size_t len) {
size_t ofs = 0;
@ -74,6 +79,22 @@ void serialize_proto(const upb_msg *msg, const upb_msgdef *m, const ctx *c) {
}
}
void serialize_text(const upb_msg *msg, const upb_msgdef *m, const ctx *c) {
size_t len;
size_t len2;
int opts = 0;
char *data;
if (!conformance_ConformanceRequest_print_unknown_fields(c->request)) {
opts |= UPB_TXTENC_SKIPUNKNOWN;
}
len = upb_textencode(msg, m, c->symtab, opts, NULL, 0);
data = upb_arena_malloc(c->arena, len + 1);
len2 = upb_textencode(msg, m, c->symtab, opts, data, len + 1);
assert(len == len2);
conformance_ConformanceResponse_set_text_payload(
c->response, upb_strview_make(data, len));
}
bool parse_input(upb_msg *msg, const upb_msgdef *m, const ctx* c) {
switch (conformance_ConformanceRequest_payload_case(c->request)) {
case conformance_ConformanceRequest_payload_protobuf_payload:
@ -98,6 +119,9 @@ void write_output(const upb_msg *msg, const upb_msgdef *m, const ctx* c) {
case conformance_PROTOBUF:
serialize_proto(msg, m, c);
break;
case conformance_TEXT_FORMAT:
serialize_text(msg, m, c);
break;
default: {
static const char msg[] = "Unsupported output format.";
conformance_ConformanceResponse_set_skipped(
@ -126,7 +150,14 @@ void DoTest(const ctx* c) {
}
}
bool DoTestIo(const upb_symtab *symtab) {
void debug_print(const char *label, const upb_msg *msg, const upb_msgdef *m,
const ctx *c) {
char buf[512];
upb_textencode(msg, m, c->symtab, UPB_TXTENC_SINGLELINE, buf, sizeof(buf));
fprintf(stderr, "%s: %s\n", label, buf);
}
bool DoTestIo(upb_symtab *symtab) {
upb_status status;
char *input;
char *output;
@ -166,6 +197,14 @@ bool DoTestIo(const upb_symtab *symtab) {
test_count++;
if (verbose) {
debug_print("Request", c.request,
conformance_ConformanceRequest_getmsgdef(symtab), &c);
debug_print("Response", c.response,
conformance_ConformanceResponse_getmsgdef(symtab), &c);
fprintf(stderr, "\n");
}
upb_arena_free(c.arena);
return true;

@ -610,6 +610,7 @@ bool upb_fielddef_hassubdef(const upb_fielddef *f) {
bool upb_fielddef_haspresence(const upb_fielddef *f) {
if (upb_fielddef_isseq(f)) return false;
if (upb_fielddef_issubmsg(f)) return true;
if (upb_fielddef_containingoneof(f)) return true;
return f->file->syntax == UPB_SYNTAX_PROTO2;
}
@ -707,6 +708,11 @@ const upb_msglayout *upb_msgdef_layout(const upb_msgdef *m) {
return m->layout;
}
const upb_fielddef *_upb_msgdef_field(const upb_msgdef *m, int i) {
if (i >= m->field_count) return NULL;
return &m->fields[i];
}
bool upb_msgdef_mapentry(const upb_msgdef *m) {
return m->map_entry;
}
@ -951,7 +957,9 @@ static bool make_layout(const upb_symtab *symtab, const upb_msgdef *m) {
}
if (upb_fielddef_haspresence(f) && !upb_fielddef_containingoneof(f)) {
field->presence = (hasbit++);
/* We don't use hasbit 0, so that 0 can indicate "no presence" in the
* table. This wastes one hasbit, but we don't worry about it for now. */
field->presence = ++hasbit;
} else {
field->presence = 0;
}

@ -427,6 +427,7 @@ const upb_oneofdef *upb_msgdef_ntoo(const upb_msgdef *m, const char *name,
int upb_msgdef_numfields(const upb_msgdef *m);
int upb_msgdef_numoneofs(const upb_msgdef *m);
const upb_msglayout *upb_msgdef_layout(const upb_msgdef *m);
const upb_fielddef *_upb_msgdef_field(const upb_msgdef *m, int i);
UPB_INLINE const upb_oneofdef *upb_msgdef_ntooz(const upb_msgdef *m,
const char *name) {

@ -94,7 +94,7 @@ void upb_msg_addunknown(upb_msg *msg, const char *data, size_t len,
}
const char *upb_msg_getunknown(const upb_msg *msg, size_t *len) {
const upb_msg_internal* in = upb_msg_getinternal_const(msg);
const upb_msg_internal *in = upb_msg_getinternal_const(msg);
*len = in->unknown_len;
return in->unknown;
}

@ -66,27 +66,36 @@ static uint32_t *oneofcase(const upb_msg *msg,
return PTR_AT(msg, ~field->presence, uint32_t);
}
static upb_msgval _upb_msg_getraw(const upb_msg *msg, const upb_fielddef *f) {
const upb_msglayout_field *field = upb_fielddef_layout(f);
const char *mem = PTR_AT(msg, field->offset, char);
upb_msgval val = {0};
int size = upb_fielddef_isseq(f) ? sizeof(void *)
: field_size[field->descriptortype];
memcpy(&val, mem, size);
return val;
}
bool upb_msg_has(const upb_msg *msg, const upb_fielddef *f) {
const upb_msglayout_field *field = upb_fielddef_layout(f);
UPB_ASSERT(field->presence);
if (in_oneof(field)) {
return *oneofcase(msg, field) == field->number;
} else {
} else if (field->presence > 0) {
uint32_t hasbit = field->presence;
return *PTR_AT(msg, hasbit / 8, char) | (1 << (hasbit % 8));
} else {
UPB_ASSERT(field->descriptortype == UPB_DESCRIPTOR_TYPE_MESSAGE ||
field->descriptortype == UPB_DESCRIPTOR_TYPE_GROUP);
return _upb_msg_getraw(msg, f).msg_val != NULL;
}
}
upb_msgval upb_msg_get(const upb_msg *msg, const upb_fielddef *f) {
const upb_msglayout_field *field = upb_fielddef_layout(f);
const char *mem = PTR_AT(msg, field->offset, char);
upb_msgval val;
if (field->presence == 0 || upb_msg_has(msg, f)) {
int size = upb_fielddef_isseq(f) ? sizeof(void *)
: field_size[field->descriptortype];
memcpy(&val, mem, size);
if (!upb_fielddef_haspresence(f) || upb_msg_has(msg, f)) {
return _upb_msg_getraw(msg, f);
} else {
/* TODO(haberman): change upb_fielddef to not require this switch(). */
upb_msgval val = {0};
switch (upb_fielddef_type(f)) {
case UPB_TYPE_INT32:
case UPB_TYPE_ENUM:
@ -118,8 +127,8 @@ upb_msgval upb_msg_get(const upb_msg *msg, const upb_fielddef *f) {
val.msg_val = NULL;
break;
}
return val;
}
return val;
}
upb_mutmsgval upb_msg_mutable(upb_msg *msg, const upb_fielddef *f,
@ -157,7 +166,36 @@ void upb_msg_set(upb_msg *msg, const upb_fielddef *f, upb_msgval val,
}
}
#undef DEREF
bool upb_msg_next(const upb_msg *msg, const upb_msgdef *m,
const upb_symtab *ext_pool, const upb_fielddef **out_f,
upb_msgval *out_val, size_t *iter) {
int i = *iter;
const upb_msgval zero = {0};
const upb_fielddef *f;
while ((f = _upb_msgdef_field(m, ++i)) != NULL) {
upb_msgval val = _upb_msg_getraw(msg, f);
/* Skip field if unset or empty. */
if (upb_fielddef_haspresence(f)) {
if (!upb_msg_has(msg, f)) continue;
} else {
upb_msgval test = val;
if (upb_fielddef_isstring(f) && !upb_fielddef_isseq(f)) {
/* Clear string pointer, only size matters (ptr could be non-NULL). */
test.str_val.data = NULL;
}
/* Continue if NULL or 0. */
if (memcmp(&test, &zero, sizeof(test)) == 0) continue;
}
*out_val = val;
*out_f = f;
*iter = i;
return true;
}
*iter = i;
return false;
}
/** upb_array *****************************************************************/

@ -52,6 +52,33 @@ void upb_msg_set(upb_msg *msg, const upb_fielddef *f, upb_msgval val,
/* Clears any field presence and sets the value back to its default. */
void upb_msg_clearfield(upb_msg *msg, const upb_fielddef *f);
/* Iterate over present fields.
*
* size_t iter = UPB_MSG_BEGIN;
* const upb_fielddef *f;
* upb_msgval val;
* while (upb_msg_next(msg, m, ext_pool, &f, &val, &iter)) {
* process_field(f, val);
* }
*
* If ext_pool is NULL, no extensions will be returned. If the given symtab
* returns extensions that don't match what is in this message, those extensions
* will be skipped.
*/
#define UPB_MSG_BEGIN -1
bool upb_msg_next(const upb_msg *msg, const upb_msgdef *m,
const upb_symtab *ext_pool, const upb_fielddef **f,
upb_msgval *val, size_t *iter);
/* Adds unknown data (serialized protobuf data) to the given message. The data
* is copied into the message instance. */
void upb_msg_addunknown(upb_msg *msg, const char *data, size_t len,
upb_arena *arena);
/* Returns a reference to the message's unknown data. */
const char *upb_msg_getunknown(const upb_msg *msg, size_t *len);
/** upb_array *****************************************************************/
/* Creates a new array on the given arena that holds elements of this type. */

@ -0,0 +1,393 @@
#include "upb/textencode.h"
#include <ctype.h>
#include <float.h>
#include <inttypes.h>
#include <stdarg.h>
#include <stdio.h>
#include <string.h>
#include "upb/reflection.h"
#include "upb/port_def.inc"
typedef struct {
char *buf, *ptr, *end;
size_t overflow;
int indent_depth;
int options;
const upb_symtab *ext_pool;
} txtenc;
static void txtenc_msg(txtenc *e, const upb_msg *msg, const upb_msgdef *m);
#define CHK(x) do { if (!(x)) { return false; } } while(0)
static void txtenc_putbytes(txtenc *e, const void *data, size_t len) {
size_t have = e->end - e->ptr;
if (UPB_LIKELY(have >= len)) {
memcpy(e->ptr, data, len);
e->ptr += len;
} else {
memcpy(e->ptr, data, have);
e->ptr += have;
e->overflow += (len - have);
}
}
static void txtenc_putstr(txtenc *e, const char *str) {
txtenc_putbytes(e, str, strlen(str));
}
static void txtenc_printf(txtenc *e, const char *fmt, ...) {
size_t n;
size_t have = e->end - e->ptr;
va_list args;
va_start(args, fmt);
n = _upb_vsnprintf(e->ptr, have, fmt, args);
va_end(args);
if (UPB_LIKELY(have > n)) {
e->ptr += n;
} else {
e->ptr += have;
e->overflow += (n - have);
}
}
static void txtenc_indent(txtenc *e) {
if ((e->options & UPB_TXTENC_SINGLELINE) == 0) {
int i = e->indent_depth;
while (i-- > 0) {
txtenc_putstr(e, " ");
}
}
}
static void txtenc_endfield(txtenc *e) {
if (e->options & UPB_TXTENC_SINGLELINE) {
txtenc_putstr(e, " ");
} else {
txtenc_putstr(e, "\n");
}
}
static void txtenc_enum(int32_t val, const upb_fielddef *f, txtenc *e) {
const upb_enumdef *e_def = upb_fielddef_enumsubdef(f);
const char *name = upb_enumdef_iton(e_def, val);
if (name) {
txtenc_printf(e, "%s", name);
} else {
txtenc_printf(e, "%" PRId32, val);
}
}
static void txtenc_string(txtenc *e, upb_strview str, bool bytes) {
const char *ptr = str.data;
const char *end = ptr + str.size;
txtenc_putstr(e, "\"");
while (ptr < end) {
switch (*ptr) {
case '\n':
txtenc_putstr(e, "\\n");
break;
case '\r':
txtenc_putstr(e, "\\r");
break;
case '\t':
txtenc_putstr(e, "\\t");
break;
case '\"':
txtenc_putstr(e, "\\\"");
break;
case '\'':
txtenc_putstr(e, "\\'");
break;
case '\\':
txtenc_putstr(e, "\\\\");
break;
default:
if ((bytes || (uint8_t)*ptr < 0x80) && !isprint(*ptr)) {
txtenc_printf(e, "\\%03o", (int)(uint8_t)*ptr);
} else {
txtenc_putbytes(e, ptr, 1);
}
}
ptr++;
}
txtenc_putstr(e, "\"");
}
static void txtenc_field(txtenc *e, upb_msgval val, const upb_fielddef *f) {
txtenc_indent(e);
txtenc_printf(e, "%s: ", upb_fielddef_name(f));
switch (upb_fielddef_type(f)) {
case UPB_TYPE_BOOL:
txtenc_putstr(e, val.bool_val ? "true" : "false");
break;
case UPB_TYPE_FLOAT:
txtenc_printf(e, "%f", val.float_val);
break;
case UPB_TYPE_DOUBLE:
txtenc_printf(e, "%f", val.double_val);
break;
case UPB_TYPE_INT32:
txtenc_printf(e, "%" PRId32, val.int32_val);
break;
case UPB_TYPE_UINT32:
txtenc_printf(e, "%" PRIu32, val.uint32_val);
break;
case UPB_TYPE_INT64:
txtenc_printf(e, "%" PRId64, val.int64_val);
break;
case UPB_TYPE_UINT64:
txtenc_printf(e, "%" PRIu64, val.uint64_val);
break;
case UPB_TYPE_STRING:
txtenc_string(e, val.str_val, false);
break;
case UPB_TYPE_BYTES:
txtenc_string(e, val.str_val, true);
break;
case UPB_TYPE_ENUM:
txtenc_enum(val.int32_val, f, e);
break;
case UPB_TYPE_MESSAGE:
txtenc_putstr(e, "{");
e->indent_depth++;
txtenc_msg(e, val.msg_val, upb_fielddef_msgsubdef(f));
e->indent_depth--;
txtenc_indent(e);
txtenc_putstr(e, "}");
break;
}
txtenc_endfield(e);
}
/*
* Arrays print as simple repeated elements, eg.
*
* foo_field: 1
* foo_field: 2
* foo_field: 3
*/
static void txtenc_array(txtenc *e, const upb_array *arr,
const upb_fielddef *f) {
size_t i;
size_t size = upb_array_size(arr);
for (i = 0; i < size; i++) {
txtenc_field(e, upb_array_get(arr, i), f);
}
}
/*
* Maps print as messages of key/value, etc.
*
* foo_map: {
* key: "abc"
* value: 123
* }
* foo_map: {
* key: "def"
* value: 456
* }
*/
static void txtenc_map(txtenc *e, const upb_map *map, const upb_fielddef *f) {
const upb_msgdef *entry = upb_fielddef_msgsubdef(f);
const upb_fielddef *key_f = upb_msgdef_itof(entry, 1);
const upb_fielddef *val_f = upb_msgdef_itof(entry, 2);
size_t iter = UPB_MAP_BEGIN;
while (upb_mapiter_next(map, &iter)) {
upb_msgval key = upb_mapiter_key(map, iter);
upb_msgval val = upb_mapiter_value(map, iter);
txtenc_indent(e);
txtenc_printf(e, "%s: {", upb_fielddef_name(f));
txtenc_endfield(e);
e->indent_depth++;
txtenc_field(e, key, key_f);
txtenc_field(e, val, val_f);
e->indent_depth--;
txtenc_indent(e);
txtenc_putstr(e, "}");
txtenc_endfield(e);
}
}
static const char *txtenc_parsevarint(const char *ptr, const char *limit,
uint64_t *val) {
uint8_t byte;
int bitpos = 0;
*val = 0;
do {
CHK(bitpos < 70 && ptr < limit);
byte = *ptr;
*val |= (uint64_t)(byte & 0x7F) << bitpos;
ptr++;
bitpos += 7;
} while (byte & 0x80);
return ptr;
}
/*
* Unknown fields are printed by number.
*
* 1001: 123
* 1002: "hello"
* 1006: 0xdeadbeef
* 1003: {
* 1: 111
* }
*/
static const char *txtenc_unknown(txtenc *e, const char *ptr, const char *end,
int groupnum) {
while (ptr < end) {
uint64_t tag_64;
uint32_t tag;
CHK(ptr = txtenc_parsevarint(ptr, end, &tag_64));
CHK(tag_64 < UINT32_MAX);
tag = tag_64;
if ((tag & 7) == UPB_WIRE_TYPE_END_GROUP) {
CHK((tag >> 3) == groupnum);
return ptr;
}
txtenc_indent(e);
txtenc_printf(e, "%d: ", (int)(tag >> 3));
switch (tag & 7) {
case UPB_WIRE_TYPE_VARINT: {
uint64_t val;
CHK(ptr = txtenc_parsevarint(ptr, end, &val));
txtenc_printf(e, "%" PRIu64, val);
break;
}
case UPB_WIRE_TYPE_32BIT: {
uint32_t val;
CHK(end - ptr >= 4);
memcpy(&val, ptr, 4);
ptr += 4;
txtenc_printf(e, "0x%08" PRIu32, val);
break;
}
case UPB_WIRE_TYPE_64BIT: {
uint64_t val;
CHK(end - ptr >= 8);
memcpy(&val, ptr, 8);
ptr += 8;
txtenc_printf(e, "0x%016" PRIu64, val);
break;
}
case UPB_WIRE_TYPE_DELIMITED: {
uint64_t len;
char *start = e->ptr;
size_t start_overflow = e->overflow;
CHK(ptr = txtenc_parsevarint(ptr, end, &len));
CHK(end - ptr >= len);
/* Speculatively try to parse as message. */
txtenc_putstr(e, "{");
txtenc_endfield(e);
e->indent_depth++;
if (txtenc_unknown(e, ptr, end, -1)) {
e->indent_depth--;
txtenc_indent(e);
txtenc_putstr(e, "}");
} else {
/* Didn't work out, print as raw bytes. */
e->indent_depth--;
e->ptr = start;
e->overflow = start_overflow;
upb_strview str = {ptr, len};
txtenc_string(e, str, true);
}
ptr += len;
break;
}
case UPB_WIRE_TYPE_START_GROUP:
txtenc_putstr(e, "{");
txtenc_endfield(e);
e->indent_depth++;
CHK(ptr = txtenc_unknown(e, ptr, end, tag >> 3));
e->indent_depth--;
txtenc_indent(e);
txtenc_putstr(e, "}");
break;
}
txtenc_endfield(e);
}
return groupnum == -1 ? ptr : NULL;
}
static void txtenc_msg(txtenc *e, const upb_msg *msg,
const upb_msgdef *m) {
size_t iter = UPB_MSG_BEGIN;
const upb_fielddef *f;
upb_msgval val;
while (upb_msg_next(msg, m, e->ext_pool, &f, &val, &iter)) {
if (upb_fielddef_ismap(f)) {
txtenc_map(e, val.map_val, f);
} else if (upb_fielddef_isseq(f)) {
txtenc_array(e, val.array_val, f);
} else {
txtenc_field(e, val, f);
}
}
if ((e->options & UPB_TXTENC_SKIPUNKNOWN) == 0) {
size_t len;
const char *ptr = upb_msg_getunknown(msg, &len);
char *start = e->ptr;
if (ptr) {
if (!txtenc_unknown(e, ptr, ptr + len, -1)) {
/* Unknown failed to parse, back up and don't print it at all. */
e->ptr = start;
}
}
}
}
size_t txtenc_nullz(txtenc *e, size_t size) {
size_t ret = e->ptr - e->buf + e->overflow;
if (size > 0) {
if (e->ptr == e->end) e->ptr--;
*e->ptr = '\0';
}
return ret;
}
size_t upb_textencode(const upb_msg *msg, const upb_msgdef *m,
const upb_symtab *ext_pool, int options, char *buf,
size_t size) {
txtenc e;
e.buf = buf;
e.ptr = buf;
e.end = buf + size;
e.overflow = 0;
e.indent_depth = 0;
e.options = options;
e.ext_pool = ext_pool;
txtenc_msg(&e, msg, m);
return txtenc_nullz(&e, size);
}
#undef CHK

@ -0,0 +1,27 @@
#ifndef UPB_TEXTENCODE_H_
#define UPB_TEXTENCODE_H_
#include "upb/def.h"
enum {
/* When set, prints everything on a single line. */
UPB_TXTENC_SINGLELINE = 1,
/* When set, unknown fields are not printed. */
UPB_TXTENC_SKIPUNKNOWN = 2
};
/* Encodes the given |msg| to text format. The message's reflection is given in
* |m|. The symtab in |symtab| is used to find extensions (if NULL, extensions
* will not be printed).
*
* Output is placed in the given buffer, and always NULL-terminated. The output
* size (excluding NULL) is returned. This means that a return value >= |size|
* implies that the output was truncated. (These are the same semantics as
* snprintf()). */
size_t upb_textencode(const upb_msg *msg, const upb_msgdef *m,
const upb_symtab *ext_pool, int options, char *buf,
size_t size);
#endif /* UPB_TEXTENCODE_H_ */
Loading…
Cancel
Save