Protocol Buffers - Google's data interchange format (grpc依赖) https://developers.google.com/protocol-buffers/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

441 lines
16 KiB

/*
* Copyright (c) 2009-2021, Google LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Google LLC nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
** Our memory representation for parsing tables and messages themselves.
** Functions in this file are used by generated code and possibly reflection.
**
** The definitions in this file are internal to upb.
**/
#ifndef UPB_MSG_INT_H_
#define UPB_MSG_INT_H_
#include <stdlib.h>
#include <string.h>
#include "upb/collections/map_internal.h"
#include "upb/extension_registry.h"
#include "upb/internal/table.h"
#include "upb/msg.h"
#include "upb/upb.h"
// Must be last.
#include "upb/port_def.inc"
#ifdef __cplusplus
extern "C" {
#endif
3 years ago
/** upb_*Int* conversion routines ********************************************/
UPB_INLINE int32_t _upb_Int32_FromI(int v) { return (int32_t)v; }
UPB_INLINE int64_t _upb_Int64_FromLL(long long v) { return (int64_t)v; }
UPB_INLINE uint32_t _upb_UInt32_FromU(unsigned v) { return (uint32_t)v; }
UPB_INLINE uint64_t _upb_UInt64_FromULL(unsigned long long v) {
return (uint64_t)v;
}
extern const float kUpb_FltInfinity;
extern const double kUpb_Infinity;
/** upb_MiniTable *************************************************************/
/* upb_MiniTable represents the memory layout of a given upb_MessageDef. The
* members are public so generated code can initialize them, but users MUST NOT
* read or write any of its members. */
typedef struct {
uint32_t number;
uint16_t offset;
3 years ago
int16_t presence; // If >0, hasbit_index. If <0, ~oneof_index
3 years ago
uint16_t submsg_index; // kUpb_NoSub if descriptortype != MESSAGE/GROUP/ENUM
uint8_t descriptortype;
uint8_t mode; /* upb_FieldMode | upb_LabelFlags |
(upb_FieldRep << kUpb_FieldRep_Shift) */
} upb_MiniTable_Field;
3 years ago
#define kUpb_NoSub ((uint16_t)-1)
typedef enum {
kUpb_FieldMode_Map = 0,
kUpb_FieldMode_Array = 1,
kUpb_FieldMode_Scalar = 2,
} upb_FieldMode;
// Mask to isolate the upb_FieldMode from field.mode.
#define kUpb_FieldMode_Mask 3
3 years ago
/* Extra flags on the mode field. */
typedef enum {
kUpb_LabelFlags_IsPacked = 4,
kUpb_LabelFlags_IsExtension = 8,
// Indicates that this descriptor type is an "alternate type":
// - for Int32, this indicates that the actual type is Enum (but was
// rewritten to Int32 because it is an open enum that requires no check).
// - for Bytes, this indicates that the actual type is String (but does
// not require any UTF-8 check).
kUpb_LabelFlags_IsAlternate = 16,
} upb_LabelFlags;
// Note: we sort by this number when calculating layout order.
typedef enum {
kUpb_FieldRep_1Byte = 0,
kUpb_FieldRep_4Byte = 1,
kUpb_FieldRep_StringView = 2,
kUpb_FieldRep_8Byte = 3,
kUpb_FieldRep_Shift = 6, // Bit offset of the rep in upb_MiniTable_Field.mode
kUpb_FieldRep_Max = kUpb_FieldRep_8Byte,
} upb_FieldRep;
UPB_INLINE upb_FieldMode upb_FieldMode_Get(const upb_MiniTable_Field* field) {
return (upb_FieldMode)(field->mode & 3);
}
UPB_INLINE bool upb_IsRepeatedOrMap(const upb_MiniTable_Field* field) {
/* This works because upb_FieldMode has no value 3. */
return !(field->mode & kUpb_FieldMode_Scalar);
}
UPB_INLINE bool upb_IsSubMessage(const upb_MiniTable_Field* field) {
return field->descriptortype == kUpb_FieldType_Message ||
field->descriptortype == kUpb_FieldType_Group;
}
struct upb_Decoder;
struct upb_MiniTable;
typedef const char* _upb_FieldParser(struct upb_Decoder* d, const char* ptr,
upb_Message* msg, intptr_t table,
uint64_t hasbits, uint64_t data);
typedef struct {
uint64_t field_data;
_upb_FieldParser* field_parser;
} _upb_FastTable_Entry;
3 years ago
typedef struct {
uint32_t mask_limit; // Limit enum value that can be tested with mask.
uint32_t value_count; // Number of values after the bitfield.
uint32_t data[]; // Bitmask + enumerated values follow.
} upb_MiniTable_Enum;
3 years ago
typedef enum {
_kUpb_FastEnumCheck_ValueIsInEnum = 0,
_kUpb_FastEnumCheck_ValueIsNotInEnum = 1,
_kUpb_FastEnumCheck_CannotCheckFast = 2,
} _kUpb_FastEnumCheck_Status;
UPB_INLINE _kUpb_FastEnumCheck_Status
_upb_MiniTable_CheckEnumValueFast(const upb_MiniTable_Enum* e, uint32_t val) {
if (UPB_UNLIKELY(val >= 64)) return _kUpb_FastEnumCheck_CannotCheckFast;
uint64_t mask = e->data[0] | ((uint64_t)e->data[1] << 32);
return (mask & (1ULL << val)) ? _kUpb_FastEnumCheck_ValueIsInEnum
: _kUpb_FastEnumCheck_ValueIsNotInEnum;
}
UPB_INLINE bool _upb_MiniTable_CheckEnumValueSlow(const upb_MiniTable_Enum* e,
uint32_t val) {
if (val < e->mask_limit) return e->data[val / 32] & (1ULL << (val % 32));
// OPT: binary search long lists?
const uint32_t* start = &e->data[e->mask_limit / 32];
const uint32_t* limit = &e->data[(e->mask_limit / 32) + e->value_count];
for (const uint32_t* p = start; p < limit; p++) {
if (*p == val) return true;
}
return false;
}
// Validates enum value against range defined by enum mini table.
UPB_INLINE bool upb_MiniTable_Enum_CheckValue(const upb_MiniTable_Enum* e,
uint32_t val) {
_kUpb_FastEnumCheck_Status status = _upb_MiniTable_CheckEnumValueFast(e, val);
if (UPB_UNLIKELY(status == _kUpb_FastEnumCheck_CannotCheckFast)) {
return _upb_MiniTable_CheckEnumValueSlow(e, val);
}
return status == _kUpb_FastEnumCheck_ValueIsInEnum ? true : false;
}
typedef union {
const struct upb_MiniTable* submsg;
const upb_MiniTable_Enum* subenum;
} upb_MiniTable_Sub;
typedef enum {
kUpb_ExtMode_NonExtendable = 0, // Non-extendable message.
kUpb_ExtMode_Extendable = 1, // Normal extendable message.
kUpb_ExtMode_IsMessageSet = 2, // MessageSet message.
kUpb_ExtMode_IsMessageSet_ITEM =
3, // MessageSet item (temporary only, see decode.c)
// During table building we steal a bit to indicate that the message is a map
// entry. *Only* used during table building!
kUpb_ExtMode_IsMapEntry = 4,
} upb_ExtMode;
struct upb_MiniTable {
const upb_MiniTable_Sub* subs;
const upb_MiniTable_Field* fields;
/* Must be aligned to sizeof(void*). Doesn't include internal members like
* unknown fields, extension dict, pointer to msglayout, etc. */
uint16_t size;
uint16_t field_count;
uint8_t ext; // upb_ExtMode, declared as uint8_t so sizeof(ext) == 1
Optimized decoder and paved the way for parsing extensions. The primary motivation for this change is to avoid referring to the `upb_msglayout` object when we are trying to fetch the `upb_msglayout` object for a sub-message. This will help pave the way for parsing extensions. We also implement several optimizations so that we can make this change without regressing performance. Normally we compute the layout for a sub-message field like so: ``` const upb_msglayout *get_submsg_layout( const upb_msglayout *layout, const upb_msglayout_field *field) { return layout->submsgs[field->submsg_index] } ``` The reason for this indirection is to avoid storing a pointer directly in `upb_msglayout_field`, as this would double its size (from 12 to 24 bytes on 64-bit architectures) which is wasteful as this pointer is only needed for message typed fields. However `get_submsg_layout` as written above does not work for extensions, as they will not have entries in the message's `layout->submsgs` array by nature, and we want to avoid creating an entire fake `upb_msglayout` for each such extension since that would also be wasteful. This change removes the dependency on `upb_msglayout` by passing down the `submsgs` array instead: ``` const upb_msglayout *get_submsg_layout( const upb_msglayout *const *submsgs, const upb_msglayout_field *field) { return submsgs[field->submsg_index] } ``` This will pave the way for parsing extensions, as we can more easily create an alternative `submsgs` array for extension fields without extra overhead or waste. Along the way several optimizations presented themselves that allow a nice increase in performance: 1. Passing the parsed `wireval` by address instead of by value ended up avoiding an expensive and useless stack copy (this is on Clang, which was used for all measurements). 2. When field numbers are densely packed, we can find a field by number with a single indexed lookup instead of linear search. At codegen time we can compute the maximum field number that will allow such an indexed lookup. 3. For fields that do require linear search, we can start the linear search at the location where we found the previous field, taking advantage of the fact that field numbers are generally increasing. 4. When the hasbit index is less than 32 (the common case) we can use a less expensive code sequence to set it. 5. We check for the hasbit case before the oneof case, as optional fields are more common than oneof fields. Benchmark results indicate a 20% improvement in parse speed with a small code size increase: ``` name old time/op new time/op delta ArenaOneAlloc 21.3ns ± 0% 21.5ns ± 0% +0.96% (p=0.000 n=12+12) ArenaInitialBlockOneAlloc 6.32ns ± 0% 6.32ns ± 0% +0.03% (p=0.000 n=12+10) LoadDescriptor_Upb 53.5µs ± 1% 51.5µs ± 2% -3.70% (p=0.000 n=12+12) LoadAdsDescriptor_Upb 2.78ms ± 2% 2.68ms ± 0% -3.57% (p=0.000 n=12+12) LoadDescriptor_Proto2 240µs ± 0% 240µs ± 0% +0.12% (p=0.001 n=12+12) LoadAdsDescriptor_Proto2 12.8ms ± 0% 12.7ms ± 0% -1.15% (p=0.000 n=12+10) Parse_Upb_FileDesc<UseArena,Copy> 13.2µs ± 2% 10.7µs ± 0% -18.49% (p=0.000 n=10+12) Parse_Upb_FileDesc<UseArena,Alias> 11.3µs ± 0% 9.6µs ± 0% -15.11% (p=0.000 n=12+11) Parse_Upb_FileDesc<InitBlock,Copy> 12.7µs ± 0% 10.3µs ± 0% -19.00% (p=0.000 n=10+12) Parse_Upb_FileDesc<InitBlock,Alias> 10.9µs ± 0% 9.2µs ± 0% -15.82% (p=0.000 n=12+12) Parse_Proto2<FileDesc,NoArena,Copy> 29.4µs ± 0% 29.5µs ± 0% +0.61% (p=0.000 n=12+12) Parse_Proto2<FileDesc,UseArena,Copy> 20.7µs ± 2% 20.6µs ± 2% ~ (p=0.260 n=12+11) Parse_Proto2<FileDesc,InitBlock,Copy> 16.7µs ± 1% 16.7µs ± 0% -0.25% (p=0.036 n=12+10) Parse_Proto2<FileDescSV,InitBlock,Alias> 16.5µs ± 0% 16.5µs ± 0% +0.20% (p=0.016 n=12+11) SerializeDescriptor_Proto2 5.30µs ± 1% 5.36µs ± 1% +1.09% (p=0.000 n=12+11) SerializeDescriptor_Upb 12.9µs ± 0% 13.0µs ± 0% +0.90% (p=0.000 n=12+11) FILE SIZE VM SIZE -------------- -------------- +1.5% +176 +1.6% +176 upb/decode.c +1.8% +176 +1.9% +176 decode_msg +0.4% +64 +0.4% +64 upb/def.c +1.4% +64 +1.4% +64 _upb_symtab_addfile +1.2% +48 +1.4% +48 upb/reflection.c +15% +32 +18% +32 upb_msg_set +2.9% +16 +3.1% +16 upb_msg_mutable -9.3% -288 [ = ] 0 [Unmapped] [ = ] 0 +0.2% +288 TOTAL ```
4 years ago
uint8_t dense_below;
uint8_t table_mask;
uint8_t required_count; // Required fields have the lowest hasbits.
/* To statically initialize the tables of variable length, we need a flexible
* array member, and we need to compile in gnu99 mode (constant initialization
* of flexible array members is a GNU extension, not in C99 unfortunately. */
_upb_FastTable_Entry fasttable[];
};
struct upb_MiniTable_Extension {
upb_MiniTable_Field field;
const upb_MiniTable* extendee;
upb_MiniTable_Sub sub; /* NULL unless submessage or proto2 enum */
};
typedef struct {
const upb_MiniTable** msgs;
const upb_MiniTable_Enum** enums;
const upb_MiniTable_Extension** exts;
int msg_count;
3 years ago
int enum_count;
int ext_count;
} upb_MiniTable_File;
// Computes a bitmask in which the |l->required_count| lowest bits are set,
// except that we skip the lowest bit (because upb never uses hasbit 0).
//
// Sample output:
// requiredmask(1) => 0b10 (0x2)
// requiredmask(5) => 0b111110 (0x3e)
UPB_INLINE uint64_t upb_MiniTable_requiredmask(const upb_MiniTable* l) {
int n = l->required_count;
assert(0 < n && n <= 63);
return ((1ULL << n) - 1) << 1;
}
/** upb_Message ***************************************************************/
/* Internal members of a upb_Message that track unknown fields and/or
* extensions. We can change this without breaking binary compatibility. We put
* these before the user's data. The user's upb_Message* points after the
* upb_Message_Internal. */
typedef struct {
/* Total size of this structure, including the data that follows.
* Must be aligned to 8, which is alignof(upb_Message_Extension) */
uint32_t size;
/* Offsets relative to the beginning of this structure.
*
* Unknown data grows forward from the beginning to unknown_end.
* Extension data grows backward from size to ext_begin.
* When the two meet, we're out of data and have to realloc.
*
* If we imagine that the final member of this struct is:
* char data[size - overhead]; // overhead =
* sizeof(upb_Message_InternalData)
*
* Then we have:
* unknown data: data[0 .. (unknown_end - overhead)]
* extensions data: data[(ext_begin - overhead) .. (size - overhead)] */
uint32_t unknown_end;
uint32_t ext_begin;
/* Data follows, as if there were an array:
* char data[size - sizeof(upb_Message_InternalData)]; */
} upb_Message_InternalData;
typedef struct {
upb_Message_InternalData* internal;
/* Message data follows. */
} upb_Message_Internal;
/* Maps upb_CType -> memory size. */
extern char _upb_CTypeo_size[12];
UPB_INLINE size_t upb_msg_sizeof(const upb_MiniTable* l) {
return l->size + sizeof(upb_Message_Internal);
}
/* Inline version upb_Message_New(), for internal use */
UPB_INLINE upb_Message* _upb_Message_New(const upb_MiniTable* mini_table,
upb_Arena* arena) {
size_t size = upb_msg_sizeof(mini_table);
void* mem = upb_Arena_Malloc(arena, size + sizeof(upb_Message_Internal));
if (UPB_UNLIKELY(!mem)) return NULL;
upb_Message* msg = UPB_PTR_AT(mem, sizeof(upb_Message_Internal), upb_Message);
memset(mem, 0, size);
return msg;
}
UPB_INLINE upb_Message_Internal* upb_Message_Getinternal(upb_Message* msg) {
ptrdiff_t size = sizeof(upb_Message_Internal);
return (upb_Message_Internal*)((char*)msg - size);
}
/* Clears the given message. */
void _upb_Message_Clear(upb_Message* msg, const upb_MiniTable* l);
/* Discards the unknown fields for this message only. */
void _upb_Message_DiscardUnknown_shallow(upb_Message* msg);
/* Adds unknown data (serialized protobuf data) to the given message. The data
* is copied into the message instance. */
bool _upb_Message_AddUnknown(upb_Message* msg, const char* data, size_t len,
upb_Arena* arena);
/** upb_Message_Extension *****************************************************/
/* The internal representation of an extension is self-describing: it contains
* enough information that we can serialize it to binary format without needing
* to look it up in a upb_ExtensionRegistry.
*
* This representation allocates 16 bytes to data on 64-bit platforms. This is
* rather wasteful for scalars (in the extreme case of bool, it wastes 15
* bytes). We accept this because we expect messages to be the most common
* extension type. */
typedef struct {
const upb_MiniTable_Extension* ext;
union {
upb_StringView str;
void* ptr;
char scalar_data[8];
} data;
} upb_Message_Extension;
/* Adds the given extension data to the given message. |ext| is copied into the
* message instance. This logically replaces any previously-added extension with
* this number */
upb_Message_Extension* _upb_Message_GetOrCreateExtension(
upb_Message* msg, const upb_MiniTable_Extension* ext, upb_Arena* arena);
/* Returns an array of extensions for this message. Note: the array is
* ordered in reverse relative to the order of creation. */
const upb_Message_Extension* _upb_Message_Getexts(const upb_Message* msg,
size_t* count);
/* Returns an extension for the given field number, or NULL if no extension
* exists for this field number. */
const upb_Message_Extension* _upb_Message_Getext(
const upb_Message* msg, const upb_MiniTable_Extension* ext);
void _upb_Message_Clearext(upb_Message* msg,
const upb_MiniTable_Extension* ext);
/** Hasbit access *************************************************************/
UPB_INLINE bool _upb_hasbit(const upb_Message* msg, size_t idx) {
return (*UPB_PTR_AT(msg, idx / 8, const char) & (1 << (idx % 8))) != 0;
}
UPB_INLINE void _upb_sethas(const upb_Message* msg, size_t idx) {
(*UPB_PTR_AT(msg, idx / 8, char)) |= (char)(1 << (idx % 8));
}
UPB_INLINE void _upb_clearhas(const upb_Message* msg, size_t idx) {
(*UPB_PTR_AT(msg, idx / 8, char)) &= (char)(~(1 << (idx % 8)));
}
UPB_INLINE size_t _upb_Message_Hasidx(const upb_MiniTable_Field* f) {
UPB_ASSERT(f->presence > 0);
return f->presence;
}
UPB_INLINE bool _upb_hasbit_field(const upb_Message* msg,
const upb_MiniTable_Field* f) {
return _upb_hasbit(msg, _upb_Message_Hasidx(f));
}
UPB_INLINE void _upb_sethas_field(const upb_Message* msg,
const upb_MiniTable_Field* f) {
_upb_sethas(msg, _upb_Message_Hasidx(f));
}
UPB_INLINE void _upb_clearhas_field(const upb_Message* msg,
const upb_MiniTable_Field* f) {
_upb_clearhas(msg, _upb_Message_Hasidx(f));
}
/** Oneof case access *********************************************************/
UPB_INLINE uint32_t* _upb_oneofcase(upb_Message* msg, size_t case_ofs) {
return UPB_PTR_AT(msg, case_ofs, uint32_t);
}
UPB_INLINE uint32_t _upb_getoneofcase(const void* msg, size_t case_ofs) {
return *UPB_PTR_AT(msg, case_ofs, uint32_t);
}
UPB_INLINE size_t _upb_oneofcase_ofs(const upb_MiniTable_Field* f) {
UPB_ASSERT(f->presence < 0);
return ~(ptrdiff_t)f->presence;
}
UPB_INLINE uint32_t* _upb_oneofcase_field(upb_Message* msg,
const upb_MiniTable_Field* f) {
return _upb_oneofcase(msg, _upb_oneofcase_ofs(f));
}
UPB_INLINE uint32_t _upb_getoneofcase_field(const upb_Message* msg,
const upb_MiniTable_Field* f) {
return _upb_getoneofcase(msg, _upb_oneofcase_ofs(f));
}
UPB_INLINE bool _upb_has_submsg_nohasbit(const upb_Message* msg, size_t ofs) {
return *UPB_PTR_AT(msg, ofs, const upb_Message*) != NULL;
}
/* Map entries aren't actually stored, they are only used during parsing. For
* parsing, it helps a lot if all map entry messages have the same layout.
* The compiler and def.c must ensure that all map entries have this layout. */
typedef struct {
upb_Message_Internal internal;
union {
upb_StringView str; /* For str/bytes. */
upb_value val; /* For all other types. */
} k;
union {
upb_StringView str; /* For str/bytes. */
upb_value val; /* For all other types. */
} v;
} upb_MapEntry;
#ifdef __cplusplus
} /* extern "C" */
#endif
#include "upb/port_undef.inc"
#endif /* UPB_MSG_INT_H_ */