More work on inttable/strtable (not finished).

pull/13171/head
Joshua Haberman 16 years ago
parent 9c21992286
commit 0124bfd8fe
  1. 12
      upb.h
  2. 12
      upb_struct.h
  3. 257
      upb_table.c
  4. 107
      upb_table.h

12
upb.h

@ -10,6 +10,7 @@
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h> /* for size_t. */
#include <string.h>
#ifdef __cplusplus
extern "C" {
@ -28,6 +29,17 @@ extern "C" {
/* The maximum that any submessages can be nested. Matches proto2's limit. */
#define UPB_MAX_NESTING 64
/* Represents a string or bytes. */
struct upb_string {
void *data;
uint32_t byte_len;
};
INLINE bool upb_string_eql(struct upb_string *s1, struct upb_string *s2) {
return s1->byte_len == s2->byte_len &&
memcmp(s1->data, s2->data, s1->byte_len) == 0;
}
/* A list of types as they are encoded on-the-wire. */
enum upb_wire_type {
UPB_WIRE_TYPE_VARINT = 0,

@ -67,29 +67,23 @@ struct upb_struct_field *upb_struct_find_field_by_number(
/* Variable-length data (strings and arrays).**********************************/
/* Represents a string or bytes. */
struct upb_string {
size_t byte_len;
void *data;
};
/* Represents an array (a repeated field) of any type. The interpretation of
* the data in the array depends on the type. */
struct upb_array {
size_t len; /* Measured in elements. */
void *data; /* Size of individual elements is based on type. */
uint32_t len; /* Measured in elements. */
};
/* A generic array of structs, using void* instead of specific types. */
struct upb_struct_array {
size_t len;
void **elements;
uint32_t len;
};
/* An array of strings. */
struct upb_string_array {
size_t len;
struct upb_string **elements;
uint32_t len;
};
/* Specific arrays of all the primitive types. */

@ -10,9 +10,153 @@
#include <stdlib.h>
#include <string.h>
static const upb_inttable_key_t EMPTYENT = 0;
static const double MAX_LOAD = 0.85;
static uint32_t MurmurHash2(const void *key, size_t len, uint32_t seed);
static uint32_t max(uint32_t a, uint32_t b) { return a > b ? a : b; }
static struct upb_inttable_entry *intent(struct upb_inttable *t, uint32_t i) {
return (struct upb_inttable_entry*)((char*)t->t.entries + i*t->t.entry_size);
}
static struct upb_strtable_entry *strent(struct upb_strtable *t, uint32_t i) {
return (struct upb_strtable_entry*)((char*)t->t.entries + i*t->t.entry_size);
}
void upb_table_init(struct upb_table *t, uint32_t size, uint16_t entry_size)
{
t->count = 0;
t->entry_size = entry_size;
t->size_lg2 = 1;
while(size >>= 1) t->size_lg2++;
t->size_lg2 = max(t->size_lg2, 4); /* Min size of 16. */
t->entries = malloc(upb_table_size(t) * t->entry_size);
}
void upb_inttable_init(struct upb_inttable *t, uint32_t size, uint16_t entsize)
{
upb_table_init(&t->t, size, entsize);
for(uint32_t i = 0; i < upb_table_size(&t->t); i++)
intent(t, i)->key = EMPTYENT;
}
void upb_strtable_init(struct upb_strtable *t, uint32_t size, uint16_t entsize)
{
upb_table_init(&t->t, size, entsize);
for(uint32_t i = 0; i < upb_table_size(&t->t); i++)
strent(t, i)->key.data = NULL;
}
void upb_table_free(struct upb_table *t) { free(t->entries); }
void upb_inttable_free(struct upb_inttable *t) { upb_table_free(&t->t); }
void upb_strtable_free(struct upb_strtable *t) { upb_table_free(&t->t); }
void *upb_strtable_lookup(struct upb_strtable *t, struct upb_string *key)
{
uint32_t hash =
MurmurHash2(key->data, key->byte_len, 0) & (upb_strtable_size(t)-1);
while(1) {
struct upb_strtable_entry *e =
(struct upb_strtable_entry*)(char*)t->t.entries + hash*t->t.entry_size;
if(e->key.data == NULL) return NULL;
else if(upb_string_eql(&e->key, key)) return e;
hash = e->next;
}
}
static struct upb_inttable_entry *find_empty_slot(struct upb_inttable *table)
{
/* TODO: does it matter that this is biased towards the front of the table? */
for(uint32_t i = 0; i < upb_inttable_size(table); i++) {
struct upb_inttable_entry *e = intent(table, i);
if(e->key == EMPTYENT) return e;
}
assert(false);
return NULL;
}
static void *maybe_resize(struct upb_table *t) {
if((double)++t->count / upb_table_size(t) > MAX_LOAD) {
void *old_entries = t->entries;
t->size_lg2++; /* Double the table size. */
t->entries = malloc(upb_table_size(t) * t->entry_size);
return old_entries;
} else {
return NULL; /* No resize necessary. */
}
}
static void intinsert(void *table, struct upb_inttable_entry *e, uint32_t size)
{
/* TODO */
#if 0
struct upb_inttable_entry *e, *table_e;
e = upb_inttable_entry_get(entries, i, entry_size);
table_e = upb_inttable_mainpos2(table, e->key);
if(table_e->key != UPB_EMPTY_ENTRY) { /* Collision. */
if(table_e == upb_inttable_mainpos2(table, table_e->key)) {
/* Existing element is in its main posisiton. Find an empty slot to
* place our new element and append it to this key's chain. */
struct upb_inttable_entry *empty = find_empty_slot(table);
while (table_e->next) table_e = table_e->next;
table_e->next = empty;
table_e = empty;
} else {
/* Existing element is not in its main position. Move it to an empty
* slot and put our element in its main position. */
struct upb_inttable_entry *empty, *colliding_key_mainpos;
empty = find_empty_slot(table);
colliding_key_mainpos = upb_inttable_mainpos2(table, table_e->key);
assert(colliding_key_mainpos->key != UPB_EMPTY_ENTRY);
assert(colliding_key_mainpos->next);
memcpy(empty, table_e, entry_size); /* next is copied also. */
while(1) {
assert(colliding_key_mainpos->next);
if(colliding_key_mainpos->next == table_e) {
colliding_key_mainpos->next = empty;
break;
}
}
/* table_e remains set to our mainpos. */
}
}
memcpy(table_e, e, entry_size);
table_e->next = NULL;
#endif
}
void upb_inttable_insert(struct upb_inttable *t, struct upb_inttable_entry *e)
{
void *new_entries = maybe_resize(&t->t);
if(new_entries) { /* Are we doing a resize? */
for(uint32_t i = 0; i < (upb_inttable_size(t)>>1); i++) {
struct upb_inttable_entry *old_e = intent(t, i);
if(old_e->key != EMPTYENT) intinsert(new_entries, old_e, t->t.entry_size);
}
}
intinsert(t->t.entries, e, t->t.entry_size);
}
static void strinsert(void *table, struct upb_strtable_entry *e, uint32_t size)
{
/* TODO */
}
void upb_strtable_insert(struct upb_strtable *t, struct upb_strtable_entry *e)
{
void *new_entries = maybe_resize(&t->t);
if(new_entries) { /* Are we doing a resize? */
for(uint32_t i = 0; i < (upb_strtable_size(t)>>1); i++) {
struct upb_strtable_entry *old_e = strent(t, i);
if(old_e->key.data != NULL) strinsert(new_entries, old_e, t->t.entry_size);
}
}
strinsert(t->t.entries, e, t->t.entry_size);
}
#ifdef UPB_UNALIGNED_READS_OK
//-----------------------------------------------------------------------------
// MurmurHash2, by Austin Appleby
// MurmurHash2, by Austin Appleby (released as public domain).
// Reformatted and C99-ified by Joshua Haberman.
// Note - This code makes a few assumptions about how your machine behaves -
// 1. We can read a 4-byte value from any address without crashing
@ -183,114 +327,3 @@ static uint32_t MurmurHash2(const void * key, size_t len, uint32_t seed)
#undef MIX
#endif // UPB_UNALIGNED_READS_OK
static int compare_entries(const void *f1, const void *f2)
{
return ((struct upb_inttable_entry*)f1)->key -
((struct upb_inttable_entry*)f2)->key;
}
static uint32_t max(uint32_t a, uint32_t b) { return a > b ? a : b; }
static uint32_t round_up_to_pow2(uint32_t v)
{
/* cf. Bit Twiddling Hacks:
* http://www-graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
v--;
v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16;
v++;
return v;
}
static struct upb_inttable_entry *find_empty_slot(struct upb_inttable *table)
{
/* TODO: does it matter that this is biased towards the front of the table? */
for(uint32_t i = 0; i < table->size; i++) {
struct upb_inttable_entry *e =
upb_inttable_entry_get(table->entries, i, table->entry_size);
if(e->key == UPB_EMPTY_ENTRY) return e;
}
assert(false);
return NULL;
}
void upb_inttable_init(struct upb_inttable *table, void *entries,
int num_entries, int entry_size)
{
qsort(entries, num_entries, entry_size, compare_entries);
/* Find the largest n for which at least half the keys <n are used. We
* make sure our table size is at least n. This allows all keys <n to be
* in their main position (as if it were an array) and only numbers >n might
* possibly have collisions. Start at 8 to avoid noise of small numbers. */
upb_inttable_key_t n = 0, maybe_n;
bool all_in_array = true;
for(int i = 0; i < num_entries; i++) {
struct upb_inttable_entry *e =
upb_inttable_entry_get(entries, i, entry_size);
maybe_n = e->key;
if(maybe_n > 8 && maybe_n/(i+1) >= 2) {
all_in_array = false;
break;
}
n = maybe_n;
}
/* TODO: measure, tweak, optimize this choice of table size. Possibly test
* (at runtime) maximum chain length for each proposed size. */
uint32_t min_size_by_load = all_in_array ? n : (double)num_entries / 0.85;
uint32_t min_size = max(n, min_size_by_load);
table->size = round_up_to_pow2(min_size);
table->entry_size = entry_size;
table->entries = malloc(table->size * entry_size);
/* Initialize to empty. */
for(size_t i = 0; i < table->size; i++) {
struct upb_inttable_entry *e =
upb_inttable_entry_get(table->entries, i, entry_size);
e->key = UPB_EMPTY_ENTRY;
e->next = NULL;
}
/* Insert the elements. */
for(int i = 0; i < num_entries; i++) {
struct upb_inttable_entry *e, *table_e;
e = upb_inttable_entry_get(entries, i, entry_size);
table_e = upb_inttable_mainpos(table, e->key);
if(table_e->key != UPB_EMPTY_ENTRY) { /* Collision. */
if(table_e == upb_inttable_mainpos(table, table_e->key)) {
/* Existing element is in its main posisiton. Find an empty slot to
* place our new element and append it to this key's chain. */
struct upb_inttable_entry *empty = find_empty_slot(table);
while (table_e->next) table_e = table_e->next;
table_e->next = empty;
table_e = empty;
} else {
/* Existing element is not in its main position. Move it to an empty
* slot and put our element in its main position. */
struct upb_inttable_entry *empty, *colliding_key_mainpos;
empty = find_empty_slot(table);
colliding_key_mainpos = upb_inttable_mainpos(table, table_e->key);
assert(colliding_key_mainpos->key != UPB_EMPTY_ENTRY);
assert(colliding_key_mainpos->next);
memcpy(empty, table_e, entry_size); /* next is copied also. */
while(1) {
assert(colliding_key_mainpos->next);
if(colliding_key_mainpos->next == table_e) {
colliding_key_mainpos->next = empty;
break;
}
}
/* table_e remains set to our mainpos. */
}
}
memcpy(table_e, e, entry_size);
table_e->next = NULL;
}
}
void upb_inttable_free(struct upb_inttable *table)
{
free(table->entries);
}

@ -21,70 +21,85 @@
extern "C" {
#endif
/* Inttables are keyed by 32-bit integer. */
typedef int32_t upb_inttable_key_t;
#define UPB_EMPTY_ENTRY (upb_inttable_key_t)-1
typedef uint32_t upb_inttable_key_t;
struct upb_inttable_entry {
upb_inttable_key_t key;
struct upb_inttable_entry *next; /* Internal chaining. */
uint32_t next; /* Internal chaining. */
};
struct upb_inttable {
uint32_t size; /* Is a power of two. */
uint32_t entry_size; /* How big is each entry? */
/* TODO: consider storing the hash in the entry. This would avoid the need to
* rehash on table resizes, but more importantly could possibly improve lookup
* performance by letting us compare hashes before comparing lengths or the
* strings themselves. */
struct upb_strtable_entry {
struct upb_string key;
uint32_t next; /* Internal chaining. */
};
struct upb_table {
void *entries;
uint32_t count; /* How many elements are currently in the table? */
uint16_t entry_size; /* How big is each entry? */
uint8_t size_lg2; /* The table is 2^size_lg2 in size. */
};
/* Builds an int32_t -> <entry> table, optimized for very fast lookup by
* number. table is a pointer to a previously allocated upb_inttable.
* entries points to an array of the desired entries themselves, each of size
* entry_size. The table is allocated in dynamic memory, and does not reference
* the data in entries. Entries may be modified by the function.
*
* The table must be freed with upb_inttable_free. */
void upb_inttable_init(struct upb_inttable *table, void *entries,
int num_entries, int entry_size);
struct upb_strtable {
struct upb_table t;
};
/* Frees any data that was allocated by upb_inttable_init. */
struct upb_inttable {
struct upb_table t;
};
/* Initialize and free a table, respectively. Specify the initial size
* with 'size' (the size will be increased as necessary). Entry size
* specifies how many bytes each entry in the table is. */
void upb_inttable_init(struct upb_inttable *table,
uint32_t size, uint16_t entry_size);
void upb_inttable_free(struct upb_inttable *table);
void upb_strtable_init(struct upb_strtable *table,
uint32_t size, uint16_t entry_size);
void upb_strtable_free(struct upb_strtable *table);
INLINE struct upb_inttable_entry *upb_inttable_entry_get(
void *entries, int32_t pos, int entry_size) {
return (struct upb_inttable_entry*)(((char*)entries) + pos*entry_size);
INLINE uint32_t upb_table_size(struct upb_table *t) { return 1 << t->size_lg2; }
INLINE uint32_t upb_inttable_size(struct upb_inttable *t) {
return upb_table_size(&t->t);
}
INLINE struct upb_inttable_entry *upb_inttable_mainpos2(
struct upb_inttable *table, upb_inttable_key_t key, int32_t entry_size) {
/* Identity hash for integers. */
int32_t pos = key & (table->size-1);
return upb_inttable_entry_get(table->entries, pos, entry_size);
INLINE uint32_t upb_strtable_size(struct upb_strtable *t) {
return upb_table_size(&t->t);
}
INLINE struct upb_inttable_entry *upb_inttable_mainpos(
struct upb_inttable *table, upb_inttable_key_t key) {
return upb_inttable_mainpos2(table, key, table->entry_size);
/* Inserts the given key into the hashtable with the given value. The key must
* not already exist in the hash table. The data will be copied from e into
* the hashtable (the amount of data copied comes from entry_size when the
* table was constructed). Therefore the data at val may be freed once the
* call returns. */
void upb_inttable_insert(struct upb_inttable *t, struct upb_inttable_entry *e);
void upb_strtable_insert(struct upb_strtable *t, struct upb_strtable_entry *e);
INLINE uint32_t upb_inttable_hash(struct upb_inttable *t, upb_inttable_key_t k) {
return k & (upb_inttable_size(t)-1); /* Identity hash for ints. */
}
/* Lookups up key in this table. Inlined because this is in the critical path
* of parsing. */
INLINE void *upb_inttable_lookup(struct upb_inttable *table,
int32_t key,
int32_t entry_size) {
/* TODO: experiment with Cuckoo Hashing, which can perform lookups without
* any branches, cf:
* http://www.cs.cmu.edu/~damon2006/pdf/zukowski06archconscioushashing.pdf.
* The tradeoff is having to calculate a second hash for every lookup, which
* will hurt the simple array case. */
struct upb_inttable_entry *e = upb_inttable_mainpos2(table, key, entry_size);
do {
if (key == e->key) return e;
e = e->next;
} while (e);
return NULL; /* Not found. */
/* Looks up key in this table. Inlined because this is in the critical path
* of parsing. We have the caller specify the entry_size because fixing
* this as a literal (instead of reading table->entry_size) gives the
* compiler more ability to optimize. */
INLINE void *upb_inttable_lookup(struct upb_inttable *t,
uint32_t key, uint32_t entry_size) {
uint32_t hash = upb_inttable_hash(t, key);
while(1) {
struct upb_inttable_entry *e =
(struct upb_inttable_entry*)(char*)t->t.entries + hash*entry_size;
if(e->key == 0) return NULL;
else if(e->key == key) return e;
hash = e->next;
}
}
void *upb_strtable_lookup(struct upb_strtable *t, struct upb_string *key);
#ifdef __cplusplus
} /* extern "C" */
#endif

Loading…
Cancel
Save