From 0124bfd8fe87cb38fe367478d18f449edce8a0ae Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Sun, 21 Jun 2009 17:58:23 -0700 Subject: [PATCH] More work on inttable/strtable (not finished). --- upb.h | 12 +++ upb_struct.h | 12 +-- upb_table.c | 257 +++++++++++++++++++++++++++++---------------------- upb_table.h | 107 ++++++++++++--------- 4 files changed, 221 insertions(+), 167 deletions(-) diff --git a/upb.h b/upb.h index 36abcaa2ac..a144218f35 100644 --- a/upb.h +++ b/upb.h @@ -10,6 +10,7 @@ #include #include #include /* for size_t. */ +#include #ifdef __cplusplus extern "C" { @@ -28,6 +29,17 @@ extern "C" { /* The maximum that any submessages can be nested. Matches proto2's limit. */ #define UPB_MAX_NESTING 64 +/* Represents a string or bytes. */ +struct upb_string { + void *data; + uint32_t byte_len; +}; + +INLINE bool upb_string_eql(struct upb_string *s1, struct upb_string *s2) { + return s1->byte_len == s2->byte_len && + memcmp(s1->data, s2->data, s1->byte_len) == 0; +} + /* A list of types as they are encoded on-the-wire. */ enum upb_wire_type { UPB_WIRE_TYPE_VARINT = 0, diff --git a/upb_struct.h b/upb_struct.h index d46b007966..72a1d21855 100644 --- a/upb_struct.h +++ b/upb_struct.h @@ -67,29 +67,23 @@ struct upb_struct_field *upb_struct_find_field_by_number( /* Variable-length data (strings and arrays).**********************************/ -/* Represents a string or bytes. */ -struct upb_string { - size_t byte_len; - void *data; -}; - /* Represents an array (a repeated field) of any type. The interpretation of * the data in the array depends on the type. */ struct upb_array { - size_t len; /* Measured in elements. */ void *data; /* Size of individual elements is based on type. */ + uint32_t len; /* Measured in elements. */ }; /* A generic array of structs, using void* instead of specific types. */ struct upb_struct_array { - size_t len; void **elements; + uint32_t len; }; /* An array of strings. */ struct upb_string_array { - size_t len; struct upb_string **elements; + uint32_t len; }; /* Specific arrays of all the primitive types. */ diff --git a/upb_table.c b/upb_table.c index 00adc01263..13ebb70e83 100644 --- a/upb_table.c +++ b/upb_table.c @@ -10,9 +10,153 @@ #include #include +static const upb_inttable_key_t EMPTYENT = 0; +static const double MAX_LOAD = 0.85; + +static uint32_t MurmurHash2(const void *key, size_t len, uint32_t seed); + +static uint32_t max(uint32_t a, uint32_t b) { return a > b ? a : b; } +static struct upb_inttable_entry *intent(struct upb_inttable *t, uint32_t i) { + return (struct upb_inttable_entry*)((char*)t->t.entries + i*t->t.entry_size); +} +static struct upb_strtable_entry *strent(struct upb_strtable *t, uint32_t i) { + return (struct upb_strtable_entry*)((char*)t->t.entries + i*t->t.entry_size); +} + +void upb_table_init(struct upb_table *t, uint32_t size, uint16_t entry_size) +{ + t->count = 0; + t->entry_size = entry_size; + t->size_lg2 = 1; + while(size >>= 1) t->size_lg2++; + t->size_lg2 = max(t->size_lg2, 4); /* Min size of 16. */ + t->entries = malloc(upb_table_size(t) * t->entry_size); +} + +void upb_inttable_init(struct upb_inttable *t, uint32_t size, uint16_t entsize) +{ + upb_table_init(&t->t, size, entsize); + for(uint32_t i = 0; i < upb_table_size(&t->t); i++) + intent(t, i)->key = EMPTYENT; +} + +void upb_strtable_init(struct upb_strtable *t, uint32_t size, uint16_t entsize) +{ + upb_table_init(&t->t, size, entsize); + for(uint32_t i = 0; i < upb_table_size(&t->t); i++) + strent(t, i)->key.data = NULL; +} + +void upb_table_free(struct upb_table *t) { free(t->entries); } +void upb_inttable_free(struct upb_inttable *t) { upb_table_free(&t->t); } +void upb_strtable_free(struct upb_strtable *t) { upb_table_free(&t->t); } + +void *upb_strtable_lookup(struct upb_strtable *t, struct upb_string *key) +{ + uint32_t hash = + MurmurHash2(key->data, key->byte_len, 0) & (upb_strtable_size(t)-1); + while(1) { + struct upb_strtable_entry *e = + (struct upb_strtable_entry*)(char*)t->t.entries + hash*t->t.entry_size; + if(e->key.data == NULL) return NULL; + else if(upb_string_eql(&e->key, key)) return e; + hash = e->next; + } +} + +static struct upb_inttable_entry *find_empty_slot(struct upb_inttable *table) +{ + /* TODO: does it matter that this is biased towards the front of the table? */ + for(uint32_t i = 0; i < upb_inttable_size(table); i++) { + struct upb_inttable_entry *e = intent(table, i); + if(e->key == EMPTYENT) return e; + } + assert(false); + return NULL; +} + +static void *maybe_resize(struct upb_table *t) { + if((double)++t->count / upb_table_size(t) > MAX_LOAD) { + void *old_entries = t->entries; + t->size_lg2++; /* Double the table size. */ + t->entries = malloc(upb_table_size(t) * t->entry_size); + return old_entries; + } else { + return NULL; /* No resize necessary. */ + } +} + +static void intinsert(void *table, struct upb_inttable_entry *e, uint32_t size) +{ + /* TODO */ +#if 0 + struct upb_inttable_entry *e, *table_e; + e = upb_inttable_entry_get(entries, i, entry_size); + table_e = upb_inttable_mainpos2(table, e->key); + if(table_e->key != UPB_EMPTY_ENTRY) { /* Collision. */ + if(table_e == upb_inttable_mainpos2(table, table_e->key)) { + /* Existing element is in its main posisiton. Find an empty slot to + * place our new element and append it to this key's chain. */ + struct upb_inttable_entry *empty = find_empty_slot(table); + while (table_e->next) table_e = table_e->next; + table_e->next = empty; + table_e = empty; + } else { + /* Existing element is not in its main position. Move it to an empty + * slot and put our element in its main position. */ + struct upb_inttable_entry *empty, *colliding_key_mainpos; + empty = find_empty_slot(table); + colliding_key_mainpos = upb_inttable_mainpos2(table, table_e->key); + assert(colliding_key_mainpos->key != UPB_EMPTY_ENTRY); + assert(colliding_key_mainpos->next); + memcpy(empty, table_e, entry_size); /* next is copied also. */ + while(1) { + assert(colliding_key_mainpos->next); + if(colliding_key_mainpos->next == table_e) { + colliding_key_mainpos->next = empty; + break; + } + } + /* table_e remains set to our mainpos. */ + } + } + memcpy(table_e, e, entry_size); + table_e->next = NULL; +#endif +} + +void upb_inttable_insert(struct upb_inttable *t, struct upb_inttable_entry *e) +{ + void *new_entries = maybe_resize(&t->t); + if(new_entries) { /* Are we doing a resize? */ + for(uint32_t i = 0; i < (upb_inttable_size(t)>>1); i++) { + struct upb_inttable_entry *old_e = intent(t, i); + if(old_e->key != EMPTYENT) intinsert(new_entries, old_e, t->t.entry_size); + } + } + intinsert(t->t.entries, e, t->t.entry_size); +} + +static void strinsert(void *table, struct upb_strtable_entry *e, uint32_t size) +{ + /* TODO */ +} + +void upb_strtable_insert(struct upb_strtable *t, struct upb_strtable_entry *e) +{ + void *new_entries = maybe_resize(&t->t); + if(new_entries) { /* Are we doing a resize? */ + for(uint32_t i = 0; i < (upb_strtable_size(t)>>1); i++) { + struct upb_strtable_entry *old_e = strent(t, i); + if(old_e->key.data != NULL) strinsert(new_entries, old_e, t->t.entry_size); + } + } + strinsert(t->t.entries, e, t->t.entry_size); +} + #ifdef UPB_UNALIGNED_READS_OK //----------------------------------------------------------------------------- -// MurmurHash2, by Austin Appleby +// MurmurHash2, by Austin Appleby (released as public domain). // Reformatted and C99-ified by Joshua Haberman. // Note - This code makes a few assumptions about how your machine behaves - // 1. We can read a 4-byte value from any address without crashing @@ -183,114 +327,3 @@ static uint32_t MurmurHash2(const void * key, size_t len, uint32_t seed) #undef MIX #endif // UPB_UNALIGNED_READS_OK - -static int compare_entries(const void *f1, const void *f2) -{ - return ((struct upb_inttable_entry*)f1)->key - - ((struct upb_inttable_entry*)f2)->key; -} - -static uint32_t max(uint32_t a, uint32_t b) { return a > b ? a : b; } - -static uint32_t round_up_to_pow2(uint32_t v) -{ - /* cf. Bit Twiddling Hacks: - * http://www-graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */ - v--; - v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; - v++; - return v; -} - -static struct upb_inttable_entry *find_empty_slot(struct upb_inttable *table) -{ - /* TODO: does it matter that this is biased towards the front of the table? */ - for(uint32_t i = 0; i < table->size; i++) { - struct upb_inttable_entry *e = - upb_inttable_entry_get(table->entries, i, table->entry_size); - if(e->key == UPB_EMPTY_ENTRY) return e; - } - assert(false); - return NULL; -} - -void upb_inttable_init(struct upb_inttable *table, void *entries, - int num_entries, int entry_size) -{ - qsort(entries, num_entries, entry_size, compare_entries); - - /* Find the largest n for which at least half the keys n might - * possibly have collisions. Start at 8 to avoid noise of small numbers. */ - upb_inttable_key_t n = 0, maybe_n; - bool all_in_array = true; - for(int i = 0; i < num_entries; i++) { - struct upb_inttable_entry *e = - upb_inttable_entry_get(entries, i, entry_size); - maybe_n = e->key; - if(maybe_n > 8 && maybe_n/(i+1) >= 2) { - all_in_array = false; - break; - } - n = maybe_n; - } - - /* TODO: measure, tweak, optimize this choice of table size. Possibly test - * (at runtime) maximum chain length for each proposed size. */ - uint32_t min_size_by_load = all_in_array ? n : (double)num_entries / 0.85; - uint32_t min_size = max(n, min_size_by_load); - table->size = round_up_to_pow2(min_size); - table->entry_size = entry_size; - table->entries = malloc(table->size * entry_size); - - /* Initialize to empty. */ - for(size_t i = 0; i < table->size; i++) { - struct upb_inttable_entry *e = - upb_inttable_entry_get(table->entries, i, entry_size); - e->key = UPB_EMPTY_ENTRY; - e->next = NULL; - } - - /* Insert the elements. */ - for(int i = 0; i < num_entries; i++) { - struct upb_inttable_entry *e, *table_e; - e = upb_inttable_entry_get(entries, i, entry_size); - table_e = upb_inttable_mainpos(table, e->key); - if(table_e->key != UPB_EMPTY_ENTRY) { /* Collision. */ - if(table_e == upb_inttable_mainpos(table, table_e->key)) { - /* Existing element is in its main posisiton. Find an empty slot to - * place our new element and append it to this key's chain. */ - struct upb_inttable_entry *empty = find_empty_slot(table); - while (table_e->next) table_e = table_e->next; - table_e->next = empty; - table_e = empty; - } else { - /* Existing element is not in its main position. Move it to an empty - * slot and put our element in its main position. */ - struct upb_inttable_entry *empty, *colliding_key_mainpos; - empty = find_empty_slot(table); - colliding_key_mainpos = upb_inttable_mainpos(table, table_e->key); - assert(colliding_key_mainpos->key != UPB_EMPTY_ENTRY); - assert(colliding_key_mainpos->next); - memcpy(empty, table_e, entry_size); /* next is copied also. */ - while(1) { - assert(colliding_key_mainpos->next); - if(colliding_key_mainpos->next == table_e) { - colliding_key_mainpos->next = empty; - break; - } - } - /* table_e remains set to our mainpos. */ - } - } - memcpy(table_e, e, entry_size); - table_e->next = NULL; - } -} - -void upb_inttable_free(struct upb_inttable *table) -{ - free(table->entries); -} - diff --git a/upb_table.h b/upb_table.h index 2a8e4f232a..f405e050eb 100644 --- a/upb_table.h +++ b/upb_table.h @@ -21,70 +21,85 @@ extern "C" { #endif -/* Inttables are keyed by 32-bit integer. */ -typedef int32_t upb_inttable_key_t; - -#define UPB_EMPTY_ENTRY (upb_inttable_key_t)-1 +typedef uint32_t upb_inttable_key_t; struct upb_inttable_entry { upb_inttable_key_t key; - struct upb_inttable_entry *next; /* Internal chaining. */ + uint32_t next; /* Internal chaining. */ }; -struct upb_inttable { - uint32_t size; /* Is a power of two. */ - uint32_t entry_size; /* How big is each entry? */ +/* TODO: consider storing the hash in the entry. This would avoid the need to + * rehash on table resizes, but more importantly could possibly improve lookup + * performance by letting us compare hashes before comparing lengths or the + * strings themselves. */ +struct upb_strtable_entry { + struct upb_string key; + uint32_t next; /* Internal chaining. */ +}; + +struct upb_table { void *entries; + uint32_t count; /* How many elements are currently in the table? */ + uint16_t entry_size; /* How big is each entry? */ + uint8_t size_lg2; /* The table is 2^size_lg2 in size. */ }; -/* Builds an int32_t -> table, optimized for very fast lookup by - * number. table is a pointer to a previously allocated upb_inttable. - * entries points to an array of the desired entries themselves, each of size - * entry_size. The table is allocated in dynamic memory, and does not reference - * the data in entries. Entries may be modified by the function. - * - * The table must be freed with upb_inttable_free. */ -void upb_inttable_init(struct upb_inttable *table, void *entries, - int num_entries, int entry_size); +struct upb_strtable { + struct upb_table t; +}; -/* Frees any data that was allocated by upb_inttable_init. */ +struct upb_inttable { + struct upb_table t; +}; + +/* Initialize and free a table, respectively. Specify the initial size + * with 'size' (the size will be increased as necessary). Entry size + * specifies how many bytes each entry in the table is. */ +void upb_inttable_init(struct upb_inttable *table, + uint32_t size, uint16_t entry_size); void upb_inttable_free(struct upb_inttable *table); +void upb_strtable_init(struct upb_strtable *table, + uint32_t size, uint16_t entry_size); +void upb_strtable_free(struct upb_strtable *table); -INLINE struct upb_inttable_entry *upb_inttable_entry_get( - void *entries, int32_t pos, int entry_size) { - return (struct upb_inttable_entry*)(((char*)entries) + pos*entry_size); +INLINE uint32_t upb_table_size(struct upb_table *t) { return 1 << t->size_lg2; } +INLINE uint32_t upb_inttable_size(struct upb_inttable *t) { + return upb_table_size(&t->t); } - -INLINE struct upb_inttable_entry *upb_inttable_mainpos2( - struct upb_inttable *table, upb_inttable_key_t key, int32_t entry_size) { - /* Identity hash for integers. */ - int32_t pos = key & (table->size-1); - return upb_inttable_entry_get(table->entries, pos, entry_size); +INLINE uint32_t upb_strtable_size(struct upb_strtable *t) { + return upb_table_size(&t->t); } -INLINE struct upb_inttable_entry *upb_inttable_mainpos( - struct upb_inttable *table, upb_inttable_key_t key) { - return upb_inttable_mainpos2(table, key, table->entry_size); +/* Inserts the given key into the hashtable with the given value. The key must + * not already exist in the hash table. The data will be copied from e into + * the hashtable (the amount of data copied comes from entry_size when the + * table was constructed). Therefore the data at val may be freed once the + * call returns. */ +void upb_inttable_insert(struct upb_inttable *t, struct upb_inttable_entry *e); +void upb_strtable_insert(struct upb_strtable *t, struct upb_strtable_entry *e); + +INLINE uint32_t upb_inttable_hash(struct upb_inttable *t, upb_inttable_key_t k) { + return k & (upb_inttable_size(t)-1); /* Identity hash for ints. */ } -/* Lookups up key in this table. Inlined because this is in the critical path - * of parsing. */ -INLINE void *upb_inttable_lookup(struct upb_inttable *table, - int32_t key, - int32_t entry_size) { - /* TODO: experiment with Cuckoo Hashing, which can perform lookups without - * any branches, cf: - * http://www.cs.cmu.edu/~damon2006/pdf/zukowski06archconscioushashing.pdf. - * The tradeoff is having to calculate a second hash for every lookup, which - * will hurt the simple array case. */ - struct upb_inttable_entry *e = upb_inttable_mainpos2(table, key, entry_size); - do { - if (key == e->key) return e; - e = e->next; - } while (e); - return NULL; /* Not found. */ +/* Looks up key in this table. Inlined because this is in the critical path + * of parsing. We have the caller specify the entry_size because fixing + * this as a literal (instead of reading table->entry_size) gives the + * compiler more ability to optimize. */ +INLINE void *upb_inttable_lookup(struct upb_inttable *t, + uint32_t key, uint32_t entry_size) { + uint32_t hash = upb_inttable_hash(t, key); + while(1) { + struct upb_inttable_entry *e = + (struct upb_inttable_entry*)(char*)t->t.entries + hash*entry_size; + if(e->key == 0) return NULL; + else if(e->key == key) return e; + hash = e->next; + } } +void *upb_strtable_lookup(struct upb_strtable *t, struct upb_string *key); + #ifdef __cplusplus } /* extern "C" */ #endif