Added MurmurHash for string hashing (not used yet).

16 years ago · 9c21992286
parent 9f80d0d60e
commit 9c21992286
2 changed files with 182 additions and 1 deletions
--- a/upb_table.c
+++ b/upb_table.c
@ -10,6 +10,180 @@
 #include <stdlib.h>
 #include <string.h>
 #ifdef UPB_UNALIGNED_READS_OK
 //-----------------------------------------------------------------------------
 // MurmurHash2, by Austin Appleby
 // Reformatted and C99-ified by Joshua Haberman.
 // Note - This code makes a few assumptions about how your machine behaves -
 //   1. We can read a 4-byte value from any address without crashing
 //   2. sizeof(int) == 4 (in upb this limitation is removed by using uint32_t
 // And it has a few limitations -
 //   1. It will not work incrementally.
 //   2. It will not produce the same results on little-endian and big-endian
 //      machines.
 static uint32_t MurmurHash2(const void *key, size_t len, uint32_t seed)
 {
  // 'm' and 'r' are mixing constants generated offline.
  // They're not really 'magic', they just happen to work well.
  const uint32_t m = 0x5bd1e995;
  const int32_t r = 24;
  // Initialize the hash to a 'random' value
  uint32_t h = seed ^ len;
  // Mix 4 bytes at a time into the hash
  const uint8_t * data = (const uint8_t *)key;
  while(len >= 4) {
    uint32_t k = *(uint32_t *)data;
    k *= m;
    k ^= k >> r;
    k *= m;
    h *= m;
    h ^= k;
    data += 4;
    len -= 4;
  }
  // Handle the last few bytes of the input array
  switch(len) {
    case 3: h ^= data[2] << 16;
    case 2: h ^= data[1] << 8;
    case 1: h ^= data[0]; h *= m;
  };
  // Do a few final mixes of the hash to ensure the last few
  // bytes are well-incorporated.
  h ^= h >> 13;
  h *= m;
  h ^= h >> 15;
  return h;
 }
 #else // !UPB_UNALIGNED_READS_OK
 //-----------------------------------------------------------------------------
 // MurmurHashAligned2, by Austin Appleby
 // Same algorithm as MurmurHash2, but only does aligned reads - should be safer
 // on certain platforms.
 // Performance will be lower than MurmurHash2
 #define MIX(h,k,m) { k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; }
 static uint32_t MurmurHash2(const void * key, size_t len, uint32_t seed)
 {
  const uint32_t m = 0x5bd1e995;
  const int32_t r = 24;
  const uint8_t * data = (const uint8_t *)key;
  uint32_t h = seed ^ len;
  uint8_t align = (uintptr_t)data & 3;
  if(align && (len >= 4)) {
    // Pre-load the temp registers
    uint32_t t = 0, d = 0;
    switch(align) {
      case 1: t |= data[2] << 16;
      case 2: t |= data[1] << 8;
      case 3: t |= data[0];
    }
    t <<= (8 * align);
    data += 4-align;
    len -= 4-align;
    int32_t sl = 8 * (4-align);
    int32_t sr = 8 * align;
    // Mix
    while(len >= 4) {
      d = *(uint32_t *)data;
      t = (t >> sr) | (d << sl);
      uint32_t k = t;
      MIX(h,k,m);
      t = d;
      data += 4;
      len -= 4;
    }
    // Handle leftover data in temp registers
    d = 0;
    if(len >= align) {
      switch(align) {
        case 3: d |= data[2] << 16;
        case 2: d |= data[1] << 8;
        case 1: d |= data[0];
      }
      uint32_t k = (t >> sr) | (d << sl);
      MIX(h,k,m);
      data += align;
      len -= align;
      //----------
      // Handle tail bytes
      switch(len) {
        case 3: h ^= data[2] << 16;
        case 2: h ^= data[1] << 8;
        case 1: h ^= data[0]; h *= m;
      };
    } else {
      switch(len) {
        case 3: d |= data[2] << 16;
        case 2: d |= data[1] << 8;
        case 1: d |= data[0];
        case 0: h ^= (t >> sr) | (d << sl); h *= m;
      }
    }
    h ^= h >> 13;
    h *= m;
    h ^= h >> 15;
    return h;
  } else {
    while(len >= 4) {
      uint32_t k = *(uint32_t *)data;
      MIX(h,k,m);
      data += 4;
      len -= 4;
    }
    //----------
    // Handle tail bytes
    switch(len) {
      case 3: h ^= data[2] << 16;
      case 2: h ^= data[1] << 8;
      case 1: h ^= data[0]; h *= m;
    };
    h ^= h >> 13;
    h *= m;
    h ^= h >> 15;
    return h;
  }
 }
 #undef MIX
 #endif // UPB_UNALIGNED_READS_OK
 static int compare_entries(const void *f1, const void *f2)
 {
  return ((struct upb_inttable_entry*)f1)->key -
@ -120,4 +294,3 @@ void upb_inttable_free(struct upb_inttable *table)
  free(table->entries);
 }
 /* Emit definition for inline functions. */
--- a/upb_table.h
+++ b/upb_table.h
@ -2,6 +2,14 @@
 * upb - a minimalist implementation of protocol buffers.
 *
 * Copyright (c) 2009 Joshua Haberman.  See LICENSE for details.
 *
 * This file defines very fast int->struct (inttable) and string->struct
 * (strtable) hash tables.  The struct can be of any size, and it is stored
 * in the table itself, for cache-friendly performance.
 *
 * The table uses internal chaining with Brent's variation (inspired by the
 * Lua implementation of hash tables).  The hash function for strings is
 * Austin Appleby's "MurmurHash."
 */
 #ifndef UPB_TABLE_H_