|
|
|
@ -147,6 +147,7 @@ public: |
|
|
|
|
LshTable() |
|
|
|
|
{ |
|
|
|
|
key_size_ = 0; |
|
|
|
|
feature_size_ = 0; |
|
|
|
|
speed_level_ = kArray; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -157,7 +158,7 @@ public: |
|
|
|
|
*/ |
|
|
|
|
LshTable(unsigned int feature_size, unsigned int key_size) |
|
|
|
|
{ |
|
|
|
|
(void)feature_size; |
|
|
|
|
feature_size_ = feature_size; |
|
|
|
|
(void)key_size; |
|
|
|
|
std::cerr << "LSH is not implemented for that type" << std::endl; |
|
|
|
|
assert(0); |
|
|
|
@ -332,6 +333,8 @@ private: |
|
|
|
|
*/ |
|
|
|
|
unsigned int key_size_; |
|
|
|
|
|
|
|
|
|
unsigned int feature_size_; |
|
|
|
|
|
|
|
|
|
// Members only used for the unsigned char specialization
|
|
|
|
|
/** The mask to apply to a feature to get the hash key
|
|
|
|
|
* Only used in the unsigned char case |
|
|
|
@ -345,9 +348,10 @@ private: |
|
|
|
|
template<> |
|
|
|
|
inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int subsignature_size) |
|
|
|
|
{ |
|
|
|
|
feature_size_ = feature_size; |
|
|
|
|
initialize(subsignature_size); |
|
|
|
|
// Allocate the mask
|
|
|
|
|
mask_ = std::vector<size_t>((size_t)ceil((float)(feature_size * sizeof(char)) / (float)sizeof(size_t)), 0); |
|
|
|
|
mask_ = std::vector<size_t>((feature_size * sizeof(char) + sizeof(size_t) - 1) / sizeof(size_t), 0); |
|
|
|
|
|
|
|
|
|
// A bit brutal but fast to code
|
|
|
|
|
std::vector<size_t> indices(feature_size * CHAR_BIT); |
|
|
|
@ -392,6 +396,7 @@ inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) cons |
|
|
|
|
{ |
|
|
|
|
// no need to check if T is dividable by sizeof(size_t) like in the Hamming
|
|
|
|
|
// distance computation as we have a mask
|
|
|
|
|
// FIXIT: This is bad assumption, because we reading tail bytes after of the allocated features buffer
|
|
|
|
|
const size_t* feature_block_ptr = reinterpret_cast<const size_t*> ((const void*)feature); |
|
|
|
|
|
|
|
|
|
// Figure out the subsignature of the feature
|
|
|
|
@ -400,10 +405,20 @@ inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) cons |
|
|
|
|
size_t subsignature = 0; |
|
|
|
|
size_t bit_index = 1; |
|
|
|
|
|
|
|
|
|
for (std::vector<size_t>::const_iterator pmask_block = mask_.begin(); pmask_block != mask_.end(); ++pmask_block) { |
|
|
|
|
for (unsigned i = 0; i < feature_size_; i += sizeof(size_t)) { |
|
|
|
|
// get the mask and signature blocks
|
|
|
|
|
size_t feature_block = *feature_block_ptr; |
|
|
|
|
size_t mask_block = *pmask_block; |
|
|
|
|
size_t feature_block; |
|
|
|
|
if (i <= feature_size_ - sizeof(size_t)) |
|
|
|
|
{ |
|
|
|
|
feature_block = *feature_block_ptr; |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
size_t tmp = 0; |
|
|
|
|
memcpy(&tmp, feature_block_ptr, feature_size_ - i); // preserve bytes order
|
|
|
|
|
feature_block = tmp; |
|
|
|
|
} |
|
|
|
|
size_t mask_block = mask_[i / sizeof(size_t)]; |
|
|
|
|
while (mask_block) { |
|
|
|
|
// Get the lowest set bit in the mask block
|
|
|
|
|
size_t lowest_bit = mask_block & (-(ptrdiff_t)mask_block); |
|
|
|
|