|
|
@ -44,11 +44,6 @@ |
|
|
|
#include <algorithm> |
|
|
|
#include <algorithm> |
|
|
|
#include <vector> |
|
|
|
#include <vector> |
|
|
|
|
|
|
|
|
|
|
|
#if ANDROID && HAVE_NEON |
|
|
|
|
|
|
|
#include <cpu-features.h> |
|
|
|
|
|
|
|
#include <arm_neon.h> |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <iostream> |
|
|
|
#include <iostream> |
|
|
|
#include <iomanip> |
|
|
|
#include <iomanip> |
|
|
|
|
|
|
|
|
|
|
@ -115,9 +110,8 @@ Hamming::ResultType Hamming::operator()(const unsigned char* a, const unsigned c |
|
|
|
{ |
|
|
|
{ |
|
|
|
#if __GNUC__ |
|
|
|
#if __GNUC__ |
|
|
|
ResultType result = 0; |
|
|
|
ResultType result = 0; |
|
|
|
#if ANDROID && HAVE_NEON |
|
|
|
#if CV_NEON |
|
|
|
static uint64_t features = android_getCpuFeatures(); |
|
|
|
if (CPU_HAS_NEON_FEATURE) |
|
|
|
if ((features & ANDROID_CPU_ARM_FEATURE_NEON)) |
|
|
|
|
|
|
|
{ |
|
|
|
{ |
|
|
|
for (size_t i = 0; i < size; i += 16) |
|
|
|
for (size_t i = 0; i < size; i += 16) |
|
|
|
{ |
|
|
|
{ |
|
|
@ -126,7 +120,7 @@ Hamming::ResultType Hamming::operator()(const unsigned char* a, const unsigned c |
|
|
|
//uint8x16_t veorq_u8 (uint8x16_t, uint8x16_t)
|
|
|
|
//uint8x16_t veorq_u8 (uint8x16_t, uint8x16_t)
|
|
|
|
uint8x16_t AxorB = veorq_u8 (A_vec, B_vec); |
|
|
|
uint8x16_t AxorB = veorq_u8 (A_vec, B_vec); |
|
|
|
|
|
|
|
|
|
|
|
uint8x16_t bitsSet += vcntq_u8 (AxorB); |
|
|
|
uint8x16_t bitsSet = vcntq_u8 (AxorB); |
|
|
|
//uint16x8_t vpadalq_u8 (uint16x8_t, uint8x16_t)
|
|
|
|
//uint16x8_t vpadalq_u8 (uint16x8_t, uint8x16_t)
|
|
|
|
uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet); |
|
|
|
uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet); |
|
|
|
uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8); |
|
|
|
uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8); |
|
|
@ -138,24 +132,26 @@ Hamming::ResultType Hamming::operator()(const unsigned char* a, const unsigned c |
|
|
|
} |
|
|
|
} |
|
|
|
else |
|
|
|
else |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
//for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
|
|
|
|
{ |
|
|
|
typedef unsigned long long pop_t; |
|
|
|
//for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
|
|
|
|
const size_t modulo = size % sizeof(pop_t); |
|
|
|
typedef unsigned long long pop_t; |
|
|
|
const pop_t * a2 = reinterpret_cast<const pop_t*> (a); |
|
|
|
const size_t modulo = size % sizeof(pop_t); |
|
|
|
const pop_t * b2 = reinterpret_cast<const pop_t*> (b); |
|
|
|
const pop_t * a2 = reinterpret_cast<const pop_t*> (a); |
|
|
|
const pop_t * a2_end = a2 + (size/sizeof(pop_t)); |
|
|
|
const pop_t * b2 = reinterpret_cast<const pop_t*> (b); |
|
|
|
|
|
|
|
const pop_t * a2_end = a2 + (size/sizeof(pop_t)); |
|
|
|
|
|
|
|
|
|
|
|
for (; a2 != a2_end; ++a2, ++b2) |
|
|
|
for (; a2 != a2_end; ++a2, ++b2) |
|
|
|
result += __builtin_popcountll((*a2) ^ (*b2)); |
|
|
|
result += __builtin_popcountll((*a2) ^ (*b2)); |
|
|
|
|
|
|
|
|
|
|
|
if (modulo) |
|
|
|
if (modulo) |
|
|
|
{ |
|
|
|
{ |
|
|
|
//in the case where size is not divisible by sizeof(size_t)
|
|
|
|
//in the case where size is not divisible by sizeof(size_t)
|
|
|
|
//need to mask off the bits at the end
|
|
|
|
//need to mask off the bits at the end
|
|
|
|
pop_t a_final=0,b_final=0; |
|
|
|
pop_t a_final=0,b_final=0; |
|
|
|
memcpy(&a_final,a2,modulo); |
|
|
|
memcpy(&a_final,a2,modulo); |
|
|
|
memcpy(&b_final,b2,modulo); |
|
|
|
memcpy(&b_final,b2,modulo); |
|
|
|
result += __builtin_popcountll(a_final ^ b_final); |
|
|
|
result += __builtin_popcountll(a_final ^ b_final); |
|
|
|
|
|
|
|
}
|
|
|
|
} |
|
|
|
} |
|
|
|
return result; |
|
|
|
return result; |
|
|
|
#else |
|
|
|
#else |
|
|
|