|
|
|
@ -98,43 +98,15 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize) |
|
|
|
|
float normL2Sqr_(const float* a, const float* b, int n) |
|
|
|
|
{ |
|
|
|
|
int j = 0; float d = 0.f; |
|
|
|
|
#if CV_AVX2 |
|
|
|
|
float CV_DECL_ALIGNED(32) buf[8]; |
|
|
|
|
__m256 d0 = _mm256_setzero_ps(); |
|
|
|
|
|
|
|
|
|
for( ; j <= n - 8; j += 8 ) |
|
|
|
|
#if CV_SIMD |
|
|
|
|
v_float32 v_d = vx_setzero_f32(); |
|
|
|
|
for (; j <= n - v_float32::nlanes; j += v_float32::nlanes) |
|
|
|
|
{ |
|
|
|
|
__m256 t0 = _mm256_sub_ps(_mm256_loadu_ps(a + j), _mm256_loadu_ps(b + j)); |
|
|
|
|
#if CV_FMA3 |
|
|
|
|
d0 = _mm256_fmadd_ps(t0, t0, d0); |
|
|
|
|
#else |
|
|
|
|
d0 = _mm256_add_ps(d0, _mm256_mul_ps(t0, t0)); |
|
|
|
|
#endif |
|
|
|
|
v_float32 t = vx_load(a + j) - vx_load(b + j); |
|
|
|
|
v_d = v_muladd(t, t, v_d); |
|
|
|
|
} |
|
|
|
|
_mm256_store_ps(buf, d0); |
|
|
|
|
d = buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7]; |
|
|
|
|
#elif CV_SSE |
|
|
|
|
float CV_DECL_ALIGNED(16) buf[4]; |
|
|
|
|
__m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps(); |
|
|
|
|
|
|
|
|
|
for( ; j <= n - 8; j += 8 ) |
|
|
|
|
{ |
|
|
|
|
__m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j)); |
|
|
|
|
__m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4)); |
|
|
|
|
d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0)); |
|
|
|
|
d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1)); |
|
|
|
|
} |
|
|
|
|
_mm_store_ps(buf, _mm_add_ps(d0, d1)); |
|
|
|
|
d = buf[0] + buf[1] + buf[2] + buf[3]; |
|
|
|
|
d = v_reduce_sum(v_d); |
|
|
|
|
#endif |
|
|
|
|
{ |
|
|
|
|
for( ; j <= n - 4; j += 4 ) |
|
|
|
|
{ |
|
|
|
|
float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3]; |
|
|
|
|
d += t0*t0 + t1*t1 + t2*t2 + t3*t3; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for( ; j < n; j++ ) |
|
|
|
|
{ |
|
|
|
|
float t = a[j] - b[j]; |
|
|
|
@ -147,38 +119,12 @@ float normL2Sqr_(const float* a, const float* b, int n) |
|
|
|
|
float normL1_(const float* a, const float* b, int n) |
|
|
|
|
{ |
|
|
|
|
int j = 0; float d = 0.f; |
|
|
|
|
#if CV_SSE |
|
|
|
|
float CV_DECL_ALIGNED(16) buf[4]; |
|
|
|
|
static const int CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; |
|
|
|
|
__m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps(); |
|
|
|
|
__m128 absmask = _mm_load_ps((const float*)absbuf); |
|
|
|
|
|
|
|
|
|
for( ; j <= n - 8; j += 8 ) |
|
|
|
|
{ |
|
|
|
|
__m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j)); |
|
|
|
|
__m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4)); |
|
|
|
|
d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask)); |
|
|
|
|
d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask)); |
|
|
|
|
} |
|
|
|
|
_mm_store_ps(buf, _mm_add_ps(d0, d1)); |
|
|
|
|
d = buf[0] + buf[1] + buf[2] + buf[3]; |
|
|
|
|
#elif CV_NEON |
|
|
|
|
float32x4_t v_sum = vdupq_n_f32(0.0f); |
|
|
|
|
for ( ; j <= n - 4; j += 4) |
|
|
|
|
v_sum = vaddq_f32(v_sum, vabdq_f32(vld1q_f32(a + j), vld1q_f32(b + j))); |
|
|
|
|
|
|
|
|
|
float CV_DECL_ALIGNED(16) buf[4]; |
|
|
|
|
vst1q_f32(buf, v_sum); |
|
|
|
|
d = buf[0] + buf[1] + buf[2] + buf[3]; |
|
|
|
|
#if CV_SIMD |
|
|
|
|
v_float32 v_d = vx_setzero_f32(); |
|
|
|
|
for (; j <= n - v_float32::nlanes; j += v_float32::nlanes) |
|
|
|
|
v_d += v_absdiff(vx_load(a + j), vx_load(b + j)); |
|
|
|
|
d = v_reduce_sum(v_d); |
|
|
|
|
#endif |
|
|
|
|
{ |
|
|
|
|
for( ; j <= n - 4; j += 4 ) |
|
|
|
|
{ |
|
|
|
|
d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + |
|
|
|
|
std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for( ; j < n; j++ ) |
|
|
|
|
d += std::abs(a[j] - b[j]); |
|
|
|
|
return d; |
|
|
|
@ -187,46 +133,10 @@ float normL1_(const float* a, const float* b, int n) |
|
|
|
|
int normL1_(const uchar* a, const uchar* b, int n) |
|
|
|
|
{ |
|
|
|
|
int j = 0, d = 0; |
|
|
|
|
#if CV_SSE |
|
|
|
|
__m128i d0 = _mm_setzero_si128(); |
|
|
|
|
|
|
|
|
|
for( ; j <= n - 16; j += 16 ) |
|
|
|
|
{ |
|
|
|
|
__m128i t0 = _mm_loadu_si128((const __m128i*)(a + j)); |
|
|
|
|
__m128i t1 = _mm_loadu_si128((const __m128i*)(b + j)); |
|
|
|
|
|
|
|
|
|
d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for( ; j <= n - 4; j += 4 ) |
|
|
|
|
{ |
|
|
|
|
__m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j)); |
|
|
|
|
__m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j)); |
|
|
|
|
|
|
|
|
|
d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); |
|
|
|
|
} |
|
|
|
|
d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0))); |
|
|
|
|
#elif CV_NEON |
|
|
|
|
uint32x4_t v_sum = vdupq_n_u32(0.0f); |
|
|
|
|
for ( ; j <= n - 16; j += 16) |
|
|
|
|
{ |
|
|
|
|
uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j)); |
|
|
|
|
uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst)); |
|
|
|
|
v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high))); |
|
|
|
|
v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
uint CV_DECL_ALIGNED(16) buf[4]; |
|
|
|
|
vst1q_u32(buf, v_sum); |
|
|
|
|
d = buf[0] + buf[1] + buf[2] + buf[3]; |
|
|
|
|
#if CV_SIMD |
|
|
|
|
for (; j <= n - v_uint8::nlanes; j += v_uint8::nlanes) |
|
|
|
|
d += v_reduce_sad(vx_load(a + j), vx_load(b + j)); |
|
|
|
|
#endif |
|
|
|
|
{ |
|
|
|
|
for( ; j <= n - 4; j += 4 ) |
|
|
|
|
{ |
|
|
|
|
d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + |
|
|
|
|
std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
for( ; j < n; j++ ) |
|
|
|
|
d += std::abs(a[j] - b[j]); |
|
|
|
|
return d; |
|
|
|
|