@ -98,43 +98,15 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
float normL2Sqr_ ( const float * a , const float * b , int n )
{
int j = 0 ; float d = 0.f ;
# if CV_AVX2
float CV_DECL_ALIGNED ( 32 ) buf [ 8 ] ;
__m256 d0 = _mm256_setzero_ps ( ) ;
for ( ; j < = n - 8 ; j + = 8 )
# if CV_SIMD
v_float32 v_d = vx_setzero_f32 ( ) ;
for ( ; j < = n - v_float32 : : nlanes ; j + = v_float32 : : nlanes )
{
__m256 t0 = _mm256_sub_ps ( _mm256_loadu_ps ( a + j ) , _mm256_loadu_ps ( b + j ) ) ;
# if CV_FMA3
d0 = _mm256_fmadd_ps ( t0 , t0 , d0 ) ;
# else
d0 = _mm256_add_ps ( d0 , _mm256_mul_ps ( t0 , t0 ) ) ;
# endif
v_float32 t = vx_load ( a + j ) - vx_load ( b + j ) ;
v_d = v_muladd ( t , t , v_d ) ;
}
_mm256_store_ps ( buf , d0 ) ;
d = buf [ 0 ] + buf [ 1 ] + buf [ 2 ] + buf [ 3 ] + buf [ 4 ] + buf [ 5 ] + buf [ 6 ] + buf [ 7 ] ;
# elif CV_SSE
float CV_DECL_ALIGNED ( 16 ) buf [ 4 ] ;
__m128 d0 = _mm_setzero_ps ( ) , d1 = _mm_setzero_ps ( ) ;
for ( ; j < = n - 8 ; j + = 8 )
{
__m128 t0 = _mm_sub_ps ( _mm_loadu_ps ( a + j ) , _mm_loadu_ps ( b + j ) ) ;
__m128 t1 = _mm_sub_ps ( _mm_loadu_ps ( a + j + 4 ) , _mm_loadu_ps ( b + j + 4 ) ) ;
d0 = _mm_add_ps ( d0 , _mm_mul_ps ( t0 , t0 ) ) ;
d1 = _mm_add_ps ( d1 , _mm_mul_ps ( t1 , t1 ) ) ;
}
_mm_store_ps ( buf , _mm_add_ps ( d0 , d1 ) ) ;
d = buf [ 0 ] + buf [ 1 ] + buf [ 2 ] + buf [ 3 ] ;
d = v_reduce_sum ( v_d ) ;
# endif
{
for ( ; j < = n - 4 ; j + = 4 )
{
float t0 = a [ j ] - b [ j ] , t1 = a [ j + 1 ] - b [ j + 1 ] , t2 = a [ j + 2 ] - b [ j + 2 ] , t3 = a [ j + 3 ] - b [ j + 3 ] ;
d + = t0 * t0 + t1 * t1 + t2 * t2 + t3 * t3 ;
}
}
for ( ; j < n ; j + + )
{
float t = a [ j ] - b [ j ] ;
@ -147,38 +119,12 @@ float normL2Sqr_(const float* a, const float* b, int n)
float normL1_ ( const float * a , const float * b , int n )
{
int j = 0 ; float d = 0.f ;
# if CV_SSE
float CV_DECL_ALIGNED ( 16 ) buf [ 4 ] ;
static const int CV_DECL_ALIGNED ( 16 ) absbuf [ 4 ] = { 0x7fffffff , 0x7fffffff , 0x7fffffff , 0x7fffffff } ;
__m128 d0 = _mm_setzero_ps ( ) , d1 = _mm_setzero_ps ( ) ;
__m128 absmask = _mm_load_ps ( ( const float * ) absbuf ) ;
for ( ; j < = n - 8 ; j + = 8 )
{
__m128 t0 = _mm_sub_ps ( _mm_loadu_ps ( a + j ) , _mm_loadu_ps ( b + j ) ) ;
__m128 t1 = _mm_sub_ps ( _mm_loadu_ps ( a + j + 4 ) , _mm_loadu_ps ( b + j + 4 ) ) ;
d0 = _mm_add_ps ( d0 , _mm_and_ps ( t0 , absmask ) ) ;
d1 = _mm_add_ps ( d1 , _mm_and_ps ( t1 , absmask ) ) ;
}
_mm_store_ps ( buf , _mm_add_ps ( d0 , d1 ) ) ;
d = buf [ 0 ] + buf [ 1 ] + buf [ 2 ] + buf [ 3 ] ;
# elif CV_NEON
float32x4_t v_sum = vdupq_n_f32 ( 0.0f ) ;
for ( ; j < = n - 4 ; j + = 4 )
v_sum = vaddq_f32 ( v_sum , vabdq_f32 ( vld1q_f32 ( a + j ) , vld1q_f32 ( b + j ) ) ) ;
float CV_DECL_ALIGNED ( 16 ) buf [ 4 ] ;
vst1q_f32 ( buf , v_sum ) ;
d = buf [ 0 ] + buf [ 1 ] + buf [ 2 ] + buf [ 3 ] ;
# if CV_SIMD
v_float32 v_d = vx_setzero_f32 ( ) ;
for ( ; j < = n - v_float32 : : nlanes ; j + = v_float32 : : nlanes )
v_d + = v_absdiff ( vx_load ( a + j ) , vx_load ( b + j ) ) ;
d = v_reduce_sum ( v_d ) ;
# endif
{
for ( ; j < = n - 4 ; j + = 4 )
{
d + = std : : abs ( a [ j ] - b [ j ] ) + std : : abs ( a [ j + 1 ] - b [ j + 1 ] ) +
std : : abs ( a [ j + 2 ] - b [ j + 2 ] ) + std : : abs ( a [ j + 3 ] - b [ j + 3 ] ) ;
}
}
for ( ; j < n ; j + + )
d + = std : : abs ( a [ j ] - b [ j ] ) ;
return d ;
@ -187,46 +133,10 @@ float normL1_(const float* a, const float* b, int n)
int normL1_ ( const uchar * a , const uchar * b , int n )
{
int j = 0 , d = 0 ;
# if CV_SSE
__m128i d0 = _mm_setzero_si128 ( ) ;
for ( ; j < = n - 16 ; j + = 16 )
{
__m128i t0 = _mm_loadu_si128 ( ( const __m128i * ) ( a + j ) ) ;
__m128i t1 = _mm_loadu_si128 ( ( const __m128i * ) ( b + j ) ) ;
d0 = _mm_add_epi32 ( d0 , _mm_sad_epu8 ( t0 , t1 ) ) ;
}
for ( ; j < = n - 4 ; j + = 4 )
{
__m128i t0 = _mm_cvtsi32_si128 ( * ( const int * ) ( a + j ) ) ;
__m128i t1 = _mm_cvtsi32_si128 ( * ( const int * ) ( b + j ) ) ;
d0 = _mm_add_epi32 ( d0 , _mm_sad_epu8 ( t0 , t1 ) ) ;
}
d = _mm_cvtsi128_si32 ( _mm_add_epi32 ( d0 , _mm_unpackhi_epi64 ( d0 , d0 ) ) ) ;
# elif CV_NEON
uint32x4_t v_sum = vdupq_n_u32 ( 0.0f ) ;
for ( ; j < = n - 16 ; j + = 16 )
{
uint8x16_t v_dst = vabdq_u8 ( vld1q_u8 ( a + j ) , vld1q_u8 ( b + j ) ) ;
uint16x8_t v_low = vmovl_u8 ( vget_low_u8 ( v_dst ) ) , v_high = vmovl_u8 ( vget_high_u8 ( v_dst ) ) ;
v_sum = vaddq_u32 ( v_sum , vaddl_u16 ( vget_low_u16 ( v_low ) , vget_low_u16 ( v_high ) ) ) ;
v_sum = vaddq_u32 ( v_sum , vaddl_u16 ( vget_high_u16 ( v_low ) , vget_high_u16 ( v_high ) ) ) ;
}
uint CV_DECL_ALIGNED ( 16 ) buf [ 4 ] ;
vst1q_u32 ( buf , v_sum ) ;
d = buf [ 0 ] + buf [ 1 ] + buf [ 2 ] + buf [ 3 ] ;
# if CV_SIMD
for ( ; j < = n - v_uint8 : : nlanes ; j + = v_uint8 : : nlanes )
d + = v_reduce_sad ( vx_load ( a + j ) , vx_load ( b + j ) ) ;
# endif
{
for ( ; j < = n - 4 ; j + = 4 )
{
d + = std : : abs ( a [ j ] - b [ j ] ) + std : : abs ( a [ j + 1 ] - b [ j + 1 ] ) +
std : : abs ( a [ j + 2 ] - b [ j + 2 ] ) + std : : abs ( a [ j + 3 ] - b [ j + 3 ] ) ;
}
}
for ( ; j < n ; j + + )
d + = std : : abs ( a [ j ] - b [ j ] ) ;
return d ;