From c370cc10e9e8542eee06671b74f73b614de4bdf7 Mon Sep 17 00:00:00 2001 From: Woody Chow Date: Thu, 16 Mar 2017 14:20:41 +0900 Subject: [PATCH] Optimize normL2Sqr_ with AVX2 --- modules/core/src/stat.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 3c5fb73de4..a2986cc88b 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -4422,7 +4422,22 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize) float normL2Sqr_(const float* a, const float* b, int n) { int j = 0; float d = 0.f; -#if CV_SSE +#if CV_AVX2 + float CV_DECL_ALIGNED(32) buf[8]; + __m256 d0 = _mm256_setzero_ps(); + + for( ; j <= n - 8; j += 8 ) + { + __m256 t0 = _mm256_sub_ps(_mm256_loadu_ps(a + j), _mm256_loadu_ps(b + j)); +#ifdef CV_FMA3 + d0 = _mm256_fmadd_ps(t0, t0, d0); +#else + d0 = _mm256_add_ps(d0, _mm256_mul_ps(t0, t0)); +#endif + } + _mm256_store_ps(buf, d0); + d = buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7]; +#elif CV_SSE float CV_DECL_ALIGNED(16) buf[4]; __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();