From 0a5bd0ac8b575918962775ac4c89b10545cdc6ed Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Tue, 4 Sep 2018 16:37:39 +0300 Subject: [PATCH] sum() implementation updated to use wide universal intrinsics --- modules/core/src/sum.cpp | 319 ++++++++++++++++++++------------------- 1 file changed, 167 insertions(+), 152 deletions(-) diff --git a/modules/core/src/sum.cpp b/modules/core/src/sum.cpp index 660e176777..519ab1ee0f 100644 --- a/modules/core/src/sum.cpp +++ b/modules/core/src/sum.cpp @@ -19,260 +19,275 @@ struct Sum_SIMD } }; -template -inline void addChannels(DT * dst, ST * buf, int cn) -{ - for (int i = 0; i < 4; ++i) - dst[i % cn] += buf[i]; -} - -#if CV_SSE2 +#if CV_SIMD template <> -struct Sum_SIMD +struct Sum_SIMD { - int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const + int operator () (const uchar * src0, const uchar * mask, int * dst, int len, int cn) const { - if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2) + if (mask || (cn != 1 && cn != 2 && cn != 4)) return 0; + len *= cn; int x = 0; - __m128i v_zero = _mm_setzero_si128(), v_sum = v_zero; + v_uint32 v_sum = vx_setzero_u32(); - for ( ; x <= len - 16; x += 16) + int len0 = len & -v_uint8::nlanes; + while (x < len0) { - __m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x)); - __m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8); - - v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16)); - v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16)); - - v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8); - v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16)); - v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16)); + const int len_tmp = min(x + 256*v_uint16::nlanes, len0); + v_uint16 v_sum16 = vx_setzero_u16(); + for (; x < len_tmp; x += v_uint8::nlanes) + { + v_uint16 v_src0, v_src1; + v_expand(vx_load(src0 + x), v_src0, v_src1); + v_sum16 += v_src0 + v_src1; + } + v_uint32 v_half0, v_half1; + v_expand(v_sum16, v_half0, v_half1); + v_sum += v_half0 + v_half1; } - - for ( ; x <= len - 8; x += 8) + if (x <= len - v_uint16::nlanes) { - __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8); - - v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); - v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); + v_uint32 v_half0, v_half1; + v_expand(vx_load_expand(src0 + x), v_half0, v_half1); + v_sum += v_half0 + v_half1; + x += v_uint16::nlanes; } - - int CV_DECL_ALIGNED(16) ar[4]; - _mm_store_si128((__m128i*)ar, v_sum); - - addChannels(dst, ar, cn); - - return x / cn; - } -}; - -template <> -struct Sum_SIMD -{ - int operator () (const int * src0, const uchar * mask, double * dst, int len, int cn) const - { - if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2) - return 0; - - int x = 0; - __m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero; - - for ( ; x <= len - 4; x += 4) + if (x <= len - v_uint32::nlanes) { - __m128i v_src = _mm_loadu_si128((__m128i const *)(src0 + x)); - v_sum0 = _mm_add_pd(v_sum0, _mm_cvtepi32_pd(v_src)); - v_sum1 = _mm_add_pd(v_sum1, _mm_cvtepi32_pd(_mm_srli_si128(v_src, 8))); + v_sum += vx_load_expand_q(src0 + x); + x += v_uint32::nlanes; } - double CV_DECL_ALIGNED(16) ar[4]; - _mm_store_pd(ar, v_sum0); - _mm_store_pd(ar + 2, v_sum1); - - addChannels(dst, ar, cn); + if (cn == 1) + *dst += v_reduce_sum(v_sum); + else + { + uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_uint32::nlanes]; + v_store_aligned(ar, v_sum); + for (int i = 0; i < v_uint32::nlanes; ++i) + dst[i % cn] += ar[i]; + } + v_cleanup(); return x / cn; } }; template <> -struct Sum_SIMD +struct Sum_SIMD { - int operator () (const float * src0, const uchar * mask, double * dst, int len, int cn) const + int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const { - if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2) + if (mask || (cn != 1 && cn != 2 && cn != 4)) return 0; + len *= cn; int x = 0; - __m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero; + v_int32 v_sum = vx_setzero_s32(); - for ( ; x <= len - 4; x += 4) + int len0 = len & -v_int8::nlanes; + while (x < len0) { - __m128 v_src = _mm_loadu_ps(src0 + x); - v_sum0 = _mm_add_pd(v_sum0, _mm_cvtps_pd(v_src)); - v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)); - v_sum1 = _mm_add_pd(v_sum1, _mm_cvtps_pd(v_src)); + const int len_tmp = min(x + 256*v_int16::nlanes, len0); + v_int16 v_sum16 = vx_setzero_s16(); + for (; x < len_tmp; x += v_int8::nlanes) + { + v_int16 v_src0, v_src1; + v_expand(vx_load(src0 + x), v_src0, v_src1); + v_sum16 += v_src0 + v_src1; + } + v_int32 v_half0, v_half1; + v_expand(v_sum16, v_half0, v_half1); + v_sum += v_half0 + v_half1; + } + if (x <= len - v_int16::nlanes) + { + v_int32 v_half0, v_half1; + v_expand(vx_load_expand(src0 + x), v_half0, v_half1); + v_sum += v_half0 + v_half1; + x += v_int16::nlanes; + } + if (x <= len - v_int32::nlanes) + { + v_sum += vx_load_expand_q(src0 + x); + x += v_int32::nlanes; } - double CV_DECL_ALIGNED(16) ar[4]; - _mm_store_pd(ar, v_sum0); - _mm_store_pd(ar + 2, v_sum1); - - addChannels(dst, ar, cn); + if (cn == 1) + *dst += v_reduce_sum(v_sum); + else + { + int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes]; + v_store_aligned(ar, v_sum); + for (int i = 0; i < v_int32::nlanes; ++i) + dst[i % cn] += ar[i]; + } + v_cleanup(); return x / cn; } }; - -#elif CV_NEON - template <> -struct Sum_SIMD +struct Sum_SIMD { - int operator () (const uchar * src0, const uchar * mask, int * dst, int len, int cn) const + int operator () (const ushort * src0, const uchar * mask, int * dst, int len, int cn) const { if (mask || (cn != 1 && cn != 2 && cn != 4)) return 0; + len *= cn; int x = 0; - uint32x4_t v_sum = vdupq_n_u32(0u); + v_uint32 v_sum = vx_setzero_u32(); - for ( ; x <= len - 16; x += 16) + for (; x <= len - v_uint16::nlanes; x += v_uint16::nlanes) { - uint8x16_t v_src = vld1q_u8(src0 + x); - uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src)); - - v_sum = vaddw_u16(v_sum, vget_low_u16(v_half)); - v_sum = vaddw_u16(v_sum, vget_high_u16(v_half)); - - v_half = vmovl_u8(vget_high_u8(v_src)); - v_sum = vaddw_u16(v_sum, vget_low_u16(v_half)); - v_sum = vaddw_u16(v_sum, vget_high_u16(v_half)); + v_uint32 v_src0, v_src1; + v_expand(vx_load(src0 + x), v_src0, v_src1); + v_sum += v_src0 + v_src1; } - - for ( ; x <= len - 8; x += 8) + if (x <= len - v_uint32::nlanes) { - uint16x8_t v_src = vmovl_u8(vld1_u8(src0 + x)); - - v_sum = vaddw_u16(v_sum, vget_low_u16(v_src)); - v_sum = vaddw_u16(v_sum, vget_high_u16(v_src)); + v_sum += vx_load_expand(src0 + x); + x += v_uint32::nlanes; } - unsigned int CV_DECL_ALIGNED(16) ar[4]; - vst1q_u32(ar, v_sum); - - addChannels(dst, ar, cn); + if (cn == 1) + *dst += v_reduce_sum(v_sum); + else + { + uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_uint32::nlanes]; + v_store_aligned(ar, v_sum); + for (int i = 0; i < v_uint32::nlanes; ++i) + dst[i % cn] += ar[i]; + } + v_cleanup(); return x / cn; } }; template <> -struct Sum_SIMD +struct Sum_SIMD { - int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const + int operator () (const short * src0, const uchar * mask, int * dst, int len, int cn) const { if (mask || (cn != 1 && cn != 2 && cn != 4)) return 0; + len *= cn; int x = 0; - int32x4_t v_sum = vdupq_n_s32(0); + v_int32 v_sum = vx_setzero_s32(); - for ( ; x <= len - 16; x += 16) + for (; x <= len - v_int16::nlanes; x += v_int16::nlanes) { - int8x16_t v_src = vld1q_s8(src0 + x); - int16x8_t v_half = vmovl_s8(vget_low_s8(v_src)); - - v_sum = vaddw_s16(v_sum, vget_low_s16(v_half)); - v_sum = vaddw_s16(v_sum, vget_high_s16(v_half)); - - v_half = vmovl_s8(vget_high_s8(v_src)); - v_sum = vaddw_s16(v_sum, vget_low_s16(v_half)); - v_sum = vaddw_s16(v_sum, vget_high_s16(v_half)); + v_int32 v_src0, v_src1; + v_expand(vx_load(src0 + x), v_src0, v_src1); + v_sum += v_src0 + v_src1; } - - for ( ; x <= len - 8; x += 8) + if (x <= len - v_int32::nlanes) { - int16x8_t v_src = vmovl_s8(vld1_s8(src0 + x)); - - v_sum = vaddw_s16(v_sum, vget_low_s16(v_src)); - v_sum = vaddw_s16(v_sum, vget_high_s16(v_src)); + v_sum += vx_load_expand(src0 + x); + x += v_int32::nlanes; } - int CV_DECL_ALIGNED(16) ar[4]; - vst1q_s32(ar, v_sum); - - addChannels(dst, ar, cn); + if (cn == 1) + *dst += v_reduce_sum(v_sum); + else + { + int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes]; + v_store_aligned(ar, v_sum); + for (int i = 0; i < v_int32::nlanes; ++i) + dst[i % cn] += ar[i]; + } + v_cleanup(); return x / cn; } }; +#if CV_SIMD_64F template <> -struct Sum_SIMD +struct Sum_SIMD { - int operator () (const ushort * src0, const uchar * mask, int * dst, int len, int cn) const + int operator () (const int * src0, const uchar * mask, double * dst, int len, int cn) const { if (mask || (cn != 1 && cn != 2 && cn != 4)) return 0; + len *= cn; int x = 0; - uint32x4_t v_sum = vdupq_n_u32(0u); + v_float64 v_sum0 = vx_setzero_f64(); + v_float64 v_sum1 = vx_setzero_f64(); - for ( ; x <= len - 8; x += 8) + for (; x <= len - 2 * v_int32::nlanes; x += 2 * v_int32::nlanes) { - uint16x8_t v_src = vld1q_u16(src0 + x); - - v_sum = vaddw_u16(v_sum, vget_low_u16(v_src)); - v_sum = vaddw_u16(v_sum, vget_high_u16(v_src)); + v_int32 v_src0 = vx_load(src0 + x); + v_int32 v_src1 = vx_load(src0 + x + v_int32::nlanes); + v_sum0 += v_cvt_f64(v_src0) + v_cvt_f64(v_src1); + v_sum1 += v_cvt_f64_high(v_src0) + v_cvt_f64_high(v_src1); } - for ( ; x <= len - 4; x += 4) - v_sum = vaddw_u16(v_sum, vld1_u16(src0 + x)); - - unsigned int CV_DECL_ALIGNED(16) ar[4]; - vst1q_u32(ar, v_sum); - - addChannels(dst, ar, cn); +#if CV_SIMD256 || CV_SIMD512 + double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_float64::nlanes]; + v_store_aligned(ar, v_sum0 + v_sum1); + for (int i = 0; i < v_float64::nlanes; ++i) + dst[i % cn] += ar[i]; +#else + double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_float64::nlanes]; + v_store_aligned(ar, v_sum0); + v_store_aligned(ar + v_float64::nlanes, v_sum1); + for (int i = 0; i < 2 * v_float64::nlanes; ++i) + dst[i % cn] += ar[i]; +#endif + v_cleanup(); return x / cn; } }; template <> -struct Sum_SIMD +struct Sum_SIMD { - int operator () (const short * src0, const uchar * mask, int * dst, int len, int cn) const + int operator () (const float * src0, const uchar * mask, double * dst, int len, int cn) const { if (mask || (cn != 1 && cn != 2 && cn != 4)) return 0; + len *= cn; int x = 0; - int32x4_t v_sum = vdupq_n_s32(0u); + v_float64 v_sum0 = vx_setzero_f64(); + v_float64 v_sum1 = vx_setzero_f64(); - for ( ; x <= len - 8; x += 8) + for (; x <= len - 2 * v_float32::nlanes; x += 2 * v_float32::nlanes) { - int16x8_t v_src = vld1q_s16(src0 + x); - - v_sum = vaddw_s16(v_sum, vget_low_s16(v_src)); - v_sum = vaddw_s16(v_sum, vget_high_s16(v_src)); + v_float32 v_src0 = vx_load(src0 + x); + v_float32 v_src1 = vx_load(src0 + x + v_float32::nlanes); + v_sum0 += v_cvt_f64(v_src0) + v_cvt_f64(v_src1); + v_sum1 += v_cvt_f64_high(v_src0) + v_cvt_f64_high(v_src1); } - for ( ; x <= len - 4; x += 4) - v_sum = vaddw_s16(v_sum, vld1_s16(src0 + x)); - - int CV_DECL_ALIGNED(16) ar[4]; - vst1q_s32(ar, v_sum); - - addChannels(dst, ar, cn); +#if CV_SIMD256 || CV_SIMD512 + double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_float64::nlanes]; + v_store_aligned(ar, v_sum0 + v_sum1); + for (int i = 0; i < v_float64::nlanes; ++i) + dst[i % cn] += ar[i]; +#else + double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_float64::nlanes]; + v_store_aligned(ar, v_sum0); + v_store_aligned(ar + v_float64::nlanes, v_sum1); + for (int i = 0; i < 2 * v_float64::nlanes; ++i) + dst[i % cn] += ar[i]; +#endif + v_cleanup(); return x / cn; } }; - +#endif #endif template