From d17507052ef4973ca56df7d56946d6ad8c69cb4b Mon Sep 17 00:00:00 2001 From: Liutong HAN Date: Wed, 28 Jun 2023 17:12:37 +0800 Subject: [PATCH] Rewrite SIMD code by using new Universal Intrinsic API. --- modules/core/src/mean.simd.hpp | 66 +++++++++++++++++----------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/modules/core/src/mean.simd.hpp b/modules/core/src/mean.simd.hpp index d94c887223..bb815adc1c 100644 --- a/modules/core/src/mean.simd.hpp +++ b/modules/core/src/mean.simd.hpp @@ -24,7 +24,7 @@ struct SumSqr_SIMD } }; -#if CV_SIMD +#if CV_SIMD || CV_SIMD_SCALABLE template <> struct SumSqr_SIMD @@ -39,37 +39,37 @@ struct SumSqr_SIMD v_int32 v_sum = vx_setzero_s32(); v_int32 v_sqsum = vx_setzero_s32(); - const int len0 = len & -v_uint8::nlanes; + const int len0 = len & -VTraits::vlanes(); while(x < len0) { - const int len_tmp = min(x + 256*v_uint16::nlanes, len0); + const int len_tmp = min(x + 256*VTraits::vlanes(), len0); v_uint16 v_sum16 = vx_setzero_u16(); - for ( ; x < len_tmp; x += v_uint8::nlanes) + for ( ; x < len_tmp; x += VTraits::vlanes()) { v_uint16 v_src0 = vx_load_expand(src0 + x); - v_uint16 v_src1 = vx_load_expand(src0 + x + v_uint16::nlanes); - v_sum16 += v_src0 + v_src1; + v_uint16 v_src1 = vx_load_expand(src0 + x + VTraits::vlanes()); + v_sum16 = v_add(v_sum16, v_add(v_src0, v_src1)); v_int16 v_tmp0, v_tmp1; v_zip(v_reinterpret_as_s16(v_src0), v_reinterpret_as_s16(v_src1), v_tmp0, v_tmp1); - v_sqsum += v_dotprod(v_tmp0, v_tmp0) + v_dotprod(v_tmp1, v_tmp1); + v_sqsum = v_add(v_sqsum, v_add(v_dotprod(v_tmp0, v_tmp0), v_dotprod(v_tmp1, v_tmp1))); } v_uint32 v_half0, v_half1; v_expand(v_sum16, v_half0, v_half1); - v_sum += v_reinterpret_as_s32(v_half0 + v_half1); + v_sum = v_add(v_sum, v_reinterpret_as_s32(v_add(v_half0, v_half1))); } - if (x <= len - v_uint16::nlanes) + if (x <= len - VTraits::vlanes()) { v_uint16 v_src = vx_load_expand(src0 + x); v_uint16 v_half = v_combine_high(v_src, v_src); v_uint32 v_tmp0, v_tmp1; - v_expand(v_src + v_half, v_tmp0, v_tmp1); - v_sum += v_reinterpret_as_s32(v_tmp0); + v_expand(v_add(v_src, v_half), v_tmp0, v_tmp1); + v_sum = v_add(v_sum, v_reinterpret_as_s32(v_tmp0)); v_int16 v_tmp2, v_tmp3; v_zip(v_reinterpret_as_s16(v_src), v_reinterpret_as_s16(v_half), v_tmp2, v_tmp3); - v_sqsum += v_dotprod(v_tmp2, v_tmp2); - x += v_uint16::nlanes; + v_sqsum = v_add(v_sqsum, v_dotprod(v_tmp2, v_tmp2)); + x += VTraits::vlanes(); } if (cn == 1) @@ -79,13 +79,13 @@ struct SumSqr_SIMD } else { - int CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_int32::nlanes]; + int CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * VTraits::max_nlanes]; v_store(ar, v_sum); - v_store(ar + v_int32::nlanes, v_sqsum); - for (int i = 0; i < v_int32::nlanes; ++i) + v_store(ar + VTraits::vlanes(), v_sqsum); + for (int i = 0; i < VTraits::vlanes(); ++i) { sum[i % cn] += ar[i]; - sqsum[i % cn] += ar[v_int32::nlanes + i]; + sqsum[i % cn] += ar[VTraits::vlanes() + i]; } } v_cleanup(); @@ -106,37 +106,37 @@ struct SumSqr_SIMD v_int32 v_sum = vx_setzero_s32(); v_int32 v_sqsum = vx_setzero_s32(); - const int len0 = len & -v_int8::nlanes; + const int len0 = len & -VTraits::vlanes(); while (x < len0) { - const int len_tmp = min(x + 256 * v_int16::nlanes, len0); + const int len_tmp = min(x + 256 * VTraits::vlanes(), len0); v_int16 v_sum16 = vx_setzero_s16(); - for (; x < len_tmp; x += v_int8::nlanes) + for (; x < len_tmp; x += VTraits::vlanes()) { v_int16 v_src0 = vx_load_expand(src0 + x); - v_int16 v_src1 = vx_load_expand(src0 + x + v_int16::nlanes); - v_sum16 += v_src0 + v_src1; + v_int16 v_src1 = vx_load_expand(src0 + x + VTraits::vlanes()); + v_sum16 = v_add(v_sum16, v_add(v_src0, v_src1)); v_int16 v_tmp0, v_tmp1; v_zip(v_src0, v_src1, v_tmp0, v_tmp1); - v_sqsum += v_dotprod(v_tmp0, v_tmp0) + v_dotprod(v_tmp1, v_tmp1); + v_sqsum = v_add(v_sqsum, v_add(v_dotprod(v_tmp0, v_tmp0), v_dotprod(v_tmp1, v_tmp1))); } v_int32 v_half0, v_half1; v_expand(v_sum16, v_half0, v_half1); - v_sum += v_half0 + v_half1; + v_sum = v_add(v_sum, v_add(v_half0, v_half1)); } - if (x <= len - v_int16::nlanes) + if (x <= len - VTraits::vlanes()) { v_int16 v_src = vx_load_expand(src0 + x); v_int16 v_half = v_combine_high(v_src, v_src); v_int32 v_tmp0, v_tmp1; - v_expand(v_src + v_half, v_tmp0, v_tmp1); - v_sum += v_tmp0; + v_expand(v_add(v_src, v_half), v_tmp0, v_tmp1); + v_sum = v_add(v_sum, v_tmp0); v_int16 v_tmp2, v_tmp3; v_zip(v_src, v_half, v_tmp2, v_tmp3); - v_sqsum += v_dotprod(v_tmp2, v_tmp2); - x += v_int16::nlanes; + v_sqsum = v_add(v_sqsum, v_dotprod(v_tmp2, v_tmp2)); + x += VTraits::vlanes(); } if (cn == 1) @@ -146,13 +146,13 @@ struct SumSqr_SIMD } else { - int CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_int32::nlanes]; + int CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * VTraits::max_nlanes]; v_store(ar, v_sum); - v_store(ar + v_int32::nlanes, v_sqsum); - for (int i = 0; i < v_int32::nlanes; ++i) + v_store(ar + VTraits::vlanes(), v_sqsum); + for (int i = 0; i < VTraits::vlanes(); ++i) { sum[i % cn] += ar[i]; - sqsum[i % cn] += ar[v_int32::nlanes + i]; + sqsum[i % cn] += ar[VTraits::vlanes() + i]; } } v_cleanup();