From 45b9398d683c1a488b3011d6a067a409d846ff83 Mon Sep 17 00:00:00 2001 From: Letu Ren Date: Fri, 27 Sep 2024 01:35:04 +0800 Subject: [PATCH] Use generic SIMD in warpAffineBlocklineNN --- modules/imgproc/src/imgwarp.cpp | 47 +++++++++++--------------- modules/imgproc/src/imgwarp.hpp | 1 - modules/imgproc/src/imgwarp.sse4_1.cpp | 36 -------------------- 3 files changed, 19 insertions(+), 65 deletions(-) diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 289d09febd..6f018640a3 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -2703,39 +2703,30 @@ void warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, { CALL_HAL(warpAffineBlocklineNN, cv_hal_warpAffineBlocklineNN, adelta, bdelta, xy, X0, Y0, bw); - const int AB_BITS = MAX(10, (int)INTER_BITS); + constexpr int AB_BITS = MAX(10, static_cast(INTER_BITS)); int x1 = 0; - #if CV_TRY_SSE4_1 - bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1; - if( useSSE4_1 ) - opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta, bdelta, xy, X0, Y0, bw); - else - #endif +#if (CV_SIMD || CV_SIMD_SCALABLE) { - #if CV_SIMD128 + const v_int32 v_X0 = vx_setall_s32(X0); + const v_int32 v_Y0 = vx_setall_s32(Y0); + const int step = VTraits::vlanes(); + for (; x1 <= bw - step; x1 += step) { - v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0); - int span = VTraits::vlanes(); - for( ; x1 <= bw - span; x1 += span ) - { - v_int16x8 v_dst[2]; - #define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr(v_add(shift,v_load(ptr + offset))),\ - v_shr(v_add(shift,v_load(ptr + offset + 4)))) - v_dst[0] = CV_CONVERT_MAP(adelta, x1, v_X0); - v_dst[1] = CV_CONVERT_MAP(bdelta, x1, v_Y0); - #undef CV_CONVERT_MAP - v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]); - } - } - #endif - for( ; x1 < bw; x1++ ) - { - int X = (X0 + adelta[x1]) >> AB_BITS; - int Y = (Y0 + bdelta[x1]) >> AB_BITS; - xy[x1*2] = saturate_cast(X); - xy[x1*2+1] = saturate_cast(Y); + v_int16 v_X = v_pack(v_shr(v_add(v_X0, vx_load(adelta + x1))), + v_shr(v_add(v_X0, vx_load(adelta + x1 + step / 2)))); + v_int16 v_Y = v_pack(v_shr(v_add(v_Y0, vx_load(bdelta + x1))), + v_shr(v_add(v_Y0, vx_load(bdelta + x1 + step / 2)))); + v_store_interleave(xy + 2 * x1, v_X, v_Y); } } +#endif + for (; x1 < bw; x1++) + { + const int X = (X0 + adelta[x1]) >> AB_BITS; + const int Y = (Y0 + bdelta[x1]) >> AB_BITS; + xy[x1 * 2] = saturate_cast(X); + xy[x1 * 2 + 1] = saturate_cast(Y); + } } void warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw) diff --git a/modules/imgproc/src/imgwarp.hpp b/modules/imgproc/src/imgwarp.hpp index 35e9f7bb02..c5b02c5a58 100644 --- a/modules/imgproc/src/imgwarp.hpp +++ b/modules/imgproc/src/imgwarp.hpp @@ -74,7 +74,6 @@ namespace opt_SSE4_1 void convertMaps_nninterpolate32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, int width); void convertMaps_32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, ushort* dst2, int width); void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, int width); -void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw); class WarpPerspectiveLine_SSE4 { diff --git a/modules/imgproc/src/imgwarp.sse4_1.cpp b/modules/imgproc/src/imgwarp.sse4_1.cpp index a2ec9396da..26ece533bc 100644 --- a/modules/imgproc/src/imgwarp.sse4_1.cpp +++ b/modules/imgproc/src/imgwarp.sse4_1.cpp @@ -173,42 +173,6 @@ void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, i } } -void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw) -{ - const int AB_BITS = MAX(10, (int)INTER_BITS); - int x1 = 0; - - __m128i v_X0 = _mm_set1_epi32(X0); - __m128i v_Y0 = _mm_set1_epi32(Y0); - for (; x1 <= bw - 16; x1 += 16) - { - __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1))), AB_BITS), - _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 4))), AB_BITS)); - __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 8))), AB_BITS), - _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 12))), AB_BITS)); - - __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1))), AB_BITS), - _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 4))), AB_BITS)); - __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 8))), AB_BITS), - _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 12))), AB_BITS)); - - _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1); - - _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0); - _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1); - } - for (; x1 < bw; x1++) - { - int X = (X0 + adelta[x1]) >> AB_BITS; - int Y = (Y0 + bdelta[x1]) >> AB_BITS; - xy[x1 * 2] = saturate_cast(X); - xy[x1 * 2 + 1] = saturate_cast(Y); - } -} - - class WarpPerspectiveLine_SSE4_Impl CV_FINAL : public WarpPerspectiveLine_SSE4 { public: