Merge pull request #26203 from FantasqueX:generic-simd-warpAffineBlocklineNN

Use generic SIMD in warpAffineBlocklineNN
pull/23689/merge
Alexander Smorkalov 2 weeks ago committed by GitHub
commit ee95bfe244
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 47
      modules/imgproc/src/imgwarp.cpp
  2. 1
      modules/imgproc/src/imgwarp.hpp
  3. 36
      modules/imgproc/src/imgwarp.sse4_1.cpp

@ -2703,39 +2703,30 @@ void warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0,
{
CALL_HAL(warpAffineBlocklineNN, cv_hal_warpAffineBlocklineNN, adelta, bdelta, xy, X0, Y0, bw);
const int AB_BITS = MAX(10, (int)INTER_BITS);
constexpr int AB_BITS = MAX(10, static_cast<int>(INTER_BITS));
int x1 = 0;
#if CV_TRY_SSE4_1
bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
if( useSSE4_1 )
opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta, bdelta, xy, X0, Y0, bw);
else
#endif
#if (CV_SIMD || CV_SIMD_SCALABLE)
{
#if CV_SIMD128
const v_int32 v_X0 = vx_setall_s32(X0);
const v_int32 v_Y0 = vx_setall_s32(Y0);
const int step = VTraits<v_int16>::vlanes();
for (; x1 <= bw - step; x1 += step)
{
v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
int span = VTraits<v_uint16x8>::vlanes();
for( ; x1 <= bw - span; x1 += span )
{
v_int16x8 v_dst[2];
#define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\
v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset + 4))))
v_dst[0] = CV_CONVERT_MAP(adelta, x1, v_X0);
v_dst[1] = CV_CONVERT_MAP(bdelta, x1, v_Y0);
#undef CV_CONVERT_MAP
v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);
}
}
#endif
for( ; x1 < bw; x1++ )
{
int X = (X0 + adelta[x1]) >> AB_BITS;
int Y = (Y0 + bdelta[x1]) >> AB_BITS;
xy[x1*2] = saturate_cast<short>(X);
xy[x1*2+1] = saturate_cast<short>(Y);
v_int16 v_X = v_pack(v_shr<AB_BITS>(v_add(v_X0, vx_load(adelta + x1))),
v_shr<AB_BITS>(v_add(v_X0, vx_load(adelta + x1 + step / 2))));
v_int16 v_Y = v_pack(v_shr<AB_BITS>(v_add(v_Y0, vx_load(bdelta + x1))),
v_shr<AB_BITS>(v_add(v_Y0, vx_load(bdelta + x1 + step / 2))));
v_store_interleave(xy + 2 * x1, v_X, v_Y);
}
}
#endif
for (; x1 < bw; x1++)
{
const int X = (X0 + adelta[x1]) >> AB_BITS;
const int Y = (Y0 + bdelta[x1]) >> AB_BITS;
xy[x1 * 2] = saturate_cast<short>(X);
xy[x1 * 2 + 1] = saturate_cast<short>(Y);
}
}
void warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)

@ -74,7 +74,6 @@ namespace opt_SSE4_1
void convertMaps_nninterpolate32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, int width);
void convertMaps_32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, ushort* dst2, int width);
void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, int width);
void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw);
class WarpPerspectiveLine_SSE4
{

@ -173,42 +173,6 @@ void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, i
}
}
void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw)
{
const int AB_BITS = MAX(10, (int)INTER_BITS);
int x1 = 0;
__m128i v_X0 = _mm_set1_epi32(X0);
__m128i v_Y0 = _mm_set1_epi32(Y0);
for (; x1 <= bw - 16; x1 += 16)
{
__m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1))), AB_BITS),
_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 4))), AB_BITS));
__m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 8))), AB_BITS),
_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 12))), AB_BITS));
__m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1))), AB_BITS),
_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 4))), AB_BITS));
__m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 8))), AB_BITS),
_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 12))), AB_BITS));
_mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
}
for (; x1 < bw; x1++)
{
int X = (X0 + adelta[x1]) >> AB_BITS;
int Y = (Y0 + bdelta[x1]) >> AB_BITS;
xy[x1 * 2] = saturate_cast<short>(X);
xy[x1 * 2 + 1] = saturate_cast<short>(Y);
}
}
class WarpPerspectiveLine_SSE4_Impl CV_FINAL : public WarpPerspectiveLine_SSE4
{
public:

Loading…
Cancel
Save