From c96f48e7c91748f896e99f5bf943faf08cd98f0e Mon Sep 17 00:00:00 2001 From: Vincent Rabaud Date: Thu, 19 Oct 2023 13:06:50 +0200 Subject: [PATCH] Merge pull request #24412 from vrabaud:inter_area1 Speed up line merging in INTER_AREA #24412 This provides a 10 to 20% speed-up. Related perf test fix: https://github.com/opencv/opencv/pull/24417 This is a split of https://github.com/opencv/opencv/pull/23525 that will be updated to only deal with column merging. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake --- modules/imgproc/src/resize.cpp | 123 +++++++++++++++++++++++++++++---- 1 file changed, 109 insertions(+), 14 deletions(-) diff --git a/modules/imgproc/src/resize.cpp b/modules/imgproc/src/resize.cpp index 1ad8e8932d..5cfc86308b 100644 --- a/modules/imgproc/src/resize.cpp +++ b/modules/imgproc/src/resize.cpp @@ -3019,6 +3019,111 @@ struct DecimateAlpha }; +namespace inter_area { +#if (CV_SIMD || CV_SIMD_SCALABLE) +inline void saturate_store(const float* src, uchar* dst) { + const v_int32 tmp0 = v_round(vx_load(src + 0 * VTraits::vlanes())); + const v_int32 tmp1 = v_round(vx_load(src + 1 * VTraits::vlanes())); + const v_int32 tmp2 = v_round(vx_load(src + 2 * VTraits::vlanes())); + const v_int32 tmp3 = v_round(vx_load(src + 3 * VTraits::vlanes())); + v_store(dst, v_pack(v_pack_u(tmp0, tmp1), v_pack_u(tmp2, tmp3))); +} + +inline void saturate_store(const float* src, ushort* dst) { + const v_int32 tmp0 = v_round(vx_load(src + 0 * VTraits::vlanes())); + const v_int32 tmp1 = v_round(vx_load(src + 1 * VTraits::vlanes())); + v_store(dst, v_pack_u(tmp0, tmp1)); +} + +inline void saturate_store(const float* src, short* dst) { + const v_int32 tmp0 = v_round(vx_load(src + 0 * VTraits::vlanes())); + const v_int32 tmp1 = v_round(vx_load(src + 1 * VTraits::vlanes())); + v_store(dst, v_pack(tmp0, tmp1)); +} + +static inline v_float32 vx_setall(float coeff) { return vx_setall_f32(coeff); } + +template +struct VArea {}; + +template <> +struct VArea { + typedef v_float32 vWT; +}; +#endif + +#if (CV_SIMD128_64F || CV_SIMD_SCALABLE_64F) +static inline v_float64 vx_setall(double coeff) { return vx_setall_f64(coeff); } + +template <> +struct VArea { + typedef v_float64 vWT; +}; + +#else +inline void mul(const double* buf, int width, double beta, double* sum) { + for (int dx = 0; dx < width; ++dx) { + sum[dx] = beta * buf[dx]; + } +} + +inline void muladd(const double* buf, int width, double beta, double* sum) { + for (int dx = 0; dx < width; ++dx) { + sum[dx] += beta * buf[dx]; + } +} +#endif + +template +inline void saturate_store(const WT* sum, int width, T* D) { + int dx = 0; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int step = VTraits::vWT>::vlanes() * sizeof(WT) / sizeof(T); + for (; dx + step < width; dx += step) { + saturate_store(sum + dx, D + dx); + } +#endif + for (; dx < width; ++dx) { + D[dx] = saturate_cast(sum[dx]); + } +} + +// Optimization when T == WT. +template +inline void saturate_store(const WT* sum, int width, WT* D) { + std::copy(sum, sum + width, D); +} + +template +inline void mul(const WT* buf, int width, WT beta, WT* sum) { + int dx = 0; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int step = VTraits::vWT>::vlanes(); + for (; dx + step < width; dx += step) { + vx_store(sum + dx, v_mul(vx_setall(beta), vx_load(buf + dx))); + } +#endif + for (; dx < width; ++dx) { + sum[dx] = beta * buf[dx]; + } +} + +template +inline void muladd(const WT* buf, int width, WT beta, WT* sum) { + int dx = 0; +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int step = VTraits::vWT>::vlanes(); + for (; dx + step < width; dx += step) { + vx_store(sum + dx, v_add(vx_load(sum + dx), v_mul(vx_setall(beta), vx_load(buf + dx)))); + } +#endif + for (; dx < width; ++dx) { + sum[dx] += beta * buf[dx]; + } +} + +} // namespace inter_area + template class ResizeArea_Invoker : public ParallelLoopBody { @@ -3120,27 +3225,17 @@ public: if( dy != prev_dy ) { - T* D = dst->template ptr(prev_dy); - - for( dx = 0; dx < dsize.width; dx++ ) - { - D[dx] = saturate_cast(sum[dx]); - sum[dx] = beta*buf[dx]; - } + inter_area::saturate_store(sum, dsize.width, dst->template ptr(prev_dy)); + inter_area::mul(buf, dsize.width, beta, sum); prev_dy = dy; } else { - for( dx = 0; dx < dsize.width; dx++ ) - sum[dx] += beta*buf[dx]; + inter_area::muladd(buf, dsize.width, beta, sum); } } - { - T* D = dst->template ptr(prev_dy); - for( dx = 0; dx < dsize.width; dx++ ) - D[dx] = saturate_cast(sum[dx]); - } + inter_area::saturate_store(sum, dsize.width, dst->template ptr(prev_dy)); } private: