Merge pull request #24412 from vrabaud:inter_area1

Speed up line merging in INTER_AREA #24412

This provides a 10 to 20% speed-up.

Related perf test fix: https://github.com/opencv/opencv/pull/24417
This is a split of https://github.com/opencv/opencv/pull/23525 that will be updated to only deal with column merging.

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
pull/24429/head
Vincent Rabaud 1 year ago committed by GitHub
parent a9664abb57
commit c96f48e7c9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 123
      modules/imgproc/src/resize.cpp

@ -3019,6 +3019,111 @@ struct DecimateAlpha
};
namespace inter_area {
#if (CV_SIMD || CV_SIMD_SCALABLE)
inline void saturate_store(const float* src, uchar* dst) {
const v_int32 tmp0 = v_round(vx_load(src + 0 * VTraits<v_float32>::vlanes()));
const v_int32 tmp1 = v_round(vx_load(src + 1 * VTraits<v_float32>::vlanes()));
const v_int32 tmp2 = v_round(vx_load(src + 2 * VTraits<v_float32>::vlanes()));
const v_int32 tmp3 = v_round(vx_load(src + 3 * VTraits<v_float32>::vlanes()));
v_store(dst, v_pack(v_pack_u(tmp0, tmp1), v_pack_u(tmp2, tmp3)));
}
inline void saturate_store(const float* src, ushort* dst) {
const v_int32 tmp0 = v_round(vx_load(src + 0 * VTraits<v_float32>::vlanes()));
const v_int32 tmp1 = v_round(vx_load(src + 1 * VTraits<v_float32>::vlanes()));
v_store(dst, v_pack_u(tmp0, tmp1));
}
inline void saturate_store(const float* src, short* dst) {
const v_int32 tmp0 = v_round(vx_load(src + 0 * VTraits<v_float32>::vlanes()));
const v_int32 tmp1 = v_round(vx_load(src + 1 * VTraits<v_float32>::vlanes()));
v_store(dst, v_pack(tmp0, tmp1));
}
static inline v_float32 vx_setall(float coeff) { return vx_setall_f32(coeff); }
template <typename T>
struct VArea {};
template <>
struct VArea<float> {
typedef v_float32 vWT;
};
#endif
#if (CV_SIMD128_64F || CV_SIMD_SCALABLE_64F)
static inline v_float64 vx_setall(double coeff) { return vx_setall_f64(coeff); }
template <>
struct VArea<double> {
typedef v_float64 vWT;
};
#else
inline void mul(const double* buf, int width, double beta, double* sum) {
for (int dx = 0; dx < width; ++dx) {
sum[dx] = beta * buf[dx];
}
}
inline void muladd(const double* buf, int width, double beta, double* sum) {
for (int dx = 0; dx < width; ++dx) {
sum[dx] += beta * buf[dx];
}
}
#endif
template <typename T, typename WT>
inline void saturate_store(const WT* sum, int width, T* D) {
int dx = 0;
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int step = VTraits<typename VArea<WT>::vWT>::vlanes() * sizeof(WT) / sizeof(T);
for (; dx + step < width; dx += step) {
saturate_store(sum + dx, D + dx);
}
#endif
for (; dx < width; ++dx) {
D[dx] = saturate_cast<T>(sum[dx]);
}
}
// Optimization when T == WT.
template <typename WT>
inline void saturate_store(const WT* sum, int width, WT* D) {
std::copy(sum, sum + width, D);
}
template <typename WT>
inline void mul(const WT* buf, int width, WT beta, WT* sum) {
int dx = 0;
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int step = VTraits<typename VArea<WT>::vWT>::vlanes();
for (; dx + step < width; dx += step) {
vx_store(sum + dx, v_mul(vx_setall(beta), vx_load(buf + dx)));
}
#endif
for (; dx < width; ++dx) {
sum[dx] = beta * buf[dx];
}
}
template <typename WT>
inline void muladd(const WT* buf, int width, WT beta, WT* sum) {
int dx = 0;
#if (CV_SIMD || CV_SIMD_SCALABLE)
const int step = VTraits<typename VArea<WT>::vWT>::vlanes();
for (; dx + step < width; dx += step) {
vx_store(sum + dx, v_add(vx_load(sum + dx), v_mul(vx_setall(beta), vx_load(buf + dx))));
}
#endif
for (; dx < width; ++dx) {
sum[dx] += beta * buf[dx];
}
}
} // namespace inter_area
template<typename T, typename WT> class ResizeArea_Invoker :
public ParallelLoopBody
{
@ -3120,27 +3225,17 @@ public:
if( dy != prev_dy )
{
T* D = dst->template ptr<T>(prev_dy);
for( dx = 0; dx < dsize.width; dx++ )
{
D[dx] = saturate_cast<T>(sum[dx]);
sum[dx] = beta*buf[dx];
}
inter_area::saturate_store(sum, dsize.width, dst->template ptr<T>(prev_dy));
inter_area::mul(buf, dsize.width, beta, sum);
prev_dy = dy;
}
else
{
for( dx = 0; dx < dsize.width; dx++ )
sum[dx] += beta*buf[dx];
inter_area::muladd(buf, dsize.width, beta, sum);
}
}
{
T* D = dst->template ptr<T>(prev_dy);
for( dx = 0; dx < dsize.width; dx++ )
D[dx] = saturate_cast<T>(sum[dx]);
}
inter_area::saturate_store(sum, dsize.width, dst->template ptr<T>(prev_dy));
}
private:

Loading…
Cancel
Save