From be38d4ea932bc3a0d06845ed1a2de84acc2a09de Mon Sep 17 00:00:00 2001 From: Anna Khakimova Date: Wed, 30 Mar 2022 00:14:01 +0300 Subject: [PATCH] Merge pull request #21777 from anna-khakimova:ak/convertto_simd GAPI Fluid: SIMD for ConvertTo. * GAPI Fluid: SIMD for convertto. * Applied comments --- .../perf/cpu/gapi_core_perf_tests_fluid.cpp | 2 +- .../gapi/src/backends/fluid/gfluidcore.cpp | 131 ++----- .../fluid/gfluidcore_func.dispatch.cpp | 59 ++- .../src/backends/fluid/gfluidcore_func.hpp | 41 ++ .../backends/fluid/gfluidcore_func.simd.hpp | 354 ++++++++++++++++++ 5 files changed, 485 insertions(+), 102 deletions(-) diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp index e4b8c0b490..83de793a81 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp @@ -324,7 +324,7 @@ INSTANTIATE_TEST_CASE_P(ConvertToPerfTestFluid, ConvertToPerfTest, Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1), Values(CV_8U, CV_16U, CV_16S, CV_32F), Values(szSmall128, szVGA, sz720p, sz1080p), - Values(2.5, 1.0), + Values(1.0, 2.5), Values(0.0), Values(cv::compile_args(CORE_FLUID)))); diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp index c5cfc19d48..7a8f1f5ed8 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp @@ -1555,102 +1555,43 @@ GAPI_FLUID_KERNEL(GFluidLUT, cv::gapi::core::GLUT, false) // //------------------------- -#if CV_SIMD128 -template -CV_ALWAYS_INLINE int run_convertto_simd(DST*, const SRC*, int) -{ - return 0; -} -CV_ALWAYS_INLINE int run_convertto_simd(uchar *out, const float *in, const int length) -{ - int l = 0; - for (; l <= length - 16; l += 16) - { - v_int32x4 i0, i1, i2, i3; - i0 = v_round( v_load( (float*)& in[l ] ) ); - i1 = v_round( v_load( (float*)& in[l + 4] ) ); - i2 = v_round( v_load( (float*)& in[l + 8] ) ); - i3 = v_round( v_load( (float*)& in[l + 12] ) ); - - v_uint16x8 us0, us1; - us0 = v_pack_u(i0, i1); - us1 = v_pack_u(i2, i3); - - v_uint8x16 uc; - uc = v_pack(us0, us1); - v_store((uchar*)& out[l], uc); - } - return l; -} -CV_ALWAYS_INLINE int run_convertto_simd(ushort *out, const float *in, const int length) +template +CV_ALWAYS_INLINE void convertto_impl(const T in[], T out[], const int length) { - int l = 0; - for (; l <= length - 8; l += 8) - { - v_int32x4 i0, i1; - i0 = v_round( v_load( (float*)& in[l ] ) ); - i1 = v_round( v_load( (float*)& in[l + 4] ) ); - - v_uint16x8 us; - us = v_pack_u(i0, i1); - v_store((ushort*)& out[l], us); - } - return l; + memcpy(out, in, length * sizeof(T)); } -#endif -template::value && - std::is_floating_point::value, bool> = true > -CV_ALWAYS_INLINE void run_convertto(DST *out, const SRC *in, const int length) +template +CV_ALWAYS_INLINE void convertto_impl(const SRC in[], DST out[], const int length) { - // manual SIMD if need rounding - static_assert(std::is_same::value, "64-bit floating-point source is not supported"); - int l = 0; // cycle index -#if CV_SIMD128 - l = run_convertto_simd(out, in, length); + int x = 0; +#if CV_SIMD + x = convertto_simd(in, out, length); #endif // tail of SIMD cycle - for (; l < length; l++) - { - out[l] = saturate(in[l], rintf); - } -} -template::value && - std::is_integral::value , bool> = true > -CV_ALWAYS_INLINE void run_convertto(DST *out, const SRC *in, const int length) -{ - for (int l = 0; l < length; l++) - { - out[l] = saturate(in[l]); - } -} -template::value, bool> = true > -CV_ALWAYS_INLINE void run_convertto(DST *out, const SRC *in, const int length) -{ - static_assert(!std::is_same::value, "64-bit floating-point source is not supported"); - for (int l = 0; l < length; l++) + for (; x < length; ++x) { - out[l] = static_cast(in[l]); + out[x] = saturate(in[x], rintf); } } -template -CV_ALWAYS_INLINE void run_convertto(DST *out, const SRC *in, const float alpha, const float beta, - const int length) +template +CV_ALWAYS_INLINE void convertto_impl(const SRC *in, DST* out, const float alpha, const float beta, + const int length) { - static_assert(!std::is_same::value, "64-bit floating-point source is not supported"); - // TODO: optimize if alpha and beta and data are integral - for (int l = 0; l < length; l++) + int x = 0; +#if CV_SIMD + x = convertto_scaled_simd(in, out, alpha, beta, length); +#endif + + for (; x < length; ++x) { - out[l] = saturate(in[l] * alpha + beta, rintf); + out[x] = saturate(in[x] * alpha + beta, rintf); } } template -static void run_convertto(Buffer &dst, const View &src, double _alpha, double _beta) +CV_ALWAYS_INLINE void run_convertto(Buffer &dst, const View &src, double _alpha, double _beta) { const auto *in = src.InLine(0); auto *out = dst.OutLine(); @@ -1664,13 +1605,13 @@ static void run_convertto(Buffer &dst, const View &src, double _alpha, double _b const auto beta = static_cast( _beta ); // compute faster if no alpha no beta - if (1.f == alpha && 0.f == beta) + if ((std::fabs(alpha - 1.f) < FLT_EPSILON) && (std::fabs(beta) < FLT_EPSILON)) { - run_convertto(out, in, length); + convertto_impl(in, out, length); } else // if alpha or beta is non-trivial { - run_convertto(out, in, alpha, beta, length); + convertto_impl(in, out, alpha, beta, length); } } @@ -1681,22 +1622,22 @@ GAPI_FLUID_KERNEL(GFluidConvertTo, cv::gapi::core::GConvertTo, false) static void run(const View &src, int /*rtype*/, double alpha, double beta, Buffer &dst) { // DST SRC OP __VA_ARGS__ - UNARY_(uchar , uchar , run_convertto, dst, src, alpha, beta); - UNARY_(uchar , ushort, run_convertto, dst, src, alpha, beta); - UNARY_(uchar , short, run_convertto, dst, src, alpha, beta); - UNARY_(uchar , float, run_convertto, dst, src, alpha, beta); + UNARY_(uchar, uchar , run_convertto, dst, src, alpha, beta); + UNARY_(uchar, ushort, run_convertto, dst, src, alpha, beta); + UNARY_(uchar, short, run_convertto, dst, src, alpha, beta); + UNARY_(uchar, float, run_convertto, dst, src, alpha, beta); UNARY_(ushort, uchar , run_convertto, dst, src, alpha, beta); UNARY_(ushort, ushort, run_convertto, dst, src, alpha, beta); UNARY_(ushort, short, run_convertto, dst, src, alpha, beta); UNARY_(ushort, float, run_convertto, dst, src, alpha, beta); - UNARY_( short, uchar , run_convertto, dst, src, alpha, beta); - UNARY_( short, ushort, run_convertto, dst, src, alpha, beta); - UNARY_( short, short, run_convertto, dst, src, alpha, beta); - UNARY_( short, float, run_convertto, dst, src, alpha, beta); - UNARY_( float, uchar , run_convertto, dst, src, alpha, beta); - UNARY_( float, ushort, run_convertto, dst, src, alpha, beta); - UNARY_( float, short, run_convertto, dst, src, alpha, beta); - UNARY_( float, float, run_convertto, dst, src, alpha, beta); + UNARY_(short, uchar , run_convertto, dst, src, alpha, beta); + UNARY_(short, ushort, run_convertto, dst, src, alpha, beta); + UNARY_(short, short, run_convertto, dst, src, alpha, beta); + UNARY_(short, float, run_convertto, dst, src, alpha, beta); + UNARY_(float, uchar , run_convertto, dst, src, alpha, beta); + UNARY_(float, ushort, run_convertto, dst, src, alpha, beta); + UNARY_(float, short, run_convertto, dst, src, alpha, beta); + UNARY_(float, float, run_convertto, dst, src, alpha, beta); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp index c235991fba..c9d329b2ff 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp @@ -293,9 +293,8 @@ int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[], #define ADD_SIMD(SRC, DST) \ int add_simd(const SRC in1[], const SRC in2[], DST out[], const int length) \ { \ - \ - CV_CPU_DISPATCH(add_simd, (in1, in2, out, length), \ - CV_CPU_DISPATCH_MODES_ALL); \ + CV_CPU_DISPATCH(add_simd, (in1, in2, out, length), \ + CV_CPU_DISPATCH_MODES_ALL); \ } ADD_SIMD(uchar, uchar) @@ -320,9 +319,8 @@ ADD_SIMD(float, float) #define SUB_SIMD(SRC, DST) \ int sub_simd(const SRC in1[], const SRC in2[], DST out[], const int length) \ { \ - \ - CV_CPU_DISPATCH(sub_simd, (in1, in2, out, length), \ - CV_CPU_DISPATCH_MODES_ALL); \ + CV_CPU_DISPATCH(sub_simd, (in1, in2, out, length), \ + CV_CPU_DISPATCH_MODES_ALL); \ } SUB_SIMD(uchar, uchar) @@ -344,6 +342,55 @@ SUB_SIMD(float, float) #undef SUB_SIMD +#define CONVERTTO_NOCOEF_SIMD(SRC, DST) \ +int convertto_simd(const SRC in[], DST out[], const int length) \ +{ \ + CV_CPU_DISPATCH(convertto_simd, (in, out, length), \ + CV_CPU_DISPATCH_MODES_ALL); \ +} + +CONVERTTO_NOCOEF_SIMD(ushort, uchar) +CONVERTTO_NOCOEF_SIMD(short, uchar) +CONVERTTO_NOCOEF_SIMD(float, uchar) +CONVERTTO_NOCOEF_SIMD(ushort, short) +CONVERTTO_NOCOEF_SIMD(uchar, short) +CONVERTTO_NOCOEF_SIMD(float, short) +CONVERTTO_NOCOEF_SIMD(uchar, ushort) +CONVERTTO_NOCOEF_SIMD(short, ushort) +CONVERTTO_NOCOEF_SIMD(float, ushort) +CONVERTTO_NOCOEF_SIMD(uchar, float) +CONVERTTO_NOCOEF_SIMD(ushort, float) +CONVERTTO_NOCOEF_SIMD(short, float) + +#undef CONVERTTO_NOCOEF_SIMD + +#define CONVERTTO_SCALED_SIMD(SRC, DST) \ +int convertto_scaled_simd(const SRC in[], DST out[], const float alpha, \ + const float beta, const int length) \ +{ \ + CV_CPU_DISPATCH(convertto_scaled_simd, (in, out, alpha, beta, length), \ + CV_CPU_DISPATCH_MODES_ALL); \ +} + +CONVERTTO_SCALED_SIMD(uchar, uchar) +CONVERTTO_SCALED_SIMD(ushort, uchar) +CONVERTTO_SCALED_SIMD(short, uchar) +CONVERTTO_SCALED_SIMD(float, uchar) +CONVERTTO_SCALED_SIMD(short, short) +CONVERTTO_SCALED_SIMD(ushort, short) +CONVERTTO_SCALED_SIMD(uchar, short) +CONVERTTO_SCALED_SIMD(float, short) +CONVERTTO_SCALED_SIMD(ushort, ushort) +CONVERTTO_SCALED_SIMD(uchar, ushort) +CONVERTTO_SCALED_SIMD(short, ushort) +CONVERTTO_SCALED_SIMD(float, ushort) +CONVERTTO_SCALED_SIMD(uchar, float) +CONVERTTO_SCALED_SIMD(ushort, float) +CONVERTTO_SCALED_SIMD(short, float) +CONVERTTO_SCALED_SIMD(float, float) + +#undef CONVERTTO_SCALED_SIMD + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp index 3a5d70a045..81aa098b64 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp @@ -266,6 +266,47 @@ SUB_SIMD(float, float) #undef SUB_SIMD +#define CONVERTTO_NOCOEF_SIMD(SRC, DST) \ +int convertto_simd(const SRC in[], DST out[], const int length); + +CONVERTTO_NOCOEF_SIMD(ushort, uchar) +CONVERTTO_NOCOEF_SIMD(short, uchar) +CONVERTTO_NOCOEF_SIMD(float, uchar) +CONVERTTO_NOCOEF_SIMD(ushort, short) +CONVERTTO_NOCOEF_SIMD(uchar, short) +CONVERTTO_NOCOEF_SIMD(float, short) +CONVERTTO_NOCOEF_SIMD(uchar, ushort) +CONVERTTO_NOCOEF_SIMD(short, ushort) +CONVERTTO_NOCOEF_SIMD(float, ushort) +CONVERTTO_NOCOEF_SIMD(uchar, float) +CONVERTTO_NOCOEF_SIMD(ushort, float) +CONVERTTO_NOCOEF_SIMD(short, float) + +#undef CONVERTTO_NOCOEF_SIMD + +#define CONVERTTO_SCALED_SIMD(SRC, DST) \ +int convertto_scaled_simd(const SRC in[], DST out[], const float alpha, \ + const float beta, const int length); + +CONVERTTO_SCALED_SIMD(uchar, uchar) +CONVERTTO_SCALED_SIMD(ushort, uchar) +CONVERTTO_SCALED_SIMD(short, uchar) +CONVERTTO_SCALED_SIMD(float, uchar) +CONVERTTO_SCALED_SIMD(short, short) +CONVERTTO_SCALED_SIMD(ushort, short) +CONVERTTO_SCALED_SIMD(uchar, short) +CONVERTTO_SCALED_SIMD(float, short) +CONVERTTO_SCALED_SIMD(ushort, ushort) +CONVERTTO_SCALED_SIMD(uchar, ushort) +CONVERTTO_SCALED_SIMD(short, ushort) +CONVERTTO_SCALED_SIMD(float, ushort) +CONVERTTO_SCALED_SIMD(uchar, float) +CONVERTTO_SCALED_SIMD(ushort, float) +CONVERTTO_SCALED_SIMD(short, float) +CONVERTTO_SCALED_SIMD(float, float) + +#undef CONVERTTO_SCALED_SIMD + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp index c148f81e77..d1fe33fa2e 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp @@ -275,6 +275,47 @@ SUB_SIMD(float, float) #undef SUB_SIMD +#define CONVERTTO_NOCOEF_SIMD(SRC, DST) \ +int convertto_simd(const SRC in[], DST out[], const int length); + +CONVERTTO_NOCOEF_SIMD(ushort, uchar) +CONVERTTO_NOCOEF_SIMD(short, uchar) +CONVERTTO_NOCOEF_SIMD(float, uchar) +CONVERTTO_NOCOEF_SIMD(ushort, short) +CONVERTTO_NOCOEF_SIMD(uchar, short) +CONVERTTO_NOCOEF_SIMD(float, short) +CONVERTTO_NOCOEF_SIMD(uchar, ushort) +CONVERTTO_NOCOEF_SIMD(short, ushort) +CONVERTTO_NOCOEF_SIMD(float, ushort) +CONVERTTO_NOCOEF_SIMD(uchar, float) +CONVERTTO_NOCOEF_SIMD(ushort, float) +CONVERTTO_NOCOEF_SIMD(short, float) + +#undef CONVERTTO_NOCOEF_SIMD + +#define CONVERTTO_SCALED_SIMD(SRC, DST) \ +int convertto_scaled_simd(const SRC in[], DST out[], const float alpha, \ + const float beta, const int length); + +CONVERTTO_SCALED_SIMD(uchar, uchar) +CONVERTTO_SCALED_SIMD(ushort, uchar) +CONVERTTO_SCALED_SIMD(short, uchar) +CONVERTTO_SCALED_SIMD(float, uchar) +CONVERTTO_SCALED_SIMD(short, short) +CONVERTTO_SCALED_SIMD(ushort, short) +CONVERTTO_SCALED_SIMD(uchar, short) +CONVERTTO_SCALED_SIMD(float, short) +CONVERTTO_SCALED_SIMD(ushort, ushort) +CONVERTTO_SCALED_SIMD(uchar, ushort) +CONVERTTO_SCALED_SIMD(short, ushort) +CONVERTTO_SCALED_SIMD(float, ushort) +CONVERTTO_SCALED_SIMD(uchar, float) +CONVERTTO_SCALED_SIMD(ushort, float) +CONVERTTO_SCALED_SIMD(short, float) +CONVERTTO_SCALED_SIMD(float, float) + +#undef CONVERTTO_SCALED_SIMD + int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[], const int width); @@ -289,6 +330,11 @@ int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[], #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY +#define SRC_SHORT_OR_USHORT std::is_same::value || std::is_same::value +#define DST_SHORT_OR_USHORT std::is_same::value || std::is_same::value +#define SRC_DST_SHORT_AND_USHORT (std::is_same::value && std::is_same::value) || (std::is_same::value && std::is_same::value) +#define SRC_DST_SHORT_OR_USHORT (std::is_same::value && std::is_same::value) || (std::is_same::value && std::is_same::value) + struct scale_tag {}; struct not_scale_tag {}; @@ -2778,6 +2824,314 @@ SUB_SIMD(float, float) #undef SUB_SIMD +//------------------------- +// +// Fluid kernels: ConvertTo +// +//------------------------- + +CV_ALWAYS_INLINE void store_i16(ushort* outx, const v_uint16& res) +{ + vx_store(outx, res); +} + +CV_ALWAYS_INLINE void store_i16(short* outx, const v_uint16& res) +{ + vx_store(outx, v_reinterpret_as_s16(res)); +} + +CV_ALWAYS_INLINE void store_i16(ushort* outx, const v_int16& res) +{ + vx_store(outx, v_reinterpret_as_u16(res)); +} + +CV_ALWAYS_INLINE void store_i16(short* outx, const v_int16& res) +{ + vx_store(outx, res); +} + +CV_ALWAYS_INLINE void convertto_simd_nocoeff_impl(const float* inx, uchar* outx) +{ + constexpr int nlanes = v_uint8::nlanes; + + v_int32 a1 = v_round(vx_load(inx)); + v_int32 a2 = v_round(vx_load(&inx[nlanes/4])); + v_int32 a3 = v_round(vx_load(&inx[nlanes/2])); + v_int32 a4 = v_round(vx_load(&inx[3*nlanes/4])); + + v_int16 r1 = v_pack(a1, a2); + v_int16 r2 = v_pack(a3, a4); + + vx_store(outx, v_pack_u(r1, r2)); +} + +template +CV_ALWAYS_INLINE +typename std::enable_if::type +convertto_simd_nocoeff_impl(const SRC* inx, uchar* outx) +{ + constexpr int nlanes = v_uint8::nlanes; + + vector_type_of_t a1 = vx_load(inx); + vector_type_of_t a2 = vx_load(&inx[nlanes/2]); + + pack_store_uchar(outx, a1, a2); +} + +//--------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE +typename std::enable_if::type +convertto_simd_nocoeff_impl(const float* inx, DST* outx) +{ + constexpr int nlanes = vector_type_of_t::nlanes; + + v_int32 a1 = v_round(vx_load(inx)); + v_int32 a2 = v_round(vx_load(&inx[nlanes/2])); + + v_store_i16(outx, a1, a2); +} + +template +CV_ALWAYS_INLINE +typename std::enable_if::type +convertto_simd_nocoeff_impl(const uchar* inx, DST* outx) +{ + v_uint8 a = vx_load(inx); + v_uint16 res = v_expand_low(a); + + store_i16(outx, res); +} + +template +CV_ALWAYS_INLINE +typename std::enable_if::type +convertto_simd_nocoeff_impl(const SRC* inx, DST* outx) +{ + vector_type_of_t a = vx_load(inx); + store_i16(outx, a); +} + +//--------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE void convertto_simd_nocoeff_impl(const SRC* inx, float* outx) +{ + v_float32 a = vg_load_f32(inx); + vx_store(outx, a); +} + +#define CONVERTTO_NOCOEF_SIMD(SRC, DST) \ +int convertto_simd(const SRC in[], DST out[], const int length) \ +{ \ + constexpr int nlanes = vector_type_of_t::nlanes; \ + \ + int x = 0; \ + for (;;) \ + { \ + for (; x <= length - nlanes; x += nlanes) \ + { \ + convertto_simd_nocoeff_impl(&in[x], &out[x]); \ + } \ + if (x < length) \ + { \ + x = length - nlanes; \ + continue; \ + } \ + break; \ + } \ + return x; \ +} + +CONVERTTO_NOCOEF_SIMD(ushort, uchar) +CONVERTTO_NOCOEF_SIMD(short, uchar) +CONVERTTO_NOCOEF_SIMD(float, uchar) +CONVERTTO_NOCOEF_SIMD(ushort, short) +CONVERTTO_NOCOEF_SIMD(uchar, short) +CONVERTTO_NOCOEF_SIMD(float, short) +CONVERTTO_NOCOEF_SIMD(uchar, ushort) +CONVERTTO_NOCOEF_SIMD(short, ushort) +CONVERTTO_NOCOEF_SIMD(float, ushort) +CONVERTTO_NOCOEF_SIMD(uchar, float) +CONVERTTO_NOCOEF_SIMD(ushort, float) +CONVERTTO_NOCOEF_SIMD(short, float) + +#undef CONVERTTO_NOCOEF_SIMD + +CV_ALWAYS_INLINE void convertto_scaled_simd_impl(const float* inx, uchar* outx, + const v_float32& v_alpha, + const v_float32& v_beta) +{ + constexpr int nlanes = v_uint8::nlanes; + + v_float32 a1 = vx_load(inx); + v_float32 a2 = vx_load(&inx[nlanes / 4]); + v_float32 a3 = vx_load(&inx[nlanes / 2]); + v_float32 a4 = vx_load(&inx[3 * nlanes / 4]); + + v_int32 r1 = v_round(v_fma(a1, v_alpha, v_beta)); + v_int32 r2 = v_round(v_fma(a2, v_alpha, v_beta)); + v_int32 r3 = v_round(v_fma(a3, v_alpha, v_beta)); + v_int32 r4 = v_round(v_fma(a4, v_alpha, v_beta)); + + vx_store(outx, v_pack_u(v_pack(r1, r2), v_pack(r3, r4))); +} + +template +CV_ALWAYS_INLINE +typename std::enable_if::type +convertto_scaled_simd_impl(const SRC* inx, uchar* outx, const v_float32& v_alpha, + const v_float32& v_beta) +{ + constexpr int nlanes = v_uint8::nlanes; + + v_int16 a = v_reinterpret_as_s16(vx_load(inx)); + v_int16 b = v_reinterpret_as_s16(vx_load(&inx[nlanes / 2])); + + v_float32 a1 = v_cvt_f32(v_expand_low(a)); + v_float32 a2 = v_cvt_f32(v_expand_high(a)); + v_float32 b1 = v_cvt_f32(v_expand_low(b)); + v_float32 b2 = v_cvt_f32(v_expand_high(b)); + + v_int32 r1 = v_round(v_fma(a1, v_alpha, v_beta)); + v_int32 r2 = v_round(v_fma(a2, v_alpha, v_beta)); + v_int32 r3 = v_round(v_fma(b1, v_alpha, v_beta)); + v_int32 r4 = v_round(v_fma(b2, v_alpha, v_beta)); + + vx_store(outx, v_pack_u(v_pack(r1, r2), v_pack(r3, r4))); +} + +CV_ALWAYS_INLINE void convertto_scaled_simd_impl(const uchar* inx, uchar* outx, + const v_float32& v_alpha, + const v_float32& v_beta) +{ + v_uint8 a = vx_load(inx); + v_int16 a1 = v_reinterpret_as_s16(v_expand_low(a)); + v_int16 a2 = v_reinterpret_as_s16(v_expand_high(a)); + + v_float32 f1 = v_cvt_f32(v_expand_low(a1)); + v_float32 f2 = v_cvt_f32(v_expand_high(a1)); + + v_float32 f3 = v_cvt_f32(v_expand_low(a2)); + v_float32 f4 = v_cvt_f32(v_expand_high(a2)); + + v_int32 r1 = v_round(v_fma(f1, v_alpha, v_beta)); + v_int32 r2 = v_round(v_fma(f2, v_alpha, v_beta)); + v_int32 r3 = v_round(v_fma(f3, v_alpha, v_beta)); + v_int32 r4 = v_round(v_fma(f4, v_alpha, v_beta)); + + vx_store(outx, v_pack_u(v_pack(r1, r2), v_pack(r3, r4))); +} + +template +CV_ALWAYS_INLINE +typename std::enable_if::type +convertto_scaled_simd_impl(const float* inx, DST* outx, + const v_float32& v_alpha, + const v_float32& v_beta) +{ + constexpr int nlanes = vector_type_of_t::nlanes; + + v_float32 a1 = vx_load(inx); + v_float32 a2 = vx_load(&inx[nlanes / 2]); + + v_int32 r1 = v_round(v_fma(a1, v_alpha, v_beta)); + v_int32 r2 = v_round(v_fma(a2, v_alpha, v_beta)); + + v_store_i16(outx, r1, r2); +} + +template +CV_ALWAYS_INLINE +typename std::enable_if::type +convertto_scaled_simd_impl(const uchar* inx, DST* outx, + const v_float32& v_alpha, + const v_float32& v_beta) +{ + v_int16 a = v_reinterpret_as_s16(vx_load_expand(inx)); + + v_float32 a1 = v_cvt_f32(v_expand_low(a)); + v_float32 a2 = v_cvt_f32(v_expand_high(a)); + + v_int32 r1 = v_round(v_fma(a1, v_alpha, v_beta)); + v_int32 r2 = v_round(v_fma(a2, v_alpha, v_beta)); + + v_store_i16(outx, r1, r2); +} + +template +CV_ALWAYS_INLINE +typename std::enable_if::type +convertto_scaled_simd_impl(const SRC* inx, DST* outx, + const v_float32& v_alpha, + const v_float32& v_beta) +{ + v_int16 a = v_reinterpret_as_s16(vx_load(inx)); + + v_float32 a1 = v_cvt_f32(v_expand_low(a)); + v_float32 a2 = v_cvt_f32(v_expand_high(a)); + + v_int32 r1 = v_round(v_fma(a1, v_alpha, v_beta)); + v_int32 r2 = v_round(v_fma(a2, v_alpha, v_beta)); + + v_store_i16(outx, r1, r2); +} + +template +CV_ALWAYS_INLINE void convertto_scaled_simd_impl(const SRC* inx, float* outx, + const v_float32& v_alpha, + const v_float32& v_beta) +{ + v_float32 a = vg_load_f32(inx); + vx_store(outx, v_fma(a, v_alpha, v_beta)); +} + +#define CONVERTTO_SCALED_SIMD(SRC, DST) \ +int convertto_scaled_simd(const SRC in[], DST out[], const float alpha, \ + const float beta, const int length) \ +{ \ + constexpr int nlanes = vector_type_of_t::nlanes; \ + v_float32 v_alpha = vx_setall_f32(alpha); \ + v_float32 v_beta = vx_setall_f32(beta); \ + \ + int x = 0; \ + for (;;) \ + { \ + for (; x <= length - nlanes; x += nlanes) \ + { \ + convertto_scaled_simd_impl(&in[x], &out[x], v_alpha, v_beta); \ + } \ + if (x < length) \ + { \ + x = length - nlanes; \ + continue; \ + } \ + break; \ + } \ + return x; \ +} + +CONVERTTO_SCALED_SIMD(uchar, uchar) +CONVERTTO_SCALED_SIMD(ushort, uchar) +CONVERTTO_SCALED_SIMD(short, uchar) +CONVERTTO_SCALED_SIMD(float, uchar) +CONVERTTO_SCALED_SIMD(short, short) +CONVERTTO_SCALED_SIMD(ushort, short) +CONVERTTO_SCALED_SIMD(uchar, short) +CONVERTTO_SCALED_SIMD(float, short) +CONVERTTO_SCALED_SIMD(ushort, ushort) +CONVERTTO_SCALED_SIMD(uchar, ushort) +CONVERTTO_SCALED_SIMD(short, ushort) +CONVERTTO_SCALED_SIMD(float, ushort) +CONVERTTO_SCALED_SIMD(uchar, float) +CONVERTTO_SCALED_SIMD(ushort, float) +CONVERTTO_SCALED_SIMD(short, float) +CONVERTTO_SCALED_SIMD(float, float) + +#undef CONVERTTO_SCALED_SIMD + #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY CV_CPU_OPTIMIZATION_NAMESPACE_END