diff --git a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp index 72837da199..c644fd1587 100644 --- a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp +++ b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp @@ -528,6 +528,10 @@ PERF_TEST_P_(DivRCPerfTest, TestPerformance) // FIXIT Unstable input data for divide initMatsRandU(type, sz, dtype, false); + //This condition need as workaround the bug in the OpenCV. + //It reinitializes divider matrix without zero values for CV_16S DST type. + if (dtype == CV_16S || (type == CV_16S && dtype == -1)) + cv::randu(in_mat1, cv::Scalar::all(1), cv::Scalar::all(255)); // OpenCV code /////////////////////////////////////////////////////////// cv::divide(sc, in_mat1, out_mat_ocv, scale, dtype); diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp index 796d05101e..a142109315 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp @@ -101,8 +101,8 @@ INSTANTIATE_TEST_CASE_P(DivCPerfTestFluid, DivCPerfTest, INSTANTIATE_TEST_CASE_P(DivRCPerfTestFluid, DivRCPerfTest, Combine(Values(Tolerance_FloatRel_IntAbs(1e-5, 1).to_compare_f()), Values(szSmall128, szVGA, sz720p, sz1080p), - Values(CV_8UC1, CV_8UC3, CV_32FC1), - Values(-1, CV_8U, CV_32F), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(-1, CV_8U, CV_16U, CV_16S, CV_32F), Values(1.0), Values(cv::compile_args(CORE_FLUID)))); diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp index 22f73e553c..bb33c45d85 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp @@ -936,8 +936,8 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca } template -static void run_arithm_rs(Buffer &dst, const View &src, const float scalar[4], Arithm arithm, - float scale=1) +CV_ALWAYS_INLINE void run_arithm_rs(Buffer &dst, const View &src, const float scalar[], + Arithm arithm, float scale=1) { const auto *in = src.InLine(0); auto *out = dst.OutLine(); @@ -955,15 +955,23 @@ static void run_arithm_rs(Buffer &dst, const View &src, const float scalar[4], A w = subrc_simd(scalar, in, out, length, chan); #endif for (; w < length; ++w) + { out[w] = subr(in[w], scalar[w % chan]); + } break; } - // TODO: optimize division case ARITHM_DIVIDE: - for (int w=0; w < width; w++) - for (int c=0; c < chan; c++) - out[chan*w + c] = div(scalar[c], in[chan*w + c], scale); + { + int w = 0; +#if CV_SIMD + w = divrc_simd(scalar, in, out, length, chan, scale); +#endif + for (; w < length; ++w) + { + out[w] = div(scalar[w % chan], in[w], scale); + } break; + } default: CV_Error(cv::Error::StsBadArg, "unsupported arithmetic operation"); } } @@ -1319,7 +1327,9 @@ CV_ALWAYS_INLINE void run_divc(Buffer& dst, const View& src, Buffer& scratch, #endif for (; w < length; ++w) + { out[w] = div(in[w], scalar[w % chan], scale); + } } GAPI_FLUID_KERNEL(GFluidDivC, cv::gapi::core::GDivC, true) @@ -1402,32 +1412,55 @@ GAPI_FLUID_KERNEL(GFluidDivC, cv::gapi::core::GDivC, true) } }; -GAPI_FLUID_KERNEL(GFluidDivRC, cv::gapi::core::GDivRC, false) +GAPI_FLUID_KERNEL(GFluidDivRC, cv::gapi::core::GDivRC, true) { static const int Window = 1; - static void run(const cv::Scalar &_scalar, const View &src, double _scale, int /*dtype*/, - Buffer &dst) + static void run(const cv::Scalar& _scalar, const View& src, double _scale, int /*dtype*/, + Buffer& dst, Buffer& scratch) { - const float scalar[4] = { - static_cast(_scalar[0]), - static_cast(_scalar[1]), - static_cast(_scalar[2]), - static_cast(_scalar[3]) - }; + GAPI_Assert(src.meta().chan <= 4); + + if (dst.y() == 0) + { + const int chan = src.meta().chan; + float* _scratch = scratch.OutLine(); + + scalar_to_scratch(_scalar, _scratch, scratch.length(), chan); + } + + const float* scalar = scratch.OutLine(); const float scale = static_cast(_scale); // DST SRC OP __VA_ARGS__ - UNARY_(uchar , uchar , run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); - UNARY_(uchar , short, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); - UNARY_(uchar , float, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); - UNARY_( short, short, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); - UNARY_( float, uchar , run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); - UNARY_( float, short, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); - UNARY_( float, float, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); + UNARY_(uchar, uchar, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); + UNARY_(uchar, ushort, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); + UNARY_(uchar, short, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); + UNARY_(uchar, float, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); + UNARY_(ushort, ushort, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); + UNARY_(ushort, uchar, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); + UNARY_(ushort, short, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); + UNARY_(ushort, float, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); + UNARY_(short, short, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); + UNARY_(short, uchar, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); + UNARY_(short, ushort, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); + UNARY_(short, float, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); + UNARY_(float, uchar, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); + UNARY_(float, ushort, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); + UNARY_(float, short, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); + UNARY_(float, float, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } + + static void initScratch(const GScalarDesc&, const GMatDesc&, double, int, Buffer& scratch) + { + initScratchBuffer(scratch); + } + + static void resetScratch(Buffer& /*scratch*/) + { + } }; //------------------- diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp index 9afac9ceb4..a682fb7dbb 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp @@ -235,6 +235,33 @@ ABSDIFFC_SIMD(float) #undef ABSDIFFC_SIMD +#define DIVRC_SIMD(SRC, DST) \ +int divrc_simd(const float scalar[], const SRC in[], DST out[], \ + const int length, const int chan, const float scale) \ +{ \ + CV_CPU_DISPATCH(divrc_simd, (scalar, in, out, length, chan, scale), \ + CV_CPU_DISPATCH_MODES_ALL); \ +} + +DIVRC_SIMD(uchar, uchar) +DIVRC_SIMD(ushort, uchar) +DIVRC_SIMD(short, uchar) +DIVRC_SIMD(float, uchar) +DIVRC_SIMD(short, short) +DIVRC_SIMD(ushort, short) +DIVRC_SIMD(uchar, short) +DIVRC_SIMD(float, short) +DIVRC_SIMD(ushort, ushort) +DIVRC_SIMD(uchar, ushort) +DIVRC_SIMD(short, ushort) +DIVRC_SIMD(float, ushort) +DIVRC_SIMD(uchar, float) +DIVRC_SIMD(ushort, float) +DIVRC_SIMD(short, float) +DIVRC_SIMD(float, float) + +#undef DIVRC_SIMD + int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[], const int width) { diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp index 868923932d..975383a8d9 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp @@ -187,6 +187,29 @@ ABSDIFFC_SIMD(float) #undef ABSDIFFC_SIMD +#define DIVRC_SIMD(SRC, DST) \ +int divrc_simd(const float scalar[], const SRC in[], DST out[], \ + const int length, const int chan, const float scale); + +DIVRC_SIMD(uchar, uchar) +DIVRC_SIMD(ushort, uchar) +DIVRC_SIMD(short, uchar) +DIVRC_SIMD(float, uchar) +DIVRC_SIMD(short, short) +DIVRC_SIMD(ushort, short) +DIVRC_SIMD(uchar, short) +DIVRC_SIMD(float, short) +DIVRC_SIMD(ushort, ushort) +DIVRC_SIMD(uchar, ushort) +DIVRC_SIMD(short, ushort) +DIVRC_SIMD(float, ushort) +DIVRC_SIMD(uchar, float) +DIVRC_SIMD(ushort, float) +DIVRC_SIMD(short, float) +DIVRC_SIMD(float, float) + +#undef DIVRC_SIMD + int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[], const int width); diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp index 2424a57677..b92d92d0cf 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp @@ -150,8 +150,8 @@ SUBRC_SIMD(float, float) #undef SUBRC_SIMD -#define MULC_SIMD(SRC, DST) \ -int mulc_simd(const SRC in[], const float scalar[], DST out[], \ +#define MULC_SIMD(SRC, DST) \ +int mulc_simd(const SRC in[], const float scalar[], DST out[], \ const int length, const int chan, const float scale); MULC_SIMD(uchar, uchar) @@ -173,9 +173,9 @@ MULC_SIMD(float, float) #undef MULC_SIMD -#define DIVC_SIMD(SRC, DST) \ -int divc_simd(const SRC in[], const float scalar[], DST out[], \ - const int length, const int chan, const float scale, \ +#define DIVC_SIMD(SRC, DST) \ +int divc_simd(const SRC in[], const float scalar[], DST out[], \ + const int length, const int chan, const float scale, \ const int set_mask_flag); DIVC_SIMD(uchar, uchar) @@ -208,6 +208,29 @@ ABSDIFFC_SIMD(float) #undef ABSDIFFC_SIMD +#define DIVRC_SIMD(SRC, DST) \ +int divrc_simd(const float scalar[], const SRC in[], DST out[], \ + const int length, const int chan, const float scale); + +DIVRC_SIMD(uchar, uchar) +DIVRC_SIMD(ushort, uchar) +DIVRC_SIMD(short, uchar) +DIVRC_SIMD(float, uchar) +DIVRC_SIMD(short, short) +DIVRC_SIMD(ushort, short) +DIVRC_SIMD(uchar, short) +DIVRC_SIMD(float, short) +DIVRC_SIMD(ushort, ushort) +DIVRC_SIMD(uchar, ushort) +DIVRC_SIMD(short, ushort) +DIVRC_SIMD(float, ushort) +DIVRC_SIMD(uchar, float) +DIVRC_SIMD(ushort, float) +DIVRC_SIMD(short, float) +DIVRC_SIMD(float, float) + +#undef DIVRC_SIMD + int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[], const int width); @@ -236,6 +259,28 @@ template<> struct vector_type_of { using type = v_uint16; }; template<> struct vector_type_of { using type = v_int16; }; template<> struct vector_type_of { using type = v_float32; }; +template +struct zero_vec_type_of; + +template +using zero_vec_type_of_t = typename zero_vec_type_of::type; + +template<> struct zero_vec_type_of { using type = v_int16; }; +template<> struct zero_vec_type_of { using type = v_int16; }; +template<> struct zero_vec_type_of { using type = v_int16; }; +template<> struct zero_vec_type_of { using type = v_float32; }; + +template +struct univ_zero_vec_type_of; + +template +using univ_zero_vec_type_of_t = typename univ_zero_vec_type_of::type; + +template<> struct univ_zero_vec_type_of { using type = v_uint8; }; +template<> struct univ_zero_vec_type_of { using type = v_int16; }; +template<> struct univ_zero_vec_type_of { using type = v_int16; }; +template<> struct univ_zero_vec_type_of { using type = v_float32; }; + CV_ALWAYS_INLINE v_float32 vg_load_f32(const float* in) { return vx_load(in); @@ -295,165 +340,111 @@ CV_ALWAYS_INLINE void v_store_select(short* dst, const v_int16& div, const v_int CV_ALWAYS_INLINE void v_store_select(ushort* dst, const v_int16& div, const v_int16& v_zero, const v_int32& res1, const v_int32& res2) { - v_uint16 sel = v_reinterpret_as_u16(v_select(div == v_zero, v_zero, v_pack(res1, res2))); - vx_store(dst, sel); + vx_store(dst, v_select(v_reinterpret_as_u16(div == v_zero), + v_reinterpret_as_u16(v_zero), v_pack_u(res1, res2))); } -//================================================================================================= +//============================================================================= -template +template CV_ALWAYS_INLINE -typename std::enable_if<(std::is_same::value && std::is_same::value) || - (std::is_same::value && std::is_same::value) || - (std::is_same::value && std::is_same::value), int>::type -div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], DST out[], const int length, double _scale) +void div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2, + const v_float32& a3, const v_float32& a4, const uchar* in2x, + uchar* outx, const v_float32& v_scale, const v_int16& v_zero) { - constexpr int nlanes = vector_type_of_t::nlanes; - - if (length < nlanes) - return 0; - - v_int16 v_zero = vx_setall_s16(0); - v_float32 scale = vx_setall_f32(static_cast(_scale)); - - int x = 0; - for (;;) - { - for (; x <= length - nlanes; x += nlanes) - { - v_float32 a1 = vg_load_f32(&in1[x]); - v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]); + constexpr int nlanes = v_uint8::nlanes; - v_int16 div = v_reinterpret_as_s16(vx_load(&in2[x])); + v_int16 div1 = v_reinterpret_as_s16(vx_load_expand(in2x)); + v_int16 div2 = v_reinterpret_as_s16(vx_load_expand(&in2x[nlanes/2])); - v_float32 fdiv1 = v_cvt_f32(v_expand_low(div)); - v_float32 fdiv2 = v_cvt_f32(v_expand_high(div)); + v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1)); + v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1)); + v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2)); + v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2)); - v_int32 r1 = v_round(div_op(t, a1, fdiv1, scale)); - v_int32 r2 = v_round(div_op(t, a2, fdiv2, scale)); + v_int32 sum1 = v_round(div_op(s_tag, a1, fdiv1, v_scale)), + sum2 = v_round(div_op(s_tag, a2, fdiv2, v_scale)), + sum3 = v_round(div_op(s_tag, a3, fdiv3, v_scale)), + sum4 = v_round(div_op(s_tag, a4, fdiv4, v_scale)); - v_store_select(&out[x], div, v_zero, r1, r2); - } + v_int16 res1 = v_select((div1 == v_zero), v_zero, v_pack(sum1, sum2)); + v_int16 res2 = v_select((div2 == v_zero), v_zero, v_pack(sum3, sum4)); - if (x < length) - { - x = length - nlanes; - continue; // process one more time (unaligned tail) - } - break; - } - return x; + vx_store(outx, v_pack_u(res1, res2)); } -//------------------------------------------------------------------------------------------------- - template CV_ALWAYS_INLINE typename std::enable_if::value || - std::is_same::value, int>::type -div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], uchar out[], const int length, double _scale) + std::is_same::value, void>::type +div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2, + const v_float32& a3, const v_float32& a4, const SRC* in2x, + uchar* outx, const v_float32& v_scale, const v_int16& v_zero) { constexpr int nlanes = v_uint8::nlanes; - if (length < nlanes) - return 0; + v_int16 div1 = v_reinterpret_as_s16(vx_load(in2x)); + v_int16 div2 = v_reinterpret_as_s16(vx_load(&in2x[nlanes/2])); - v_float32 scale = vx_setall_f32(static_cast(_scale)); - v_int16 v_zero = vx_setall_s16(0); + v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1)); + v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1)); + v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2)); + v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2)); - int x = 0; - for (;;) - { - for (; x <= length - nlanes; x += nlanes) - { - v_float32 a1 = vg_load_f32(&in1[x]); - v_float32 a2 = vg_load_f32(&in1[x + nlanes / 4]); - v_float32 a3 = vg_load_f32(&in1[x + nlanes / 2]); - v_float32 a4 = vg_load_f32(&in1[x + 3 * nlanes / 4]); - - v_int16 div1 = v_reinterpret_as_s16(vx_load(&in2[x])); - v_int16 div2 = v_reinterpret_as_s16(vx_load(&in2[x + nlanes/2])); - - v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1)); - v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1)); - v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2)); - v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2)); + v_int32 sum1 = v_round(div_op(s_tag, a1, fdiv1, v_scale)), + sum2 = v_round(div_op(s_tag, a2, fdiv2, v_scale)), + sum3 = v_round(div_op(s_tag, a3, fdiv3, v_scale)), + sum4 = v_round(div_op(s_tag, a4, fdiv4, v_scale)); - v_int32 sum1 = v_round(div_op(t, a1, fdiv1, scale)), - sum2 = v_round(div_op(t, a2, fdiv2, scale)), - sum3 = v_round(div_op(t, a3, fdiv3, scale)), - sum4 = v_round(div_op(t, a4, fdiv4, scale)); + v_int16 res1 = v_select((div1 == v_zero), v_zero, v_pack(sum1, sum2)); + v_int16 res2 = v_select((div2 == v_zero), v_zero, v_pack(sum3, sum4)); - v_int16 res1 = v_select((div1 == v_zero), v_zero, v_pack(sum1, sum2)); - v_int16 res2 = v_select((div2 == v_zero), v_zero, v_pack(sum3, sum4)); - - vx_store(&out[x], v_pack_u(res1, res2)); - } - - if (x < length) - { - x = length - nlanes; - continue; // process one more time (unaligned tail) - } - break; - } - return x; + vx_store(outx, v_pack_u(res1, res2)); } -//------------------------------------------------------------------------------------------------- - template -CV_ALWAYS_INLINE int div_hal(scale_tag_t t, const float in1[], const float in2[], uchar out[], - const int length, double _scale) +CV_ALWAYS_INLINE void div_simd_impl(scale_tag_t s_tag, const v_float32& a1, + const v_float32& a2, const v_float32& a3, + const v_float32& a4, const float* in2x, uchar* outx, + const v_float32& v_scale, const v_float32& v_zero) { constexpr int nlanes = v_uint8::nlanes; - if (length < nlanes) - return 0; + v_float32 div1 = vg_load_f32(in2x); + v_float32 div2 = vg_load_f32(&in2x[nlanes / 4]); + v_float32 div3 = vg_load_f32(&in2x[nlanes / 2]); + v_float32 div4 = vg_load_f32(&in2x[3 * nlanes / 4]); - v_float32 scale = vx_setall_f32(static_cast(_scale)); - v_float32 v_zero = vx_setall_f32(0); - int x = 0; - for (;;) - { - for (; x <= length - nlanes; x += nlanes) - { - v_float32 a1 = vg_load_f32(&in1[x]); - v_float32 a2 = vg_load_f32(&in1[x + nlanes / 4]); - v_float32 a3 = vg_load_f32(&in1[x + nlanes / 2]); - v_float32 a4 = vg_load_f32(&in1[x + 3 * nlanes / 4]); + v_float32 r1 = div_op(s_tag, a1, div1, v_scale); + v_float32 r2 = div_op(s_tag, a2, div2, v_scale); + v_float32 r3 = div_op(s_tag, a3, div3, v_scale); + v_float32 r4 = div_op(s_tag, a4, div4, v_scale); - v_float32 div1 = vg_load_f32(&in2[x]); - v_float32 div2 = vg_load_f32(&in2[x + nlanes / 4]); - v_float32 div3 = vg_load_f32(&in2[x + nlanes / 2]); - v_float32 div4 = vg_load_f32(&in2[x + 3 * nlanes / 4]); + v_float32 sel1 = v_select((div1 == v_zero), v_zero, r1); + v_float32 sel2 = v_select((div2 == v_zero), v_zero, r2); + v_float32 sel3 = v_select((div3 == v_zero), v_zero, r3); + v_float32 sel4 = v_select((div4 == v_zero), v_zero, r4); - v_float32 r1 = div_op(t, a1, div1, scale); - v_float32 r2 = div_op(t, a2, div2, scale); - v_float32 r3 = div_op(t, a3, div3, scale); - v_float32 r4 = div_op(t, a4, div4, scale); + v_int32 res1 = v_round(sel1); + v_int32 res2 = v_round(sel2); + v_int32 res3 = v_round(sel3); + v_int32 res4 = v_round(sel4); - v_float32 sel1 = v_select((div1 == v_zero), v_zero, r1); - v_float32 sel2 = v_select((div2 == v_zero), v_zero, r2); - v_float32 sel3 = v_select((div3 == v_zero), v_zero, r3); - v_float32 sel4 = v_select((div4 == v_zero), v_zero, r4); + vx_store(outx, v_pack_u(v_pack(res1, res2), v_pack(res3, res4))); +} - v_int32 res1 = v_round(sel1); - v_int32 res2 = v_round(sel2); - v_int32 res3 = v_round(sel3); - v_int32 res4 = v_round(sel4); +template +CV_ALWAYS_INLINE void div_hal(scale_tag_t s_tag, const SRC* in1x, const SRC* in2x, uchar* outx, + const v_float32& v_scale, const Vtype& v_zero) +{ + constexpr int nlanes = v_uint8::nlanes; - vx_store(&out[x], v_pack_u(v_pack(res1, res2), v_pack(res3, res4))); - } + v_float32 a1 = vg_load_f32(in1x); + v_float32 a2 = vg_load_f32(&in1x[nlanes / 4]); + v_float32 a3 = vg_load_f32(&in1x[nlanes / 2]); + v_float32 a4 = vg_load_f32(&in1x[3 * nlanes / 4]); - if (x < length) - { - x = length - nlanes; - continue; // process one more time (unaligned tail) - } - break; - } - return x; + div_simd_impl(s_tag, a1, a2, a3, a4, in2x, outx, v_scale, v_zero); } //------------------------------------------------------------------------------------------------- @@ -461,113 +452,117 @@ CV_ALWAYS_INLINE int div_hal(scale_tag_t t, const float in1[], const float in2[] template CV_ALWAYS_INLINE typename std::enable_if::value || - std::is_same::value, int>::type -div_hal(scale_tag_t t, const uchar in1[], const uchar in2[], DST out[], const int length, double _scale) + std::is_same::value, void>::type +div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2, + const uchar* in2x, DST* outx, const v_float32& v_scale, + const v_int16& v_zero) { - constexpr int nlanes = vector_type_of_t::nlanes; - - if (length < nlanes) - return 0; + v_int16 div = v_reinterpret_as_s16(vx_load_expand(in2x)); - v_float32 scale = vx_setall_f32(static_cast(_scale)); - v_int16 v_zero = vx_setall_s16(0); + v_float32 fdiv1 = v_cvt_f32(v_expand_low(div)); + v_float32 fdiv2 = v_cvt_f32(v_expand_high(div)); - int x = 0; - for (;;) - { - for (; x <= length - nlanes; x += nlanes) - { - v_float32 a1 = vg_load_f32(&in1[x]); - v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]); + v_int32 r1 = v_round(div_op(s_tag, a1, fdiv1, v_scale)); + v_int32 r2 = v_round(div_op(s_tag, a2, fdiv2, v_scale)); - v_int16 div = v_reinterpret_as_s16(vx_load_expand(&in2[x])); + v_store_select(outx, div, v_zero, r1, r2); +} - v_float32 fdiv1 = v_cvt_f32(v_expand_low(div)); - v_float32 fdiv2 = v_cvt_f32(v_expand_high(div)); +template +CV_ALWAYS_INLINE +typename std::enable_if<(std::is_same::value && std::is_same::value) || + (std::is_same::value && std::is_same::value) || + (std::is_same::value && std::is_same::value) || + (std::is_same::value && std::is_same::value), void>::type +div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2, + const SRC* in2x, DST* outx, const v_float32& v_scale, const v_int16& v_zero) +{ + v_int16 div = v_reinterpret_as_s16(vx_load(in2x)); - v_int32 r1 = v_round(div_op(t, a1, fdiv1, scale)); - v_int32 r2 = v_round(div_op(t, a2, fdiv2, scale)); + v_float32 fdiv1 = v_cvt_f32(v_expand_low(div)); + v_float32 fdiv2 = v_cvt_f32(v_expand_high(div)); - v_store_select(&out[x], div, v_zero, r1, r2); - } + v_int32 r1 = v_round(div_op(s_tag, a1, fdiv1, v_scale)); + v_int32 r2 = v_round(div_op(s_tag, a2, fdiv2, v_scale)); - if (x < length) - { - x = length - nlanes; - continue; // process one more time (unaligned tail) - } - break; - } - return x; + v_store_select(outx, div, v_zero, r1, r2); } -//------------------------------------------------------------------------------------------------- - template CV_ALWAYS_INLINE typename std::enable_if::value || - std::is_same::value, int>::type -div_hal(scale_tag_t t, const float in1[], const float in2[], DST out[], const int length, double _scale) + std::is_same::value, void>::type +div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const v_float32& a2, + const float* in2x, DST* outx, const v_float32& v_scale, + const v_float32& v_zero) { constexpr int nlanes = vector_type_of_t::nlanes; - if (length < nlanes) - return 0; + v_float32 fdiv1 = vg_load_f32(in2x); + v_float32 fdiv2 = vg_load_f32(&in2x[nlanes / 2]); - v_float32 scale = vx_setall_f32(static_cast(_scale)); - v_float32 v_zero = vx_setall_f32(0); - int x = 0; - for (;;) - { - for (; x <= length - nlanes; x += nlanes) - { - v_float32 a1 = vg_load_f32(&in1[x]); - v_float32 a2 = vg_load_f32(&in1[x + nlanes / 2]); + v_float32 r1 = div_op(s_tag, a1, fdiv1, v_scale); + v_float32 r2 = div_op(s_tag, a2, fdiv2, v_scale); - v_float32 fdiv1 = vg_load_f32(&in2[x]); - v_float32 fdiv2 = vg_load_f32(&in2[x + nlanes / 2]); + v_int32 res1 = v_round(v_select((fdiv1 == v_zero), v_zero, r1)); + v_int32 res2 = v_round(v_select((fdiv2 == v_zero), v_zero, r2)); - v_float32 r1 = div_op(t, a1, fdiv1, scale); - v_float32 r2 = div_op(t, a2, fdiv2, scale); + v_store_i16(outx, res1, res2); +} - v_int32 res1 = v_round(v_select((fdiv1 == v_zero), v_zero, r1)); - v_int32 res2 = v_round(v_select((fdiv2 == v_zero), v_zero, r2)); +template +CV_ALWAYS_INLINE +typename std::enable_if::value || + std::is_same::value, void>::type +div_hal(scale_tag_t s_tag, const SRC* in1x, const SRC* in2x, DST* outx, + const v_float32& v_scale, const Vtype& v_zero) +{ + constexpr int nlanes = vector_type_of_t::nlanes; - v_store_i16(&out[x], res1, res2); - } + v_float32 a1 = vg_load_f32(in1x); + v_float32 a2 = vg_load_f32(&in1x[nlanes / 2]); - if (x < length) - { - x = length - nlanes; - continue; // process one more time (unaligned tail) - } - break; - } - return x; + div_simd_impl(s_tag, a1, a2, in2x, outx, v_scale, v_zero); } //------------------------------------------------------------------------------------------------- template -CV_ALWAYS_INLINE int div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], float out[], - const int length, double _scale) +CV_ALWAYS_INLINE void div_simd_impl(scale_tag_t s_tag, const v_float32& a1, const SRC* in2x, + float* outx, const v_float32& v_scale) { - constexpr int nlanes = v_float32::nlanes; + v_float32 b1 = vg_load_f32(in2x); + vx_store(outx, div_op(s_tag, a1, b1, v_scale)); +} + +template +CV_ALWAYS_INLINE void div_hal(scale_tag_t s_tag, const SRC* in1x, const SRC* in2x, float* outx, + const v_float32& v_scale, const Tvec&) +{ + v_float32 a1 = vg_load_f32(in1x); + div_simd_impl(s_tag, a1, in2x, outx, v_scale); +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE int div_simd_common(scale_tag_t s_tag, const SRC in1[], const SRC in2[], + DST out[], const int length, float scale) +{ + constexpr int nlanes = vector_type_of_t::nlanes; if (length < nlanes) return 0; - v_float32 scale = vx_setall_f32(static_cast(_scale)); + const zero_vec_type_of_t v_zero = vx_setall::lane_type>(0); + v_float32 v_scale = vx_setall_f32(scale); int x = 0; for (;;) { for (; x <= length - nlanes; x += nlanes) { - v_float32 a1 = vg_load_f32(&in1[x]); - v_float32 b1 = vg_load_f32(&in2[x]); - - vx_store(&out[x], div_op(t, a1, b1, scale)); + div_hal(s_tag, &in1[x], &in2[x], &out[x], v_scale, v_zero); } if (x < length) @@ -580,28 +575,6 @@ CV_ALWAYS_INLINE int div_hal(scale_tag_t t, const SRC in1[], const SRC in2[], fl return x; } -//------------------------------------------------------------------------------------------------- - -template -CV_ALWAYS_INLINE int div_hal(scale_tag_t, const uchar in1[], const uchar in2[], uchar out[], - const int length, double scale) -{ - hal::div8u(in1, static_cast(length), in2, static_cast(length), - out, static_cast(length), length, 1, &scale); - return length; -} - -template -CV_ALWAYS_INLINE int div_hal(scale_tag_t, const short in1[], const short in2[], short out[], - const int length, double scale) -{ - hal::div16s(in1, static_cast(length), in2, static_cast(length), - out, static_cast(length), length, 1, &scale); - return length; -} - -//------------------------------------------------------------------------------------------------- - #define DIV_SIMD(SRC, DST) \ int div_simd(const SRC in1[], const SRC in2[], DST out[], \ const int length, double _scale) \ @@ -610,13 +583,11 @@ int div_simd(const SRC in1[], const SRC in2[], DST out[], float fscale = static_cast(_scale); \ if (std::fabs(fscale - 1.0f) <= FLT_EPSILON) \ { \ - not_scale_tag t; \ - x = div_hal(t, in1, in2, out, length, _scale); \ + x = div_simd_common(not_scale_tag{}, in1, in2, out, length, fscale); \ } \ else \ { \ - scale_tag t; \ - x = div_hal(t, in1, in2, out, length, _scale); \ + x = div_simd_common(scale_tag{}, in1, in2, out, length, fscale); \ } \ return x; \ } @@ -1553,7 +1524,7 @@ int mulc_simd(const SRC in[], const float scalar[], DST out[], \ else \ { \ return arithmOpScalarScaled_simd_common(op_t, in, scalar, out, \ - length, scale); \ + length, scale); \ } \ } \ case 3: \ @@ -1743,11 +1714,11 @@ divc_simd_c3_impl(scale_tag_t s_tag, SRC in[], DST out[], const v_float32& s1, v_float32 a6 = vg_load_f32(&in[x + 5 * nlanes / 2]); arithmOpScalar_pack_store_c3(&out[x], v_round(v_select(v_mask1, v_zero, div_op(s_tag, a1, s1, v_scale))), - v_round(v_select(v_mask2, v_zero, div_op(s_tag, a2, s2, v_scale))), - v_round(v_select(v_mask3, v_zero, div_op(s_tag, a3, s3, v_scale))), - v_round(v_select(v_mask1, v_zero, div_op(s_tag, a4, s1, v_scale))), - v_round(v_select(v_mask2, v_zero, div_op(s_tag, a5, s2, v_scale))), - v_round(v_select(v_mask3, v_zero, div_op(s_tag, a6, s3, v_scale)))); + v_round(v_select(v_mask2, v_zero, div_op(s_tag, a2, s2, v_scale))), + v_round(v_select(v_mask3, v_zero, div_op(s_tag, a3, s3, v_scale))), + v_round(v_select(v_mask1, v_zero, div_op(s_tag, a4, s1, v_scale))), + v_round(v_select(v_mask2, v_zero, div_op(s_tag, a5, s2, v_scale))), + v_round(v_select(v_mask3, v_zero, div_op(s_tag, a6, s3, v_scale)))); } if (x < length) @@ -1976,14 +1947,432 @@ ABSDIFFC_SIMD(float) #undef ABSDIFFC_SIMD +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE +typename std::enable_if::value || + std::is_same::value, void>::type +divrc_simd_common_impl(scale_tag_t s_tag, const SRC* inx, + const v_float32& v_scalar, DST* outx, + const v_float32& v_scale, const Tvec& v_zero) +{ + div_simd_impl(s_tag, v_scalar, v_scalar, inx, outx, v_scale, v_zero); +} + +template +CV_ALWAYS_INLINE +typename std::enable_if::value, void>::type +divrc_simd_common_impl(scale_tag_t s_tag, const SRC* inx, + const v_float32& v_scalar, DST* outx, + const v_float32& v_scale, const Tvec& v_zero) +{ + div_simd_impl(s_tag, v_scalar, v_scalar, v_scalar, v_scalar, inx, outx, v_scale, v_zero); +} + +template +CV_ALWAYS_INLINE +typename std::enable_if::value, void>::type +divrc_simd_common_impl(scale_tag_t s_tag, const SRC* inx, + const v_float32& v_scalar, DST* outx, + const v_float32& v_scale, const Tvec&) +{ + div_simd_impl(s_tag, v_scalar, inx, outx, v_scale); +} + +template +CV_ALWAYS_INLINE int divrc_simd_common(scale_tag_t s_tag, const SRC in[], + const float scalar[], DST out[], + const int length, const float scale) +{ + constexpr int nlanes = vector_type_of_t::nlanes; + + if (length < nlanes) + return 0; + + v_float32 v_scalar = vx_load(scalar); + v_float32 v_scale = vx_setall_f32(scale); + zero_vec_type_of_t v_zero = + vx_setall::lane_type>(0); + + int x = 0; + for (;;) + { + for (; x <= length - nlanes; x += nlanes) + { + divrc_simd_common_impl(s_tag, &in[x], v_scalar, &out[x], v_scale, v_zero); + } + + if (x < length) + { + x = length - nlanes; + continue; // process unaligned tail + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE void divrc_simd_c3_calc(scale_tag_t s_tag, const uchar* inx, uchar* outx, + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const v_float32& v_scale, + const v_uint8& v_zero) +{ + v_uint8 div = vx_load(inx); + v_uint8 v_mask = (div == v_zero); + + v_uint16 div1 = v_expand_low(div); + v_uint16 div2 = v_expand_high(div); + + v_float32 fdiv1 = v_cvt_f32(v_reinterpret_as_s32(v_expand_low(div1))); + v_float32 fdiv2 = v_cvt_f32(v_reinterpret_as_s32(v_expand_high(div1))); + v_float32 fdiv3 = v_cvt_f32(v_reinterpret_as_s32(v_expand_low(div2))); + v_float32 fdiv4 = v_cvt_f32(v_reinterpret_as_s32(v_expand_high(div2))); + + vx_store(outx, + v_select(v_mask, v_zero, v_pack_u(v_pack(v_round(div_op(s_tag, s1, fdiv1, v_scale)), + v_round(div_op(s_tag, s2, fdiv2, v_scale))), + v_pack(v_round(div_op(s_tag, s3, fdiv3, v_scale)), + v_round(div_op(s_tag, s1, fdiv4, v_scale)))))); +} + +template +CV_ALWAYS_INLINE +typename std::enable_if::value || + std::is_same::value, void>::type +divrc_simd_c3_calc(scale_tag_t s_tag, const SRC* inx, uchar* outx, + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const v_float32& v_scale, + const v_int16& v_zero) +{ + constexpr int nlanes = v_uint8::nlanes; + + v_int16 div1 = v_reinterpret_as_s16(vx_load(inx)); + v_int16 div2 = v_reinterpret_as_s16(vx_load(&inx[nlanes / 2])); + + v_int16 v_mask1 = (div1 == v_zero); + v_int16 v_mask2 = (div2 == v_zero); + + v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1)); + v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1)); + v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2)); + v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2)); + + vx_store(outx, + v_pack_u(v_select(v_mask1, v_zero, + v_pack(v_round(div_op(s_tag, s1, fdiv1, v_scale)), + v_round(div_op(s_tag, s2, fdiv2, v_scale)))), + v_select(v_mask2, v_zero, + v_pack(v_round(div_op(s_tag, s3, fdiv3, v_scale)), + v_round(div_op(s_tag, s1, fdiv4, v_scale)))))); +} + +template +CV_ALWAYS_INLINE void divrc_simd_c3_calc(scale_tag_t s_tag, const float* inx, uchar* outx, + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const v_float32& v_scale, + const v_float32& v_zero) +{ + constexpr int nlanes = v_uint8::nlanes; + + v_float32 fdiv1 = vg_load_f32(inx); + v_float32 fdiv2 = vg_load_f32(&inx[nlanes / 4]); + v_float32 fdiv3 = vg_load_f32(&inx[nlanes / 2]); + v_float32 fdiv4 = vg_load_f32(&inx[3 * nlanes / 4]); + + v_float32 v_mask1 = (fdiv1 == v_zero); + v_float32 v_mask2 = (fdiv2 == v_zero); + v_float32 v_mask3 = (fdiv3 == v_zero); + v_float32 v_mask4 = (fdiv4 == v_zero); + + vx_store(outx, + v_pack_u(v_pack(v_round(v_select(v_mask1, v_zero, div_op(s_tag, s1, fdiv1, v_scale))), + v_round(v_select(v_mask2, v_zero, div_op(s_tag, s2, fdiv2, v_scale)))), + v_pack(v_round(v_select(v_mask3, v_zero, div_op(s_tag, s3, fdiv3, v_scale))), + v_round(v_select(v_mask4, v_zero, div_op(s_tag, s1, fdiv4, v_scale)))))); + +} + +template +CV_ALWAYS_INLINE int divrc_simd_c3_impl(scale_tag_t s_tag, const SRC in[], uchar out[], + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const v_float32& v_scale, + const int length, const int nlanes, const int lanes) +{ + univ_zero_vec_type_of_t v_zero = + vx_setall::lane_type>(0); + + int x = 0; + for (;;) + { + for (; x <= length - lanes; x += lanes) + { + divrc_simd_c3_calc(s_tag, &in[x], &out[x], s1, s2, s3, v_scale, v_zero); + divrc_simd_c3_calc(s_tag, &in[x + nlanes], &out[x + nlanes], s2, s3, s1, v_scale, v_zero); + divrc_simd_c3_calc(s_tag, &in[x + 2 * nlanes], &out[x + 2 * nlanes], s3, s1, s2, v_scale, v_zero); + } + + if (x < length) + { + x = length - lanes; + continue; // process unaligned tail + } + break; + } + return x; +} + +//--------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE +typename std::enable_if::value || + std::is_same::value, void>::type +divrc_simd_c3_calc(scale_tag_t s_tag, const uchar* inx, DST* outx, + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const v_float32& v_scale, + const v_int16& v_zero) +{ + constexpr int nlanes = vector_type_of_t::nlanes; + v_uint8 div = vx_load(inx); + + v_int16 div1 = v_reinterpret_as_s16(v_expand_low(div)); + v_int16 div2 = v_reinterpret_as_s16(v_expand_high(div)); + v_int16 div3 = v_reinterpret_as_s16(vx_load_expand(&inx[2 * nlanes])); + + v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1)); + v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1)); + v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2)); + v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2)); + v_float32 fdiv5 = v_cvt_f32(v_expand_low(div3)); + v_float32 fdiv6 = v_cvt_f32(v_expand_high(div3)); + + v_store_select(outx, div1, v_zero, v_round(div_op(s_tag, s1, fdiv1, v_scale)), + v_round(div_op(s_tag, s2, fdiv2, v_scale))); + v_store_select(&outx[nlanes], div2, v_zero, v_round(div_op(s_tag, s3, fdiv3, v_scale)), + v_round(div_op(s_tag, s1, fdiv4, v_scale))); + v_store_select(&outx[2*nlanes], div3, v_zero, v_round(div_op(s_tag, s2, fdiv5, v_scale)), + v_round(div_op(s_tag, s3, fdiv6, v_scale))); +} + +template +CV_ALWAYS_INLINE +typename std::enable_if<(std::is_same::value && std::is_same::value) || + (std::is_same::value && std::is_same::value) || + (std::is_same::value && std::is_same::value) || + (std::is_same::value && std::is_same::value), void>::type +divrc_simd_c3_calc(scale_tag_t s_tag, const SRC* inx, DST* outx, + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const v_float32& v_scale, + const v_int16& v_zero) +{ + constexpr int nlanes = vector_type_of_t::nlanes; + + v_int16 div1 = v_reinterpret_as_s16(vx_load(inx)); + v_int16 div2 = v_reinterpret_as_s16(vx_load(&inx[nlanes])); + v_int16 div3 = v_reinterpret_as_s16(vx_load(&inx[2*nlanes])); + + v_float32 fdiv1 = v_cvt_f32(v_expand_low(div1)); + v_float32 fdiv2 = v_cvt_f32(v_expand_high(div1)); + v_float32 fdiv3 = v_cvt_f32(v_expand_low(div2)); + v_float32 fdiv4 = v_cvt_f32(v_expand_high(div2)); + v_float32 fdiv5 = v_cvt_f32(v_expand_low(div3)); + v_float32 fdiv6 = v_cvt_f32(v_expand_high(div3)); + + v_store_select(outx, div1, v_zero, v_round(div_op(s_tag, s1, fdiv1, v_scale)), + v_round(div_op(s_tag, s2, fdiv2, v_scale))); + v_store_select(&outx[nlanes], div2, v_zero, v_round(div_op(s_tag, s3, fdiv3, v_scale)), + v_round(div_op(s_tag, s1, fdiv4, v_scale))); + v_store_select(&outx[2*nlanes], div3, v_zero, v_round(div_op(s_tag, s2, fdiv5, v_scale)), + v_round(div_op(s_tag, s3, fdiv6, v_scale))); +} + +template +CV_ALWAYS_INLINE +typename std::enable_if::value || + std::is_same::value, void>::type +divrc_simd_c3_calc(scale_tag_t s_tag, const float* inx, DST* outx, + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const v_float32& v_scale, + const v_float32& v_zero) +{ + constexpr int nlanes = vector_type_of_t::nlanes; + + v_float32 fdiv1 = vg_load_f32(inx); + v_float32 fdiv2 = vg_load_f32(&inx[nlanes/2]); + v_float32 fdiv3 = vg_load_f32(&inx[nlanes]); + v_float32 fdiv4 = vg_load_f32(&inx[3*nlanes/2]); + v_float32 fdiv5 = vg_load_f32(&inx[2*nlanes]); + v_float32 fdiv6 = vg_load_f32(&inx[5*nlanes/2]); + + v_store_i16(outx, v_round(v_select(fdiv1 == v_zero, v_zero, div_op(s_tag, s1, fdiv1, v_scale))), + v_round(v_select(fdiv2 == v_zero, v_zero, div_op(s_tag, s2, fdiv2, v_scale)))); + v_store_i16(&outx[nlanes], v_round(v_select(fdiv3 == v_zero, v_zero, div_op(s_tag, s3, fdiv3, v_scale))), + v_round(v_select(fdiv4 == v_zero, v_zero, div_op(s_tag, s1, fdiv4, v_scale)))); + v_store_i16(&outx[2*nlanes], v_round(v_select(fdiv5 == v_zero, v_zero, div_op(s_tag, s2, fdiv5, v_scale))), + v_round(v_select(fdiv6 == v_zero, v_zero, div_op(s_tag, s3, fdiv6, v_scale)))); +} + +template +CV_ALWAYS_INLINE +typename std::enable_if::value || + std::is_same::value, int>::type +divrc_simd_c3_impl(scale_tag_t s_tag, const SRC in[], DST out[], const v_float32& s1, + const v_float32& s2, const v_float32& s3, + const v_float32& v_scale, const int length, + const int, const int lanes) +{ + zero_vec_type_of_t v_zero = + vx_setall::lane_type>(0); + + int x = 0; + for (;;) + { + for (; x <= length - lanes; x += lanes) + { + divrc_simd_c3_calc(s_tag, &in[x], &out[x], s1, s2, s3, v_scale, v_zero); + } + + if (x < length) + { + x = length - lanes; + continue; // process unaligned tail + } + break; + } + return x; +} + +//--------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE int divrc_simd_c3_impl(scale_tag_t s_tag, const SRC* in, float* out, + const v_float32& s1, const v_float32& s2, + const v_float32& s3, const v_float32& v_scale, + const int length, const int nlanes, const int lanes) +{ + int x = 0; + for (;;) + { + for (; x <= length - lanes; x += lanes) + { + v_float32 div1 = vg_load_f32(&in[x]); + v_float32 div2 = vg_load_f32(&in[x + nlanes]); + v_float32 div3 = vg_load_f32(&in[x + 2*nlanes]); + + vx_store(&out[x], div_op(s_tag, s1, div1, v_scale)); + vx_store(&out[x + nlanes], div_op(s_tag, s2, div2, v_scale)); + vx_store(&out[x + 2*nlanes], div_op(s_tag, s3, div3, v_scale)); + } + + if (x < length) + { + x = length - lanes; + continue; // process unaligned tail + } + break; + } + return x; +} + +//------------------------------------------------------------------------------------------------- + +template +CV_ALWAYS_INLINE int divrc_simd_c3(scale_tag_t s_tag, const SRC in[], + const float scalar[], DST out[], + const int length, const float scale) +{ + constexpr int chan = 3; + constexpr int nlanes = vector_type_of_t::nlanes; + constexpr int lanes = chan * nlanes; + + if (length < lanes) + return 0; + + v_float32 v_scale = vx_setall_f32(scale); + + v_float32 s1 = vx_load(scalar); +#if CV_SIMD_WIDTH == 32 + v_float32 s2 = vx_load(&scalar[2]); + v_float32 s3 = vx_load(&scalar[1]); +#else + v_float32 s2 = vx_load(&scalar[1]); + v_float32 s3 = vx_load(&scalar[2]); +#endif + return divrc_simd_c3_impl(s_tag, in, out, s1, s2, s3, v_scale, length, nlanes, lanes); +} + +#define DIVRC_SIMD(SRC, DST) \ +int divrc_simd(const float scalar[], const SRC in[], DST out[], \ + const int length, const int chan, const float scale) \ +{ \ + switch (chan) \ + { \ + case 1: \ + case 2: \ + case 4: \ + { \ + if (std::fabs(scale - 1.0f) <= FLT_EPSILON) \ + { \ + return divrc_simd_common(not_scale_tag{}, in, scalar, \ + out, length, scale); \ + } \ + else \ + { \ + return divrc_simd_common(scale_tag{}, in, scalar, out, \ + length, scale); \ + } \ + } \ + case 3: \ + { \ + if (std::fabs(scale - 1.0f) <= FLT_EPSILON) \ + { \ + return divrc_simd_c3(not_scale_tag{}, in, scalar, \ + out, length, scale); \ + } \ + else \ + { \ + return divrc_simd_c3(scale_tag{}, in, scalar, out, \ + length, scale); \ + } \ + } \ + default: \ + GAPI_Assert(chan <= 4); \ + break; \ + } \ + return 0; \ +} + +DIVRC_SIMD(uchar, uchar) +DIVRC_SIMD(ushort, uchar) +DIVRC_SIMD(short, uchar) +DIVRC_SIMD(float, uchar) +DIVRC_SIMD(short, short) +DIVRC_SIMD(ushort, short) +DIVRC_SIMD(uchar, short) +DIVRC_SIMD(float, short) +DIVRC_SIMD(ushort, ushort) +DIVRC_SIMD(uchar, ushort) +DIVRC_SIMD(short, ushort) +DIVRC_SIMD(float, ushort) +DIVRC_SIMD(uchar, float) +DIVRC_SIMD(ushort, float) +DIVRC_SIMD(short, float) +DIVRC_SIMD(float, float) + +#undef DIVRC_SIMD + //------------------------- // // Fluid kernels: Split3 // //------------------------- -int split3_simd(const uchar in[], uchar out1[], uchar out2[], - uchar out3[], const int width) +int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[], + const int width) { constexpr int nlanes = v_uint8::nlanes; if (width < nlanes)