diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp index a142109315..d91ce65fff 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp @@ -28,8 +28,8 @@ INSTANTIATE_TEST_CASE_P(SqrtPerfTestFluid, SqrtPerfTest, INSTANTIATE_TEST_CASE_P(AddPerfTestFluid, AddPerfTest, Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()), Values(szSmall128, szVGA, sz720p, sz1080p), - Values(CV_8UC1, CV_8UC3, CV_16SC1, CV_32FC1), - Values(-1, CV_8U, CV_32F), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(-1, CV_8U, CV_16U, CV_16S, CV_32F), Values(cv::compile_args(CORE_FLUID)))); INSTANTIATE_TEST_CASE_P(AddCPerfTestFluid, AddCPerfTest, diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp index bb33c45d85..866381f39b 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp @@ -379,136 +379,6 @@ CV_ALWAYS_INLINE int absdiff_simd(const T in1[], const T in2[], T out[], int len return 0; } -template<typename T, typename VT> -CV_ALWAYS_INLINE int add_simd_sametype(const T in1[], const T in2[], T out[], int length) -{ - constexpr int nlanes = static_cast<int>(VT::nlanes); - - if (length < nlanes) - return 0; - - int x = 0; - for (;;) - { - for (; x <= length - nlanes; x += nlanes) - { - VT a = vx_load(&in1[x]); - VT b = vx_load(&in2[x]); - vx_store(&out[x], a + b); - } - - if (x < length && (in1 != out) && (in2 != out)) - { - x = length - nlanes; - continue; // process one more time (unaligned tail) - } - break; - } - - return x; -} - -template<typename SRC, typename DST> -CV_ALWAYS_INLINE int add_simd(const SRC in1[], const SRC in2[], DST out[], int length) -{ - if (std::is_same<DST, float>::value && !std::is_same<SRC, float>::value) - return 0; - - if (std::is_same<DST, SRC>::value) - { - if (std::is_same<DST, uchar>::value) - { - return add_simd_sametype<uchar, v_uint8>(reinterpret_cast<const uchar*>(in1), - reinterpret_cast<const uchar*>(in2), - reinterpret_cast<uchar*>(out), length); - } - else if (std::is_same<DST, short>::value) - { - return add_simd_sametype<short, v_int16>(reinterpret_cast<const short*>(in1), - reinterpret_cast<const short*>(in2), - reinterpret_cast<short*>(out), length); - } - else if (std::is_same<DST, float>::value) - { - return add_simd_sametype<float, v_float32>(reinterpret_cast<const float*>(in1), - reinterpret_cast<const float*>(in2), - reinterpret_cast<float*>(out), length); - } - } - else if (std::is_same<SRC, short>::value && std::is_same<DST, uchar>::value) - { - constexpr int nlanes = static_cast<int>(v_uint8::nlanes); - - if (length < nlanes) - return 0; - - int x = 0; - for (;;) - { - for (; x <= length - nlanes; x += nlanes) - { - v_int16 a1 = vx_load(reinterpret_cast<const short*>(&in1[x])); - v_int16 a2 = vx_load(reinterpret_cast<const short*>(&in1[x + nlanes / 2])); - v_int16 b1 = vx_load(reinterpret_cast<const short*>(&in2[x])); - v_int16 b2 = vx_load(reinterpret_cast<const short*>(&in2[x + nlanes / 2])); - - vx_store(reinterpret_cast<uchar*>(&out[x]), v_pack_u(a1 + b1, a2 + b2)); - } - - if (x < length) - { - CV_DbgAssert((reinterpret_cast<const short*>(in1) != reinterpret_cast<const short*>(out)) && - (reinterpret_cast<const short*>(in2) != reinterpret_cast<const short*>(out))); - x = length - nlanes; - continue; // process one more time (unaligned tail) - } - break; - } - - return x; - } - else if (std::is_same<SRC, float>::value && std::is_same<DST, uchar>::value) - { - constexpr int nlanes = static_cast<int>(v_uint8::nlanes); - - if (length < nlanes) - return 0; - - int x = 0; - for (;;) - { - for (; x <= length - nlanes; x += nlanes) - { - v_float32 a1 = vx_load(reinterpret_cast<const float*>(&in1[x])); - v_float32 a2 = vx_load(reinterpret_cast<const float*>(&in1[x + nlanes / 4])); - v_float32 a3 = vx_load(reinterpret_cast<const float*>(&in1[x + 2 * nlanes / 4])); - v_float32 a4 = vx_load(reinterpret_cast<const float*>(&in1[x + 3 * nlanes / 4])); - - v_float32 b1 = vx_load(reinterpret_cast<const float*>(&in2[x])); - v_float32 b2 = vx_load(reinterpret_cast<const float*>(&in2[x + nlanes / 4])); - v_float32 b3 = vx_load(reinterpret_cast<const float*>(&in2[x + 2 * nlanes / 4])); - v_float32 b4 = vx_load(reinterpret_cast<const float*>(&in2[x + 3 * nlanes / 4])); - - vx_store(reinterpret_cast<uchar*>(&out[x]), v_pack_u(v_pack(v_round(a1 + b1), v_round(a2 + b2)), - v_pack(v_round(a3 + b3), v_round(a4 + b4)))); - } - - if (x < length) - { - CV_DbgAssert((reinterpret_cast<const float*>(in1) != reinterpret_cast<const float*>(out)) && - (reinterpret_cast<const float*>(in2) != reinterpret_cast<const float*>(out))); - x = length - nlanes; - continue; // process one more time (unaligned tail) - } - break; - } - - return x; - } - - return 0; -} - template<typename T, typename VT> CV_ALWAYS_INLINE int sub_simd_sametype(const T in1[], const T in2[], T out[], int length) { @@ -641,7 +511,7 @@ CV_ALWAYS_INLINE int sub_simd(const SRC in1[], const SRC in2[], DST out[], int l #endif // CV_SIMD template<typename DST, typename SRC1, typename SRC2> -static CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2, +CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const View &src2, Arithm arithm, double scale=1) { static_assert(std::is_same<SRC1, SRC2>::value, "wrong types"); @@ -652,7 +522,7 @@ static CV_ALWAYS_INLINE void run_arithm(Buffer &dst, const View &src1, const Vie int width = dst.length(); int chan = dst.meta().chan; - int length = width * chan; + const int length = width * chan; // NB: assume in/out types are not 64-bits float _scale = static_cast<float>( scale ); @@ -708,13 +578,22 @@ GAPI_FLUID_KERNEL(GFluidAdd, cv::gapi::core::GAdd, false) static void run(const View &src1, const View &src2, int /*dtype*/, Buffer &dst) { // DST SRC1 SRC2 OP __VA_ARGS__ - BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_ADD); - BINARY_(uchar , short, short, run_arithm, dst, src1, src2, ARITHM_ADD); - BINARY_(uchar , float, float, run_arithm, dst, src1, src2, ARITHM_ADD); - BINARY_( short, short, short, run_arithm, dst, src1, src2, ARITHM_ADD); - BINARY_( float, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_ADD); - BINARY_( float, short, short, run_arithm, dst, src1, src2, ARITHM_ADD); - BINARY_( float, float, float, run_arithm, dst, src1, src2, ARITHM_ADD); + BINARY_(uchar, uchar, uchar, run_arithm, dst, src1, src2, ARITHM_ADD); + BINARY_(uchar, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_ADD); + BINARY_(uchar, short, short, run_arithm, dst, src1, src2, ARITHM_ADD); + BINARY_(uchar, float, float, run_arithm, dst, src1, src2, ARITHM_ADD); + BINARY_(short, short, short, run_arithm, dst, src1, src2, ARITHM_ADD); + BINARY_(short, uchar, uchar, run_arithm, dst, src1, src2, ARITHM_ADD); + BINARY_(short, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_ADD); + BINARY_(short, float, float, run_arithm, dst, src1, src2, ARITHM_ADD); + BINARY_(ushort, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_ADD); + BINARY_(ushort, uchar, uchar, run_arithm, dst, src1, src2, ARITHM_ADD); + BINARY_(ushort, short, short, run_arithm, dst, src1, src2, ARITHM_ADD); + BINARY_(ushort, float, float, run_arithm, dst, src1, src2, ARITHM_ADD); + BINARY_(float, uchar, uchar, run_arithm, dst, src1, src2, ARITHM_ADD); + BINARY_(float, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_ADD); + BINARY_(float, short, short, run_arithm, dst, src1, src2, ARITHM_ADD); + BINARY_(float, float, float, run_arithm, dst, src1, src2, ARITHM_ADD); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp index a682fb7dbb..d80a6b29c0 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp @@ -290,6 +290,33 @@ int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[], CV_CPU_DISPATCH_MODES_ALL); } +#define ADD_SIMD(SRC, DST) \ +int add_simd(const SRC in1[], const SRC in2[], DST out[], const int length) \ +{ \ + \ + CV_CPU_DISPATCH(add_simd, (in1, in2, out, length), \ + CV_CPU_DISPATCH_MODES_ALL); \ +} + +ADD_SIMD(uchar, uchar) +ADD_SIMD(ushort, uchar) +ADD_SIMD(short, uchar) +ADD_SIMD(float, uchar) +ADD_SIMD(short, short) +ADD_SIMD(ushort, short) +ADD_SIMD(uchar, short) +ADD_SIMD(float, short) +ADD_SIMD(ushort, ushort) +ADD_SIMD(uchar, ushort) +ADD_SIMD(short, ushort) +ADD_SIMD(float, ushort) +ADD_SIMD(uchar, float) +ADD_SIMD(ushort, float) +ADD_SIMD(short, float) +ADD_SIMD(float, float) + +#undef ADD_SIMD + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp index 975383a8d9..052adbe2fd 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp @@ -222,6 +222,28 @@ int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[], int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[], const uchar in4[], uchar out[], const int width); +#define ADD_SIMD(SRC, DST) \ +int add_simd(const SRC in1[], const SRC in2[], DST out[], const int length); + +ADD_SIMD(uchar, uchar) +ADD_SIMD(ushort, uchar) +ADD_SIMD(short, uchar) +ADD_SIMD(float, uchar) +ADD_SIMD(short, short) +ADD_SIMD(ushort, short) +ADD_SIMD(uchar, short) +ADD_SIMD(float, short) +ADD_SIMD(ushort, ushort) +ADD_SIMD(uchar, ushort) +ADD_SIMD(short, ushort) +ADD_SIMD(float, ushort) +ADD_SIMD(uchar, float) +ADD_SIMD(ushort, float) +ADD_SIMD(short, float) +ADD_SIMD(float, float) + +#undef ADD_SIMD + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp index b92d92d0cf..4c324daa25 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp @@ -231,6 +231,28 @@ DIVRC_SIMD(float, float) #undef DIVRC_SIMD +#define ADD_SIMD(SRC, DST) \ +int add_simd(const SRC in1[], const SRC in2[], DST out[], const int length); + +ADD_SIMD(uchar, uchar) +ADD_SIMD(ushort, uchar) +ADD_SIMD(short, uchar) +ADD_SIMD(float, uchar) +ADD_SIMD(short, short) +ADD_SIMD(ushort, short) +ADD_SIMD(uchar, short) +ADD_SIMD(float, short) +ADD_SIMD(ushort, ushort) +ADD_SIMD(uchar, ushort) +ADD_SIMD(short, ushort) +ADD_SIMD(float, ushort) +ADD_SIMD(uchar, float) +ADD_SIMD(ushort, float) +ADD_SIMD(short, float) +ADD_SIMD(float, float) + +#undef ADD_SIMD + int split3_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[], const int width); @@ -2503,6 +2525,178 @@ int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[], return x; } +//------------------------- +// +// Fluid kernels: Add +// +//------------------------- + +CV_ALWAYS_INLINE void add_uchar_store(uchar* outx, const v_uint16& c1, const v_uint16& c2) +{ + vx_store(outx, v_pack(c1, c2)); +} + +CV_ALWAYS_INLINE void add_uchar_store(uchar* outx, const v_int16& c1, const v_int16& c2) +{ + vx_store(outx, v_pack_u(c1, c2)); +} + +template<typename SRC, typename DST> +CV_ALWAYS_INLINE +typename std::enable_if<std::is_same<SRC, DST>::value, void>::type +add_simd_impl(const SRC* in1x, const SRC* in2x, DST* outx) +{ + vector_type_of_t<SRC> a = vx_load(in1x); + vector_type_of_t<SRC> b = vx_load(in2x); + vx_store(outx, a + b); +} + +template<typename SRC> +CV_ALWAYS_INLINE +typename std::enable_if<std::is_same<SRC, short>::value || + std::is_same<SRC, ushort>::value, void>::type +add_simd_impl(const SRC* in1x, const SRC* in2x, uchar* outx) +{ + constexpr int nlanes = v_uint8::nlanes; + + vector_type_of_t<SRC> a1 = vx_load(in1x); + vector_type_of_t<SRC> a2 = vx_load(&in1x[nlanes / 2]); + vector_type_of_t<SRC> b1 = vx_load(in2x); + vector_type_of_t<SRC> b2 = vx_load(&in2x[nlanes / 2]); + + add_uchar_store(outx, a1 + b1, a2 + b2); +} + +CV_ALWAYS_INLINE void add_simd_impl(const float* in1x, const float* in2x, uchar* outx) +{ + constexpr int nlanes = v_uint8::nlanes; + + v_float32 a1 = vx_load(in1x); + v_float32 a2 = vx_load(&in1x[nlanes / 4]); + v_float32 a3 = vx_load(&in1x[2 * nlanes / 4]); + v_float32 a4 = vx_load(&in1x[3 * nlanes / 4]); + + v_float32 b1 = vx_load(in2x); + v_float32 b2 = vx_load(&in2x[nlanes / 4]); + v_float32 b3 = vx_load(&in2x[2 * nlanes / 4]); + v_float32 b4 = vx_load(&in2x[3 * nlanes / 4]); + + vx_store(outx, v_pack_u(v_pack(v_round(a1 + b1), v_round(a2 + b2)), + v_pack(v_round(a3 + b3), v_round(a4 + b4)))); +} + +CV_ALWAYS_INLINE void add_simd_impl(const uchar* in1x, const uchar* in2x, short* outx) +{ + v_int16 a = v_reinterpret_as_s16(vx_load_expand(in1x)); + v_int16 b = v_reinterpret_as_s16(vx_load_expand(in2x)); + + vx_store(outx, a + b); +} + +CV_ALWAYS_INLINE void add_simd_impl(const uchar* in1x, const uchar* in2x, ushort* outx) +{ + v_uint16 a = vx_load_expand(in1x); + v_uint16 b = vx_load_expand(in2x); + + vx_store(outx, a + b); +} + +template<typename DST> +CV_ALWAYS_INLINE +typename std::enable_if<std::is_same<DST, short>::value || + std::is_same<DST, ushort>::value, void>::type +add_simd_impl(const float* in1x, const float* in2x, DST* outx) +{ + constexpr int nlanes = vector_type_of_t<DST>::nlanes; + v_float32 a1 = vx_load(in1x); + v_float32 a2 = vx_load(&in1x[nlanes/2]); + v_float32 b1 = vx_load(in2x); + v_float32 b2 = vx_load(&in2x[nlanes/2]); + + v_store_i16(outx, v_round(a1 + b1), v_round(a2 + b2)); +} + +CV_ALWAYS_INLINE void add_simd_impl(const short* in1x, const short* in2x, ushort* outx) +{ + v_int16 a = vx_load(in1x); + v_int32 a1 = v_expand_low(a); + v_int32 a2 = v_expand_high(a); + + v_int16 b = vx_load(in2x); + v_int32 b1 = v_expand_low(b); + v_int32 b2 = v_expand_high(b); + + vx_store(outx, v_pack_u(a1 + b1, a2 + b2)); +} + +CV_ALWAYS_INLINE void add_simd_impl(const ushort* in1x, const ushort* in2x, short* outx) +{ + v_uint16 a = vx_load(in1x); + v_uint32 a1 = v_expand_low(a); + v_uint32 a2 = v_expand_high(a); + + v_uint16 b = vx_load(in2x); + v_uint32 b1 = v_expand_low(b); + v_uint32 b2 = v_expand_high(b); + + vx_store(outx, v_reinterpret_as_s16(v_pack(a1 + b1, a2 + b2))); +} + +template<typename SRC> +CV_ALWAYS_INLINE void add_simd_impl(const SRC* in1x, const SRC* in2x, float* outx) +{ + v_float32 a = vg_load_f32(in1x); + v_float32 b = vg_load_f32(in2x); + + vx_store(outx, a + b); +} + +#define ADD_SIMD(SRC, DST) \ +int add_simd(const SRC in1[], const SRC in2[], DST out[], const int length) \ +{ \ + constexpr int nlanes = vector_type_of_t<DST>::nlanes; \ + \ + if (length < nlanes) \ + return 0; \ + \ + int x = 0; \ + for (;;) \ + { \ + for (; x <= length - nlanes; x += nlanes) \ + { \ + add_simd_impl(&in1[x], &in2[x], &out[x]); \ + } \ + \ + if (x < length) \ + { \ + x = length - nlanes; \ + continue; \ + } \ + break; \ + } \ + \ + return x; \ +} + +ADD_SIMD(uchar, uchar) +ADD_SIMD(ushort, uchar) +ADD_SIMD(short, uchar) +ADD_SIMD(float, uchar) +ADD_SIMD(short, short) +ADD_SIMD(ushort, short) +ADD_SIMD(uchar, short) +ADD_SIMD(float, short) +ADD_SIMD(ushort, ushort) +ADD_SIMD(uchar, ushort) +ADD_SIMD(short, ushort) +ADD_SIMD(float, ushort) +ADD_SIMD(uchar, float) +ADD_SIMD(ushort, float) +ADD_SIMD(short, float) +ADD_SIMD(float, float) + +#undef ADD_SIMD + #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY CV_CPU_OPTIMIZATION_NAMESPACE_END