diff --git a/modules/gapi/perf/common/gapi_core_perf_tests.hpp b/modules/gapi/perf/common/gapi_core_perf_tests.hpp index 4084ed3e88..f3f251167b 100644 --- a/modules/gapi/perf/common/gapi_core_perf_tests.hpp +++ b/modules/gapi/perf/common/gapi_core_perf_tests.hpp @@ -50,7 +50,7 @@ namespace opencv_test class MinPerfTest : public TestPerfParams> {}; class MaxPerfTest : public TestPerfParams> {}; class AbsDiffPerfTest : public TestPerfParams> {}; - class AbsDiffCPerfTest : public TestPerfParams> {}; + class AbsDiffCPerfTest : public TestPerfParams> {}; class SumPerfTest : public TestPerfParams> {}; class CountNonZeroPerfTest : public TestPerfParams> {}; class AddWeightedPerfTest : public TestPerfParams> {}; diff --git a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp index d4144cd71a..96ce369081 100644 --- a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp +++ b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp @@ -970,9 +970,10 @@ PERF_TEST_P_(AbsDiffPerfTest, TestPerformance) PERF_TEST_P_(AbsDiffCPerfTest, TestPerformance) { - cv::Size sz_in = get<0>(GetParam()); - MatType type = get<1>(GetParam()); - cv::GCompileArgs compile_args = get<2>(GetParam()); + compare_f cmpF = get<0>(GetParam()); + cv::Size sz_in = get<1>(GetParam()); + MatType type = get<2>(GetParam()); + cv::GCompileArgs compile_args = get<3>(GetParam()); initMatsRandU(type, sz_in, type, false); @@ -997,8 +998,9 @@ PERF_TEST_P_(AbsDiffCPerfTest, TestPerformance) } // Comparison //////////////////////////////////////////////////////////// - // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv)); - EXPECT_EQ(out_mat_gapi.size(), sz_in); + { + EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv)); + } SANITY_CHECK_NOTHING(); } diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp index 1255f5ca52..c110de4fdd 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp @@ -156,7 +156,8 @@ INSTANTIATE_TEST_CASE_P(AbsDiffPerfTestCPU, AbsDiffPerfTest, Values(cv::compile_args(CORE_CPU)))); INSTANTIATE_TEST_CASE_P(AbsDiffCPerfTestCPU, AbsDiffCPerfTest, - Combine(Values(szSmall128, szVGA, sz720p, sz1080p), + Combine(Values(AbsExact().to_compare_f()), + Values(szSmall128, szVGA, sz720p, sz1080p), Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), Values(cv::compile_args(CORE_CPU)))); diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp index 058cff69ac..442d9efa7a 100644 --- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp +++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp @@ -153,10 +153,9 @@ INSTANTIATE_TEST_CASE_P(AbsDiffPerfTestFluid, AbsDiffPerfTest, Values(cv::compile_args(CORE_FLUID)))); INSTANTIATE_TEST_CASE_P(AbsDiffCPerfTestFluid, AbsDiffCPerfTest, - Combine(Values(szSmall128, szVGA, sz720p, sz1080p), - Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_8UC2, - CV_16UC2, CV_16SC2, CV_8UC3, CV_16UC3, - CV_16SC3, CV_8UC4, CV_16UC4, CV_16SC4), + Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()), + Values(szSmall128, szVGA, sz720p, sz1080p), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), Values(cv::compile_args(CORE_FLUID)))); // INSTANTIATE_TEST_CASE_P(SumPerfTestFluid, SumPerfTest, diff --git a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp index fa63f1e208..fd00f1caea 100644 --- a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp +++ b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp @@ -154,7 +154,8 @@ INSTANTIATE_TEST_CASE_P(AbsDiffPerfTestGPU, AbsDiffPerfTest, Values(cv::compile_args(CORE_GPU)))); INSTANTIATE_TEST_CASE_P(AbsDiffCPerfTestGPU, AbsDiffCPerfTest, - Combine(Values( szSmall128, szVGA, sz720p, sz1080p ), + Combine(Values(AbsExact().to_compare_f()), + Values( szSmall128, szVGA, sz720p, sz1080p ), Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ), Values(cv::compile_args(CORE_GPU)))); diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp index a0513a09cd..8342a26d0d 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp @@ -994,244 +994,6 @@ static void run_arithm_s(DST out[], const SRC in[], int width, int chan, CV_Error(cv::Error::StsBadArg, "unsupported number of channels"); } -#if CV_SIMD -CV_ALWAYS_INLINE void absdiffc_short_store_c1c2c4(short* out_ptr, const v_int32& c1, const v_int32& c2) -{ - vx_store(out_ptr, v_pack(c1, c2)); -} - -CV_ALWAYS_INLINE void absdiffc_short_store_c1c2c4(ushort* out_ptr, const v_int32& c1, const v_int32& c2) -{ - vx_store(out_ptr, v_pack_u(c1, c2)); -} - -template -CV_ALWAYS_INLINE int absdiffc_simd_c1c2c4(const T in[], T out[], - const v_float32& s, const int length) -{ - static_assert((std::is_same::value) || (std::is_same::value), - "This templated overload is only for short or ushort type combinations."); - - constexpr int nlanes = (std::is_same::value) ? static_cast(v_uint16::nlanes) : - static_cast(v_int16::nlanes); - if (length < nlanes) - return 0; - - int x = 0; - for (;;) - { - for (; x <= length - nlanes; x += nlanes) - { - v_float32 a1 = v_load_f32(in + x); - v_float32 a2 = v_load_f32(in + x + nlanes / 2); - - absdiffc_short_store_c1c2c4(&out[x], v_round(v_absdiff(a1, s)), - v_round(v_absdiff(a2, s))); - } - - if (x < length && (in != out)) - { - x = length - nlanes; - continue; // process unaligned tail - } - break; - } - return x; -} - -template<> -CV_ALWAYS_INLINE int absdiffc_simd_c1c2c4(const uchar in[], uchar out[], - const v_float32& s, const int length) -{ - constexpr int nlanes = static_cast(v_uint8::nlanes); - - if (length < nlanes) - return 0; - - int x = 0; - for (;;) - { - for (; x <= length - nlanes; x += nlanes) - { - v_float32 a1 = v_load_f32(in + x); - v_float32 a2 = v_load_f32(in + x + nlanes / 4); - v_float32 a3 = v_load_f32(in + x + nlanes / 2); - v_float32 a4 = v_load_f32(in + x + 3 * nlanes / 4); - - vx_store(&out[x], v_pack_u(v_pack(v_round(v_absdiff(a1, s)), - v_round(v_absdiff(a2, s))), - v_pack(v_round(v_absdiff(a3, s)), - v_round(v_absdiff(a4, s))))); - } - - if (x < length && (in != out)) - { - x = length - nlanes; - continue; // process unaligned tail - } - break; - } - return x; -} - -CV_ALWAYS_INLINE void absdiffc_short_store_c3(short* out_ptr, const v_int32& c1, - const v_int32& c2, const v_int32& c3, - const v_int32& c4, const v_int32& c5, - const v_int32& c6) -{ - constexpr int nlanes = static_cast(v_int16::nlanes); - vx_store(out_ptr, v_pack(c1, c2)); - vx_store(out_ptr + nlanes, v_pack(c3, c4)); - vx_store(out_ptr + 2*nlanes, v_pack(c5, c6)); -} - -CV_ALWAYS_INLINE void absdiffc_short_store_c3(ushort* out_ptr, const v_int32& c1, - const v_int32& c2, const v_int32& c3, - const v_int32& c4, const v_int32& c5, - const v_int32& c6) -{ - constexpr int nlanes = static_cast(v_uint16::nlanes); - vx_store(out_ptr, v_pack_u(c1, c2)); - vx_store(out_ptr + nlanes, v_pack_u(c3, c4)); - vx_store(out_ptr + 2*nlanes, v_pack_u(c5, c6)); -} - -template -CV_ALWAYS_INLINE int absdiffc_simd_c3_impl(const T in[], T out[], - const v_float32& s1, const v_float32& s2, - const v_float32& s3, const int length) -{ - static_assert((std::is_same::value) || (std::is_same::value), - "This templated overload is only for short or ushort type combinations."); - - constexpr int nlanes = (std::is_same::value) ? static_cast(v_uint16::nlanes): - static_cast(v_int16::nlanes); - - if (length < 3 * nlanes) - return 0; - - int x = 0; - for (;;) - { - for (; x <= length - 3 * nlanes; x += 3 * nlanes) - { - v_float32 a1 = v_load_f32(in + x); - v_float32 a2 = v_load_f32(in + x + nlanes / 2); - v_float32 a3 = v_load_f32(in + x + nlanes); - v_float32 a4 = v_load_f32(in + x + 3 * nlanes / 2); - v_float32 a5 = v_load_f32(in + x + 2 * nlanes); - v_float32 a6 = v_load_f32(in + x + 5 * nlanes / 2); - - absdiffc_short_store_c3(&out[x], v_round(v_absdiff(a1, s1)), - v_round(v_absdiff(a2, s2)), - v_round(v_absdiff(a3, s3)), - v_round(v_absdiff(a4, s1)), - v_round(v_absdiff(a5, s2)), - v_round(v_absdiff(a6, s3))); - } - - if (x < length && (in != out)) - { - x = length - 3 * nlanes; - continue; // process unaligned tail - } - break; - } - return x; -} - -template<> -CV_ALWAYS_INLINE int absdiffc_simd_c3_impl(const uchar in[], uchar out[], - const v_float32& s1, const v_float32& s2, - const v_float32& s3, const int length) -{ - constexpr int nlanes = static_cast(v_uint8::nlanes); - - if (length < 3 * nlanes) - return 0; - - int x = 0; - - for (;;) - { - for (; x <= length - 3 * nlanes; x += 3 * nlanes) - { - vx_store(&out[x], - v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x), s1)), - v_round(v_absdiff(v_load_f32(in + x + nlanes/4), s2))), - v_pack(v_round(v_absdiff(v_load_f32(in + x + nlanes/2), s3)), - v_round(v_absdiff(v_load_f32(in + x + 3*nlanes/4), s1))))); - - vx_store(&out[x + nlanes], - v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x + nlanes), s2)), - v_round(v_absdiff(v_load_f32(in + x + 5*nlanes/4), s3))), - v_pack(v_round(v_absdiff(v_load_f32(in + x + 3*nlanes/2), s1)), - v_round(v_absdiff(v_load_f32(in + x + 7*nlanes/4), s2))))); - - vx_store(&out[x + 2 * nlanes], - v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x + 2*nlanes), s3)), - v_round(v_absdiff(v_load_f32(in + x + 9*nlanes/4), s1))), - v_pack(v_round(v_absdiff(v_load_f32(in + x + 5*nlanes/2), s2)), - v_round(v_absdiff(v_load_f32(in + x + 11*nlanes/4), s3))))); - } - - if (x < length && (in != out)) - { - x = length - 3 * nlanes; - continue; // process unaligned tail - } - break; - } - return x; -} - -template -CV_ALWAYS_INLINE int absdiffc_simd_channels(const T in[], const float scalar[], T out[], - const int width, int chan) -{ - int length = width * chan; - v_float32 s = vx_load(scalar); - - return absdiffc_simd_c1c2c4(in, out, s, length); -} - -template -CV_ALWAYS_INLINE int absdiffc_simd_c3(const T in[], const float scalar[], T out[], int width) -{ - constexpr int chan = 3; - int length = width * chan; - - v_float32 s1 = vx_load(scalar); -#if CV_SIMD_WIDTH == 32 - v_float32 s2 = vx_load(scalar + 2); - v_float32 s3 = vx_load(scalar + 1); -#else - v_float32 s2 = vx_load(scalar + 1); - v_float32 s3 = vx_load(scalar + 2); -#endif - - return absdiffc_simd_c3_impl(in, out, s1, s2, s3, length); -} - -template -CV_ALWAYS_INLINE int absdiffc_simd(const T in[], const float scalar[], T out[], int width, int chan) -{ - switch (chan) - { - case 1: - case 2: - case 4: - return absdiffc_simd_channels(in, scalar, out, width, chan); - case 3: - return absdiffc_simd_c3(in, scalar, out, width); - default: - break; - } - - return 0; -} -#endif // CV_SIMD - template static void run_absdiffc(Buffer &dst, const View &src, const float scalar[]) { @@ -1240,13 +1002,14 @@ static void run_absdiffc(Buffer &dst, const View &src, const float scalar[]) int width = dst.length(); int chan = dst.meta().chan; + const int length = width * chan; int w = 0; #if CV_SIMD - w = absdiffc_simd(in, scalar, out, width, chan); + w = absdiffc_simd(in, scalar, out, length, chan); #endif - for (; w < width*chan; ++w) + for (; w < length; ++w) out[w] = absdiff(in[w], scalar[w%chan]); } @@ -1349,6 +1112,32 @@ static void run_arithm_rs(Buffer &dst, const View &src, const float scalar[4], A } } +CV_ALWAYS_INLINE void initScratchBuffer(Buffer& scratch) +{ +#if CV_SIMD + // 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector. + constexpr int maxNlanes = 16; + + // +2 is offset for 3-channel case. + // Offset is need to right load coefficients from scalar array to SIMD vectors for 3-channel case. + // Scalar array looks like: scalar[] = {C1, C2, C3, C1, C2, C3, ...} + // The first scalar SIMD vector should looks like: + // C1 C2 C3 C1 + // The second: + // C2 C3 C1 C2 + // The third: + // C3 C1 C2 C3 + constexpr int offset = 2; + constexpr int buflen = maxNlanes + offset; +#else + constexpr int buflen = 4; +#endif + cv::Size bufsize(buflen, 1); + GMatDesc bufdesc = { CV_32F, 1, bufsize }; + Buffer buffer(bufdesc); + scratch = std::move(buffer); +} + GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true) { static const int Window = 1; @@ -1370,21 +1159,14 @@ GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true) UNARY_(uchar, uchar, run_absdiffc, dst, src, scalar); UNARY_(ushort, ushort, run_absdiffc, dst, src, scalar); UNARY_(short, short, run_absdiffc, dst, src, scalar); + UNARY_(float, float, run_absdiffc, dst, src, scalar); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } static void initScratch(const GMatDesc&, const GScalarDesc&, Buffer& scratch) { -#if CV_SIMD - constexpr int buflen = static_cast(v_float32::nlanes) + 2; // buffer size -#else - constexpr int buflen = 4; -#endif - cv::Size bufsize(buflen, 1); - GMatDesc bufdesc = { CV_32F, 1, bufsize }; - Buffer buffer(bufdesc); - scratch = std::move(buffer); + initScratchBuffer(scratch); } static void resetScratch(Buffer& /* scratch */) @@ -1392,32 +1174,6 @@ GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true) } }; -CV_ALWAYS_INLINE void initScratchBuffer(Buffer& scratch) -{ -#if CV_SIMD - // 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector. - constexpr int maxNlanes = 16; - - // +2 is offset for 3-channel case. - // Offset is need to right load coefficients from scalar array to SIMD vectors for 3-channel case. - // Scalar array looks like: scalar[] = {C1, C2, C3, C1, C2, C3, ...} - // The first scalar SIMD vector should looks like: - // C1 C2 C3 C1 - // The second: - // C2 C3 C1 C2 - // The third: - // C3 C1 C2 C3 - constexpr int offset = 2; - constexpr int buflen = maxNlanes + offset; -#else - constexpr int buflen = 4; -#endif - cv::Size bufsize(buflen, 1); - GMatDesc bufdesc = { CV_32F, 1, bufsize }; - Buffer buffer(bufdesc); - scratch = std::move(buffer); -} - GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, true) { static const int Window = 1; diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp index f596779286..ab6b013694 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp @@ -165,6 +165,21 @@ MULC_SIMD(float, float) #undef MULC_SIMD +#define ABSDIFFC_SIMD(SRC) \ +int absdiffc_simd(const SRC in[], const float scalar[], SRC out[], \ + const int length, const int chan) \ +{ \ + CV_CPU_DISPATCH(absdiffc_simd, (in, scalar, out, length, chan), \ + CV_CPU_DISPATCH_MODES_ALL); \ +} + +ABSDIFFC_SIMD(uchar) +ABSDIFFC_SIMD(short) +ABSDIFFC_SIMD(ushort) +ABSDIFFC_SIMD(float) + +#undef ABSDIFFC_SIMD + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp index 541870e548..522d7b8b44 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp @@ -60,8 +60,8 @@ MUL_SIMD(float, float) #undef MUL_SIMD -#define ADDC_SIMD(SRC, DST) \ -int addc_simd(const SRC in[], const float scalar[], DST out[], \ +#define ADDC_SIMD(SRC, DST) \ +int addc_simd(const SRC in[], const float scalar[], DST out[], \ const int length, const int chan); ADDC_SIMD(uchar, uchar) @@ -83,8 +83,8 @@ ADDC_SIMD(float, float) #undef ADDC_SIMD -#define SUBC_SIMD(SRC, DST) \ -int subc_simd(const SRC in[], const float scalar[], DST out[], \ +#define SUBC_SIMD(SRC, DST) \ +int subc_simd(const SRC in[], const float scalar[], DST out[], \ const int length, const int chan); SUBC_SIMD(uchar, uchar) @@ -106,8 +106,8 @@ SUBC_SIMD(float, float) #undef SUBC_SIMD -#define MULC_SIMD(SRC, DST) \ -int mulc_simd(const SRC in[], const float scalar[], DST out[], \ +#define MULC_SIMD(SRC, DST) \ +int mulc_simd(const SRC in[], const float scalar[], DST out[], \ const int length, const int chan, const float scale); MULC_SIMD(uchar, uchar) @@ -129,6 +129,17 @@ MULC_SIMD(float, float) #undef MULC_SIMD +#define ABSDIFFC_SIMD(T) \ +int absdiffc_simd(const T in[], const float scalar[], T out[], \ + const int length, const int chan); + +ABSDIFFC_SIMD(uchar) +ABSDIFFC_SIMD(short) +ABSDIFFC_SIMD(ushort) +ABSDIFFC_SIMD(float) + +#undef ABSDIFFC_SIMD + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp index 45974131c3..12b74f8f67 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp @@ -151,6 +151,17 @@ MULC_SIMD(float, float) #undef MULC_SIMD +#define ABSDIFFC_SIMD(T) \ +int absdiffc_simd(const T in[], const float scalar[], T out[], \ + const int length, const int chan); + +ABSDIFFC_SIMD(uchar) +ABSDIFFC_SIMD(short) +ABSDIFFC_SIMD(ushort) +ABSDIFFC_SIMD(float) + +#undef ABSDIFFC_SIMD + #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY struct scale_tag {}; @@ -901,6 +912,7 @@ MUL_SIMD(float, float) struct add_tag {}; struct sub_tag {}; struct mul_tag {}; +struct absdiff_tag {}; CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(short* outx, const v_int32& c1, const v_int32& c2, const v_int32& c3, @@ -938,6 +950,12 @@ CV_ALWAYS_INLINE v_float32 oper(mul_tag, const v_float32& a, const v_float32& sc { return a * sc; } + +CV_ALWAYS_INLINE v_float32 oper(absdiff_tag, const v_float32& a, const v_float32& sc) +{ + return v_absdiff(a, sc); +} + //------------------------------------------------------------------------------------------------- template @@ -1450,6 +1468,38 @@ MULC_SIMD(float, float) #undef MULC_SIMD +//------------------------- +// +// Fluid kernels: AbsDiffC +// +//------------------------- + +#define ABSDIFFC_SIMD(SRC) \ +int absdiffc_simd(const SRC in[], const float scalar[], SRC out[], \ + const int length, const int chan) \ +{ \ + switch (chan) \ + { \ + case 1: \ + case 2: \ + case 4: \ + return arithmOpScalar_simd_common(absdiff_tag{}, in, scalar, out, length); \ + case 3: \ + return arithmOpScalar_simd_c3(absdiff_tag{}, in, scalar, out, length); \ + default: \ + GAPI_Assert(chan <= 4); \ + break; \ + } \ + return 0; \ +} + +ABSDIFFC_SIMD(uchar) +ABSDIFFC_SIMD(short) +ABSDIFFC_SIMD(ushort) +ABSDIFFC_SIMD(float) + +#undef ABSDIFFC_SIMD + #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY CV_CPU_OPTIMIZATION_NAMESPACE_END