From a4d6bcba0998bdb30eac2e6d333d389906617bed Mon Sep 17 00:00:00 2001
From: Anna Khakimova <anna.khakimova@intel.com>
Date: Mon, 6 Dec 2021 13:59:26 +0300
Subject: [PATCH] GAPI Fluid: Enable dynamic dispatching for AbsDiffC kernel.

---
 .../gapi/perf/common/gapi_core_perf_tests.hpp |   2 +-
 .../perf/common/gapi_core_perf_tests_inl.hpp  |  12 +-
 .../perf/cpu/gapi_core_perf_tests_cpu.cpp     |   3 +-
 .../perf/cpu/gapi_core_perf_tests_fluid.cpp   |   7 +-
 .../perf/gpu/gapi_core_perf_tests_gpu.cpp     |   3 +-
 .../gapi/src/backends/fluid/gfluidcore.cpp    | 306 ++----------------
 .../fluid/gfluidcore_func.dispatch.cpp        |  15 +
 .../src/backends/fluid/gfluidcore_func.hpp    |  23 +-
 .../backends/fluid/gfluidcore_func.simd.hpp   |  50 +++
 9 files changed, 128 insertions(+), 293 deletions(-)

diff --git a/modules/gapi/perf/common/gapi_core_perf_tests.hpp b/modules/gapi/perf/common/gapi_core_perf_tests.hpp
index 4084ed3e88..f3f251167b 100644
--- a/modules/gapi/perf/common/gapi_core_perf_tests.hpp
+++ b/modules/gapi/perf/common/gapi_core_perf_tests.hpp
@@ -50,7 +50,7 @@ namespace opencv_test
     class MinPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
     class MaxPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
     class AbsDiffPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
-    class AbsDiffCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
+    class AbsDiffCPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, cv::GCompileArgs>> {};
     class SumPerfTest : public TestPerfParams<tuple<compare_scalar_f, cv::Size, MatType, cv::GCompileArgs>> {};
     class CountNonZeroPerfTest : public TestPerfParams<tuple<compare_scalar_f, cv::Size, MatType, cv::GCompileArgs>> {};
     class AddWeightedPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};
diff --git a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
index d4144cd71a..96ce369081 100644
--- a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
+++ b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
@@ -970,9 +970,10 @@ PERF_TEST_P_(AbsDiffPerfTest, TestPerformance)
 
 PERF_TEST_P_(AbsDiffCPerfTest, TestPerformance)
 {
-    cv::Size sz_in = get<0>(GetParam());
-    MatType type = get<1>(GetParam());
-    cv::GCompileArgs compile_args = get<2>(GetParam());
+    compare_f cmpF = get<0>(GetParam());
+    cv::Size sz_in = get<1>(GetParam());
+    MatType type = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
 
 
     initMatsRandU(type, sz_in, type, false);
@@ -997,8 +998,9 @@ PERF_TEST_P_(AbsDiffCPerfTest, TestPerformance)
     }
 
     // Comparison ////////////////////////////////////////////////////////////
-    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
-    EXPECT_EQ(out_mat_gapi.size(), sz_in);
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+    }
 
     SANITY_CHECK_NOTHING();
 }
diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
index 1255f5ca52..c110de4fdd 100644
--- a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
+++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
@@ -156,7 +156,8 @@ INSTANTIATE_TEST_CASE_P(AbsDiffPerfTestCPU, AbsDiffPerfTest,
         Values(cv::compile_args(CORE_CPU))));
 
 INSTANTIATE_TEST_CASE_P(AbsDiffCPerfTestCPU, AbsDiffCPerfTest,
-    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(szSmall128, szVGA, sz720p, sz1080p),
         Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
         Values(cv::compile_args(CORE_CPU))));
 
diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
index 058cff69ac..442d9efa7a 100644
--- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
+++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
@@ -153,10 +153,9 @@ INSTANTIATE_TEST_CASE_P(AbsDiffPerfTestFluid, AbsDiffPerfTest,
             Values(cv::compile_args(CORE_FLUID))));
 
 INSTANTIATE_TEST_CASE_P(AbsDiffCPerfTestFluid, AbsDiffCPerfTest,
-    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
-            Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_8UC2,
-                   CV_16UC2, CV_16SC2, CV_8UC3, CV_16UC3,
-                   CV_16SC3, CV_8UC4, CV_16UC4, CV_16SC4),
+    Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()),
+            Values(szSmall128, szVGA, sz720p, sz1080p),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
             Values(cv::compile_args(CORE_FLUID))));
 
 // INSTANTIATE_TEST_CASE_P(SumPerfTestFluid, SumPerfTest,
diff --git a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
index 025ea5331d..b567f8cd8a 100644
--- a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
+++ b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
@@ -154,7 +154,8 @@ INSTANTIATE_TEST_CASE_P(AbsDiffPerfTestGPU, AbsDiffPerfTest,
                                 Values(cv::compile_args(CORE_GPU))));
 
 INSTANTIATE_TEST_CASE_P(AbsDiffCPerfTestGPU, AbsDiffCPerfTest,
-                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                        Combine(Values(AbsExact().to_compare_f()),
+                            Values( szSmall128, szVGA, sz720p, sz1080p ),
                                 Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
                                 Values(cv::compile_args(CORE_GPU))));
 
diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp
index a0513a09cd..8342a26d0d 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp
@@ -994,244 +994,6 @@ static void run_arithm_s(DST out[], const SRC in[], int width, int chan,
         CV_Error(cv::Error::StsBadArg, "unsupported number of channels");
 }
 
-#if CV_SIMD
-CV_ALWAYS_INLINE void absdiffc_short_store_c1c2c4(short* out_ptr, const v_int32& c1, const v_int32& c2)
-{
-    vx_store(out_ptr, v_pack(c1, c2));
-}
-
-CV_ALWAYS_INLINE void absdiffc_short_store_c1c2c4(ushort* out_ptr, const v_int32& c1, const v_int32& c2)
-{
-    vx_store(out_ptr, v_pack_u(c1, c2));
-}
-
-template<typename T>
-CV_ALWAYS_INLINE int absdiffc_simd_c1c2c4(const T in[], T out[],
-                                          const v_float32& s, const int length)
-{
-    static_assert((std::is_same<T, ushort>::value) || (std::is_same<T, short>::value),
-                  "This templated overload is only for short or ushort type combinations.");
-
-    constexpr int nlanes = (std::is_same<T, ushort>::value) ? static_cast<int>(v_uint16::nlanes) :
-                                                              static_cast<int>(v_int16::nlanes);
-    if (length < nlanes)
-        return 0;
-
-    int x = 0;
-    for (;;)
-    {
-        for (; x <= length - nlanes; x += nlanes)
-        {
-            v_float32 a1 = v_load_f32(in + x);
-            v_float32 a2 = v_load_f32(in + x + nlanes / 2);
-
-            absdiffc_short_store_c1c2c4(&out[x], v_round(v_absdiff(a1, s)),
-                                                 v_round(v_absdiff(a2, s)));
-        }
-
-        if (x < length && (in != out))
-        {
-            x = length - nlanes;
-            continue;  // process unaligned tail
-        }
-        break;
-    }
-    return x;
-}
-
-template<>
-CV_ALWAYS_INLINE int absdiffc_simd_c1c2c4<uchar>(const uchar in[], uchar out[],
-                                                 const v_float32& s, const int length)
-{
-    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
-
-    if (length < nlanes)
-        return 0;
-
-    int x = 0;
-    for (;;)
-    {
-        for (; x <= length - nlanes; x += nlanes)
-        {
-            v_float32 a1 = v_load_f32(in + x);
-            v_float32 a2 = v_load_f32(in + x + nlanes / 4);
-            v_float32 a3 = v_load_f32(in + x + nlanes / 2);
-            v_float32 a4 = v_load_f32(in + x + 3 * nlanes / 4);
-
-            vx_store(&out[x], v_pack_u(v_pack(v_round(v_absdiff(a1, s)),
-                                              v_round(v_absdiff(a2, s))),
-                                       v_pack(v_round(v_absdiff(a3, s)),
-                                              v_round(v_absdiff(a4, s)))));
-        }
-
-        if (x < length && (in != out))
-        {
-            x = length - nlanes;
-            continue;  // process unaligned tail
-        }
-        break;
-    }
-    return x;
-}
-
-CV_ALWAYS_INLINE void absdiffc_short_store_c3(short* out_ptr, const v_int32& c1,
-                                              const v_int32& c2, const v_int32& c3,
-                                              const v_int32& c4, const v_int32& c5,
-                                              const v_int32& c6)
-{
-    constexpr int nlanes = static_cast<int>(v_int16::nlanes);
-    vx_store(out_ptr, v_pack(c1, c2));
-    vx_store(out_ptr + nlanes, v_pack(c3, c4));
-    vx_store(out_ptr + 2*nlanes, v_pack(c5, c6));
-}
-
-CV_ALWAYS_INLINE void absdiffc_short_store_c3(ushort* out_ptr, const v_int32& c1,
-                                              const v_int32& c2, const v_int32& c3,
-                                              const v_int32& c4, const v_int32& c5,
-                                              const v_int32& c6)
-{
-    constexpr int nlanes = static_cast<int>(v_uint16::nlanes);
-    vx_store(out_ptr, v_pack_u(c1, c2));
-    vx_store(out_ptr + nlanes, v_pack_u(c3, c4));
-    vx_store(out_ptr + 2*nlanes, v_pack_u(c5, c6));
-}
-
-template<typename T>
-CV_ALWAYS_INLINE int absdiffc_simd_c3_impl(const T in[], T out[],
-                                           const v_float32& s1, const v_float32& s2,
-                                           const v_float32& s3, const int length)
-{
-    static_assert((std::is_same<T, ushort>::value) || (std::is_same<T, short>::value),
-                  "This templated overload is only for short or ushort type combinations.");
-
-    constexpr int nlanes = (std::is_same<T, ushort>::value) ? static_cast<int>(v_uint16::nlanes):
-                                                              static_cast<int>(v_int16::nlanes);
-
-    if (length < 3 * nlanes)
-        return 0;
-
-    int x = 0;
-    for (;;)
-    {
-        for (; x <= length - 3 * nlanes; x += 3 * nlanes)
-        {
-            v_float32 a1 = v_load_f32(in + x);
-            v_float32 a2 = v_load_f32(in + x + nlanes / 2);
-            v_float32 a3 = v_load_f32(in + x + nlanes);
-            v_float32 a4 = v_load_f32(in + x + 3 * nlanes / 2);
-            v_float32 a5 = v_load_f32(in + x + 2 * nlanes);
-            v_float32 a6 = v_load_f32(in + x + 5 * nlanes / 2);
-
-            absdiffc_short_store_c3(&out[x], v_round(v_absdiff(a1, s1)),
-                                             v_round(v_absdiff(a2, s2)),
-                                             v_round(v_absdiff(a3, s3)),
-                                             v_round(v_absdiff(a4, s1)),
-                                             v_round(v_absdiff(a5, s2)),
-                                             v_round(v_absdiff(a6, s3)));
-        }
-
-        if (x < length && (in != out))
-        {
-            x = length - 3 * nlanes;
-            continue;  // process unaligned tail
-        }
-        break;
-    }
-    return x;
-}
-
-template<>
-CV_ALWAYS_INLINE int absdiffc_simd_c3_impl<uchar>(const uchar in[], uchar out[],
-                                                  const v_float32& s1, const v_float32& s2,
-                                                  const v_float32& s3, const int length)
-{
-    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
-
-    if (length < 3 * nlanes)
-        return 0;
-
-    int x = 0;
-
-    for (;;)
-    {
-        for (; x <= length - 3 * nlanes; x += 3 * nlanes)
-        {
-            vx_store(&out[x],
-                     v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x), s1)),
-                                     v_round(v_absdiff(v_load_f32(in + x + nlanes/4), s2))),
-                              v_pack(v_round(v_absdiff(v_load_f32(in + x + nlanes/2), s3)),
-                                     v_round(v_absdiff(v_load_f32(in + x + 3*nlanes/4), s1)))));
-
-            vx_store(&out[x + nlanes],
-                     v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x + nlanes), s2)),
-                                     v_round(v_absdiff(v_load_f32(in + x + 5*nlanes/4), s3))),
-                              v_pack(v_round(v_absdiff(v_load_f32(in + x + 3*nlanes/2), s1)),
-                                     v_round(v_absdiff(v_load_f32(in + x + 7*nlanes/4), s2)))));
-
-            vx_store(&out[x + 2 * nlanes],
-                     v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x + 2*nlanes), s3)),
-                                     v_round(v_absdiff(v_load_f32(in + x + 9*nlanes/4), s1))),
-                              v_pack(v_round(v_absdiff(v_load_f32(in + x + 5*nlanes/2), s2)),
-                                     v_round(v_absdiff(v_load_f32(in + x + 11*nlanes/4), s3)))));
-        }
-
-        if (x < length && (in != out))
-        {
-            x = length - 3 * nlanes;
-            continue;  // process unaligned tail
-        }
-        break;
-    }
-    return x;
-}
-
-template<typename T>
-CV_ALWAYS_INLINE int absdiffc_simd_channels(const T in[], const float scalar[], T out[],
-                                            const int width, int chan)
-{
-    int length = width * chan;
-    v_float32 s = vx_load(scalar);
-
-    return absdiffc_simd_c1c2c4(in, out, s, length);
-}
-
-template<typename T>
-CV_ALWAYS_INLINE int absdiffc_simd_c3(const T in[], const float scalar[], T out[], int width)
-{
-    constexpr int chan = 3;
-    int length = width * chan;
-
-    v_float32 s1 = vx_load(scalar);
-#if CV_SIMD_WIDTH == 32
-    v_float32 s2 = vx_load(scalar + 2);
-    v_float32 s3 = vx_load(scalar + 1);
-#else
-    v_float32 s2 = vx_load(scalar + 1);
-    v_float32 s3 = vx_load(scalar + 2);
-#endif
-
-    return absdiffc_simd_c3_impl(in, out, s1, s2, s3, length);
-}
-
-template<typename T>
-CV_ALWAYS_INLINE int absdiffc_simd(const T in[], const float scalar[], T out[], int width, int chan)
-{
-    switch (chan)
-    {
-    case 1:
-    case 2:
-    case 4:
-        return absdiffc_simd_channels(in, scalar, out, width, chan);
-    case 3:
-        return absdiffc_simd_c3(in, scalar, out, width);
-    default:
-        break;
-    }
-
-    return 0;
-}
-#endif  // CV_SIMD
-
 template<typename DST, typename SRC>
 static void run_absdiffc(Buffer &dst, const View &src, const float scalar[])
 {
@@ -1240,13 +1002,14 @@ static void run_absdiffc(Buffer &dst, const View &src, const float scalar[])
 
     int width = dst.length();
     int chan = dst.meta().chan;
+    const int length = width * chan;
 
     int w = 0;
 #if CV_SIMD
-    w = absdiffc_simd(in, scalar, out, width, chan);
+    w = absdiffc_simd(in, scalar, out, length, chan);
 #endif
 
-    for (; w < width*chan; ++w)
+    for (; w < length; ++w)
         out[w] = absdiff<DST>(in[w], scalar[w%chan]);
 }
 
@@ -1349,6 +1112,32 @@ static void run_arithm_rs(Buffer &dst, const View &src, const float scalar[4], A
     }
 }
 
+CV_ALWAYS_INLINE void initScratchBuffer(Buffer& scratch)
+{
+#if CV_SIMD
+    // 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector.
+    constexpr int maxNlanes = 16;
+
+    // +2 is offset for 3-channel case.
+    // Offset is need to right load coefficients from scalar array to SIMD vectors for 3-channel case.
+    // Scalar array looks like: scalar[] = {C1, C2, C3, C1, C2, C3, ...}
+    // The first scalar SIMD vector should looks like:
+    // C1 C2 C3 C1
+    // The second:
+    // C2 C3 C1 C2
+    // The third:
+    // C3 C1 C2 C3
+    constexpr int offset = 2;
+    constexpr int buflen = maxNlanes + offset;
+#else
+    constexpr int buflen = 4;
+#endif
+    cv::Size bufsize(buflen, 1);
+    GMatDesc bufdesc = { CV_32F, 1, bufsize };
+    Buffer buffer(bufdesc);
+    scratch = std::move(buffer);
+}
+
 GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true)
 {
     static const int Window = 1;
@@ -1370,21 +1159,14 @@ GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true)
         UNARY_(uchar, uchar, run_absdiffc, dst, src, scalar);
         UNARY_(ushort, ushort, run_absdiffc, dst, src, scalar);
         UNARY_(short, short, run_absdiffc, dst, src, scalar);
+        UNARY_(float, float, run_absdiffc, dst, src, scalar);
 
         CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
     }
 
     static void initScratch(const GMatDesc&, const GScalarDesc&, Buffer& scratch)
     {
-#if CV_SIMD
-        constexpr int buflen = static_cast<int>(v_float32::nlanes) + 2; // buffer size
-#else
-        constexpr int buflen = 4;
-#endif
-        cv::Size bufsize(buflen, 1);
-        GMatDesc bufdesc = { CV_32F, 1, bufsize };
-        Buffer buffer(bufdesc);
-        scratch = std::move(buffer);
+        initScratchBuffer(scratch);
     }
 
     static void resetScratch(Buffer& /* scratch */)
@@ -1392,32 +1174,6 @@ GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true)
     }
 };
 
-CV_ALWAYS_INLINE void initScratchBuffer(Buffer& scratch)
-{
-#if CV_SIMD
-    // 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector.
-    constexpr int maxNlanes = 16;
-
-    // +2 is offset for 3-channel case.
-    // Offset is need to right load coefficients from scalar array to SIMD vectors for 3-channel case.
-    // Scalar array looks like: scalar[] = {C1, C2, C3, C1, C2, C3, ...}
-    // The first scalar SIMD vector should looks like:
-    // C1 C2 C3 C1
-    // The second:
-    // C2 C3 C1 C2
-    // The third:
-    // C3 C1 C2 C3
-    constexpr int offset = 2;
-    constexpr int buflen = maxNlanes + offset;
-#else
-    constexpr int buflen = 4;
-#endif
-    cv::Size bufsize(buflen, 1);
-    GMatDesc bufdesc = { CV_32F, 1, bufsize };
-    Buffer buffer(bufdesc);
-    scratch = std::move(buffer);
-}
-
 GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, true)
 {
     static const int Window = 1;
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
index f596779286..ab6b013694 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
@@ -165,6 +165,21 @@ MULC_SIMD(float, float)
 
 #undef MULC_SIMD
 
+#define ABSDIFFC_SIMD(SRC)                                               \
+int absdiffc_simd(const SRC in[], const float scalar[], SRC out[],       \
+                  const int length, const int chan)                      \
+{                                                                        \
+    CV_CPU_DISPATCH(absdiffc_simd, (in, scalar, out, length, chan),      \
+                    CV_CPU_DISPATCH_MODES_ALL);                          \
+}
+
+ABSDIFFC_SIMD(uchar)
+ABSDIFFC_SIMD(short)
+ABSDIFFC_SIMD(ushort)
+ABSDIFFC_SIMD(float)
+
+#undef ABSDIFFC_SIMD
+
 } // namespace fluid
 } // namespace gapi
 } // namespace cv
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
index 541870e548..522d7b8b44 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
@@ -60,8 +60,8 @@ MUL_SIMD(float, float)
 
 #undef MUL_SIMD
 
-#define ADDC_SIMD(SRC, DST)                                                              \
-int addc_simd(const SRC in[], const float scalar[], DST out[],                           \
+#define ADDC_SIMD(SRC, DST)                                             \
+int addc_simd(const SRC in[], const float scalar[], DST out[],          \
               const int length, const int chan);
 
 ADDC_SIMD(uchar, uchar)
@@ -83,8 +83,8 @@ ADDC_SIMD(float, float)
 
 #undef ADDC_SIMD
 
-#define SUBC_SIMD(SRC, DST)                                                              \
-int subc_simd(const SRC in[], const float scalar[], DST out[],                           \
+#define SUBC_SIMD(SRC, DST)                                        \
+int subc_simd(const SRC in[], const float scalar[], DST out[],     \
               const int length, const int chan);
 
 SUBC_SIMD(uchar, uchar)
@@ -106,8 +106,8 @@ SUBC_SIMD(float, float)
 
 #undef SUBC_SIMD
 
-#define MULC_SIMD(SRC, DST)                                                              \
-int mulc_simd(const SRC in[], const float scalar[], DST out[],                           \
+#define MULC_SIMD(SRC, DST)                                                 \
+int mulc_simd(const SRC in[], const float scalar[], DST out[],              \
               const int length, const int chan, const float scale);
 
 MULC_SIMD(uchar, uchar)
@@ -129,6 +129,17 @@ MULC_SIMD(float, float)
 
 #undef MULC_SIMD
 
+#define ABSDIFFC_SIMD(T)                                            \
+int absdiffc_simd(const T in[], const float scalar[], T out[],      \
+                  const int length, const int chan);
+
+ABSDIFFC_SIMD(uchar)
+ABSDIFFC_SIMD(short)
+ABSDIFFC_SIMD(ushort)
+ABSDIFFC_SIMD(float)
+
+#undef ABSDIFFC_SIMD
+
 }  // namespace fluid
 }  // namespace gapi
 }  // namespace cv
diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
index 45974131c3..12b74f8f67 100644
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
@@ -151,6 +151,17 @@ MULC_SIMD(float, float)
 
 #undef MULC_SIMD
 
+#define ABSDIFFC_SIMD(T)                                            \
+int absdiffc_simd(const T in[], const float scalar[], T out[],      \
+                  const int length, const int chan);
+
+ABSDIFFC_SIMD(uchar)
+ABSDIFFC_SIMD(short)
+ABSDIFFC_SIMD(ushort)
+ABSDIFFC_SIMD(float)
+
+#undef ABSDIFFC_SIMD
+
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
 struct scale_tag {};
@@ -901,6 +912,7 @@ MUL_SIMD(float, float)
 struct add_tag {};
 struct sub_tag {};
 struct mul_tag {};
+struct absdiff_tag {};
 
 CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(short* outx,       const v_int32& c1,
                                                    const v_int32& c2, const v_int32& c3,
@@ -938,6 +950,12 @@ CV_ALWAYS_INLINE v_float32 oper(mul_tag, const v_float32& a, const v_float32& sc
 {
     return a * sc;
 }
+
+CV_ALWAYS_INLINE v_float32 oper(absdiff_tag, const v_float32& a, const v_float32& sc)
+{
+    return v_absdiff(a, sc);
+}
+
 //-------------------------------------------------------------------------------------------------
 
 template<typename oper_tag, typename SRC, typename DST>
@@ -1450,6 +1468,38 @@ MULC_SIMD(float, float)
 
 #undef MULC_SIMD
 
+//-------------------------
+//
+// Fluid kernels: AbsDiffC
+//
+//-------------------------
+
+#define ABSDIFFC_SIMD(SRC)                                                          \
+int absdiffc_simd(const SRC in[], const float scalar[], SRC out[],                  \
+              const int length, const int chan)                                     \
+{                                                                                   \
+    switch (chan)                                                                   \
+    {                                                                               \
+    case 1:                                                                         \
+    case 2:                                                                         \
+    case 4:                                                                         \
+        return arithmOpScalar_simd_common(absdiff_tag{}, in, scalar, out, length);  \
+    case 3:                                                                         \
+        return arithmOpScalar_simd_c3(absdiff_tag{}, in, scalar, out, length);      \
+    default:                                                                        \
+        GAPI_Assert(chan <= 4);                                                     \
+        break;                                                                      \
+    }                                                                               \
+    return 0;                                                                       \
+}
+
+ABSDIFFC_SIMD(uchar)
+ABSDIFFC_SIMD(short)
+ABSDIFFC_SIMD(ushort)
+ABSDIFFC_SIMD(float)
+
+#undef ABSDIFFC_SIMD
+
 #endif  // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
 CV_CPU_OPTIMIZATION_NAMESPACE_END