Merge pull request #21177 from anna-khakimova:ak/simd_mulc

* GAPI Fluid: SIMD for MulC kernel. * Changes for MulDouble kernel.
3 years ago · c3910807c5
parent c5b8b5687f
commit c3910807c5
9 changed files with 459 additions and 68 deletions
--- a/modules/gapi/perf/common/gapi_core_perf_tests.hpp
+++ b/modules/gapi/perf/common/gapi_core_perf_tests.hpp
@ -33,8 +33,8 @@ namespace opencv_test
    class SubCPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};
    class SubRCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
    class MulPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, double, cv::GCompileArgs>> {};
-    class MulDoublePerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
-    class MulCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
+    class MulDoublePerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};
+    class MulCPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};
    class DivPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, double, cv::GCompileArgs>> {};
    class DivCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
    class DivRCPerfTest : public TestPerfParams<tuple<compare_f,cv::Size, MatType, int, cv::GCompileArgs>> {};
--- a/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
+++ b/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
@ -257,17 +257,21 @@ PERF_TEST_P_(MulPerfTest, TestPerformance)

 PERF_TEST_P_(MulDoublePerfTest, TestPerformance)
 {
-    Size sz = get<0>(GetParam());
-    MatType type = get<1>(GetParam());
-    int dtype = get<2>(GetParam());
-    cv::GCompileArgs compile_args = get<3>(GetParam());
+    compare_f cmpF;
+    cv::Size sz;
+    MatType type = -1;
+    int dtype = -1;
+    double scale = 1.0;
+    cv::GCompileArgs compile_args;
+
+    std::tie(cmpF, sz, type, dtype, compile_args) = GetParam();

    auto& rng = cv::theRNG();
    double d = rng.uniform(0.0, 10.0);
    initMatrixRandU(type, sz, dtype, false);

    // OpenCV code ///////////////////////////////////////////////////////////
-    cv::multiply(in_mat1, d, out_mat_ocv, 1, dtype);
+    cv::multiply(in_mat1, d, out_mat_ocv, scale, dtype);

    // G-API code ////////////////////////////////////////////////////////////
    cv::GMat in1, out;
@ -285,8 +289,9 @@ PERF_TEST_P_(MulDoublePerfTest, TestPerformance)
    }

    // Comparison ////////////////////////////////////////////////////////////
-    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
-    EXPECT_EQ(out_mat_gapi.size(), sz);
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+    }

    SANITY_CHECK_NOTHING();
 }
@ -295,15 +300,19 @@ PERF_TEST_P_(MulDoublePerfTest, TestPerformance)

 PERF_TEST_P_(MulCPerfTest, TestPerformance)
 {
-    Size sz = get<0>(GetParam());
-    MatType type = get<1>(GetParam());
-    int dtype = get<2>(GetParam());
-    cv::GCompileArgs compile_args = get<3>(GetParam());
+    compare_f cmpF;
+    cv::Size sz;
+    MatType type = -1;
+    int dtype = -1;
+    double scale = 1.0;
+    cv::GCompileArgs compile_args;
+
+    std::tie(cmpF, sz, type, dtype, compile_args) = GetParam();

    initMatsRandU(type, sz, dtype, false);

    // OpenCV code ///////////////////////////////////////////////////////////
-    cv::multiply(in_mat1, sc, out_mat_ocv, 1, dtype);
+    cv::multiply(in_mat1, sc, out_mat_ocv, scale, dtype);

    // G-API code ////////////////////////////////////////////////////////////
    cv::GMat in1, out;
@ -322,8 +331,9 @@ PERF_TEST_P_(MulCPerfTest, TestPerformance)
    }

    // Comparison ////////////////////////////////////////////////////////////
-    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
-    EXPECT_EQ(out_mat_gapi.size(), sz);
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+    }

    SANITY_CHECK_NOTHING();
 }
--- a/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
+++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
@ -56,13 +56,15 @@ INSTANTIATE_TEST_CASE_P(MulPerfTestCPU, MulPerfTest,
        Values(cv::compile_args(CORE_CPU))));

 INSTANTIATE_TEST_CASE_P(MulDoublePerfTestCPU, MulDoublePerfTest,
-    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
-        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-        Values(-1, CV_8U, CV_16U, CV_32F),
-        Values(cv::compile_args(CORE_CPU))));
+    Combine(Values(AbsExact().to_compare_f()),
+            Values(szSmall128, szVGA, sz720p, sz1080p),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(-1, CV_8U, CV_16U, CV_32F),
+            Values(cv::compile_args(CORE_CPU))));

 INSTANTIATE_TEST_CASE_P(MulCPerfTestCPU, MulCPerfTest,
-    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(szSmall128, szVGA, sz720p, sz1080p),
        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
        Values(-1, CV_8U, CV_16U, CV_32F),
        Values(cv::compile_args(CORE_CPU))));
--- a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
+++ b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
@ -52,17 +52,19 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestFluid, SubPerfTest,
         Values(2.0),
         Values(cv::compile_args(CORE_FLUID))));

-// INSTANTIATE_TEST_CASE_P(MulDoublePerfTestFluid, MulDoublePerfTest,
-//     Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
-//         Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-//         Values(-1, CV_8U, CV_16U, CV_32F),
-//         Values(cv::compile_args(CORE_FLUID))));
+ INSTANTIATE_TEST_CASE_P(MulDoublePerfTestFluid, MulDoublePerfTest,
+     Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()),
+             Values(szSmall128, szVGA, sz720p, sz1080p),
+             Values(CV_8UC1, CV_8UC3, CV_16SC1, CV_32FC1),
+             Values(-1, CV_8U, CV_32F),
+             Values(cv::compile_args(CORE_FLUID))));

-// INSTANTIATE_TEST_CASE_P(MulCPerfTestFluid, MulCPerfTest,
-//     Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
-//         Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
-//         Values(-1, CV_8U, CV_16U, CV_32F),
-//         Values(cv::compile_args(CORE_FLUID))));
+ INSTANTIATE_TEST_CASE_P(MulCPerfTestFluid, MulCPerfTest,
+     Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()),
+             Values(szSmall128, szVGA, sz720p, sz1080p),
+             Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+             Values(-1, CV_8U, CV_16U, CV_16S, CV_32F),
+             Values(cv::compile_args(CORE_FLUID))));

 INSTANTIATE_TEST_CASE_P(DivPerfTestFluid, DivPerfTest,
     Combine(Values(AbsExact().to_compare_f()),
--- a/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
+++ b/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
@ -54,13 +54,15 @@ INSTANTIATE_TEST_CASE_P(MulPerfTestGPU, MulPerfTest,
                                Values(cv::compile_args(CORE_GPU))));

 INSTANTIATE_TEST_CASE_P(MulDoublePerfTestGPU, MulDoublePerfTest,
-                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values( szSmall128, szVGA, sz720p, sz1080p ),
                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
                                Values( -1, CV_8U, CV_16U, CV_32F ),
                                Values(cv::compile_args(CORE_GPU))));

 INSTANTIATE_TEST_CASE_P(MulCPerfTestGPU, MulCPerfTest,
-                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values( szSmall128, szVGA, sz720p, sz1080p ),
                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
                                Values( -1, CV_8U, CV_16U, CV_32F ),
                                Values(cv::compile_args(CORE_GPU))));
--- a/modules/gapi/src/backends/fluid/gfluidcore.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp
@ -1265,12 +1265,12 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
    {
    case ARITHM_ADD:
    {
-            int w = 0;
+        int w = 0;
 #if CV_SIMD
-            w = addc_simd(in, scalar, out, length, chan);
+        w = addc_simd(in, scalar, out, length, chan);
 #endif
-            for (; w < length; ++w)
-                out[w] = add<DST>(in[w], scalar[w % chan]);
+        for (; w < length; ++w)
+            out[w] = add<DST>(in[w], scalar[w % chan]);

        break;
    }
@ -1284,12 +1284,17 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
            out[w] = sub<DST>(in[w], scalar[w % chan]);
        break;
    }
-    // TODO: optimize miltiplication and division
    case ARITHM_MULTIPLY:
-        for (int w=0; w < width; w++)
-            for (int c=0; c < chan; c++)
-                out[chan*w + c] = mul<DST>(in[chan*w + c], scalar[c], scale);
+    {
+        int w = 0;
+#if CV_SIMD
+        w = mulc_simd(in, scalar, out, length, chan, scale);
+#endif
+        for (; w < width; ++w)
+            for (int c = 0; c < chan; ++c)
+                out[chan * w + c] = mul<DST>(in[chan * w + c], scalar[c], scale);
        break;
+    }
    case ARITHM_DIVIDE:
        for (int w=0; w < width; w++)
            for (int c=0; c < chan; c++)
@ -1539,45 +1544,73 @@ GAPI_FLUID_KERNEL(GFluidSubRC, cv::gapi::core::GSubRC, false)
    }
 };

-GAPI_FLUID_KERNEL(GFluidMulC, cv::gapi::core::GMulC, false)
+GAPI_FLUID_KERNEL(GFluidMulC, cv::gapi::core::GMulC, true)
 {
    static const int Window = 1;

-    static void run(const View &src, const cv::Scalar &_scalar, int /*dtype*/, Buffer &dst)
+    static void run(const View& src, const cv::Scalar& _scalar, int /*dtype*/,
+                    Buffer& dst, Buffer& scratch)
    {
-        const float scalar[4] = {
-            static_cast<float>(_scalar[0]),
-            static_cast<float>(_scalar[1]),
-            static_cast<float>(_scalar[2]),
-            static_cast<float>(_scalar[3])
-        };
-        const float scale = 1.f;
+        GAPI_Assert(src.meta().chan <= 4);
+
+        if (dst.y() == 0)
+        {
+            const int chan = src.meta().chan;
+            float* sc = scratch.OutLine<float>();
+
+            for (int i = 0; i < scratch.length(); ++i)
+                sc[i] = static_cast<float>(_scalar[i % chan]);
+        }
+        const float* scalar = scratch.OutLine<float>();
+        const float scale = 1.0;

        //     DST     SRC     OP            __VA_ARGS__
-        UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
-        UNARY_(uchar ,  short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
-        UNARY_(uchar ,  float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
-        UNARY_( short,  short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
-        UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
-        UNARY_( float,  short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
-        UNARY_( float,  float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(uchar,  uchar,  run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(uchar,  ushort, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(uchar,  short,  run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(uchar,  float,  run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(ushort, ushort, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(ushort, short,  run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(ushort, uchar,  run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(ushort, float,  run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(short,  short,  run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(short,  ushort, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(short,  uchar,  run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(short,  float,  run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(float,  uchar,  run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(float,  ushort, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(float,  short,  run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(float,  float,  run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);

        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
    }
+
+    static void initScratch(const GMatDesc&, const GScalarDesc&, int, Buffer& scratch)
+    {
+        initScratchBuffer(scratch);
+    }
+
+    static void resetScratch(Buffer& /*scratch*/)
+    {
+    }
 };

-GAPI_FLUID_KERNEL(GFluidMulCOld, cv::gapi::core::GMulCOld, false)
+GAPI_FLUID_KERNEL(GFluidMulCOld, cv::gapi::core::GMulCOld, true)
 {
    static const int Window = 1;

-    static void run(const View &src, double _scalar, int /*dtype*/, Buffer &dst)
+    static void run(const View &src, double _scalar, int /*dtype*/, Buffer &dst, Buffer& scratch)
    {
-        const float scalar[4] = {
-            static_cast<float>(_scalar),
-            static_cast<float>(_scalar),
-            static_cast<float>(_scalar),
-            static_cast<float>(_scalar)
-        };
+        GAPI_Assert(src.meta().chan <= 4);
+
+        if (dst.y() == 0)
+        {
+            float* sc = scratch.OutLine<float>();
+
+            for (int i = 0; i < scratch.length(); ++i)
+                sc[i] = static_cast<float>(_scalar);
+        }
+        const float* scalar = scratch.OutLine<float>();
        const float scale = 1.f;

        //     DST     SRC     OP            __VA_ARGS__
@ -1591,6 +1624,15 @@ GAPI_FLUID_KERNEL(GFluidMulCOld, cv::gapi::core::GMulCOld, false)

        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
    }
+
+    static void initScratch(const GMatDesc&, double, int, Buffer& scratch)
+    {
+        initScratchBuffer(scratch);
+    }
+
+    static void resetScratch(Buffer& /*scratch*/)
+    {
+    }
 };

 GAPI_FLUID_KERNEL(GFluidDivC, cv::gapi::core::GDivC, false)
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
@ -138,6 +138,33 @@ SUBC_SIMD(float, float)

 #undef SUBC_SIMD

+#define MULC_SIMD(SRC, DST)                                               \
+int mulc_simd(const SRC in[], const float scalar[], DST out[],            \
+              const int length, const int chan, const float scale)        \
+{                                                                         \
+    CV_CPU_DISPATCH(mulc_simd, (in, scalar, out, length, chan, scale),    \
+                    CV_CPU_DISPATCH_MODES_ALL);                           \
+}
+
+MULC_SIMD(uchar, uchar)
+MULC_SIMD(ushort, uchar)
+MULC_SIMD(short, uchar)
+MULC_SIMD(float, uchar)
+MULC_SIMD(short, short)
+MULC_SIMD(ushort, short)
+MULC_SIMD(uchar, short)
+MULC_SIMD(float, short)
+MULC_SIMD(ushort, ushort)
+MULC_SIMD(uchar, ushort)
+MULC_SIMD(short, ushort)
+MULC_SIMD(float, ushort)
+MULC_SIMD(uchar, float)
+MULC_SIMD(ushort, float)
+MULC_SIMD(short, float)
+MULC_SIMD(float, float)
+
+#undef MULC_SIMD
+
 } // namespace fluid
 } // namespace gapi
 } // namespace cv
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp
@ -106,6 +106,29 @@ SUBC_SIMD(float, float)

 #undef SUBC_SIMD

+#define MULC_SIMD(SRC, DST)                                                              \
+int mulc_simd(const SRC in[], const float scalar[], DST out[],                           \
+              const int length, const int chan, const float scale);
+
+MULC_SIMD(uchar, uchar)
+MULC_SIMD(ushort, uchar)
+MULC_SIMD(short, uchar)
+MULC_SIMD(float, uchar)
+MULC_SIMD(short, short)
+MULC_SIMD(ushort, short)
+MULC_SIMD(uchar, short)
+MULC_SIMD(float, short)
+MULC_SIMD(ushort, ushort)
+MULC_SIMD(uchar, ushort)
+MULC_SIMD(short, ushort)
+MULC_SIMD(float, ushort)
+MULC_SIMD(uchar, float)
+MULC_SIMD(ushort, float)
+MULC_SIMD(short, float)
+MULC_SIMD(float, float)
+
+#undef MULC_SIMD
+
 }  // namespace fluid
 }  // namespace gapi
 }  // namespace cv
--- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
@ -127,6 +127,30 @@ SUBC_SIMD(float, float)

 #undef SUBC_SIMD

+
+#define MULC_SIMD(SRC, DST)                                                              \
+int mulc_simd(const SRC in[], const float scalar[], DST out[],                           \
+              const int length, const int chan, const float scale);
+
+MULC_SIMD(uchar, uchar)
+MULC_SIMD(ushort, uchar)
+MULC_SIMD(short, uchar)
+MULC_SIMD(float, uchar)
+MULC_SIMD(short, short)
+MULC_SIMD(ushort, short)
+MULC_SIMD(uchar, short)
+MULC_SIMD(float, short)
+MULC_SIMD(ushort, ushort)
+MULC_SIMD(uchar, ushort)
+MULC_SIMD(short, ushort)
+MULC_SIMD(float, ushort)
+MULC_SIMD(uchar, float)
+MULC_SIMD(ushort, float)
+MULC_SIMD(short, float)
+MULC_SIMD(float, float)
+
+#undef MULC_SIMD
+
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY

 struct scale_tag {};
@ -870,12 +894,13 @@ MUL_SIMD(float, float)

 //-------------------------
 //
-// Fluid kernels: AddC
+// Fluid kernels: AddC, SubC
 //
 //-------------------------

 struct add_tag {};
 struct sub_tag {};
+struct mul_tag {};

 CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(short* outx,       const v_int32& c1,
                                                   const v_int32& c2, const v_int32& c3,
@ -909,6 +934,12 @@ CV_ALWAYS_INLINE v_float32 oper(sub_tag, const v_float32& a, const v_float32& sc
    return a - sc;
 }

+CV_ALWAYS_INLINE v_float32 oper(mul_tag, const v_float32& a, const v_float32& sc)
+{
+    return a * sc;
+}
+//-------------------------------------------------------------------------------------------------
+
 template<typename oper_tag, typename SRC, typename DST>
 CV_ALWAYS_INLINE
 typename std::enable_if<(std::is_same<DST, ushort>::value ||
@ -957,7 +988,7 @@ CV_ALWAYS_INLINE
 typename std::enable_if<std::is_same<DST, short>::value ||
                        std::is_same<DST, ushort>::value, void>::type
 arithmOpScalar_simd_c3_impl(oper_tag t, const SRC* inx, DST* outx, const v_float32& s1, const v_float32& s2,
-                  const v_float32& s3, const int nlanes)
+                            const v_float32& s3, const int nlanes)
 {
    v_float32 a1 = vg_load_f32(inx);
    v_float32 a2 = vg_load_f32(&inx[nlanes / 2]);
@ -1089,7 +1120,7 @@ CV_ALWAYS_INLINE int arithmOpScalar_simd_common(oper_tag t, const SRC in[],
    return x;
 }

-
+//-------------------------------------------------------------------------------------------------

 #define ADDC_SIMD(SRC, DST)                                                         \
 int addc_simd(const SRC in[], const float scalar[], DST out[],                      \
@ -1129,6 +1160,8 @@ ADDC_SIMD(float, float)

 #undef ADDC_SIMD

+//-------------------------------------------------------------------------------------------------
+
 #define SUBC_SIMD(SRC, DST)                                                         \
 int subc_simd(const SRC in[], const float scalar[], DST out[],                      \
              const int length, const int chan)                                     \
@ -1167,6 +1200,256 @@ SUBC_SIMD(float, float)

 #undef SUBC_SIMD

+//-------------------------
+//
+// Fluid kernels: MulC
+//
+//-------------------------
+
+template<typename SRC, typename DST>
+CV_ALWAYS_INLINE
+typename std::enable_if<std::is_same<DST, short>::value ||
+                        std::is_same<DST, ushort>::value, void>::type
+mulc_scale_simd_c3_impl(const SRC* inx, DST* outx, const v_float32& s1, const v_float32& s2,
+                        const v_float32& s3, const v_float32& scale, const int nlanes)
+{
+    v_float32 a1 = vg_load_f32(inx);
+    v_float32 a2 = vg_load_f32(&inx[nlanes / 2]);
+    v_float32 a3 = vg_load_f32(&inx[nlanes]);
+    v_float32 a4 = vg_load_f32(&inx[3 * nlanes / 2]);
+    v_float32 a5 = vg_load_f32(&inx[2 * nlanes]);
+    v_float32 a6 = vg_load_f32(&inx[5 * nlanes / 2]);
+
+    arithmOpScalar_pack_store_c3(outx, v_round(scale*a1*s1),
+                                       v_round(scale*a2*s2),
+                                       v_round(scale*a3*s3),
+                                       v_round(scale*a4*s1),
+                                       v_round(scale*a5*s2),
+                                       v_round(scale*a6*s3));
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename SRC>
+CV_ALWAYS_INLINE void mulc_scale_simd_c3_impl(const SRC* inx, uchar* outx,
+                                              const v_float32& s1, const v_float32& s2,
+                                              const v_float32& s3, const v_float32& scale, const int nlanes)
+{
+    vx_store(outx,
+               v_pack_u(v_pack(v_round(scale * vg_load_f32(inx)* s1),
+                               v_round(scale * vg_load_f32(&inx[nlanes/4])* s2)),
+                        v_pack(v_round(scale * vg_load_f32(&inx[nlanes/2])* s3),
+                               v_round(scale * vg_load_f32(&inx[3*nlanes/4])* s1))));
+
+    vx_store(&outx[nlanes],
+                v_pack_u(v_pack(v_round(scale * vg_load_f32(&inx[nlanes])* s2),
+                                v_round(scale * vg_load_f32(&inx[5*nlanes/4])* s3)),
+                         v_pack(v_round(scale * vg_load_f32(&inx[3*nlanes/2])* s1),
+                                v_round(scale * vg_load_f32(&inx[7*nlanes/4])* s2))));
+
+    vx_store(&outx[2 * nlanes],
+                v_pack_u(v_pack(v_round(scale * vg_load_f32(&inx[2*nlanes])* s3),
+                                v_round(scale * vg_load_f32(&inx[9*nlanes/4])* s1)),
+                         v_pack(v_round(scale * vg_load_f32(&inx[5*nlanes/2])* s2),
+                                v_round(scale * vg_load_f32(&inx[11*nlanes/4])* s3))));
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename SRC>
+CV_ALWAYS_INLINE void mulc_scale_simd_c3_impl(const SRC* in, float* out,
+                                        const v_float32& s1, const v_float32& s2,
+                                        const v_float32& s3, const v_float32& scale, const int nlanes)
+{
+    v_float32 a1 = vg_load_f32(in);
+    v_float32 a2 = vg_load_f32(&in[nlanes]);
+    v_float32 a3 = vg_load_f32(&in[2*nlanes]);
+
+    vx_store(out, scale * a1* s1);
+    vx_store(&out[nlanes], scale * a2* s2);
+    vx_store(&out[2*nlanes], scale * a3* s3);
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename SRC, typename DST>
+CV_ALWAYS_INLINE int mulc_scale_simd_c3(const SRC in[],
+                                        const float scalar[], DST out[],
+                                        const int length, const float _scale)
+{
+    constexpr int chan = 3;
+    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+    constexpr int lanes = chan * nlanes;
+
+    if (length < lanes)
+        return 0;
+
+    v_float32 scale = vx_setall_f32(_scale);
+
+    v_float32 s1 = vx_load(scalar);
+#if CV_SIMD_WIDTH == 32
+    v_float32 s2 = vx_load(&scalar[2]);
+    v_float32 s3 = vx_load(&scalar[1]);
+#else
+    v_float32 s2 = vx_load(&scalar[1]);
+    v_float32 s3 = vx_load(&scalar[2]);
+#endif
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - lanes; x += lanes)
+        {
+            mulc_scale_simd_c3_impl(&in[x], &out[x], s1, s2, s3, scale, nlanes);
+        }
+
+        if (x < length)
+        {
+            x = length - lanes;
+            continue;  // process unaligned tail
+        }
+        break;
+    }
+    return x;
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename SRC, typename DST>
+CV_ALWAYS_INLINE
+typename std::enable_if<(std::is_same<DST, ushort>::value ||
+                         std::is_same<DST, short>::value), void>::type
+mulc_scale_simd_common_impl(const SRC* inx, DST* outx,
+                            const v_float32& sc, const v_float32& scale,
+                            const int nlanes)
+{
+    v_float32 a1 = vg_load_f32(inx);
+    v_float32 a2 = vg_load_f32(&inx[nlanes/2]);
+
+    v_store_i16(outx, v_round(scale * a1* sc), v_round(scale * a2* sc));
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename SRC>
+CV_ALWAYS_INLINE void mulc_scale_simd_common_impl(const SRC* inx,
+                                                  uchar* outx, const v_float32& sc,
+                                                  const v_float32& scale, const int nlanes)
+{
+    v_float32 a1 = vg_load_f32(inx);
+    v_float32 a2 = vg_load_f32(&inx[nlanes/4]);
+    v_float32 a3 = vg_load_f32(&inx[nlanes/2]);
+    v_float32 a4 = vg_load_f32(&inx[3 * nlanes/4]);
+
+    vx_store(outx, v_pack_u(v_pack(v_round(scale * a1* sc),
+                                   v_round(scale * a2* sc)),
+                            v_pack(v_round(scale * a3* sc),
+                                   v_round(scale * a4* sc))));
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename SRC>
+CV_ALWAYS_INLINE void mulc_scale_simd_common_impl(const SRC* inx,
+                                                  float* outx, const v_float32& sc,
+                                                  const v_float32& scale, const int)
+{
+    v_float32 a1 = vg_load_f32(inx);
+    vx_store(outx, scale * a1* sc);
+}
+
+//-------------------------------------------------------------------------------------------------
+
+template<typename SRC, typename DST>
+CV_ALWAYS_INLINE int mulc_scale_simd_common(const SRC in[],
+                                            const float scalar[], DST out[],
+                                            const int length, const float _scale)
+{
+    constexpr int nlanes = vector_type_of_t<DST>::nlanes;
+
+    if (length < nlanes)
+        return 0;
+
+    v_float32 _scalar = vx_load(scalar);
+    v_float32 scale = vx_setall_f32(_scale);
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - nlanes; x += nlanes)
+        {
+            mulc_scale_simd_common_impl(&in[x], &out[x], _scalar, scale, nlanes);
+        }
+
+        if (x < length)
+        {
+            x = length - nlanes;
+            continue;  // process unaligned tail
+        }
+        break;
+    }
+    return x;
+}
+
+#define MULC_SIMD(SRC, DST)                                                    \
+int mulc_simd(const SRC in[], const float scalar[], DST out[],                 \
+              const int length, const int chan, const float scale)             \
+{                                                                              \
+    mul_tag op_t;                                                              \
+    switch (chan)                                                              \
+    {                                                                          \
+    case 1:                                                                    \
+    case 2:                                                                    \
+    case 4:                                                                    \
+    {                                                                          \
+        if (std::fabs(scale - 1.0f) <= FLT_EPSILON)                            \
+        {                                                                      \
+            return arithmOpScalar_simd_common(op_t, in, scalar,                \
+                                              out, length);                    \
+        }                                                                      \
+        else                                                                   \
+        {                                                                      \
+            return mulc_scale_simd_common(in, scalar, out, length, scale);     \
+        }                                                                      \
+    }                                                                          \
+    case 3:                                                                    \
+    {                                                                          \
+        if (std::fabs(scale - 1.0f) <= FLT_EPSILON)                            \
+        {                                                                      \
+            return arithmOpScalar_simd_c3(op_t, in, scalar,                    \
+                                          out, length);                        \
+        }                                                                      \
+        else                                                                   \
+        {                                                                      \
+            return mulc_scale_simd_c3(in, scalar, out, length, scale);         \
+        }                                                                      \
+    }                                                                          \
+    default:                                                                   \
+        GAPI_Assert(chan <= 4);                                                \
+        break;                                                                 \
+    }                                                                          \
+    return 0;                                                                  \
+}
+
+MULC_SIMD(uchar, uchar)
+MULC_SIMD(ushort, uchar)
+MULC_SIMD(short, uchar)
+MULC_SIMD(float, uchar)
+MULC_SIMD(short, short)
+MULC_SIMD(ushort, short)
+MULC_SIMD(uchar, short)
+MULC_SIMD(float, short)
+MULC_SIMD(ushort, ushort)
+MULC_SIMD(uchar, ushort)
+MULC_SIMD(short, ushort)
+MULC_SIMD(float, ushort)
+MULC_SIMD(uchar, float)
+MULC_SIMD(ushort, float)
+MULC_SIMD(short, float)
+MULC_SIMD(float, float)
+
+#undef MULC_SIMD
+
 #endif  // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY

 CV_CPU_OPTIMIZATION_NAMESPACE_END