diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp index 403bcf252d..4072bb2c65 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore.cpp @@ -2509,10 +2509,10 @@ GAPI_FLUID_KERNEL(GFluidSplit3, cv::gapi::core::GSplit3, false) static void run(const View &src, Buffer &dst1, Buffer &dst2, Buffer &dst3) { - const auto *in = src.InLine(0); - auto *out1 = dst1.OutLine(); - auto *out2 = dst2.OutLine(); - auto *out3 = dst3.OutLine(); + const auto *in = src.InLine(0); + auto *out1 = dst1.OutLine(); + auto *out2 = dst2.OutLine(); + auto *out3 = dst3.OutLine(); GAPI_Assert(3 == src.meta().chan); int width = src.length(); @@ -2537,11 +2537,11 @@ GAPI_FLUID_KERNEL(GFluidSplit4, cv::gapi::core::GSplit4, false) static void run(const View &src, Buffer &dst1, Buffer &dst2, Buffer &dst3, Buffer &dst4) { - const auto *in = src.InLine(0); - auto *out1 = dst1.OutLine(); - auto *out2 = dst2.OutLine(); - auto *out3 = dst3.OutLine(); - auto *out4 = dst4.OutLine(); + const auto *in = src.InLine(0); + auto *out1 = dst1.OutLine(); + auto *out2 = dst2.OutLine(); + auto *out3 = dst3.OutLine(); + auto *out4 = dst4.OutLine(); GAPI_Assert(4 == src.meta().chan); int width = src.length(); @@ -2574,18 +2574,10 @@ GAPI_FLUID_KERNEL(GFluidMerge3, cv::gapi::core::GMerge3, false) GAPI_Assert(3 == dst.meta().chan); int width = dst.length(); + int w = 0; - int w = 0; // cycle counter - - #if CV_SIMD128 - for (; w <= width-16; w+=16) - { - v_uint8x16 a, b, c; - a = v_load(&in1[w]); - b = v_load(&in2[w]); - c = v_load(&in3[w]); - v_store_interleave(&out[3*w], a, b, c); - } + #if CV_SIMD + w = merge3_simd(in1, in2, in3, out, width); #endif for (; w < width; w++) diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp index 30e3d1f5ea..45ef6143d3 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp @@ -221,6 +221,13 @@ int split4_simd(const uchar in[], uchar out1[], uchar out2[], CV_CPU_DISPATCH_MODES_ALL); } +int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[], + uchar out[], const int width) +{ + CV_CPU_DISPATCH(merge3_simd, (in1, in2, in3, out, width), + CV_CPU_DISPATCH_MODES_ALL); +} + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp index e0fdf812f2..0fe8e04687 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.hpp @@ -169,6 +169,9 @@ int split3_simd(const uchar in[], uchar out1[], uchar out2[], int split4_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[], uchar out4[], const int width); +int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[], + uchar out[], const int width); + } // namespace fluid } // namespace gapi } // namespace cv diff --git a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp index 9f7886f9b0..c6c8d7b8b1 100644 --- a/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp +++ b/modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp @@ -190,6 +190,9 @@ int split3_simd(const uchar in[], uchar out1[], uchar out2[], int split4_simd(const uchar in[], uchar out1[], uchar out2[], uchar out3[], uchar out4[], const int width); +int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[], + uchar out[], const int width); + #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY struct scale_tag {}; @@ -1643,6 +1646,40 @@ int split4_simd(const uchar in[], uchar out1[], uchar out2[], return x; } +//------------------------- +// +// Fluid kernels: Merge3 +// +//------------------------- + +int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[], + uchar out[], const int width) +{ + constexpr int nlanes = v_uint8::nlanes; + if (width < nlanes) + return 0; + + int x = 0; + for (;;) + { + for (; x <= width - nlanes; x += nlanes) + { + v_uint8 a, b, c; + a = vx_load(&in1[x]); + b = vx_load(&in2[x]); + c = vx_load(&in3[x]); + v_store_interleave(&out[3 * x], a, b, c); + } + if (x < width) + { + x = width - nlanes; + continue; + } + break; + } + return x; +} + #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY CV_CPU_OPTIMIZATION_NAMESPACE_END