Merge pull request #21520 from alexgiving:atrutnev/simd_for_split4

GAPI FLUID: Enable dynamic dispatching for Split4

* Enable dynamic dispatching for split4

* Add tail proc for split3 and split4
pull/21545/head
Trutnev Aleksei 3 years ago committed by GitHub
parent 870c8d3c4e
commit 245f6273bd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 25
      modules/gapi/src/backends/fluid/gfluidcore.cpp
  2. 7
      modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
  3. 3
      modules/gapi/src/backends/fluid/gfluidcore_func.hpp
  4. 62
      modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp

@ -2537,27 +2537,18 @@ GAPI_FLUID_KERNEL(GFluidSplit4, cv::gapi::core::GSplit4, false)
static void run(const View &src, Buffer &dst1, Buffer &dst2, Buffer &dst3, Buffer &dst4)
{
const auto *in = src.InLine<uchar>(0);
auto *out1 = dst1.OutLine<uchar>();
auto *out2 = dst2.OutLine<uchar>();
auto *out3 = dst3.OutLine<uchar>();
auto *out4 = dst4.OutLine<uchar>();
const auto *in = src.InLine<uchar>(0);
auto *out1 = dst1.OutLine<uchar>();
auto *out2 = dst2.OutLine<uchar>();
auto *out3 = dst3.OutLine<uchar>();
auto *out4 = dst4.OutLine<uchar>();
GAPI_Assert(4 == src.meta().chan);
int width = src.length();
int w = 0;
int w = 0; // cycle counter
#if CV_SIMD128
for (; w <= width-16; w+=16)
{
v_uint8x16 a, b, c, d;
v_load_deinterleave(&in[4*w], a, b, c, d);
v_store(&out1[w], a);
v_store(&out2[w], b);
v_store(&out3[w], c);
v_store(&out4[w], d);
}
#if CV_SIMD
w = split4_simd(in, out1, out2, out3, out4, width);
#endif
for (; w < width; w++)

@ -214,6 +214,13 @@ int split3_simd(const uchar in[], uchar out1[], uchar out2[],
CV_CPU_DISPATCH_MODES_ALL);
}
int split4_simd(const uchar in[], uchar out1[], uchar out2[],
uchar out3[], uchar out4[], const int width)
{
CV_CPU_DISPATCH(split4_simd, (in, out1, out2, out3, out4, width),
CV_CPU_DISPATCH_MODES_ALL);
}
} // namespace fluid
} // namespace gapi
} // namespace cv

@ -166,6 +166,9 @@ ABSDIFFC_SIMD(float)
int split3_simd(const uchar in[], uchar out1[], uchar out2[],
uchar out3[], const int width);
int split4_simd(const uchar in[], uchar out1[], uchar out2[],
uchar out3[], uchar out4[], const int width);
} // namespace fluid
} // namespace gapi
} // namespace cv

@ -187,6 +187,9 @@ ABSDIFFC_SIMD(float)
int split3_simd(const uchar in[], uchar out1[], uchar out2[],
uchar out3[], const int width);
int split4_simd(const uchar in[], uchar out1[], uchar out2[],
uchar out3[], uchar out4[], const int width);
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
struct scale_tag {};
@ -1581,14 +1584,61 @@ int split3_simd(const uchar in[], uchar out1[], uchar out2[],
uchar out3[], const int width)
{
constexpr int nlanes = v_uint8::nlanes;
if (width < nlanes)
return 0;
int x = 0;
for (;;)
{
for (; x <= width - nlanes; x += nlanes)
{
v_uint8 a, b, c;
v_load_deinterleave(&in[3 * x], a, b, c);
vx_store(&out1[x], a);
vx_store(&out2[x], b);
vx_store(&out3[x], c);
}
if (x < width)
{
x = width - nlanes;
continue;
}
break;
}
return x;
}
//-------------------------
//
// Fluid kernels: Split4
//
//-------------------------
int split4_simd(const uchar in[], uchar out1[], uchar out2[],
uchar out3[], uchar out4[], const int width)
{
constexpr int nlanes = v_uint8::nlanes;
if (width < nlanes)
return 0;
int x = 0;
for (; x <= width - nlanes; x += nlanes)
for (;;)
{
v_uint8 a, b, c;
v_load_deinterleave(&in[3 * x], a, b, c);
vx_store(&out1[x], a);
vx_store(&out2[x], b);
vx_store(&out3[x], c);
for (; x <= width - nlanes; x += nlanes)
{
v_uint8 a, b, c, d;
v_load_deinterleave(&in[4 * x], a, b, c, d);
vx_store(&out1[x], a);
vx_store(&out2[x], b);
vx_store(&out3[x], c);
vx_store(&out4[x], d);
}
if (x < width)
{
x = width - nlanes;
continue;
}
break;
}
return x;
}

Loading…
Cancel
Save