Merge pull request #21797 from anna-khakimova:ak/merge3_extend_supported_types

GAPI Fluid SIMD:Add support of new several types for the Merge3

- Support of the new several types was added.
- Fixes for the Split/Merge and ConvertTo issues.
pull/23721/head
Anna Khakimova 1 year ago committed by GitHub
parent fc5d412ba7
commit 6d3dd24622
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 2
      modules/gapi/perf/common/gapi_core_perf_tests.hpp
  2. 7
      modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
  3. 1
      modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
  4. 1
      modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
  5. 1
      modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
  6. 66
      modules/gapi/src/backends/fluid/gfluidcore.cpp
  7. 18
      modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
  8. 12
      modules/gapi/src/backends/fluid/gfluidcore_func.hpp
  9. 80
      modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp
  10. 17
      modules/gapi/src/backends/fluid/gfluidutils.hpp

@ -62,7 +62,7 @@ namespace opencv_test
class InRangePerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, cv::GCompileArgs>> {};
class Split3PerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
class Split4PerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
class Merge3PerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
class Merge3PerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, cv::GCompileArgs>> {};
class Merge4PerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
class RemapPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, cv::GCompileArgs>> {};
class FlipPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};

@ -1577,11 +1577,12 @@ PERF_TEST_P_(Merge3PerfTest, TestPerformance)
{
compare_f cmpF;
cv::Size sz;
MatType type = -1;
cv::GCompileArgs compile_args;
std::tie(cmpF, sz, compile_args) = GetParam();
std::tie(cmpF, sz, type, compile_args) = GetParam();
initMatsRandU(CV_8UC1, sz, CV_8UC3);
cv::Mat in_mat3(sz, CV_8UC1);
initMatsRandU(type, sz, CV_MAKETYPE(type, 3));
cv::Mat in_mat3(sz, type);
cv::Scalar mean = cv::Scalar::all(127);
cv::Scalar stddev = cv::Scalar::all(40.f);
cv::randn(in_mat3, mean, stddev);

@ -252,6 +252,7 @@ INSTANTIATE_TEST_CASE_P(Split4PerfTestCPU, Split4PerfTest,
INSTANTIATE_TEST_CASE_P(Merge3PerfTestCPU, Merge3PerfTest,
Combine(Values(AbsExact().to_compare_f()),
Values(szSmall128, szVGA, sz720p, sz1080p),
Values(CV_8U),
Values(cv::compile_args(CORE_CPU))));
INSTANTIATE_TEST_CASE_P(Merge4PerfTestCPU, Merge4PerfTest,

@ -253,6 +253,7 @@ INSTANTIATE_TEST_CASE_P(Split4PerfTestFluid, Split4PerfTest,
INSTANTIATE_TEST_CASE_P(Merge3PerfTestFluid, Merge3PerfTest,
Combine(Values(AbsExact().to_compare_f()),
Values(szSmall128, szVGA, sz720p, sz1080p),
Values(CV_8U, CV_16S, CV_16U, CV_32F),
Values(cv::compile_args(CORE_FLUID))));
INSTANTIATE_TEST_CASE_P(Merge4PerfTestFluid, Merge4PerfTest,

@ -242,6 +242,7 @@ INSTANTIATE_TEST_CASE_P(Split4PerfTestGPU, Split4PerfTest,
INSTANTIATE_TEST_CASE_P(Merge3PerfTestGPU, Merge3PerfTest,
Combine(Values(AbsExact().to_compare_f()),
Values( szSmall128, szVGA, sz720p, sz1080p ),
Values(CV_8U),
Values(cv::compile_args(CORE_GPU))));
INSTANTIATE_TEST_CASE_P(Merge4PerfTestGPU, Merge4PerfTest,

@ -2320,12 +2320,15 @@ GAPI_FLUID_KERNEL(GFluidSplit3, cv::gapi::core::GSplit3, false)
static void run(const View &src, Buffer &dst1, Buffer &dst2, Buffer &dst3)
{
GAPI_Assert((src.meta().depth == CV_8U) && (dst1.meta().depth == CV_8U) &&
(dst2.meta().depth == CV_8U) && (dst3.meta().depth == CV_8U) &&
(3 == src.meta().chan));
const auto *in = src.InLine<uchar>(0);
auto *out1 = dst1.OutLine<uchar>();
auto *out2 = dst2.OutLine<uchar>();
auto *out3 = dst3.OutLine<uchar>();
GAPI_Assert(3 == src.meta().chan);
int width = src.length();
int w = 0;
@ -2348,13 +2351,16 @@ GAPI_FLUID_KERNEL(GFluidSplit4, cv::gapi::core::GSplit4, false)
static void run(const View &src, Buffer &dst1, Buffer &dst2, Buffer &dst3, Buffer &dst4)
{
GAPI_Assert((src.meta().depth == CV_8U) && (dst1.meta().depth == CV_8U) &&
(dst2.meta().depth == CV_8U) && (dst3.meta().depth == CV_8U) &&
(dst4.meta().depth == CV_8U) && (4 == src.meta().chan));
const auto *in = src.InLine<uchar>(0);
auto *out1 = dst1.OutLine<uchar>();
auto *out2 = dst2.OutLine<uchar>();
auto *out3 = dst3.OutLine<uchar>();
auto *out4 = dst4.OutLine<uchar>();
GAPI_Assert(4 == src.meta().chan);
int width = src.length();
int w = 0;
@ -2372,31 +2378,46 @@ GAPI_FLUID_KERNEL(GFluidSplit4, cv::gapi::core::GSplit4, false)
}
};
template<typename T>
CV_ALWAYS_INLINE void run_merge3(Buffer& dst, const View& src1, const View& src2, const View& src3)
{
const auto* in1 = src1.InLine<T>(0);
const auto* in2 = src2.InLine<T>(0);
const auto* in3 = src3.InLine<T>(0);
auto* out = dst.OutLine<T>();
int width = dst.length();
int w = 0;
#if CV_SIMD
w = merge3_simd(in1, in2, in3, out, width);
#endif
for (; w < width; w++)
{
out[3 * w] = in1[w];
out[3 * w + 1] = in2[w];
out[3 * w + 2] = in3[w];
}
}
GAPI_FLUID_KERNEL(GFluidMerge3, cv::gapi::core::GMerge3, false)
{
static const int Window = 1;
static void run(const View &src1, const View &src2, const View &src3, Buffer &dst)
static void run(const View& src1, const View& src2, const View& src3, Buffer& dst)
{
const auto *in1 = src1.InLine<uchar>(0);
const auto *in2 = src2.InLine<uchar>(0);
const auto *in3 = src3.InLine<uchar>(0);
auto *out = dst.OutLine<uchar>();
GAPI_Assert(3 == dst.meta().chan);
int width = dst.length();
int w = 0;
GAPI_Assert((src1.meta().depth == dst.meta().depth) &&
(src1.meta().depth == src2.meta().depth) &&
(src1.meta().depth == src3.meta().depth));
#if CV_SIMD
w = merge3_simd(in1, in2, in3, out, width);
#endif
// SRC/DST TYPE OP __VA_ARGS__
MERGE3_(uchar, run_merge3, dst, src1, src2, src3);
MERGE3_(ushort, run_merge3, dst, src1, src2, src3);
MERGE3_(short, run_merge3, dst, src1, src2, src3);
MERGE3_(float, run_merge3, dst, src1, src2, src3);
for (; w < width; w++)
{
out[3*w ] = in1[w];
out[3*w + 1] = in2[w];
out[3*w + 2] = in3[w];
}
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
}
};
@ -2407,13 +2428,16 @@ GAPI_FLUID_KERNEL(GFluidMerge4, cv::gapi::core::GMerge4, false)
static void run(const View &src1, const View &src2, const View &src3, const View &src4,
Buffer &dst)
{
GAPI_Assert((dst.meta().depth == CV_8U) && (src1.meta().depth == CV_8U) &&
(src2.meta().depth == CV_8U) && (src3.meta().depth == CV_8U) &&
(4 == dst.meta().chan));
const auto *in1 = src1.InLine<uchar>(0);
const auto *in2 = src2.InLine<uchar>(0);
const auto *in3 = src3.InLine<uchar>(0);
const auto *in4 = src4.InLine<uchar>(0);
auto *out = dst.OutLine<uchar>();
GAPI_Assert(4 == dst.meta().chan);
int width = dst.length();
int w = 0; // cycle counter

@ -277,13 +277,21 @@ int split4_simd(const uchar in[], uchar out1[], uchar out2[],
CV_CPU_DISPATCH_MODES_ALL);
}
int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[],
uchar out[], const int width)
{
CV_CPU_DISPATCH(merge3_simd, (in1, in2, in3, out, width),
CV_CPU_DISPATCH_MODES_ALL);
#define MERGE3_SIMD(T) \
int merge3_simd(const T in1[], const T in2[], const T in3[], \
T out[], const int width) \
{ \
CV_CPU_DISPATCH(merge3_simd, (in1, in2, in3, out, width), \
CV_CPU_DISPATCH_MODES_ALL); \
}
MERGE3_SIMD(uchar)
MERGE3_SIMD(short)
MERGE3_SIMD(ushort)
MERGE3_SIMD(float)
#undef MERGE3_SIMD
int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
const uchar in4[], uchar out[], const int width)
{

@ -216,8 +216,16 @@ int split3_simd(const uchar in[], uchar out1[], uchar out2[],
int split4_simd(const uchar in[], uchar out1[], uchar out2[],
uchar out3[], uchar out4[], const int width);
int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[],
uchar out[], const int width);
#define MERGE3_SIMD(T) \
int merge3_simd(const T in1[], const T in2[], const T in3[], \
T out[], const int width);
MERGE3_SIMD(uchar)
MERGE3_SIMD(short)
MERGE3_SIMD(ushort)
MERGE3_SIMD(float)
#undef MERGE3_SIMD
int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
const uchar in4[], uchar out[], const int width);

@ -322,12 +322,21 @@ int split3_simd(const uchar in[], uchar out1[], uchar out2[],
int split4_simd(const uchar in[], uchar out1[], uchar out2[],
uchar out3[], uchar out4[], const int width);
int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[],
uchar out[], const int width);
#define MERGE3_SIMD(T) \
int merge3_simd(const T in1[], const T in2[], const T in3[], \
T out[], const int width);
MERGE3_SIMD(uchar)
MERGE3_SIMD(short)
MERGE3_SIMD(ushort)
MERGE3_SIMD(float)
#undef MERGE3_SIMD
int merge4_simd(const uchar in1[], const uchar in2[], const uchar in3[],
const uchar in4[], uchar out[], const int width);
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
#define SRC_SHORT_OR_USHORT std::is_same<SRC, short>::value || std::is_same<SRC, ushort>::value
@ -2530,33 +2539,41 @@ int split4_simd(const uchar in[], uchar out1[], uchar out2[],
//
//-------------------------
int merge3_simd(const uchar in1[], const uchar in2[], const uchar in3[],
uchar out[], const int width)
{
constexpr int nlanes = v_uint8::nlanes;
if (width < nlanes)
return 0;
int x = 0;
for (;;)
{
for (; x <= width - nlanes; x += nlanes)
{
v_uint8 a, b, c;
a = vx_load(&in1[x]);
b = vx_load(&in2[x]);
c = vx_load(&in3[x]);
v_store_interleave(&out[3 * x], a, b, c);
}
if (x < width)
{
x = width - nlanes;
continue;
}
break;
}
return x;
}
#define MERGE3_SIMD(T) \
int merge3_simd(const T in1[], const T in2[], const T in3[], \
T out[], const int width) \
{ \
constexpr int nlanes = vector_type_of_t<T>::nlanes; \
if (width < nlanes) \
return 0; \
\
int x = 0; \
for (;;) \
{ \
for (; x <= width - nlanes; x += nlanes) \
{ \
vector_type_of_t<T> a, b, c; \
a = vx_load(&in1[x]); \
b = vx_load(&in2[x]); \
c = vx_load(&in3[x]); \
v_store_interleave(&out[3 * x], a, b, c); \
} \
if (x < width) \
{ \
x = width - nlanes; \
continue; \
} \
break; \
} \
return x; \
}
MERGE3_SIMD(uchar)
MERGE3_SIMD(short)
MERGE3_SIMD(ushort)
MERGE3_SIMD(float)
#undef MERGE3_SIMD
//-------------------------
//
@ -2926,6 +2943,8 @@ CV_ALWAYS_INLINE void convertto_simd_nocoeff_impl(const SRC* inx, float* outx)
int convertto_simd(const SRC in[], DST out[], const int length) \
{ \
constexpr int nlanes = vector_type_of_t<DST>::nlanes; \
if (length < nlanes) \
return 0; \
\
int x = 0; \
for (;;) \
@ -3093,6 +3112,9 @@ int convertto_scaled_simd(const SRC in[], DST out[], const float alpha, \
const float beta, const int length) \
{ \
constexpr int nlanes = vector_type_of_t<DST>::nlanes; \
if (length < nlanes) \
return 0; \
\
v_float32 v_alpha = vx_setall_f32(alpha); \
v_float32 v_beta = vx_setall_f32(beta); \
\

@ -86,6 +86,23 @@ using cv::gapi::own::rintd;
return; \
}
#define MERGE3_(T, OP, ...) \
if (cv::DataType<T>::depth == dst.meta().depth && \
cv::DataType<T>::depth == src1.meta().depth) \
{ \
GAPI_DbgAssert(dst.length() == src1.length()); \
GAPI_DbgAssert(dst.length() == src2.length()); \
GAPI_DbgAssert(dst.length() == src3.length()); \
\
GAPI_DbgAssert(1 == src1.meta().chan); \
GAPI_DbgAssert(1 == src2.meta().chan); \
GAPI_DbgAssert(1 == src3.meta().chan); \
GAPI_DbgAssert(3 == dst.meta().chan); \
\
OP<T>(__VA_ARGS__); \
return; \
}
} // namespace fluid
} // namespace gapi
} // namespace cv

Loading…
Cancel
Save