Merge pull request #21204 from anna-khakimova:ak/move_simd_absdiffc

pull/21253/head
Alexander Alekhin 3 years ago
commit f1053d48a2
  1. 2
      modules/gapi/perf/common/gapi_core_perf_tests.hpp
  2. 12
      modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
  3. 3
      modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
  4. 7
      modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
  5. 3
      modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
  6. 306
      modules/gapi/src/backends/fluid/gfluidcore.cpp
  7. 15
      modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
  8. 23
      modules/gapi/src/backends/fluid/gfluidcore_func.hpp
  9. 50
      modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp

@ -50,7 +50,7 @@ namespace opencv_test
class MinPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
class MaxPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
class AbsDiffPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
class AbsDiffCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
class AbsDiffCPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, cv::GCompileArgs>> {};
class SumPerfTest : public TestPerfParams<tuple<compare_scalar_f, cv::Size, MatType, cv::GCompileArgs>> {};
class CountNonZeroPerfTest : public TestPerfParams<tuple<compare_scalar_f, cv::Size, MatType, cv::GCompileArgs>> {};
class AddWeightedPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};

@ -970,9 +970,10 @@ PERF_TEST_P_(AbsDiffPerfTest, TestPerformance)
PERF_TEST_P_(AbsDiffCPerfTest, TestPerformance)
{
cv::Size sz_in = get<0>(GetParam());
MatType type = get<1>(GetParam());
cv::GCompileArgs compile_args = get<2>(GetParam());
compare_f cmpF = get<0>(GetParam());
cv::Size sz_in = get<1>(GetParam());
MatType type = get<2>(GetParam());
cv::GCompileArgs compile_args = get<3>(GetParam());
initMatsRandU(type, sz_in, type, false);
@ -997,8 +998,9 @@ PERF_TEST_P_(AbsDiffCPerfTest, TestPerformance)
}
// Comparison ////////////////////////////////////////////////////////////
// FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
EXPECT_EQ(out_mat_gapi.size(), sz_in);
{
EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
}
SANITY_CHECK_NOTHING();
}

@ -156,7 +156,8 @@ INSTANTIATE_TEST_CASE_P(AbsDiffPerfTestCPU, AbsDiffPerfTest,
Values(cv::compile_args(CORE_CPU))));
INSTANTIATE_TEST_CASE_P(AbsDiffCPerfTestCPU, AbsDiffCPerfTest,
Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
Combine(Values(AbsExact().to_compare_f()),
Values(szSmall128, szVGA, sz720p, sz1080p),
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
Values(cv::compile_args(CORE_CPU))));

@ -153,10 +153,9 @@ INSTANTIATE_TEST_CASE_P(AbsDiffPerfTestFluid, AbsDiffPerfTest,
Values(cv::compile_args(CORE_FLUID))));
INSTANTIATE_TEST_CASE_P(AbsDiffCPerfTestFluid, AbsDiffCPerfTest,
Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_8UC2,
CV_16UC2, CV_16SC2, CV_8UC3, CV_16UC3,
CV_16SC3, CV_8UC4, CV_16UC4, CV_16SC4),
Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()),
Values(szSmall128, szVGA, sz720p, sz1080p),
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
Values(cv::compile_args(CORE_FLUID))));
// INSTANTIATE_TEST_CASE_P(SumPerfTestFluid, SumPerfTest,

@ -154,7 +154,8 @@ INSTANTIATE_TEST_CASE_P(AbsDiffPerfTestGPU, AbsDiffPerfTest,
Values(cv::compile_args(CORE_GPU))));
INSTANTIATE_TEST_CASE_P(AbsDiffCPerfTestGPU, AbsDiffCPerfTest,
Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
Combine(Values(AbsExact().to_compare_f()),
Values( szSmall128, szVGA, sz720p, sz1080p ),
Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
Values(cv::compile_args(CORE_GPU))));

@ -994,244 +994,6 @@ static void run_arithm_s(DST out[], const SRC in[], int width, int chan,
CV_Error(cv::Error::StsBadArg, "unsupported number of channels");
}
#if CV_SIMD
CV_ALWAYS_INLINE void absdiffc_short_store_c1c2c4(short* out_ptr, const v_int32& c1, const v_int32& c2)
{
vx_store(out_ptr, v_pack(c1, c2));
}
CV_ALWAYS_INLINE void absdiffc_short_store_c1c2c4(ushort* out_ptr, const v_int32& c1, const v_int32& c2)
{
vx_store(out_ptr, v_pack_u(c1, c2));
}
template<typename T>
CV_ALWAYS_INLINE int absdiffc_simd_c1c2c4(const T in[], T out[],
const v_float32& s, const int length)
{
static_assert((std::is_same<T, ushort>::value) || (std::is_same<T, short>::value),
"This templated overload is only for short or ushort type combinations.");
constexpr int nlanes = (std::is_same<T, ushort>::value) ? static_cast<int>(v_uint16::nlanes) :
static_cast<int>(v_int16::nlanes);
if (length < nlanes)
return 0;
int x = 0;
for (;;)
{
for (; x <= length - nlanes; x += nlanes)
{
v_float32 a1 = v_load_f32(in + x);
v_float32 a2 = v_load_f32(in + x + nlanes / 2);
absdiffc_short_store_c1c2c4(&out[x], v_round(v_absdiff(a1, s)),
v_round(v_absdiff(a2, s)));
}
if (x < length && (in != out))
{
x = length - nlanes;
continue; // process unaligned tail
}
break;
}
return x;
}
template<>
CV_ALWAYS_INLINE int absdiffc_simd_c1c2c4<uchar>(const uchar in[], uchar out[],
const v_float32& s, const int length)
{
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
if (length < nlanes)
return 0;
int x = 0;
for (;;)
{
for (; x <= length - nlanes; x += nlanes)
{
v_float32 a1 = v_load_f32(in + x);
v_float32 a2 = v_load_f32(in + x + nlanes / 4);
v_float32 a3 = v_load_f32(in + x + nlanes / 2);
v_float32 a4 = v_load_f32(in + x + 3 * nlanes / 4);
vx_store(&out[x], v_pack_u(v_pack(v_round(v_absdiff(a1, s)),
v_round(v_absdiff(a2, s))),
v_pack(v_round(v_absdiff(a3, s)),
v_round(v_absdiff(a4, s)))));
}
if (x < length && (in != out))
{
x = length - nlanes;
continue; // process unaligned tail
}
break;
}
return x;
}
CV_ALWAYS_INLINE void absdiffc_short_store_c3(short* out_ptr, const v_int32& c1,
const v_int32& c2, const v_int32& c3,
const v_int32& c4, const v_int32& c5,
const v_int32& c6)
{
constexpr int nlanes = static_cast<int>(v_int16::nlanes);
vx_store(out_ptr, v_pack(c1, c2));
vx_store(out_ptr + nlanes, v_pack(c3, c4));
vx_store(out_ptr + 2*nlanes, v_pack(c5, c6));
}
CV_ALWAYS_INLINE void absdiffc_short_store_c3(ushort* out_ptr, const v_int32& c1,
const v_int32& c2, const v_int32& c3,
const v_int32& c4, const v_int32& c5,
const v_int32& c6)
{
constexpr int nlanes = static_cast<int>(v_uint16::nlanes);
vx_store(out_ptr, v_pack_u(c1, c2));
vx_store(out_ptr + nlanes, v_pack_u(c3, c4));
vx_store(out_ptr + 2*nlanes, v_pack_u(c5, c6));
}
template<typename T>
CV_ALWAYS_INLINE int absdiffc_simd_c3_impl(const T in[], T out[],
const v_float32& s1, const v_float32& s2,
const v_float32& s3, const int length)
{
static_assert((std::is_same<T, ushort>::value) || (std::is_same<T, short>::value),
"This templated overload is only for short or ushort type combinations.");
constexpr int nlanes = (std::is_same<T, ushort>::value) ? static_cast<int>(v_uint16::nlanes):
static_cast<int>(v_int16::nlanes);
if (length < 3 * nlanes)
return 0;
int x = 0;
for (;;)
{
for (; x <= length - 3 * nlanes; x += 3 * nlanes)
{
v_float32 a1 = v_load_f32(in + x);
v_float32 a2 = v_load_f32(in + x + nlanes / 2);
v_float32 a3 = v_load_f32(in + x + nlanes);
v_float32 a4 = v_load_f32(in + x + 3 * nlanes / 2);
v_float32 a5 = v_load_f32(in + x + 2 * nlanes);
v_float32 a6 = v_load_f32(in + x + 5 * nlanes / 2);
absdiffc_short_store_c3(&out[x], v_round(v_absdiff(a1, s1)),
v_round(v_absdiff(a2, s2)),
v_round(v_absdiff(a3, s3)),
v_round(v_absdiff(a4, s1)),
v_round(v_absdiff(a5, s2)),
v_round(v_absdiff(a6, s3)));
}
if (x < length && (in != out))
{
x = length - 3 * nlanes;
continue; // process unaligned tail
}
break;
}
return x;
}
template<>
CV_ALWAYS_INLINE int absdiffc_simd_c3_impl<uchar>(const uchar in[], uchar out[],
const v_float32& s1, const v_float32& s2,
const v_float32& s3, const int length)
{
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
if (length < 3 * nlanes)
return 0;
int x = 0;
for (;;)
{
for (; x <= length - 3 * nlanes; x += 3 * nlanes)
{
vx_store(&out[x],
v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x), s1)),
v_round(v_absdiff(v_load_f32(in + x + nlanes/4), s2))),
v_pack(v_round(v_absdiff(v_load_f32(in + x + nlanes/2), s3)),
v_round(v_absdiff(v_load_f32(in + x + 3*nlanes/4), s1)))));
vx_store(&out[x + nlanes],
v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x + nlanes), s2)),
v_round(v_absdiff(v_load_f32(in + x + 5*nlanes/4), s3))),
v_pack(v_round(v_absdiff(v_load_f32(in + x + 3*nlanes/2), s1)),
v_round(v_absdiff(v_load_f32(in + x + 7*nlanes/4), s2)))));
vx_store(&out[x + 2 * nlanes],
v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x + 2*nlanes), s3)),
v_round(v_absdiff(v_load_f32(in + x + 9*nlanes/4), s1))),
v_pack(v_round(v_absdiff(v_load_f32(in + x + 5*nlanes/2), s2)),
v_round(v_absdiff(v_load_f32(in + x + 11*nlanes/4), s3)))));
}
if (x < length && (in != out))
{
x = length - 3 * nlanes;
continue; // process unaligned tail
}
break;
}
return x;
}
template<typename T>
CV_ALWAYS_INLINE int absdiffc_simd_channels(const T in[], const float scalar[], T out[],
const int width, int chan)
{
int length = width * chan;
v_float32 s = vx_load(scalar);
return absdiffc_simd_c1c2c4(in, out, s, length);
}
template<typename T>
CV_ALWAYS_INLINE int absdiffc_simd_c3(const T in[], const float scalar[], T out[], int width)
{
constexpr int chan = 3;
int length = width * chan;
v_float32 s1 = vx_load(scalar);
#if CV_SIMD_WIDTH == 32
v_float32 s2 = vx_load(scalar + 2);
v_float32 s3 = vx_load(scalar + 1);
#else
v_float32 s2 = vx_load(scalar + 1);
v_float32 s3 = vx_load(scalar + 2);
#endif
return absdiffc_simd_c3_impl(in, out, s1, s2, s3, length);
}
template<typename T>
CV_ALWAYS_INLINE int absdiffc_simd(const T in[], const float scalar[], T out[], int width, int chan)
{
switch (chan)
{
case 1:
case 2:
case 4:
return absdiffc_simd_channels(in, scalar, out, width, chan);
case 3:
return absdiffc_simd_c3(in, scalar, out, width);
default:
break;
}
return 0;
}
#endif // CV_SIMD
template<typename DST, typename SRC>
static void run_absdiffc(Buffer &dst, const View &src, const float scalar[])
{
@ -1240,13 +1002,14 @@ static void run_absdiffc(Buffer &dst, const View &src, const float scalar[])
int width = dst.length();
int chan = dst.meta().chan;
const int length = width * chan;
int w = 0;
#if CV_SIMD
w = absdiffc_simd(in, scalar, out, width, chan);
w = absdiffc_simd(in, scalar, out, length, chan);
#endif
for (; w < width*chan; ++w)
for (; w < length; ++w)
out[w] = absdiff<DST>(in[w], scalar[w%chan]);
}
@ -1349,6 +1112,32 @@ static void run_arithm_rs(Buffer &dst, const View &src, const float scalar[4], A
}
}
CV_ALWAYS_INLINE void initScratchBuffer(Buffer& scratch)
{
#if CV_SIMD
// 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector.
constexpr int maxNlanes = 16;
// +2 is offset for 3-channel case.
// Offset is need to right load coefficients from scalar array to SIMD vectors for 3-channel case.
// Scalar array looks like: scalar[] = {C1, C2, C3, C1, C2, C3, ...}
// The first scalar SIMD vector should looks like:
// C1 C2 C3 C1
// The second:
// C2 C3 C1 C2
// The third:
// C3 C1 C2 C3
constexpr int offset = 2;
constexpr int buflen = maxNlanes + offset;
#else
constexpr int buflen = 4;
#endif
cv::Size bufsize(buflen, 1);
GMatDesc bufdesc = { CV_32F, 1, bufsize };
Buffer buffer(bufdesc);
scratch = std::move(buffer);
}
GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true)
{
static const int Window = 1;
@ -1370,21 +1159,14 @@ GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true)
UNARY_(uchar, uchar, run_absdiffc, dst, src, scalar);
UNARY_(ushort, ushort, run_absdiffc, dst, src, scalar);
UNARY_(short, short, run_absdiffc, dst, src, scalar);
UNARY_(float, float, run_absdiffc, dst, src, scalar);
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
}
static void initScratch(const GMatDesc&, const GScalarDesc&, Buffer& scratch)
{
#if CV_SIMD
constexpr int buflen = static_cast<int>(v_float32::nlanes) + 2; // buffer size
#else
constexpr int buflen = 4;
#endif
cv::Size bufsize(buflen, 1);
GMatDesc bufdesc = { CV_32F, 1, bufsize };
Buffer buffer(bufdesc);
scratch = std::move(buffer);
initScratchBuffer(scratch);
}
static void resetScratch(Buffer& /* scratch */)
@ -1392,32 +1174,6 @@ GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true)
}
};
CV_ALWAYS_INLINE void initScratchBuffer(Buffer& scratch)
{
#if CV_SIMD
// 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector.
constexpr int maxNlanes = 16;
// +2 is offset for 3-channel case.
// Offset is need to right load coefficients from scalar array to SIMD vectors for 3-channel case.
// Scalar array looks like: scalar[] = {C1, C2, C3, C1, C2, C3, ...}
// The first scalar SIMD vector should looks like:
// C1 C2 C3 C1
// The second:
// C2 C3 C1 C2
// The third:
// C3 C1 C2 C3
constexpr int offset = 2;
constexpr int buflen = maxNlanes + offset;
#else
constexpr int buflen = 4;
#endif
cv::Size bufsize(buflen, 1);
GMatDesc bufdesc = { CV_32F, 1, bufsize };
Buffer buffer(bufdesc);
scratch = std::move(buffer);
}
GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, true)
{
static const int Window = 1;

@ -165,6 +165,21 @@ MULC_SIMD(float, float)
#undef MULC_SIMD
#define ABSDIFFC_SIMD(SRC) \
int absdiffc_simd(const SRC in[], const float scalar[], SRC out[], \
const int length, const int chan) \
{ \
CV_CPU_DISPATCH(absdiffc_simd, (in, scalar, out, length, chan), \
CV_CPU_DISPATCH_MODES_ALL); \
}
ABSDIFFC_SIMD(uchar)
ABSDIFFC_SIMD(short)
ABSDIFFC_SIMD(ushort)
ABSDIFFC_SIMD(float)
#undef ABSDIFFC_SIMD
} // namespace fluid
} // namespace gapi
} // namespace cv

@ -60,8 +60,8 @@ MUL_SIMD(float, float)
#undef MUL_SIMD
#define ADDC_SIMD(SRC, DST) \
int addc_simd(const SRC in[], const float scalar[], DST out[], \
#define ADDC_SIMD(SRC, DST) \
int addc_simd(const SRC in[], const float scalar[], DST out[], \
const int length, const int chan);
ADDC_SIMD(uchar, uchar)
@ -83,8 +83,8 @@ ADDC_SIMD(float, float)
#undef ADDC_SIMD
#define SUBC_SIMD(SRC, DST) \
int subc_simd(const SRC in[], const float scalar[], DST out[], \
#define SUBC_SIMD(SRC, DST) \
int subc_simd(const SRC in[], const float scalar[], DST out[], \
const int length, const int chan);
SUBC_SIMD(uchar, uchar)
@ -106,8 +106,8 @@ SUBC_SIMD(float, float)
#undef SUBC_SIMD
#define MULC_SIMD(SRC, DST) \
int mulc_simd(const SRC in[], const float scalar[], DST out[], \
#define MULC_SIMD(SRC, DST) \
int mulc_simd(const SRC in[], const float scalar[], DST out[], \
const int length, const int chan, const float scale);
MULC_SIMD(uchar, uchar)
@ -129,6 +129,17 @@ MULC_SIMD(float, float)
#undef MULC_SIMD
#define ABSDIFFC_SIMD(T) \
int absdiffc_simd(const T in[], const float scalar[], T out[], \
const int length, const int chan);
ABSDIFFC_SIMD(uchar)
ABSDIFFC_SIMD(short)
ABSDIFFC_SIMD(ushort)
ABSDIFFC_SIMD(float)
#undef ABSDIFFC_SIMD
} // namespace fluid
} // namespace gapi
} // namespace cv

@ -151,6 +151,17 @@ MULC_SIMD(float, float)
#undef MULC_SIMD
#define ABSDIFFC_SIMD(T) \
int absdiffc_simd(const T in[], const float scalar[], T out[], \
const int length, const int chan);
ABSDIFFC_SIMD(uchar)
ABSDIFFC_SIMD(short)
ABSDIFFC_SIMD(ushort)
ABSDIFFC_SIMD(float)
#undef ABSDIFFC_SIMD
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
struct scale_tag {};
@ -901,6 +912,7 @@ MUL_SIMD(float, float)
struct add_tag {};
struct sub_tag {};
struct mul_tag {};
struct absdiff_tag {};
CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(short* outx, const v_int32& c1,
const v_int32& c2, const v_int32& c3,
@ -938,6 +950,12 @@ CV_ALWAYS_INLINE v_float32 oper(mul_tag, const v_float32& a, const v_float32& sc
{
return a * sc;
}
CV_ALWAYS_INLINE v_float32 oper(absdiff_tag, const v_float32& a, const v_float32& sc)
{
return v_absdiff(a, sc);
}
//-------------------------------------------------------------------------------------------------
template<typename oper_tag, typename SRC, typename DST>
@ -1450,6 +1468,38 @@ MULC_SIMD(float, float)
#undef MULC_SIMD
//-------------------------
//
// Fluid kernels: AbsDiffC
//
//-------------------------
#define ABSDIFFC_SIMD(SRC) \
int absdiffc_simd(const SRC in[], const float scalar[], SRC out[], \
const int length, const int chan) \
{ \
switch (chan) \
{ \
case 1: \
case 2: \
case 4: \
return arithmOpScalar_simd_common(absdiff_tag{}, in, scalar, out, length); \
case 3: \
return arithmOpScalar_simd_c3(absdiff_tag{}, in, scalar, out, length); \
default: \
GAPI_Assert(chan <= 4); \
break; \
} \
return 0; \
}
ABSDIFFC_SIMD(uchar)
ABSDIFFC_SIMD(short)
ABSDIFFC_SIMD(ushort)
ABSDIFFC_SIMD(float)
#undef ABSDIFFC_SIMD
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
CV_CPU_OPTIMIZATION_NAMESPACE_END

Loading…
Cancel
Save