Merge pull request #21158 from anna-khakimova:ak/simd_subC

* GAPI Fluid: SIMD for SubC kernel.

* Applied comments
pull/21177/head
Anna Khakimova 3 years ago committed by GitHub
parent d9e7c1626a
commit 369b260e12
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 2
      modules/gapi/perf/common/gapi_core_perf_tests.hpp
  2. 16
      modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
  3. 3
      modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
  4. 11
      modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
  5. 3
      modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
  6. 152
      modules/gapi/src/backends/fluid/gfluidcore.cpp
  7. 32
      modules/gapi/src/backends/fluid/gfluidcore_func.dispatch.cpp
  8. 25
      modules/gapi/src/backends/fluid/gfluidcore_func.hpp
  9. 237
      modules/gapi/src/backends/fluid/gfluidcore_func.simd.hpp

@ -30,7 +30,7 @@ namespace opencv_test
class AddPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {}; class AddPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
class AddCPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {}; class AddCPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};
class SubPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {}; class SubPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
class SubCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {}; class SubCPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};
class SubRCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {}; class SubRCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
class MulPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, double, cv::GCompileArgs>> {}; class MulPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, double, cv::GCompileArgs>> {};
class MulDoublePerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {}; class MulDoublePerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};

@ -138,10 +138,13 @@ PERF_TEST_P_(SubPerfTest, TestPerformance)
PERF_TEST_P_(SubCPerfTest, TestPerformance) PERF_TEST_P_(SubCPerfTest, TestPerformance)
{ {
Size sz = get<0>(GetParam()); compare_f cmpF;
MatType type = get<1>(GetParam()); cv::Size sz;
int dtype = get<2>(GetParam()); MatType type = -1;
cv::GCompileArgs compile_args = get<3>(GetParam()); int dtype = -1;
cv::GCompileArgs compile_args;
std::tie(cmpF, sz, type, dtype, compile_args) = GetParam();
initMatsRandU(type, sz, dtype, false); initMatsRandU(type, sz, dtype, false);
@ -165,8 +168,9 @@ PERF_TEST_P_(SubCPerfTest, TestPerformance)
} }
// Comparison //////////////////////////////////////////////////////////// // Comparison ////////////////////////////////////////////////////////////
// FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv)); {
EXPECT_EQ(out_mat_gapi.size(), sz); EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
}
SANITY_CHECK_NOTHING(); SANITY_CHECK_NOTHING();
} }

@ -35,7 +35,8 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestCPU, SubPerfTest,
Values(cv::compile_args(CORE_CPU)))); Values(cv::compile_args(CORE_CPU))));
INSTANTIATE_TEST_CASE_P(SubCPerfTestCPU, SubCPerfTest, INSTANTIATE_TEST_CASE_P(SubCPerfTestCPU, SubCPerfTest,
Combine(Values(szSmall128, szVGA, sz720p, sz1080p), Combine(Values(AbsExact().to_compare_f()),
Values(szSmall128, szVGA, sz720p, sz1080p),
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
Values(-1, CV_8U, CV_16U, CV_32F), Values(-1, CV_8U, CV_16U, CV_32F),
Values(cv::compile_args(CORE_CPU)))); Values(cv::compile_args(CORE_CPU))));

@ -31,11 +31,12 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestFluid, SubPerfTest,
Values(-1, CV_8U, CV_32F), Values(-1, CV_8U, CV_32F),
Values(cv::compile_args(CORE_FLUID)))); Values(cv::compile_args(CORE_FLUID))));
// INSTANTIATE_TEST_CASE_P(SubCPerfTestFluid, SubCPerfTest, INSTANTIATE_TEST_CASE_P(SubCPerfTestFluid, SubCPerfTest,
// Combine(Values(szSmall128, szVGA, sz720p, sz1080p), Combine(Values(Tolerance_FloatRel_IntAbs(1e-6, 1).to_compare_f()),
// Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), Values(szSmall128, szVGA, sz720p, sz1080p),
// Values(-1, CV_8U, CV_16U, CV_32F), Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
// Values(cv::compile_args(CORE_FLUID)))); Values(-1, CV_8U, CV_16U, CV_16S, CV_32F),
Values(cv::compile_args(CORE_FLUID))));
// INSTANTIATE_TEST_CASE_P(SubRCPerfTestFluid, SubRCPerfTest, // INSTANTIATE_TEST_CASE_P(SubRCPerfTestFluid, SubRCPerfTest,
// Combine(Values(szSmall128, szVGA, sz720p, sz1080p), // Combine(Values(szSmall128, szVGA, sz720p, sz1080p),

@ -33,7 +33,8 @@ INSTANTIATE_TEST_CASE_P(SubPerfTestGPU, SubPerfTest,
Values(cv::compile_args(CORE_GPU)))); Values(cv::compile_args(CORE_GPU))));
INSTANTIATE_TEST_CASE_P(SubCPerfTestGPU, SubCPerfTest, INSTANTIATE_TEST_CASE_P(SubCPerfTestGPU, SubCPerfTest,
Combine(Values( szSmall128, szVGA, sz720p, sz1080p ), Combine(Values(AbsExact().to_compare_f()),
Values( szSmall128, szVGA, sz720p, sz1080p ),
Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ), Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
Values( -1, CV_8U, CV_16U, CV_32F ), Values( -1, CV_8U, CV_16U, CV_32F ),
Values(cv::compile_args(CORE_GPU)))); Values(cv::compile_args(CORE_GPU))));

@ -844,16 +844,12 @@ GAPI_FLUID_KERNEL(GFluidAbsDiff, cv::gapi::core::GAbsDiff, false)
// //
//-------------------------------------- //--------------------------------------
static inline v_uint16x8 v_sub_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return x - y; }
static inline v_uint16x8 v_subr_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return y - x; } static inline v_uint16x8 v_subr_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return y - x; }
static inline v_float32x4 v_sub_32f(const v_float32x4 &x, const v_float32x4 &y) { return x - y; }
static inline v_float32x4 v_subr_32f(const v_float32x4 &x, const v_float32x4 &y) { return y - x; } static inline v_float32x4 v_subr_32f(const v_float32x4 &x, const v_float32x4 &y) { return y - x; }
static inline int s_sub_8u(uchar x, uchar y) { return x - y; }
static inline int s_subr_8u(uchar x, uchar y) { return y - x; } static inline int s_subr_8u(uchar x, uchar y) { return y - x; }
static inline float s_sub_32f(float x, float y) { return x - y; }
static inline float s_subr_32f(float x, float y) { return y - x; } static inline float s_subr_32f(float x, float y) { return y - x; }
// manual SIMD if important case 8UC3 // manual SIMD if important case 8UC3
@ -942,21 +938,11 @@ static void run_arithm_s1(uchar out[], const float in[], int width, const float
} }
} }
static void run_arithm_s_sub3(uchar out[], const uchar in[], int width, const uchar scalar[])
{
run_arithm_s3(out, in, width, scalar, v_sub_16u, s_sub_8u);
}
static void run_arithm_s_subr3(uchar out[], const uchar in[], int width, const uchar scalar[]) static void run_arithm_s_subr3(uchar out[], const uchar in[], int width, const uchar scalar[])
{ {
run_arithm_s3(out, in, width, scalar, v_subr_16u, s_subr_8u); // reverse: subr run_arithm_s3(out, in, width, scalar, v_subr_16u, s_subr_8u); // reverse: subr
} }
static void run_arithm_s_sub1(uchar out[], const float in[], int width, const float scalar[])
{
run_arithm_s1(out, in, width, scalar, v_sub_32f, s_sub_32f);
}
static void run_arithm_s_subr1(uchar out[], const float in[], int width, const float scalar[]) static void run_arithm_s_subr1(uchar out[], const float in[], int width, const float scalar[])
{ {
run_arithm_s1(out, in, width, scalar, v_subr_32f, s_subr_32f); // reverse: subr run_arithm_s1(out, in, width, scalar, v_subr_32f, s_subr_32f); // reverse: subr
@ -1273,6 +1259,7 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
int width = dst.length(); int width = dst.length();
int chan = dst.meta().chan; int chan = dst.meta().chan;
const int length = width * chan;
switch (arithm) switch (arithm)
{ {
@ -1280,37 +1267,21 @@ CV_ALWAYS_INLINE void run_arithm_s(Buffer &dst, const View &src, const float sca
{ {
int w = 0; int w = 0;
#if CV_SIMD #if CV_SIMD
w = addc_simd(in, scalar, out, width, chan); w = addc_simd(in, scalar, out, length, chan);
#endif #endif
for (; w < length; ++w)
for (; w < width * chan; ++w)
out[w] = add<DST>(in[w], scalar[w % chan]); out[w] = add<DST>(in[w], scalar[w % chan]);
break; break;
} }
case ARITHM_SUBTRACT: case ARITHM_SUBTRACT:
{ {
// What if we cast the scalar into the SRC type? int w = 0;
const SRC myscal[4] = { static_cast<SRC>(scalar[0]), static_cast<SRC>(scalar[1]), #if CV_SIMD
static_cast<SRC>(scalar[2]), static_cast<SRC>(scalar[3]) }; w = subc_simd(in, scalar, out, length, chan);
bool usemyscal = (myscal[0] == scalar[0]) && (myscal[1] == scalar[1]) && #endif
(myscal[2] == scalar[2]) && (myscal[3] == scalar[3]); for (; w < length; ++w)
out[w] = sub<DST>(in[w], scalar[w % chan]);
if (usemyscal)
{
if (std::is_same<DST, uchar>::value &&
std::is_same<SRC, uchar>::value &&
chan == 3)
run_arithm_s_sub3((uchar*)out, (const uchar*)in, width, (const uchar*)myscal);
else if (std::is_same<DST, uchar>::value &&
std::is_same<SRC, float>::value &&
chan == 1)
run_arithm_s_sub1((uchar*)out, (const float*)in, width, (const float*)myscal);
else
run_arithm_s(out, in, width, chan, myscal, sub<DST, SRC, SRC>);
}
else
run_arithm_s(out, in, width, chan, scalar, sub<DST, SRC, float>);
break; break;
} }
// TODO: optimize miltiplication and division // TODO: optimize miltiplication and division
@ -1416,6 +1387,32 @@ GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true)
} }
}; };
CV_ALWAYS_INLINE void initScratchBuffer(Buffer& scratch)
{
#if CV_SIMD
// 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector.
constexpr int maxNlanes = 16;
// +2 is offset for 3-channel case.
// Offset is need to right load coefficients from scalar array to SIMD vectors for 3-channel case.
// Scalar array looks like: scalar[] = {C1, C2, C3, C1, C2, C3, ...}
// The first scalar SIMD vector should looks like:
// C1 C2 C3 C1
// The second:
// C2 C3 C1 C2
// The third:
// C3 C1 C2 C3
constexpr int offset = 2;
constexpr int buflen = maxNlanes + offset;
#else
constexpr int buflen = 4;
#endif
cv::Size bufsize(buflen, 1);
GMatDesc bufdesc = { CV_32F, 1, bufsize };
Buffer buffer(bufdesc);
scratch = std::move(buffer);
}
GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, true) GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, true)
{ {
static const int Window = 1; static const int Window = 1;
@ -1458,59 +1455,62 @@ GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, true)
static void initScratch(const GMatDesc&, const GScalarDesc&, int, Buffer& scratch) static void initScratch(const GMatDesc&, const GScalarDesc&, int, Buffer& scratch)
{ {
#if CV_SIMD initScratchBuffer(scratch);
// 512 bits / 32 bits = 16 elements of float32 can contain a AVX 512 SIMD vector.
constexpr int maxNlanes = 16;
// +2 is offset for 3-channel case.
// Offset is need to right load coefficients from scalar array to SIMD vectors for 3-channel case.
// Scalar array looks like: scalar[] = {C1, C2, C3, C1, C2, C3, ...}
// The first scalar SIMD vector should looks like:
// C1 C2 C3 C1
// The second:
// C2 C3 C1 C2
// The third:
// C3 C1 C2 C3
constexpr int offset = 2;
constexpr int buflen = maxNlanes + offset;
#else
constexpr int buflen = 4;
#endif
cv::Size bufsize(buflen, 1);
GMatDesc bufdesc = { CV_32F, 1, bufsize };
Buffer buffer(bufdesc);
scratch = std::move(buffer);
} }
static void resetScratch(Buffer& /* scratch */) static void resetScratch(Buffer& /*scratch*/)
{ {
} }
}; };
GAPI_FLUID_KERNEL(GFluidSubC, cv::gapi::core::GSubC, false) GAPI_FLUID_KERNEL(GFluidSubC, cv::gapi::core::GSubC, true)
{ {
static const int Window = 1; static const int Window = 1;
static void run(const View &src, const cv::Scalar &_scalar, int /*dtype*/, Buffer &dst) static void run(const View& src, const cv::Scalar& _scalar, int /*dtype*/, Buffer& dst, Buffer& scratch)
{ {
const float scalar[4] = { GAPI_Assert(src.meta().chan <= 4);
static_cast<float>(_scalar[0]),
static_cast<float>(_scalar[1]), if (dst.y() == 0)
static_cast<float>(_scalar[2]), {
static_cast<float>(_scalar[3]) const int chan = src.meta().chan;
}; float* sc = scratch.OutLine<float>();
for (int i = 0; i < scratch.length(); ++i)
sc[i] = static_cast<float>(_scalar[i % chan]);
}
const float* scalar = scratch.OutLine<float>();
// DST SRC OP __VA_ARGS__ // DST SRC OP __VA_ARGS__
UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); UNARY_(uchar, uchar, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
UNARY_(uchar , short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); UNARY_(uchar, ushort, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
UNARY_(uchar , float, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); UNARY_(uchar, short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
UNARY_( short, short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); UNARY_(uchar, float, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); UNARY_(ushort, ushort, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
UNARY_( float, short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); UNARY_(ushort, short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
UNARY_( float, float, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT); UNARY_(ushort, uchar, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
UNARY_(ushort, float, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
UNARY_(short, short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
UNARY_(short, ushort, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
UNARY_(short, uchar, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
UNARY_(short, float, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
UNARY_(float, uchar , run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
UNARY_(float, ushort, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
UNARY_(float, short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
UNARY_(float, float, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
} }
static void initScratch(const GMatDesc&, const GScalarDesc&, int, Buffer& scratch)
{
initScratchBuffer(scratch);
}
static void resetScratch(Buffer& /*scratch*/)
{
}
}; };
GAPI_FLUID_KERNEL(GFluidSubRC, cv::gapi::core::GSubRC, false) GAPI_FLUID_KERNEL(GFluidSubRC, cv::gapi::core::GSubRC, false)

@ -65,7 +65,6 @@ int mul_simd(const SRC in1[], const SRC in2[], DST out[], \
CV_CPU_DISPATCH_MODES_ALL); \ CV_CPU_DISPATCH_MODES_ALL); \
} }
MUL_SIMD(uchar, uchar) MUL_SIMD(uchar, uchar)
MUL_SIMD(ushort, uchar) MUL_SIMD(ushort, uchar)
MUL_SIMD(short, uchar) MUL_SIMD(short, uchar)
@ -87,9 +86,9 @@ MUL_SIMD(float, float)
#define ADDC_SIMD(SRC, DST) \ #define ADDC_SIMD(SRC, DST) \
int addc_simd(const SRC in[], const float scalar[], DST out[], \ int addc_simd(const SRC in[], const float scalar[], DST out[], \
const int width, const int chan) \ const int length, const int chan) \
{ \ { \
CV_CPU_DISPATCH(addc_simd, (in, scalar, out, width, chan), \ CV_CPU_DISPATCH(addc_simd, (in, scalar, out, length, chan), \
CV_CPU_DISPATCH_MODES_ALL); \ CV_CPU_DISPATCH_MODES_ALL); \
} }
@ -112,6 +111,33 @@ ADDC_SIMD(float, float)
#undef ADDC_SIMD #undef ADDC_SIMD
#define SUBC_SIMD(SRC, DST) \
int subc_simd(const SRC in[], const float scalar[], DST out[], \
const int length, const int chan) \
{ \
CV_CPU_DISPATCH(subc_simd, (in, scalar, out, length, chan), \
CV_CPU_DISPATCH_MODES_ALL); \
}
SUBC_SIMD(uchar, uchar)
SUBC_SIMD(ushort, uchar)
SUBC_SIMD(short, uchar)
SUBC_SIMD(float, uchar)
SUBC_SIMD(short, short)
SUBC_SIMD(ushort, short)
SUBC_SIMD(uchar, short)
SUBC_SIMD(float, short)
SUBC_SIMD(ushort, ushort)
SUBC_SIMD(uchar, ushort)
SUBC_SIMD(short, ushort)
SUBC_SIMD(float, ushort)
SUBC_SIMD(uchar, float)
SUBC_SIMD(ushort, float)
SUBC_SIMD(short, float)
SUBC_SIMD(float, float)
#undef SUBC_SIMD
} // namespace fluid } // namespace fluid
} // namespace gapi } // namespace gapi
} // namespace cv } // namespace cv

@ -62,7 +62,7 @@ MUL_SIMD(float, float)
#define ADDC_SIMD(SRC, DST) \ #define ADDC_SIMD(SRC, DST) \
int addc_simd(const SRC in[], const float scalar[], DST out[], \ int addc_simd(const SRC in[], const float scalar[], DST out[], \
const int width, const int chan); const int length, const int chan);
ADDC_SIMD(uchar, uchar) ADDC_SIMD(uchar, uchar)
ADDC_SIMD(ushort, uchar) ADDC_SIMD(ushort, uchar)
@ -83,6 +83,29 @@ ADDC_SIMD(float, float)
#undef ADDC_SIMD #undef ADDC_SIMD
#define SUBC_SIMD(SRC, DST) \
int subc_simd(const SRC in[], const float scalar[], DST out[], \
const int length, const int chan);
SUBC_SIMD(uchar, uchar)
SUBC_SIMD(ushort, uchar)
SUBC_SIMD(short, uchar)
SUBC_SIMD(float, uchar)
SUBC_SIMD(short, short)
SUBC_SIMD(ushort, short)
SUBC_SIMD(uchar, short)
SUBC_SIMD(float, short)
SUBC_SIMD(ushort, ushort)
SUBC_SIMD(uchar, ushort)
SUBC_SIMD(short, ushort)
SUBC_SIMD(float, ushort)
SUBC_SIMD(uchar, float)
SUBC_SIMD(ushort, float)
SUBC_SIMD(short, float)
SUBC_SIMD(float, float)
#undef SUBC_SIMD
} // namespace fluid } // namespace fluid
} // namespace gapi } // namespace gapi
} // namespace cv } // namespace cv

@ -83,7 +83,7 @@ MUL_SIMD(float, float)
#define ADDC_SIMD(SRC, DST) \ #define ADDC_SIMD(SRC, DST) \
int addc_simd(const SRC in[], const float scalar[], DST out[], \ int addc_simd(const SRC in[], const float scalar[], DST out[], \
const int width, const int chan); const int length, const int chan);
ADDC_SIMD(uchar, uchar) ADDC_SIMD(uchar, uchar)
ADDC_SIMD(ushort, uchar) ADDC_SIMD(ushort, uchar)
@ -104,6 +104,29 @@ ADDC_SIMD(float, float)
#undef ADDC_SIMD #undef ADDC_SIMD
#define SUBC_SIMD(SRC, DST) \
int subc_simd(const SRC in[], const float scalar[], DST out[], \
const int length, const int chan);
SUBC_SIMD(uchar, uchar)
SUBC_SIMD(ushort, uchar)
SUBC_SIMD(short, uchar)
SUBC_SIMD(float, uchar)
SUBC_SIMD(short, short)
SUBC_SIMD(ushort, short)
SUBC_SIMD(uchar, short)
SUBC_SIMD(float, short)
SUBC_SIMD(ushort, ushort)
SUBC_SIMD(uchar, ushort)
SUBC_SIMD(short, ushort)
SUBC_SIMD(float, ushort)
SUBC_SIMD(uchar, float)
SUBC_SIMD(ushort, float)
SUBC_SIMD(short, float)
SUBC_SIMD(float, float)
#undef SUBC_SIMD
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
struct scale_tag {}; struct scale_tag {};
@ -851,10 +874,13 @@ MUL_SIMD(float, float)
// //
//------------------------- //-------------------------
CV_ALWAYS_INLINE void addc_pack_store_c3(short* outx, const v_int32& c1, struct add_tag {};
const v_int32& c2, const v_int32& c3, struct sub_tag {};
const v_int32& c4, const v_int32& c5,
const v_int32& c6) CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(short* outx, const v_int32& c1,
const v_int32& c2, const v_int32& c3,
const v_int32& c4, const v_int32& c5,
const v_int32& c6)
{ {
constexpr int nlanes = v_int16::nlanes; constexpr int nlanes = v_int16::nlanes;
vx_store(outx, v_pack(c1, c2)); vx_store(outx, v_pack(c1, c2));
@ -862,10 +888,10 @@ CV_ALWAYS_INLINE void addc_pack_store_c3(short* outx, const v_int32& c1,
vx_store(&outx[2*nlanes], v_pack(c5, c6)); vx_store(&outx[2*nlanes], v_pack(c5, c6));
} }
CV_ALWAYS_INLINE void addc_pack_store_c3(ushort* outx, const v_int32& c1, CV_ALWAYS_INLINE void arithmOpScalar_pack_store_c3(ushort* outx, const v_int32& c1,
const v_int32& c2, const v_int32& c3, const v_int32& c2, const v_int32& c3,
const v_int32& c4, const v_int32& c5, const v_int32& c4, const v_int32& c5,
const v_int32& c6) const v_int32& c6)
{ {
constexpr int nlanes = v_uint16::nlanes; constexpr int nlanes = v_uint16::nlanes;
vx_store(outx, v_pack_u(c1, c2)); vx_store(outx, v_pack_u(c1, c2));
@ -873,50 +899,64 @@ CV_ALWAYS_INLINE void addc_pack_store_c3(ushort* outx, const v_int32& c1,
vx_store(&outx[2*nlanes], v_pack_u(c5, c6)); vx_store(&outx[2*nlanes], v_pack_u(c5, c6));
} }
template<typename SRC, typename DST> CV_ALWAYS_INLINE v_float32 oper(add_tag, const v_float32& a, const v_float32& sc)
{
return a + sc;
}
CV_ALWAYS_INLINE v_float32 oper(sub_tag, const v_float32& a, const v_float32& sc)
{
return a - sc;
}
template<typename oper_tag, typename SRC, typename DST>
CV_ALWAYS_INLINE CV_ALWAYS_INLINE
typename std::enable_if<(std::is_same<DST, ushort>::value || typename std::enable_if<(std::is_same<DST, ushort>::value ||
std::is_same<DST, short>::value), void>::type std::is_same<DST, short>::value), void>::type
addc_simd_common_impl(const SRC* inx, DST* outx, const v_float32& sc, const int nlanes) arithmOpScalar_simd_common_impl(oper_tag t, const SRC* inx, DST* outx,
const v_float32& sc, const int nlanes)
{ {
v_float32 a1 = vg_load_f32(inx); v_float32 a1 = vg_load_f32(inx);
v_float32 a2 = vg_load_f32(&inx[nlanes/2]); v_float32 a2 = vg_load_f32(&inx[nlanes/2]);
v_store_i16(outx, v_round(a1 + sc), v_round(a2 + sc)); v_store_i16(outx, v_round(oper(t, a1, sc)), v_round(oper(t, a2, sc)));
} }
//------------------------------------------------------------------------------------------------- //-------------------------------------------------------------------------------------------------
template<typename SRC> template<typename oper_tag, typename SRC>
CV_ALWAYS_INLINE void addc_simd_common_impl(const SRC* inx, uchar* outx, const v_float32& sc, const int nlanes) CV_ALWAYS_INLINE void arithmOpScalar_simd_common_impl(oper_tag t, const SRC* inx,
uchar* outx, const v_float32& sc,
const int nlanes)
{ {
v_float32 a1 = vg_load_f32(inx); v_float32 a1 = vg_load_f32(inx);
v_float32 a2 = vg_load_f32(&inx[nlanes/4]); v_float32 a2 = vg_load_f32(&inx[nlanes/4]);
v_float32 a3 = vg_load_f32(&inx[nlanes/2]); v_float32 a3 = vg_load_f32(&inx[nlanes/2]);
v_float32 a4 = vg_load_f32(&inx[3 * nlanes/4]); v_float32 a4 = vg_load_f32(&inx[3 * nlanes/4]);
vx_store(outx, v_pack_u(v_pack(v_round(a1 + sc), vx_store(outx, v_pack_u(v_pack(v_round(oper(t, a1, sc)),
v_round(a2 + sc)), v_round(oper(t, a2, sc))),
v_pack(v_round(a3 + sc), v_pack(v_round(oper(t, a3, sc)),
v_round(a4 + sc)))); v_round(oper(t, a4, sc)))));
} }
//------------------------------------------------------------------------------------------------- //-------------------------------------------------------------------------------------------------
template<typename SRC> template<typename oper_tag, typename SRC>
CV_ALWAYS_INLINE void addc_simd_common_impl(const SRC* inx, float* outx, const v_float32& sc, const int) CV_ALWAYS_INLINE void arithmOpScalar_simd_common_impl(oper_tag t, const SRC* inx,
float* outx, const v_float32& sc, const int)
{ {
v_float32 a1 = vg_load_f32(inx); v_float32 a1 = vg_load_f32(inx);
vx_store(outx, a1 + sc); vx_store(outx, oper(t, a1, sc));
} }
//------------------------------------------------------------------------------------------------- //-------------------------------------------------------------------------------------------------
template<typename SRC, typename DST> template<typename oper_tag, typename SRC, typename DST>
CV_ALWAYS_INLINE CV_ALWAYS_INLINE
typename std::enable_if<std::is_same<DST, short>::value || typename std::enable_if<std::is_same<DST, short>::value ||
std::is_same<DST, ushort>::value, void>::type std::is_same<DST, ushort>::value, void>::type
addc_simd_c3_impl(const SRC* inx, DST* outx, const v_float32& s1, const v_float32& s2, arithmOpScalar_simd_c3_impl(oper_tag t, const SRC* inx, DST* outx, const v_float32& s1, const v_float32& s2,
const v_float32& s3, const int nlanes) const v_float32& s3, const int nlanes)
{ {
v_float32 a1 = vg_load_f32(inx); v_float32 a1 = vg_load_f32(inx);
@ -926,60 +966,62 @@ addc_simd_c3_impl(const SRC* inx, DST* outx, const v_float32& s1, const v_float3
v_float32 a5 = vg_load_f32(&inx[2 * nlanes]); v_float32 a5 = vg_load_f32(&inx[2 * nlanes]);
v_float32 a6 = vg_load_f32(&inx[5 * nlanes / 2]); v_float32 a6 = vg_load_f32(&inx[5 * nlanes / 2]);
addc_pack_store_c3(outx, v_round(a1 + s1), arithmOpScalar_pack_store_c3(outx, v_round(oper(t, a1, s1)),
v_round(a2 + s2), v_round(oper(t, a2, s2)),
v_round(a3 + s3), v_round(oper(t, a3, s3)),
v_round(a4 + s1), v_round(oper(t, a4, s1)),
v_round(a5 + s2), v_round(oper(t, a5, s2)),
v_round(a6 + s3)); v_round(oper(t, a6, s3)));
} }
//------------------------------------------------------------------------------------------------- //-------------------------------------------------------------------------------------------------
template<typename SRC> template<typename oper_tag, typename SRC>
CV_ALWAYS_INLINE void addc_simd_c3_impl(const SRC* inx, uchar* outx, CV_ALWAYS_INLINE void arithmOpScalar_simd_c3_impl(oper_tag t, const SRC* inx, uchar* outx,
const v_float32& s1, const v_float32& s2, const v_float32& s1, const v_float32& s2,
const v_float32& s3, const int nlanes) const v_float32& s3, const int nlanes)
{ {
vx_store(outx, vx_store(outx,
v_pack_u(v_pack(v_round(vg_load_f32(inx) + s1), v_pack_u(v_pack(v_round(oper(t, vg_load_f32(inx), s1)),
v_round(vg_load_f32(&inx[nlanes/4]) + s2)), v_round(oper(t, vg_load_f32(&inx[nlanes/4]), s2))),
v_pack(v_round(vg_load_f32(&inx[nlanes/2]) + s3), v_pack(v_round(oper(t, vg_load_f32(&inx[nlanes/2]), s3)),
v_round(vg_load_f32(&inx[3*nlanes/4]) + s1)))); v_round(oper(t, vg_load_f32(&inx[3*nlanes/4]), s1)))));
vx_store(&outx[nlanes], vx_store(&outx[nlanes],
v_pack_u(v_pack(v_round(vg_load_f32(&inx[nlanes]) + s2), v_pack_u(v_pack(v_round(oper(t, vg_load_f32(&inx[nlanes]), s2)),
v_round(vg_load_f32(&inx[5*nlanes/4]) + s3)), v_round(oper(t, vg_load_f32(&inx[5*nlanes/4]), s3))),
v_pack(v_round(vg_load_f32(&inx[3*nlanes/2]) + s1), v_pack(v_round(oper(t, vg_load_f32(&inx[3*nlanes/2]), s1)),
v_round(vg_load_f32(&inx[7*nlanes/4]) + s2)))); v_round(oper(t, vg_load_f32(&inx[7*nlanes/4]), s2)))));
vx_store(&outx[2 * nlanes], vx_store(&outx[2 * nlanes],
v_pack_u(v_pack(v_round(vg_load_f32(&inx[2*nlanes]) + s3), v_pack_u(v_pack(v_round(oper(t, vg_load_f32(&inx[2*nlanes]), s3)),
v_round(vg_load_f32(&inx[9*nlanes/4]) + s1)), v_round(oper(t, vg_load_f32(&inx[9*nlanes/4]), s1))),
v_pack(v_round(vg_load_f32(&inx[5*nlanes/2]) + s2), v_pack(v_round(oper(t, vg_load_f32(&inx[5*nlanes/2]), s2)),
v_round(vg_load_f32(&inx[11*nlanes/4]) + s3)))); v_round(oper(t, vg_load_f32(&inx[11*nlanes/4]), s3)))));
} }
//------------------------------------------------------------------------------------------------- //-------------------------------------------------------------------------------------------------
template<typename SRC> template<typename oper_tag, typename SRC>
CV_ALWAYS_INLINE void addc_simd_c3_impl(const SRC* in, float* out, CV_ALWAYS_INLINE void arithmOpScalar_simd_c3_impl(oper_tag t, const SRC* in, float* out,
const v_float32& s1, const v_float32& s2, const v_float32& s1, const v_float32& s2,
const v_float32& s3, const int nlanes) const v_float32& s3, const int nlanes)
{ {
v_float32 a1 = vg_load_f32(in); v_float32 a1 = vg_load_f32(in);
v_float32 a2 = vg_load_f32(&in[nlanes]); v_float32 a2 = vg_load_f32(&in[nlanes]);
v_float32 a3 = vg_load_f32(&in[2*nlanes]); v_float32 a3 = vg_load_f32(&in[2*nlanes]);
vx_store(out, a1 + s1); vx_store(out, oper(t, a1, s1));
vx_store(&out[nlanes], a2 + s2); vx_store(&out[nlanes], oper(t, a2, s2));
vx_store(&out[2*nlanes], a3 + s3); vx_store(&out[2*nlanes], oper(t, a3, s3));
} }
//------------------------------------------------------------------------------------------------- //-------------------------------------------------------------------------------------------------
template<typename SRC, typename DST> template<typename oper_tag, typename SRC, typename DST>
CV_ALWAYS_INLINE int addc_simd_c3(const SRC in[], const float scalar[], DST out[], const int length) CV_ALWAYS_INLINE int arithmOpScalar_simd_c3(oper_tag t, const SRC in[],
const float scalar[], DST out[],
const int length)
{ {
constexpr int chan = 3; constexpr int chan = 3;
constexpr int nlanes = vector_type_of_t<DST>::nlanes; constexpr int nlanes = vector_type_of_t<DST>::nlanes;
@ -1002,7 +1044,7 @@ CV_ALWAYS_INLINE int addc_simd_c3(const SRC in[], const float scalar[], DST out[
{ {
for (; x <= length - lanes; x += lanes) for (; x <= length - lanes; x += lanes)
{ {
addc_simd_c3_impl(&in[x], &out[x], s1, s2, s3, nlanes); arithmOpScalar_simd_c3_impl(t, &in[x], &out[x], s1, s2, s3, nlanes);
} }
if (x < length) if (x < length)
@ -1015,8 +1057,12 @@ CV_ALWAYS_INLINE int addc_simd_c3(const SRC in[], const float scalar[], DST out[
return x; return x;
} }
template<typename SRC, typename DST> //-------------------------------------------------------------------------------------------------
CV_ALWAYS_INLINE int addc_simd_common(const SRC in[], const float scalar[], DST out[], const int length)
template<typename oper_tag, typename SRC, typename DST>
CV_ALWAYS_INLINE int arithmOpScalar_simd_common(oper_tag t, const SRC in[],
const float scalar[], DST out[],
const int length)
{ {
constexpr int nlanes = vector_type_of_t<DST>::nlanes; constexpr int nlanes = vector_type_of_t<DST>::nlanes;
@ -1030,7 +1076,7 @@ CV_ALWAYS_INLINE int addc_simd_common(const SRC in[], const float scalar[], DST
{ {
for (; x <= length - nlanes; x += nlanes) for (; x <= length - nlanes; x += nlanes)
{ {
addc_simd_common_impl(&in[x], &out[x], sc, nlanes); arithmOpScalar_simd_common_impl(t, &in[x], &out[x], sc, nlanes);
} }
if (x < length) if (x < length)
@ -1043,24 +1089,25 @@ CV_ALWAYS_INLINE int addc_simd_common(const SRC in[], const float scalar[], DST
return x; return x;
} }
#define ADDC_SIMD(SRC, DST) \
int addc_simd(const SRC in[], const float scalar[], DST out[], \
const int width, const int chan) \ #define ADDC_SIMD(SRC, DST) \
{ \ int addc_simd(const SRC in[], const float scalar[], DST out[], \
const int length = width * chan; \ const int length, const int chan) \
switch (chan) \ { \
{ \ switch (chan) \
case 1: \ { \
case 2: \ case 1: \
case 4: \ case 2: \
return addc_simd_common(in, scalar, out, length); \ case 4: \
case 3: \ return arithmOpScalar_simd_common(add_tag{}, in, scalar, out, length); \
return addc_simd_c3(in, scalar, out, length); \ case 3: \
default: \ return arithmOpScalar_simd_c3(add_tag{}, in, scalar, out, length); \
GAPI_Assert(chan <= 4); \ default: \
break; \ GAPI_Assert(chan <= 4); \
} \ break; \
return 0; \ } \
return 0; \
} }
ADDC_SIMD(uchar, uchar) ADDC_SIMD(uchar, uchar)
@ -1082,6 +1129,44 @@ ADDC_SIMD(float, float)
#undef ADDC_SIMD #undef ADDC_SIMD
#define SUBC_SIMD(SRC, DST) \
int subc_simd(const SRC in[], const float scalar[], DST out[], \
const int length, const int chan) \
{ \
switch (chan) \
{ \
case 1: \
case 2: \
case 4: \
return arithmOpScalar_simd_common(sub_tag{}, in, scalar, out, length); \
case 3: \
return arithmOpScalar_simd_c3(sub_tag{}, in, scalar, out, length); \
default: \
GAPI_Assert(chan <= 4); \
break; \
} \
return 0; \
}
SUBC_SIMD(uchar, uchar)
SUBC_SIMD(ushort, uchar)
SUBC_SIMD(short, uchar)
SUBC_SIMD(float, uchar)
SUBC_SIMD(short, short)
SUBC_SIMD(ushort, short)
SUBC_SIMD(uchar, short)
SUBC_SIMD(float, short)
SUBC_SIMD(ushort, ushort)
SUBC_SIMD(uchar, ushort)
SUBC_SIMD(short, ushort)
SUBC_SIMD(float, ushort)
SUBC_SIMD(uchar, float)
SUBC_SIMD(ushort, float)
SUBC_SIMD(short, float)
SUBC_SIMD(float, float)
#undef SUBC_SIMD
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
CV_CPU_OPTIMIZATION_NAMESPACE_END CV_CPU_OPTIMIZATION_NAMESPACE_END

Loading…
Cancel
Save