Merge pull request #13221 from elatkin:el/gapi_perf_sepfilter

GAPI (fluid): optimization of Separable filter (#13221)

* GAPI (fluid): Separable filter: performance test

* GAPI (fluid): enable all performance tests

* GAPI: separable filters: alternative code for Sobel

* GAPI (fluid): hide unused old code for Sobel filter

* GAPI (fluid): especial code for Sobel if U8 into S16

* GAPI (fluid): back to old code for Sobel

* GAPI (fluid): run_sepfilter3x3_impl() with CPU dispatcher

* GAPI (fluid): run_sepfilter3x3_impl(): fix compiler warnings

* GAPI (fluid): new engine for separable filters (but Sobel)

* GAPI (fluid): new performance engine for Sobel

* GAPI (fluid): Sepfilters performance: fixed compilation error
pull/13264/head
Evgeny Latkin 6 years ago committed by Alexander Alekhin
parent dd952f6d68
commit f07856eab9
  1. 26
      modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp
  2. 180
      modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp
  3. 274
      modules/gapi/src/backends/fluid/gfluidimgproc.cpp
  4. 48
      modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
  5. 36
      modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
  6. 646
      modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp

@ -52,7 +52,7 @@ PERF_TEST_P_(SepFilterPerfTest, TestPerformance)
TEST_CYCLE()
{
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
c.apply(in_mat1, out_mat_gapi);
}
// Comparison //////////////////////////////////////////////////////////////
@ -100,7 +100,7 @@ PERF_TEST_P_(Filter2DPerfTest, TestPerformance)
TEST_CYCLE()
{
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
c.apply(in_mat1, out_mat_gapi);
}
// Comparison //////////////////////////////////////////////////////////////
@ -145,7 +145,7 @@ PERF_TEST_P_(BoxFilterPerfTest, TestPerformance)
TEST_CYCLE()
{
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
c.apply(in_mat1, out_mat_gapi);
}
// Comparison //////////////////////////////////////////////////////////////
@ -188,7 +188,7 @@ PERF_TEST_P_(BlurPerfTest, TestPerformance)
TEST_CYCLE()
{
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
c.apply(in_mat1, out_mat_gapi);
}
// Comparison //////////////////////////////////////////////////////////////
@ -230,7 +230,7 @@ PERF_TEST_P_(GaussianBlurPerfTest, TestPerformance)
TEST_CYCLE()
{
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
c.apply(in_mat1, out_mat_gapi);
}
// Comparison //////////////////////////////////////////////////////////////
@ -271,7 +271,7 @@ PERF_TEST_P_(MedianBlurPerfTest, TestPerformance)
TEST_CYCLE()
{
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
c.apply(in_mat1, out_mat_gapi);
}
// Comparison //////////////////////////////////////////////////////////////
@ -314,7 +314,7 @@ PERF_TEST_P_(ErodePerfTest, TestPerformance)
TEST_CYCLE()
{
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
c.apply(in_mat1, out_mat_gapi);
}
// Comparison //////////////////////////////////////////////////////////////
@ -357,7 +357,7 @@ PERF_TEST_P_(Erode3x3PerfTest, TestPerformance)
TEST_CYCLE()
{
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
c.apply(in_mat1, out_mat_gapi);
}
// Comparison //////////////////////////////////////////////////////////////
@ -400,7 +400,7 @@ PERF_TEST_P_(DilatePerfTest, TestPerformance)
TEST_CYCLE()
{
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
c.apply(in_mat1, out_mat_gapi);
}
// Comparison //////////////////////////////////////////////////////////////
@ -443,7 +443,7 @@ PERF_TEST_P_(Dilate3x3PerfTest, TestPerformance)
TEST_CYCLE()
{
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
c.apply(in_mat1, out_mat_gapi);
}
// Comparison //////////////////////////////////////////////////////////////
@ -526,7 +526,7 @@ PERF_TEST_P_(CannyPerfTest, TestPerformance)
TEST_CYCLE()
{
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
c.apply(in_mat1, out_mat_gapi);
}
// Comparison //////////////////////////////////////////////////////////////
@ -564,7 +564,7 @@ PERF_TEST_P_(EqHistPerfTest, TestPerformance)
TEST_CYCLE()
{
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
c.apply(in_mat1, out_mat_gapi);
}
// Comparison //////////////////////////////////////////////////////////////
@ -830,7 +830,7 @@ PERF_TEST_P_(LUV2BGRPerfTest, TestPerformance)
TEST_CYCLE()
{
c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
c.apply(in_mat1, out_mat_gapi);
}
// Comparison //////////////////////////////////////////////////////////////

@ -13,9 +13,101 @@
namespace opencv_test
{
INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid, SobelPerfTest,
Combine(Values(AbsExact().to_compare_f()),
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1), // add CV_32FC1 when ready
INSTANTIATE_TEST_CASE_P(SepFilterPerfTestFluid_8U, SepFilterPerfTest,
Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
Values(CV_8UC1, CV_8UC3),
Values(3),
Values(szVGA, sz720p, sz1080p),
Values(-1, CV_16S, CV_32F),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(SepFilterPerfTestFluid_other, SepFilterPerfTest,
Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
Values(CV_16UC1, CV_16SC1, CV_32FC1),
Values(3),
Values(szVGA, sz720p, sz1080p),
Values(-1, CV_32F),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(Filter2DPerfTestFluid, Filter2DPerfTest,
Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
Values(3), // add 4, 5, 7 when kernel is ready
Values(szVGA, sz720p, sz1080p),
Values(cv::BORDER_DEFAULT),
Values(-1, CV_32F),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(BoxFilterPerfTestFluid, BoxFilterPerfTest,
Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
Values(3), // add size=5, when kernel is ready
Values(szVGA, sz720p, sz1080p),
Values(cv::BORDER_DEFAULT),
Values(-1, CV_32F),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(BlurPerfTestFluid, BlurPerfTest,
Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
Values(3), // add size=5, when kernel is ready
Values(szVGA, sz720p, sz1080p),
Values(cv::BORDER_DEFAULT),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(GaussianBlurPerfTestFluid, GaussianBlurPerfTest,
Combine(Values(ToleranceFilter(1e-3f, 0.01).to_compare_f()),
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
Values(3), // add size=5, when kernel is ready
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(MedianBlurPerfTestFluid, MedianBlurPerfTest,
Combine(Values(AbsExact().to_compare_f()),
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
Values(3), // add size=5, when kernel is ready
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(ErodePerfTestFluid, ErodePerfTest,
Combine(Values(AbsExact().to_compare_f()),
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
Values(3), // add size=5, when kernel is ready
Values(szVGA, sz720p, sz1080p),
Values(cv::MorphShapes::MORPH_RECT,
cv::MorphShapes::MORPH_CROSS,
cv::MorphShapes::MORPH_ELLIPSE),
Values(cv::compile_args(IMGPROC_FLUID))));
// GAPI/fluid does not support iterations parameter for the Erode kernel
INSTANTIATE_TEST_CASE_P(DISABLED_Erode3x3PerfTestFluid, Erode3x3PerfTest,
Combine(Values(AbsExact().to_compare_f()),
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
Values(szVGA, sz720p, sz1080p),
Values(1, 2, 4),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(DilatePerfTestFluid, DilatePerfTest,
Combine(Values(AbsExact().to_compare_f()),
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
Values(3), // add size=5, when kernel is ready
Values(szVGA, sz720p, sz1080p),
Values(cv::MorphShapes::MORPH_RECT,
cv::MorphShapes::MORPH_CROSS,
cv::MorphShapes::MORPH_ELLIPSE),
Values(cv::compile_args(IMGPROC_FLUID))));
// GAPI/fluid does not support iterations parameter for the Dilate kernel
INSTANTIATE_TEST_CASE_P(DISABLED_Dilate3x3PerfTestFluid, Dilate3x3PerfTest,
Combine(Values(AbsExact().to_compare_f()),
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
Values(szVGA, sz720p, sz1080p),
Values(1, 2, 4),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid, SobelPerfTest,
Combine(Values(AbsExact().to_compare_f()),
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
Values(3), // add 5x5 once supported
Values(szVGA, sz720p, sz1080p),
Values(-1, CV_16S, CV_32F),
@ -23,8 +115,8 @@ namespace opencv_test
Values(1, 2),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid32F, SobelPerfTest,
Combine(Values(ToleranceFilter(1e-3f, 0.0).to_compare_f()),
INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid32F, SobelPerfTest,
Combine(Values(ToleranceFilter(1e-3f, 0.0).to_compare_f()),
Values(CV_32FC1),
Values(3), // add 5x5 once supported
Values(szVGA, sz720p, sz1080p),
@ -33,44 +125,44 @@ namespace opencv_test
Values(1, 2),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(RGB2GrayPerfTestFluid, RGB2GrayPerfTest,
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(BGR2GrayPerfTestFluid, BGR2GrayPerfTest,
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(RGB2YUVPerfTestFluid, RGB2YUVPerfTest,
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(YUV2RGBPerfTestFluid, YUV2RGBPerfTest,
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(BGR2YUVPerfTestFluid, BGR2YUVPerfTest,
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(YUV2BGRPerfTestFluid, YUV2BGRPerfTest,
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(BGR2LUVPerfTestFluid, BGR2LUVPerfTest,
Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(RGB2LabPerfTestFluid, RGB2LabPerfTest,
Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(RGB2GrayPerfTestFluid, RGB2GrayPerfTest,
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(BGR2GrayPerfTestFluid, BGR2GrayPerfTest,
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(RGB2YUVPerfTestFluid, RGB2YUVPerfTest,
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(YUV2RGBPerfTestFluid, YUV2RGBPerfTest,
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(BGR2YUVPerfTestFluid, BGR2YUVPerfTest,
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(YUV2BGRPerfTestFluid, YUV2BGRPerfTest,
Combine(Values(ToleranceColor(1e-3).to_compare_f()),
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(BGR2LUVPerfTestFluid, BGR2LUVPerfTest,
Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
INSTANTIATE_TEST_CASE_P(RGB2LabPerfTestFluid, RGB2LabPerfTest,
Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
Values(szVGA, sz720p, sz1080p),
Values(cv::compile_args(IMGPROC_FLUID))));
}

@ -344,7 +344,7 @@ static const int maxKernelSize = 9;
template<typename DST, typename SRC>
static void run_boxfilter(Buffer &dst, const View &src, const cv::Size &kernelSize,
const cv::Point& /* anchor */, bool normalize)
const cv::Point& /* anchor */, bool normalize, float *buf[])
{
GAPI_Assert(kernelSize.width <= maxKernelSize);
GAPI_Assert(kernelSize.width == kernelSize.height);
@ -365,36 +365,53 @@ static void run_boxfilter(Buffer &dst, const View &src, const cv::Size &kernelSi
int width = dst.length();
int chan = dst.meta().chan;
GAPI_DbgAssert(chan <= 4);
if (kernelSize.width == 3 && kernelSize.height == 3)
{
int y = dst.y();
int y0 = dst.priv().writeStart();
for (int w=0; w < width; w++)
float kx[3] = {1, 1, 1};
float *ky = kx;
float scale=1, delta=0;
if (normalize)
scale = 1/9.f;
run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
} else
{
float sum[4] = {0, 0, 0, 0};
GAPI_DbgAssert(chan <= 4);
for (int i=0; i < kernel; i++)
for (int w=0; w < width; w++)
{
for (int j=0; j < kernel; j++)
float sum[4] = {0, 0, 0, 0};
for (int i=0; i < kernel; i++)
{
for (int c=0; c < chan; c++)
sum[c] += in[i][(w + j - border)*chan + c];
for (int j=0; j < kernel; j++)
{
for (int c=0; c < chan; c++)
sum[c] += in[i][(w + j - border)*chan + c];
}
}
}
for (int c=0; c < chan; c++)
{
float result = normalize? sum[c]/(kernel * kernel) : sum[c];
for (int c=0; c < chan; c++)
{
float result = normalize? sum[c]/(kernel * kernel) : sum[c];
out[w*chan + c] = saturate<DST>(result, rintf);
out[w*chan + c] = saturate<DST>(result, rintf);
}
}
}
}
GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, false)
GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, true)
{
static const int Window = 3;
static void run(const View &src, const cv::Size& kernelSize, const cv::Point& anchor,
int /* borderType */, const cv::Scalar& /* borderValue */, Buffer &dst)
int /* borderType */, const cv::Scalar& /* borderValue */, Buffer &dst,
Buffer& scratch)
{
// TODO: support sizes 3, 5, 7, 9, ...
GAPI_Assert(kernelSize.width == 3 && kernelSize.height == 3);
@ -404,14 +421,46 @@ GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, false)
static const bool normalize = true;
int width = src.length();
int chan = src.meta().chan;
int length = width * chan;
float *buf[3];
buf[0] = scratch.OutLine<float>();
buf[1] = buf[0] + length;
buf[2] = buf[1] + length;
// DST SRC OP __VA_ARGS__
UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize);
UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize);
UNARY_( short, short, run_boxfilter, dst, src, kernelSize, anchor, normalize);
UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
UNARY_( short, short, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
UNARY_( float, float, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
}
static void initScratch(const GMatDesc & in,
const cv::Size & /* ksize */,
const cv::Point & /* anchor */,
int /* borderType */,
const cv::Scalar & /* borderValue */,
Buffer & scratch)
{
int width = in.size.width;
int chan = in.chan;
int buflen = width * chan * Window; // work buffers
cv::gapi::own::Size bufsize(buflen, 1);
GMatDesc bufdesc = {CV_32F, 1, bufsize};
Buffer buffer(bufdesc);
scratch = std::move(buffer);
}
static void resetScratch(Buffer& /* scratch */)
{
}
static Border getBorder(const cv::GMatDesc& /* src */,
const cv::Size & /* kernelSize */,
const cv::Point & /* anchor */,
@ -422,18 +471,19 @@ GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, false)
}
};
GAPI_FLUID_KERNEL(GFluidBoxFilter, cv::gapi::imgproc::GBoxFilter, false)
GAPI_FLUID_KERNEL(GFluidBoxFilter, cv::gapi::imgproc::GBoxFilter, true)
{
static const int Window = 3;
static void run(const View & src,
int /* ddepth */,
const cv::Size & kernelSize,
const cv::Point & anchor,
const cv::Point & anchor,
bool normalize,
int /* borderType */,
const cv::Scalar& /* borderValue */,
Buffer& dst)
Buffer& dst,
Buffer& scratch)
{
// TODO: support sizes 3, 5, 7, 9, ...
GAPI_Assert(kernelSize.width == 3 && kernelSize.height == 3);
@ -441,17 +491,51 @@ GAPI_FLUID_KERNEL(GFluidBoxFilter, cv::gapi::imgproc::GBoxFilter, false)
// TODO: suport non-trivial anchor
GAPI_Assert(anchor.x == -1 && anchor.y == -1);
int width = src.length();
int chan = src.meta().chan;
int length = width * chan;
float *buf[3];
buf[0] = scratch.OutLine<float>();
buf[1] = buf[0] + length;
buf[2] = buf[1] + length;
// DST SRC OP __VA_ARGS__
UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize);
UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize);
UNARY_( short, short, run_boxfilter, dst, src, kernelSize, anchor, normalize);
UNARY_( float, uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize);
UNARY_( float, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize);
UNARY_( float, short, run_boxfilter, dst, src, kernelSize, anchor, normalize);
UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
UNARY_( float, uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
UNARY_( float, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
UNARY_( short, short, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
UNARY_( float, short, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
UNARY_( float, float, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
}
static void initScratch(const GMatDesc & in,
int /* ddepth */,
const cv::Size & /* kernelSize */,
const cv::Point & /* anchor */,
bool /* normalize */,
int /* borderType */,
const cv::Scalar& /* borderValue */,
Buffer & scratch)
{
int width = in.size.width;
int chan = in.chan;
int buflen = width * chan * Window; // work buffers
cv::gapi::own::Size bufsize(buflen, 1);
GMatDesc bufdesc = {CV_32F, 1, bufsize};
Buffer buffer(bufdesc);
scratch = std::move(buffer);
}
static void resetScratch(Buffer& /* scratch */)
{
}
static Border getBorder(const cv::GMatDesc& /* src */,
int /* ddepth */,
const cv::Size & /* kernelSize */,
@ -510,18 +594,21 @@ static void run_sepfilter(Buffer& dst, const View& src,
const float kx[], int kxLen,
const float ky[], int kyLen,
const cv::Point& /* anchor */,
float delta=0)
float scale, float delta,
float *buf[])
{
static const int maxLines = 9;
GAPI_Assert(kyLen <= maxLines);
constexpr int kMax = 11;
GAPI_Assert(kxLen <= kMax && kyLen <= kMax);
const SRC *in[ maxLines ];
const SRC *in[kMax];
DST *out;
int border = (kyLen - 1) / 2;
int xborder = (kxLen - 1) / 2;
int yborder = (kyLen - 1) / 2;
for (int i=0; i < kyLen; i++)
{
in[i] = src.InLine<SRC>(i - border);
in[i] = src.InLine<SRC>(i - yborder);
}
out = dst.OutLine<DST>();
@ -529,28 +616,52 @@ static void run_sepfilter(Buffer& dst, const View& src,
int width = dst.length();
int chan = dst.meta().chan;
for (int w=0; w < width; w++)
// optimized 3x3 vs reference
if (kxLen == 3 && kyLen == 3)
{
// TODO: make this cycle innermost
for (int c=0; c < chan; c++)
int y = dst.y();
int y0 = dst.priv().writeStart();
int border = xborder;
run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
}
else
{
int length = chan * width;
int xshift = chan * xborder;
// horizontal pass
for (int k=0; k < kyLen; k++)
{
float sum=0;
const SRC *inp[kMax] = {nullptr};
for (int i=0; i < kyLen; i++)
for (int j=0; j < kxLen; j++)
{
float sumi=0;
inp[j] = in[k] + (j - xborder)*xshift;
}
for (int l=0; l < length; l++)
{
float sum = 0;
for (int j=0; j < kxLen; j++)
{
sumi += in[i][(w + j - border)*chan + c] * kx[j];
sum += inp[j][l] * kx[j];
}
sum += sumi * ky[i];
buf[k][l] = sum;
}
}
float result = sum + delta;
// vertical pass
out[w*chan + c] = saturate<DST>(result, rintf);
for (int l=0; l < length; l++)
{
float sum = 0;
for (int k=0; k < kyLen; k++)
{
sum += buf[k][l] * ky[k];
}
out[l] = saturate<DST>(sum*scale + delta, rintf);
}
}
}
@ -580,21 +691,37 @@ GAPI_FLUID_KERNEL(GFluidSepFilter, cv::gapi::imgproc::GSepFilter, true)
int kxLen = kernX.rows * kernX.cols;
int kyLen = kernY.rows * kernY.cols;
GAPI_Assert(kyLen == 3);
float *kx = scratch.OutLine<float>();
float *ky = kx + kxLen;
int width = src.meta().size.width;
int chan = src.meta().chan;
int length = width * chan;
float *buf[3];
buf[0] = ky + kyLen;
buf[1] = buf[0] + length;
buf[2] = buf[1] + length;
float scale = 1;
float delta = static_cast<float>(delta_[0]);
// DST SRC OP __VA_ARGS__
UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
UNARY_( short, short, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
UNARY_( float, float, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
UNARY_( short, uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
UNARY_( float, uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
UNARY_( float, ushort, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
UNARY_( short, short, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
UNARY_( float, short, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
UNARY_( float, float, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
}
static void initScratch(const GMatDesc& /* in */,
static void initScratch(const GMatDesc& in,
int /* ddepth */,
const Mat & kernX,
const Mat & kernY,
@ -607,7 +734,13 @@ GAPI_FLUID_KERNEL(GFluidSepFilter, cv::gapi::imgproc::GSepFilter, true)
int kxLen = kernX.rows * kernX.cols;
int kyLen = kernY.rows * kernY.cols;
cv::gapi::own::Size bufsize(kxLen + kyLen, 1);
int width = in.size.width;
int chan = in.chan;
int buflen = kxLen + kyLen + // x, y kernels
width * chan * Window; // work buffers
cv::gapi::own::Size bufsize(buflen, 1);
GMatDesc bufdesc = {CV_32F, 1, bufsize};
Buffer buffer(bufdesc);
scratch = std::move(buffer);
@ -664,29 +797,47 @@ GAPI_FLUID_KERNEL(GFluidGaussBlur, cv::gapi::imgproc::GGaussBlur, true)
auto *kx = scratch.OutLine<float>(); // cached kernX data
auto *ky = kx + kxsize; // cached kernY data
int width = src.meta().size.width;
int chan = src.meta().chan;
int length = width * chan;
float *buf[3];
buf[0] = ky + kysize;
buf[1] = buf[0] + length;
buf[2] = buf[1] + length;
auto anchor = cv::Point(-1, -1);
float delta = 0.f;
float scale = 1;
float delta = 0;
// DST SRC OP __VA_ARGS__
UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta);
UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta);
UNARY_( short, short, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta);
UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf);
UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf);
UNARY_( short, short, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf);
UNARY_( float, float, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf);
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
}
static void initScratch(const GMatDesc& /* in */,
static void initScratch(const GMatDesc& in,
const cv::Size & ksize,
double sigmaX,
double sigmaY,
int /* borderType */,
const cv::Scalar & /* borderValue */,
int /* borderType */,
const cv::Scalar & /* borderValue */,
Buffer & scratch)
{
int kxsize = ksize.width;
int kysize = ksize.height;
cv::gapi::own::Size bufsize(kxsize + kysize, 1);
int width = in.size.width;
int chan = in.chan;
int buflen = kxsize + kysize + // x, y kernels
width * chan * Window; // work buffers
cv::gapi::own::Size bufsize(buflen, 1);
GMatDesc bufdesc = {CV_32F, 1, bufsize};
Buffer buffer(bufdesc);
scratch = std::move(buffer);
@ -767,7 +918,7 @@ static void run_sobel(Buffer& dst,
int y0 = dst.priv().writeStart();
// int y1 = dst.priv().writeEnd();
run_sobel_row(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
}
GAPI_FLUID_KERNEL(GFluidSobel, cv::gapi::imgproc::GSobel, true)
@ -1102,6 +1253,7 @@ GAPI_FLUID_KERNEL(GFluidErode, cv::gapi::imgproc::GErode, true)
UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
UNARY_( short, short, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
UNARY_( float, float, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
}
@ -1109,7 +1261,7 @@ GAPI_FLUID_KERNEL(GFluidErode, cv::gapi::imgproc::GErode, true)
static void initScratch(const GMatDesc& /* in */,
const Mat & kernel,
const Point & /* anchor */,
int /* iterations */,
int /* iterations */,
int /* borderType */,
const cv::Scalar & /* borderValue */,
Buffer & scratch)
@ -1179,6 +1331,7 @@ GAPI_FLUID_KERNEL(GFluidDilate, cv::gapi::imgproc::GDilate, true)
UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
UNARY_( short, short, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
UNARY_( float, float, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
}
@ -1290,6 +1443,7 @@ GAPI_FLUID_KERNEL(GFluidMedianBlur, cv::gapi::imgproc::GMedianBlur, false)
UNARY_(uchar , uchar , run_medianblur, dst, src, ksize);
UNARY_(ushort, ushort, run_medianblur, dst, src, ksize);
UNARY_( short, short, run_medianblur, dst, src, ksize);
UNARY_( float, float, run_medianblur, dst, src, ksize);
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
}

@ -57,34 +57,34 @@ void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef
CV_CPU_DISPATCH(run_yuv2rgb_impl, (out, in, width, coef), CV_CPU_DISPATCH_MODES_ALL);
}
//---------------------
//-------------------------
//
// Fluid kernels: Sobel
// Fluid kernels: sepFilter
//
//---------------------
#define RUN_SOBEL_ROW(DST, SRC) \
void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \
const float kx[], const float ky[], int border, \
float scale, float delta, float *buf[], \
int y, int y0) \
{ \
CV_CPU_DISPATCH(run_sobel_row, \
(out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0), \
CV_CPU_DISPATCH_MODES_ALL); \
//-------------------------
#define RUN_SEPFILTER3X3_IMPL(DST, SRC) \
void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \
const float kx[], const float ky[], int border, \
float scale, float delta, \
float *buf[], int y, int y0) \
{ \
CV_CPU_DISPATCH(run_sepfilter3x3_impl, \
(out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0), \
CV_CPU_DISPATCH_MODES_ALL); \
}
RUN_SOBEL_ROW(uchar , uchar )
RUN_SOBEL_ROW(ushort, ushort)
RUN_SOBEL_ROW( short, uchar )
RUN_SOBEL_ROW( short, ushort)
RUN_SOBEL_ROW( short, short)
RUN_SOBEL_ROW( float, uchar )
RUN_SOBEL_ROW( float, ushort)
RUN_SOBEL_ROW( float, short)
RUN_SOBEL_ROW( float, float)
#undef RUN_SOBEL_ROW
RUN_SEPFILTER3X3_IMPL(uchar , uchar )
RUN_SEPFILTER3X3_IMPL( short, uchar )
RUN_SEPFILTER3X3_IMPL( float, uchar )
RUN_SEPFILTER3X3_IMPL(ushort, ushort)
RUN_SEPFILTER3X3_IMPL( short, ushort)
RUN_SEPFILTER3X3_IMPL( float, ushort)
RUN_SEPFILTER3X3_IMPL( short, short)
RUN_SEPFILTER3X3_IMPL( float, short)
RUN_SEPFILTER3X3_IMPL( float, float)
#undef RUN_SEPFILTER3X3_IMPL
} // namespace fliud
} // namespace gapi

@ -33,29 +33,29 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef
void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]);
//---------------------
//-------------------------
//
// Fluid kernels: Sobel
// Fluid kernels: sepFilter
//
//---------------------
//-------------------------
#define RUN_SOBEL_ROW(DST, SRC) \
void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \
const float kx[], const float ky[], int border, \
float scale, float delta, float *buf[], \
int y, int y0);
#define RUN_SEPFILTER3X3_IMPL(DST, SRC) \
void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \
const float kx[], const float ky[], int border, \
float scale, float delta, \
float *buf[], int y, int y0);
RUN_SOBEL_ROW(uchar , uchar )
RUN_SOBEL_ROW(ushort, ushort)
RUN_SOBEL_ROW( short, uchar )
RUN_SOBEL_ROW( short, ushort)
RUN_SOBEL_ROW( short, short)
RUN_SOBEL_ROW( float, uchar )
RUN_SOBEL_ROW( float, ushort)
RUN_SOBEL_ROW( float, short)
RUN_SOBEL_ROW( float, float)
RUN_SEPFILTER3X3_IMPL(uchar , uchar )
RUN_SEPFILTER3X3_IMPL( short, uchar )
RUN_SEPFILTER3X3_IMPL( float, uchar )
RUN_SEPFILTER3X3_IMPL(ushort, ushort)
RUN_SEPFILTER3X3_IMPL( short, ushort)
RUN_SEPFILTER3X3_IMPL( float, ushort)
RUN_SEPFILTER3X3_IMPL( short, short)
RUN_SEPFILTER3X3_IMPL( float, short)
RUN_SEPFILTER3X3_IMPL( float, float)
#undef RUN_SOBEL_ROW
#undef RUN_SEPFILTER3X3_IMPL
} // namespace fluid
} // namespace gapi

@ -9,6 +9,8 @@
#if !defined(GAPI_STANDALONE)
#include "gfluidimgproc_func.hpp"
#include "opencv2/gapi/own/saturate.hpp"
#include "opencv2/core.hpp"
@ -16,6 +18,8 @@
#include <cstdint>
#include <vector>
#ifdef __GNUC__
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wstrict-overflow"
@ -48,34 +52,66 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef
void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]);
//---------------------
//-------------------------
//
// Fluid kernels: Sobel
// Fluid kernels: sepFilter
//
//---------------------
#define RUN_SOBEL_ROW(DST, SRC) \
void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \
const float kx[], const float ky[], int border, \
float scale, float delta, float *buf[], \
int y, int y0);
RUN_SOBEL_ROW(uchar , uchar )
RUN_SOBEL_ROW(ushort, ushort)
RUN_SOBEL_ROW( short, uchar )
RUN_SOBEL_ROW( short, ushort)
RUN_SOBEL_ROW( short, short)
RUN_SOBEL_ROW( float, uchar )
RUN_SOBEL_ROW( float, ushort)
RUN_SOBEL_ROW( float, short)
RUN_SOBEL_ROW( float, float)
#undef RUN_SOBEL_ROW
//-------------------------
#define RUN_SEPFILTER3X3_IMPL(DST, SRC) \
void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \
const float kx[], const float ky[], int border, \
float scale, float delta, \
float *buf[], int y, int y0);
RUN_SEPFILTER3X3_IMPL(uchar , uchar )
RUN_SEPFILTER3X3_IMPL( short, uchar )
RUN_SEPFILTER3X3_IMPL( float, uchar )
RUN_SEPFILTER3X3_IMPL(ushort, ushort)
RUN_SEPFILTER3X3_IMPL( short, ushort)
RUN_SEPFILTER3X3_IMPL( float, ushort)
RUN_SEPFILTER3X3_IMPL( short, short)
RUN_SEPFILTER3X3_IMPL( float, short)
RUN_SEPFILTER3X3_IMPL( float, float)
#undef RUN_SEPFILTER3X3_IMPL
//----------------------------------------------------------------------
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
#if CV_SIMD
template<typename SRC>
static inline v_float32 vx_load_f32(const SRC* ptr)
{
if (std::is_same<SRC,uchar>::value)
{
v_uint32 tmp = vx_load_expand_q(reinterpret_cast<const uchar*>(ptr));
return v_cvt_f32(v_reinterpret_as_s32(tmp));
}
if (std::is_same<SRC,ushort>::value)
{
v_uint32 tmp = vx_load_expand(reinterpret_cast<const ushort*>(ptr));
return v_cvt_f32(v_reinterpret_as_s32(tmp));
}
if (std::is_same<SRC,short>::value)
{
v_int32 tmp = vx_load_expand(reinterpret_cast<const short*>(ptr));
return v_cvt_f32(tmp);
}
if (std::is_same<SRC,float>::value)
{
v_float32 tmp = vx_load(reinterpret_cast<const float*>(ptr));
return tmp;
}
CV_Error(cv::Error::StsBadArg, "unsupported type");
}
#endif // CV_SIMD
//----------------------------------
//
// Fluid kernels: RGB2Gray, BGR2Gray
@ -309,187 +345,359 @@ void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef
}
}
//---------------------
//-------------------------
//
// Fluid kernels: Sobel
// Fluid kernels: sepFilter
//
//---------------------
//-------------------------
// Sobel 3x3: vertical pass
template<bool noscale, typename DST>
static void run_sobel3x3_vert(DST out[], int length, const float ky[],
float scale, float delta, const int r[], float *buf[])
#if CV_SIMD
// this variant not using buf[] appears 15% faster than reference any-2-float code below
template<bool noscale, typename SRC>
static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width, int chan,
const float kx[], const float ky[], int border,
float scale, float delta)
{
float ky0 = ky[0],
ky1 = ky[1],
ky2 = ky[2];
const int length = width * chan;
const int shift = border * chan;
int r0 = r[0],
r1 = r[1],
r2 = r[2];
const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2];
const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2];
#if CV_SIMD
// for floating-point output,
// manual vectoring may be not better than compiler's optimization
#define EXPLICIT_SIMD_32F 0 // 1=vectorize 32f case explicitly, 0=don't
#if EXPLICIT_SIMD_32F
if (std::is_same<DST, float>::value && length >= v_int16::nlanes)
for (int l=0; l < length; )
{
constexpr static int nlanes = v_float32::nlanes;
static const int nlanes = v_float32::nlanes;
for (int l=0; l < length; )
// main part
for ( ; l <= length - nlanes; l += nlanes)
{
for (; l <= length - nlanes; l += nlanes)
auto xsum = [l, shift, kx0, kx1, kx2](const SRC i[])
{
v_float32 sum = vx_load(&buf[r0][l]) * vx_setall_f32(ky0);
sum = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum);
sum = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum);
v_float32 t0 = vx_load_f32(&i[l - shift]);
v_float32 t1 = vx_load_f32(&i[l ]);
v_float32 t2 = vx_load_f32(&i[l + shift]);
v_float32 t = t0 * vx_setall_f32(kx0);
t = v_fma(t1, vx_setall_f32(kx1), t);
t = v_fma(t2, vx_setall_f32(kx2), t);
return t;
};
v_float32 s0 = xsum(in[0]);
v_float32 s1 = xsum(in[1]);
v_float32 s2 = xsum(in[2]);
v_float32 s = s0 * vx_setall_f32(ky0);
s = v_fma(s1, vx_setall_f32(ky1), s);
s = v_fma(s2, vx_setall_f32(ky2), s);
if (!noscale)
{
s = v_fma(s, vx_setall_f32(scale), vx_setall_f32(delta));
}
v_store(&out[l], s);
}
// tail (if any)
if (l < length)
{
GAPI_DbgAssert(length >= nlanes);
l = length - nlanes;
}
}
}
// this variant with manually vectored rounding to short/ushort appears 10-40x faster
// than reference code below
template<bool noscale, typename DST, typename SRC>
static void run_sepfilter3x3_any2short(DST out[], const SRC *in[], int width, int chan,
const float kx[], const float ky[], int border,
float scale, float delta,
float *buf[], int y, int y0)
{
int r[3];
r[0] = (y - y0 ) % 3; // buf[r[0]]: previous
r[1] = (y - y0 + 1) % 3; // this
r[2] = (y - y0 + 2) % 3; // next row
const int length = width * chan;
const int shift = border * chan;
const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2];
const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2];
// horizontal pass
int k0 = (y == y0)? 0: 2;
for (int k = k0; k < 3; k++)
{
// previous , this , next pixel
const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift};
// rely on compiler vectoring
for (int l=0; l < length; l++)
{
buf[r[k]][l] = s[0][l]*kx0 + s[1][l]*kx1 + s[2][l]*kx2;
}
}
// vertical pass
const int r0=r[0], r1=r[1], r2=r[2];
for (int l=0; l < length;)
{
constexpr int nlanes = v_int16::nlanes;
// main part of row
for (; l <= length - nlanes; l += nlanes)
{
v_float32 sum0 = vx_load(&buf[r0][l]) * vx_setall_f32(ky0);
sum0 = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum0);
sum0 = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum0);
if (!noscale)
{
sum = v_fma(sum, vx_setall_f32(scale), vx_setall_f32(delta));
}
v_float32 sum1 = vx_load(&buf[r0][l + nlanes/2]) * vx_setall_f32(ky0);
sum1 = v_fma(vx_load(&buf[r1][l + nlanes/2]), vx_setall_f32(ky1), sum1);
sum1 = v_fma(vx_load(&buf[r2][l + nlanes/2]), vx_setall_f32(ky2), sum1);
v_store(reinterpret_cast<float*>(&out[l]), sum);
if (!noscale)
{
sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
}
if (l < length)
v_int32 isum0 = v_round(sum0),
isum1 = v_round(sum1);
if (std::is_same<DST, short>::value)
{
// tail: recalculate last pixels
GAPI_DbgAssert(length >= nlanes);
l = length - nlanes;
// signed short
v_int16 res = v_pack(isum0, isum1);
v_store(reinterpret_cast<short*>(&out[l]), res);
} else
{
// unsigned short
v_uint16 res = v_pack_u(isum0, isum1);
v_store(reinterpret_cast<ushort*>(&out[l]), res);
}
}
return;
// tail (if any)
if (l < length)
{
GAPI_DbgAssert(length >= nlanes);
l = length - nlanes;
}
}
#endif
}
if ((std::is_same<DST, short>::value || std::is_same<DST, ushort>::value)
&& length >= v_int16::nlanes)
// this code with manually vectored rounding to uchar is 10-40x faster than reference
template<bool noscale, typename SRC>
static void run_sepfilter3x3_any2char(uchar out[], const SRC *in[], int width, int chan,
const float kx[], const float ky[], int border,
float scale, float delta,
float *buf[], int y, int y0)
{
int r[3];
r[0] = (y - y0 ) % 3; // buf[r[0]]: previous
r[1] = (y - y0 + 1) % 3; // this
r[2] = (y - y0 + 2) % 3; // next row
const int length = width * chan;
const int shift = border * chan;
const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2];
const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2];
// horizontal pass
int k0 = (y == y0)? 0: 2;
for (int k = k0; k < 3; k++)
{
constexpr static int nlanes = v_int16::nlanes;
// previous , this , next pixel
const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift};
for (int l=0; l < length; )
// rely on compiler vectoring
for (int l=0; l < length; l++)
{
for (; l <= length - nlanes; l += nlanes)
{
v_float32 sum0 = vx_load(&buf[r0][l]) * vx_setall_f32(ky0);
sum0 = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum0);
sum0 = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum0);
v_float32 sum1 = vx_load(&buf[r0][l + nlanes/2]) * vx_setall_f32(ky0);
sum1 = v_fma(vx_load(&buf[r1][l + nlanes/2]), vx_setall_f32(ky1), sum1);
sum1 = v_fma(vx_load(&buf[r2][l + nlanes/2]), vx_setall_f32(ky2), sum1);
if (!noscale)
{
sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
}
v_int32 isum0 = v_round(sum0),
isum1 = v_round(sum1);
if (std::is_same<DST, short>::value)
{
// signed short
v_int16 res = v_pack(isum0, isum1);
v_store(reinterpret_cast<short*>(&out[l]), res);
} else
{
// unsigned short
v_uint16 res = v_pack_u(isum0, isum1);
v_store(reinterpret_cast<ushort*>(&out[l]), res);
}
}
buf[r[k]][l] = s[0][l]*kx0 + s[1][l]*kx1 + s[2][l]*kx2;
}
}
if (l < length)
// vertical pass
const int r0=r[0], r1=r[1], r2=r[2];
for (int l=0; l < length;)
{
constexpr int nlanes = v_uint8::nlanes;
// main part of row
for (; l <= length - nlanes; l += nlanes)
{
v_float32 sum0 = vx_load(&buf[r0][l]) * vx_setall_f32(ky0);
sum0 = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum0);
sum0 = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum0);
v_float32 sum1 = vx_load(&buf[r0][l + nlanes/4]) * vx_setall_f32(ky0);
sum1 = v_fma(vx_load(&buf[r1][l + nlanes/4]), vx_setall_f32(ky1), sum1);
sum1 = v_fma(vx_load(&buf[r2][l + nlanes/4]), vx_setall_f32(ky2), sum1);
v_float32 sum2 = vx_load(&buf[r0][l + 2*nlanes/4]) * vx_setall_f32(ky0);
sum2 = v_fma(vx_load(&buf[r1][l + 2*nlanes/4]), vx_setall_f32(ky1), sum2);
sum2 = v_fma(vx_load(&buf[r2][l + 2*nlanes/4]), vx_setall_f32(ky2), sum2);
v_float32 sum3 = vx_load(&buf[r0][l + 3*nlanes/4]) * vx_setall_f32(ky0);
sum3 = v_fma(vx_load(&buf[r1][l + 3*nlanes/4]), vx_setall_f32(ky1), sum3);
sum3 = v_fma(vx_load(&buf[r2][l + 3*nlanes/4]), vx_setall_f32(ky2), sum3);
if (!noscale)
{
// tail: recalculate last pixels
GAPI_DbgAssert(length >= nlanes);
l = length - nlanes;
sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta));
sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta));
}
v_int32 isum0 = v_round(sum0),
isum1 = v_round(sum1),
isum2 = v_round(sum2),
isum3 = v_round(sum3);
v_int16 ires0 = v_pack(isum0, isum1),
ires1 = v_pack(isum2, isum3);
v_uint8 res = v_pack_u(ires0, ires1);
v_store(reinterpret_cast<uchar*>(&out[l]), res);
}
return;
// tail (if any)
if (l < length)
{
GAPI_DbgAssert(length >= nlanes);
l = length - nlanes;
}
}
}
if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
// this code manually vectored for int16 not much faster than generic any-to-short code above
#define USE_SEPFILTER3X3_CHAR2SHORT 1
#if USE_SEPFILTER3X3_CHAR2SHORT
template<bool noscale>
static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int width, int chan,
const float kx[], const float ky[], int border,
float scale, float delta,
float *buf[], int y, int y0)
{
const schar ikx0 = saturate<schar>(kx[0], rintf);
const schar ikx1 = saturate<schar>(kx[1], rintf);
const schar ikx2 = saturate<schar>(kx[2], rintf);
const schar iky0 = saturate<schar>(ky[0], rintf);
const schar iky1 = saturate<schar>(ky[1], rintf);
const schar iky2 = saturate<schar>(ky[2], rintf);
const short iscale = saturate<short>(scale * (1 << 15), rintf);
const short idelta = saturate<short>(delta , rintf);
// check if this code is applicable
if (ikx0 != kx[0] || ikx1 != kx[1] || ikx2 != kx[2] ||
iky0 != ky[0] || iky1 != ky[1] || iky2 != ky[2] ||
idelta != delta ||
std::abs(scale) > 1 || std::abs(scale) < 0.01)
{
constexpr static int nlanes = v_uint8::nlanes;
run_sepfilter3x3_any2short<noscale>(out, in, width, chan, kx, ky, border, scale, delta,
buf, y, y0);
return;
}
short *ibuf[3];
ibuf[0] = reinterpret_cast<short*>(buf[0]);
ibuf[1] = reinterpret_cast<short*>(buf[1]);
ibuf[2] = reinterpret_cast<short*>(buf[2]);
for (int l=0; l < length; )
int r[3];
r[0] = (y - y0 ) % 3; // buf[r[0]]: previous
r[1] = (y - y0 + 1) % 3; // this
r[2] = (y - y0 + 2) % 3; // next row
const int length = width * chan;
const int shift = border * chan;
// horizontal pass
int k0 = (y == y0)? 0: 2;
for (int k = k0; k < 3; k++)
{
for (int l=0; l < length;)
{
constexpr int nlanes = v_int16::nlanes;
// main part of output row
for (; l <= length - nlanes; l += nlanes)
{
v_float32 sum0 = vx_load(&buf[r0][l]) * vx_setall_f32(ky0);
sum0 = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum0);
sum0 = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum0);
v_float32 sum1 = vx_load(&buf[r0][l + nlanes/4]) * vx_setall_f32(ky0);
sum1 = v_fma(vx_load(&buf[r1][l + nlanes/4]), vx_setall_f32(ky1), sum1);
sum1 = v_fma(vx_load(&buf[r2][l + nlanes/4]), vx_setall_f32(ky2), sum1);
v_float32 sum2 = vx_load(&buf[r0][l + 2*nlanes/4]) * vx_setall_f32(ky0);
sum2 = v_fma(vx_load(&buf[r1][l + 2*nlanes/4]), vx_setall_f32(ky1), sum2);
sum2 = v_fma(vx_load(&buf[r2][l + 2*nlanes/4]), vx_setall_f32(ky2), sum2);
v_float32 sum3 = vx_load(&buf[r0][l + 3*nlanes/4]) * vx_setall_f32(ky0);
sum3 = v_fma(vx_load(&buf[r1][l + 3*nlanes/4]), vx_setall_f32(ky1), sum3);
sum3 = v_fma(vx_load(&buf[r2][l + 3*nlanes/4]), vx_setall_f32(ky2), sum3);
if (!noscale)
{
sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta));
sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta));
}
v_int32 isum0 = v_round(sum0),
isum1 = v_round(sum1),
isum2 = v_round(sum2),
isum3 = v_round(sum3);
v_int16 ires0 = v_pack(isum0, isum1),
ires1 = v_pack(isum2, isum3);
v_uint8 res = v_pack_u(ires0, ires1);
v_store(reinterpret_cast<uchar*>(&out[l]), res);
v_uint16 t0 = vx_load_expand(&in[k][l - shift]); // previous
v_uint16 t1 = vx_load_expand(&in[k][l ]); // current
v_uint16 t2 = vx_load_expand(&in[k][l + shift]); // next pixel
v_int16 t = v_reinterpret_as_s16(t0) * vx_setall_s16(ikx0) +
v_reinterpret_as_s16(t1) * vx_setall_s16(ikx1) +
v_reinterpret_as_s16(t2) * vx_setall_s16(ikx2);
v_store(&ibuf[r[k]][l], t);
}
// tail (if any)
if (l < length)
{
// tail: recalculate last pixels
GAPI_DbgAssert(length >= nlanes);
l = length - nlanes;
}
}
return;
}
#endif
// reference code
for (int l=0; l < length; l++)
// vertical pass
for (int l=0; l < length;)
{
float sum = buf[r0][l]*ky0 + buf[r1][l]*ky1 + buf[r2][l]*ky2;
constexpr int nlanes = v_int16::nlanes;
if (!noscale)
// main part of output row
for (; l <= length - nlanes; l += nlanes)
{
sum = sum*scale + delta;
v_int16 s0 = vx_load(&ibuf[r[0]][l]); // previous
v_int16 s1 = vx_load(&ibuf[r[1]][l]); // current
v_int16 s2 = vx_load(&ibuf[r[2]][l]); // next row
v_int16 s = s0 * vx_setall_s16(iky0) +
s1 * vx_setall_s16(iky1) +
s2 * vx_setall_s16(iky2);
if (!noscale)
{
s = v_mul_hi(s << 1, vx_setall_s16(iscale)) + vx_setall_s16(idelta);
}
v_store(&out[l], s);
}
out[l] = cv::gapi::own::saturate<DST>(sum, rintf);
// tail (if any)
if (l < length)
{
GAPI_DbgAssert(length >= nlanes);
l = length - nlanes;
}
}
}
#endif
#endif // CV_SIMD
template<typename DST, typename SRC>
static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan,
const float kx[], const float ky[], int border,
float scale, float delta, float *buf[],
int y, int y0)
template<bool noscale, typename DST, typename SRC>
static void run_sepfilter3x3_reference(DST out[], const SRC *in[], int width, int chan,
const float kx[], const float ky[], int border,
float scale, float delta,
float *buf[], int y, int y0)
{
int r[3];
r[0] = (y - y0) % 3; // buf[r[0]]: previous
@ -497,19 +705,21 @@ static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan,
r[2] = (y - y0 + 2) % 3; // next row
int length = width * chan;
int shift = border * chan;
// horizontal pass
// full horizontal pass is needed only if very 1st row in ROI;
// for 2nd and further rows, it is enough to convolve only the
// "next" row - as we can reuse buffers from previous calls to
// this kernel (note that Fluid processes rows consequently)
// this kernel (Fluid does rows consequently: y=y0, y0+1, ...)
int k0 = (y == y0)? 0: 2;
for (int k = k0; k < 3; k++)
{
// previous, this , next pixel
const SRC *s[3] = {in[k] - border*chan , in[k], in[k] + border*chan};
// previous , this , next pixel
const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift};
// rely on compiler vectoring
for (int l=0; l < length; l++)
@ -519,37 +729,121 @@ static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan,
}
// vertical pass
if (scale == 1 && delta == 0)
for (int l=0; l < length; l++)
{
float sum = buf[r[0]][l]*ky[0] + buf[r[1]][l]*ky[1] + buf[r[2]][l]*ky[2];
if (!noscale)
{
sum = sum*scale + delta;
}
out[l] = saturate<DST>(sum, rintf);
}
}
template<bool noscale, typename DST, typename SRC>
static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int chan,
const float kx[], const float ky[], int border,
float scale, float delta,
float *buf[], int y, int y0)
{
#if CV_SIMD
int length = width * chan;
// length variable may be unused if types do not match at 'if' statements below
(void) length;
#if USE_SEPFILTER3X3_CHAR2SHORT
if (std::is_same<DST, short>::value && std::is_same<SRC, uchar>::value &&
length >= v_int16::nlanes)
{
// only slightly faster than more generic any-to-short (see below)
run_sepfilter3x3_char2short<noscale>(reinterpret_cast<short*>(out),
reinterpret_cast<const uchar**>(in),
width, chan, kx, ky, border, scale, delta,
buf, y, y0);
return;
}
#endif
if (std::is_same<DST, float>::value && std::is_same<SRC, float>::value &&
length >= v_float32::nlanes)
{
// appears 15% faster than reference any-to-float code (called below)
run_sepfilter3x3_any2float<noscale>(reinterpret_cast<float*>(out), in,
width, chan, kx, ky, border, scale, delta);
return;
}
if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
{
// appears 10-40x faster than reference due to much faster rounding
run_sepfilter3x3_any2short<noscale>(reinterpret_cast<short*>(out), in,
width, chan, kx, ky, border, scale, delta,
buf, y, y0);
return;
}
if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
{
constexpr static bool noscale = true; // omit scaling
run_sobel3x3_vert<noscale, DST>(out, length, ky, scale, delta, r, buf);
} else
// appears 10-40x faster than reference due to much faster rounding
run_sepfilter3x3_any2short<noscale>(reinterpret_cast<ushort*>(out), in,
width, chan, kx, ky, border, scale, delta,
buf, y, y0);
return;
}
if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
{
constexpr static bool noscale = false; // do scaling
run_sobel3x3_vert<noscale, DST>(out, length, ky, scale, delta, r, buf);
// appears 10-40x faster than reference due to much faster rounding
run_sepfilter3x3_any2char<noscale>(reinterpret_cast<uchar*>(out), in,
width, chan, kx, ky, border, scale, delta,
buf, y, y0);
return;
}
#endif // CV_SIMD
// reference code is quite fast for any-to-float case,
// but not for any-to-integral due to very slow rounding
run_sepfilter3x3_reference<noscale>(out, in, width, chan, kx, ky, border,
scale, delta, buf, y, y0);
}
#define RUN_SOBEL_ROW(DST, SRC) \
void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \
const float kx[], const float ky[], int border, \
float scale, float delta, float *buf[], \
int y, int y0) \
{ \
run_sobel_impl(out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0); \
#define RUN_SEPFILTER3X3_IMPL(DST, SRC) \
void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \
const float kx[], const float ky[], int border, \
float scale, float delta, \
float *buf[], int y, int y0) \
{ \
if (scale == 1 && delta == 0) \
{ \
constexpr bool noscale = true; \
run_sepfilter3x3_code<noscale>(out, in, width, chan, kx, ky, border, \
scale, delta, buf, y, y0); \
} \
else \
{ \
constexpr bool noscale = false; \
run_sepfilter3x3_code<noscale>(out, in, width, chan, kx, ky, border, \
scale, delta, buf, y, y0); \
} \
}
RUN_SOBEL_ROW(uchar , uchar )
RUN_SOBEL_ROW(ushort, ushort)
RUN_SOBEL_ROW( short, uchar )
RUN_SOBEL_ROW( short, ushort)
RUN_SOBEL_ROW( short, short)
RUN_SOBEL_ROW( float, uchar )
RUN_SOBEL_ROW( float, ushort)
RUN_SOBEL_ROW( float, short)
RUN_SOBEL_ROW( float, float)
#undef RUN_SOBEL_ROW
RUN_SEPFILTER3X3_IMPL(uchar , uchar )
RUN_SEPFILTER3X3_IMPL( short, uchar )
RUN_SEPFILTER3X3_IMPL( float, uchar )
RUN_SEPFILTER3X3_IMPL(ushort, ushort)
RUN_SEPFILTER3X3_IMPL( short, ushort)
RUN_SEPFILTER3X3_IMPL( float, ushort)
RUN_SEPFILTER3X3_IMPL( short, short)
RUN_SEPFILTER3X3_IMPL( float, short)
RUN_SEPFILTER3X3_IMPL( float, float)
#undef RUN_SEPFILTER3X3_IMPL
//------------------------------------------------------------------------------
#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY

Loading…
Cancel
Save