From f07856eab936dcae9a490acc4ba8ced0db91ca9e Mon Sep 17 00:00:00 2001 From: Evgeny Latkin Date: Mon, 26 Nov 2018 15:05:35 +0300 Subject: [PATCH] Merge pull request #13221 from elatkin:el/gapi_perf_sepfilter GAPI (fluid): optimization of Separable filter (#13221) * GAPI (fluid): Separable filter: performance test * GAPI (fluid): enable all performance tests * GAPI: separable filters: alternative code for Sobel * GAPI (fluid): hide unused old code for Sobel filter * GAPI (fluid): especial code for Sobel if U8 into S16 * GAPI (fluid): back to old code for Sobel * GAPI (fluid): run_sepfilter3x3_impl() with CPU dispatcher * GAPI (fluid): run_sepfilter3x3_impl(): fix compiler warnings * GAPI (fluid): new engine for separable filters (but Sobel) * GAPI (fluid): new performance engine for Sobel * GAPI (fluid): Sepfilters performance: fixed compilation error --- .../common/gapi_imgproc_perf_tests_inl.hpp | 26 +- .../cpu/gapi_imgproc_perf_tests_fluid.cpp | 180 +++-- .../gapi/src/backends/fluid/gfluidimgproc.cpp | 274 ++++++-- .../fluid/gfluidimgproc_func.dispatch.cpp | 48 +- .../src/backends/fluid/gfluidimgproc_func.hpp | 36 +- .../fluid/gfluidimgproc_func.simd.hpp | 646 +++++++++++++----- 6 files changed, 875 insertions(+), 335 deletions(-) diff --git a/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp index 5a13cfeebe..89ebf0405d 100644 --- a/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp +++ b/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp @@ -52,7 +52,7 @@ PERF_TEST_P_(SepFilterPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -100,7 +100,7 @@ PERF_TEST_P_(Filter2DPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -145,7 +145,7 @@ PERF_TEST_P_(BoxFilterPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -188,7 +188,7 @@ PERF_TEST_P_(BlurPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -230,7 +230,7 @@ PERF_TEST_P_(GaussianBlurPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -271,7 +271,7 @@ PERF_TEST_P_(MedianBlurPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -314,7 +314,7 @@ PERF_TEST_P_(ErodePerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -357,7 +357,7 @@ PERF_TEST_P_(Erode3x3PerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -400,7 +400,7 @@ PERF_TEST_P_(DilatePerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -443,7 +443,7 @@ PERF_TEST_P_(Dilate3x3PerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -526,7 +526,7 @@ PERF_TEST_P_(CannyPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -564,7 +564,7 @@ PERF_TEST_P_(EqHistPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// @@ -830,7 +830,7 @@ PERF_TEST_P_(LUV2BGRPerfTest, TestPerformance) TEST_CYCLE() { - c.apply(in_mat1, out_mat_gapi, std::move(compile_args)); + c.apply(in_mat1, out_mat_gapi); } // Comparison ////////////////////////////////////////////////////////////// diff --git a/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp index a5d13e661d..12554bf4b1 100644 --- a/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp +++ b/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp @@ -13,9 +13,101 @@ namespace opencv_test { - INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid, SobelPerfTest, - Combine(Values(AbsExact().to_compare_f()), - Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1), // add CV_32FC1 when ready +INSTANTIATE_TEST_CASE_P(SepFilterPerfTestFluid_8U, SepFilterPerfTest, + Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()), + Values(CV_8UC1, CV_8UC3), + Values(3), + Values(szVGA, sz720p, sz1080p), + Values(-1, CV_16S, CV_32F), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(SepFilterPerfTestFluid_other, SepFilterPerfTest, + Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()), + Values(CV_16UC1, CV_16SC1, CV_32FC1), + Values(3), + Values(szVGA, sz720p, sz1080p), + Values(-1, CV_32F), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(Filter2DPerfTestFluid, Filter2DPerfTest, + Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(3), // add 4, 5, 7 when kernel is ready + Values(szVGA, sz720p, sz1080p), + Values(cv::BORDER_DEFAULT), + Values(-1, CV_32F), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(BoxFilterPerfTestFluid, BoxFilterPerfTest, + Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(3), // add size=5, when kernel is ready + Values(szVGA, sz720p, sz1080p), + Values(cv::BORDER_DEFAULT), + Values(-1, CV_32F), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(BlurPerfTestFluid, BlurPerfTest, + Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(3), // add size=5, when kernel is ready + Values(szVGA, sz720p, sz1080p), + Values(cv::BORDER_DEFAULT), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(GaussianBlurPerfTestFluid, GaussianBlurPerfTest, + Combine(Values(ToleranceFilter(1e-3f, 0.01).to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(3), // add size=5, when kernel is ready + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(MedianBlurPerfTestFluid, MedianBlurPerfTest, + Combine(Values(AbsExact().to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(3), // add size=5, when kernel is ready + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(ErodePerfTestFluid, ErodePerfTest, + Combine(Values(AbsExact().to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(3), // add size=5, when kernel is ready + Values(szVGA, sz720p, sz1080p), + Values(cv::MorphShapes::MORPH_RECT, + cv::MorphShapes::MORPH_CROSS, + cv::MorphShapes::MORPH_ELLIPSE), + Values(cv::compile_args(IMGPROC_FLUID)))); + +// GAPI/fluid does not support iterations parameter for the Erode kernel +INSTANTIATE_TEST_CASE_P(DISABLED_Erode3x3PerfTestFluid, Erode3x3PerfTest, + Combine(Values(AbsExact().to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(szVGA, sz720p, sz1080p), + Values(1, 2, 4), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(DilatePerfTestFluid, DilatePerfTest, + Combine(Values(AbsExact().to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(3), // add size=5, when kernel is ready + Values(szVGA, sz720p, sz1080p), + Values(cv::MorphShapes::MORPH_RECT, + cv::MorphShapes::MORPH_CROSS, + cv::MorphShapes::MORPH_ELLIPSE), + Values(cv::compile_args(IMGPROC_FLUID)))); + +// GAPI/fluid does not support iterations parameter for the Dilate kernel +INSTANTIATE_TEST_CASE_P(DISABLED_Dilate3x3PerfTestFluid, Dilate3x3PerfTest, + Combine(Values(AbsExact().to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1), + Values(szVGA, sz720p, sz1080p), + Values(1, 2, 4), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid, SobelPerfTest, + Combine(Values(AbsExact().to_compare_f()), + Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1), Values(3), // add 5x5 once supported Values(szVGA, sz720p, sz1080p), Values(-1, CV_16S, CV_32F), @@ -23,8 +115,8 @@ namespace opencv_test Values(1, 2), Values(cv::compile_args(IMGPROC_FLUID)))); - INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid32F, SobelPerfTest, - Combine(Values(ToleranceFilter(1e-3f, 0.0).to_compare_f()), +INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid32F, SobelPerfTest, + Combine(Values(ToleranceFilter(1e-3f, 0.0).to_compare_f()), Values(CV_32FC1), Values(3), // add 5x5 once supported Values(szVGA, sz720p, sz1080p), @@ -33,44 +125,44 @@ namespace opencv_test Values(1, 2), Values(cv::compile_args(IMGPROC_FLUID)))); - INSTANTIATE_TEST_CASE_P(RGB2GrayPerfTestFluid, RGB2GrayPerfTest, - Combine(Values(ToleranceColor(1e-3).to_compare_f()), - Values(szVGA, sz720p, sz1080p), - Values(cv::compile_args(IMGPROC_FLUID)))); - - INSTANTIATE_TEST_CASE_P(BGR2GrayPerfTestFluid, BGR2GrayPerfTest, - Combine(Values(ToleranceColor(1e-3).to_compare_f()), - Values(szVGA, sz720p, sz1080p), - Values(cv::compile_args(IMGPROC_FLUID)))); - - INSTANTIATE_TEST_CASE_P(RGB2YUVPerfTestFluid, RGB2YUVPerfTest, - Combine(Values(ToleranceColor(1e-3).to_compare_f()), - Values(szVGA, sz720p, sz1080p), - Values(cv::compile_args(IMGPROC_FLUID)))); - - INSTANTIATE_TEST_CASE_P(YUV2RGBPerfTestFluid, YUV2RGBPerfTest, - Combine(Values(ToleranceColor(1e-3).to_compare_f()), - Values(szVGA, sz720p, sz1080p), - Values(cv::compile_args(IMGPROC_FLUID)))); - - INSTANTIATE_TEST_CASE_P(BGR2YUVPerfTestFluid, BGR2YUVPerfTest, - Combine(Values(ToleranceColor(1e-3).to_compare_f()), - Values(szVGA, sz720p, sz1080p), - Values(cv::compile_args(IMGPROC_FLUID)))); - - INSTANTIATE_TEST_CASE_P(YUV2BGRPerfTestFluid, YUV2BGRPerfTest, - Combine(Values(ToleranceColor(1e-3).to_compare_f()), - Values(szVGA, sz720p, sz1080p), - Values(cv::compile_args(IMGPROC_FLUID)))); - - INSTANTIATE_TEST_CASE_P(BGR2LUVPerfTestFluid, BGR2LUVPerfTest, - Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()), - Values(szVGA, sz720p, sz1080p), - Values(cv::compile_args(IMGPROC_FLUID)))); - - INSTANTIATE_TEST_CASE_P(RGB2LabPerfTestFluid, RGB2LabPerfTest, - Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()), - Values(szVGA, sz720p, sz1080p), - Values(cv::compile_args(IMGPROC_FLUID)))); +INSTANTIATE_TEST_CASE_P(RGB2GrayPerfTestFluid, RGB2GrayPerfTest, + Combine(Values(ToleranceColor(1e-3).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(BGR2GrayPerfTestFluid, BGR2GrayPerfTest, + Combine(Values(ToleranceColor(1e-3).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(RGB2YUVPerfTestFluid, RGB2YUVPerfTest, + Combine(Values(ToleranceColor(1e-3).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(YUV2RGBPerfTestFluid, YUV2RGBPerfTest, + Combine(Values(ToleranceColor(1e-3).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(BGR2YUVPerfTestFluid, BGR2YUVPerfTest, + Combine(Values(ToleranceColor(1e-3).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(YUV2BGRPerfTestFluid, YUV2BGRPerfTest, + Combine(Values(ToleranceColor(1e-3).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(BGR2LUVPerfTestFluid, BGR2LUVPerfTest, + Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); + +INSTANTIATE_TEST_CASE_P(RGB2LabPerfTestFluid, RGB2LabPerfTest, + Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()), + Values(szVGA, sz720p, sz1080p), + Values(cv::compile_args(IMGPROC_FLUID)))); } diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp index e2e4c4f754..6b27b13876 100644 --- a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp +++ b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp @@ -344,7 +344,7 @@ static const int maxKernelSize = 9; template static void run_boxfilter(Buffer &dst, const View &src, const cv::Size &kernelSize, - const cv::Point& /* anchor */, bool normalize) + const cv::Point& /* anchor */, bool normalize, float *buf[]) { GAPI_Assert(kernelSize.width <= maxKernelSize); GAPI_Assert(kernelSize.width == kernelSize.height); @@ -365,36 +365,53 @@ static void run_boxfilter(Buffer &dst, const View &src, const cv::Size &kernelSi int width = dst.length(); int chan = dst.meta().chan; - GAPI_DbgAssert(chan <= 4); + if (kernelSize.width == 3 && kernelSize.height == 3) + { + int y = dst.y(); + int y0 = dst.priv().writeStart(); - for (int w=0; w < width; w++) + float kx[3] = {1, 1, 1}; + float *ky = kx; + + float scale=1, delta=0; + if (normalize) + scale = 1/9.f; + + run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0); + } else { - float sum[4] = {0, 0, 0, 0}; + GAPI_DbgAssert(chan <= 4); - for (int i=0; i < kernel; i++) + for (int w=0; w < width; w++) { - for (int j=0; j < kernel; j++) + float sum[4] = {0, 0, 0, 0}; + + for (int i=0; i < kernel; i++) { - for (int c=0; c < chan; c++) - sum[c] += in[i][(w + j - border)*chan + c]; + for (int j=0; j < kernel; j++) + { + for (int c=0; c < chan; c++) + sum[c] += in[i][(w + j - border)*chan + c]; + } } - } - for (int c=0; c < chan; c++) - { - float result = normalize? sum[c]/(kernel * kernel) : sum[c]; + for (int c=0; c < chan; c++) + { + float result = normalize? sum[c]/(kernel * kernel) : sum[c]; - out[w*chan + c] = saturate(result, rintf); + out[w*chan + c] = saturate(result, rintf); + } } } } -GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, false) +GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, true) { static const int Window = 3; static void run(const View &src, const cv::Size& kernelSize, const cv::Point& anchor, - int /* borderType */, const cv::Scalar& /* borderValue */, Buffer &dst) + int /* borderType */, const cv::Scalar& /* borderValue */, Buffer &dst, + Buffer& scratch) { // TODO: support sizes 3, 5, 7, 9, ... GAPI_Assert(kernelSize.width == 3 && kernelSize.height == 3); @@ -404,14 +421,46 @@ GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, false) static const bool normalize = true; + int width = src.length(); + int chan = src.meta().chan; + int length = width * chan; + + float *buf[3]; + buf[0] = scratch.OutLine(); + buf[1] = buf[0] + length; + buf[2] = buf[1] + length; + // DST SRC OP __VA_ARGS__ - UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize); - UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize); - UNARY_( short, short, run_boxfilter, dst, src, kernelSize, anchor, normalize); + UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); + UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); + UNARY_( short, short, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); + UNARY_( float, float, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } + static void initScratch(const GMatDesc & in, + const cv::Size & /* ksize */, + const cv::Point & /* anchor */, + int /* borderType */, + const cv::Scalar & /* borderValue */, + Buffer & scratch) + { + int width = in.size.width; + int chan = in.chan; + + int buflen = width * chan * Window; // work buffers + + cv::gapi::own::Size bufsize(buflen, 1); + GMatDesc bufdesc = {CV_32F, 1, bufsize}; + Buffer buffer(bufdesc); + scratch = std::move(buffer); + } + + static void resetScratch(Buffer& /* scratch */) + { + } + static Border getBorder(const cv::GMatDesc& /* src */, const cv::Size & /* kernelSize */, const cv::Point & /* anchor */, @@ -422,18 +471,19 @@ GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, false) } }; -GAPI_FLUID_KERNEL(GFluidBoxFilter, cv::gapi::imgproc::GBoxFilter, false) +GAPI_FLUID_KERNEL(GFluidBoxFilter, cv::gapi::imgproc::GBoxFilter, true) { static const int Window = 3; static void run(const View & src, int /* ddepth */, const cv::Size & kernelSize, - const cv::Point & anchor, + const cv::Point & anchor, bool normalize, int /* borderType */, const cv::Scalar& /* borderValue */, - Buffer& dst) + Buffer& dst, + Buffer& scratch) { // TODO: support sizes 3, 5, 7, 9, ... GAPI_Assert(kernelSize.width == 3 && kernelSize.height == 3); @@ -441,17 +491,51 @@ GAPI_FLUID_KERNEL(GFluidBoxFilter, cv::gapi::imgproc::GBoxFilter, false) // TODO: suport non-trivial anchor GAPI_Assert(anchor.x == -1 && anchor.y == -1); + int width = src.length(); + int chan = src.meta().chan; + int length = width * chan; + + float *buf[3]; + buf[0] = scratch.OutLine(); + buf[1] = buf[0] + length; + buf[2] = buf[1] + length; + // DST SRC OP __VA_ARGS__ - UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize); - UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize); - UNARY_( short, short, run_boxfilter, dst, src, kernelSize, anchor, normalize); - UNARY_( float, uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize); - UNARY_( float, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize); - UNARY_( float, short, run_boxfilter, dst, src, kernelSize, anchor, normalize); + UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); + UNARY_( float, uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); + UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); + UNARY_( float, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); + UNARY_( short, short, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); + UNARY_( float, short, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); + UNARY_( float, float, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } + static void initScratch(const GMatDesc & in, + int /* ddepth */, + const cv::Size & /* kernelSize */, + const cv::Point & /* anchor */, + bool /* normalize */, + int /* borderType */, + const cv::Scalar& /* borderValue */, + Buffer & scratch) + { + int width = in.size.width; + int chan = in.chan; + + int buflen = width * chan * Window; // work buffers + + cv::gapi::own::Size bufsize(buflen, 1); + GMatDesc bufdesc = {CV_32F, 1, bufsize}; + Buffer buffer(bufdesc); + scratch = std::move(buffer); + } + + static void resetScratch(Buffer& /* scratch */) + { + } + static Border getBorder(const cv::GMatDesc& /* src */, int /* ddepth */, const cv::Size & /* kernelSize */, @@ -510,18 +594,21 @@ static void run_sepfilter(Buffer& dst, const View& src, const float kx[], int kxLen, const float ky[], int kyLen, const cv::Point& /* anchor */, - float delta=0) + float scale, float delta, + float *buf[]) { - static const int maxLines = 9; - GAPI_Assert(kyLen <= maxLines); + constexpr int kMax = 11; + GAPI_Assert(kxLen <= kMax && kyLen <= kMax); - const SRC *in[ maxLines ]; + const SRC *in[kMax]; DST *out; - int border = (kyLen - 1) / 2; + int xborder = (kxLen - 1) / 2; + int yborder = (kyLen - 1) / 2; + for (int i=0; i < kyLen; i++) { - in[i] = src.InLine(i - border); + in[i] = src.InLine(i - yborder); } out = dst.OutLine(); @@ -529,28 +616,52 @@ static void run_sepfilter(Buffer& dst, const View& src, int width = dst.length(); int chan = dst.meta().chan; - for (int w=0; w < width; w++) + // optimized 3x3 vs reference + if (kxLen == 3 && kyLen == 3) { - // TODO: make this cycle innermost - for (int c=0; c < chan; c++) + int y = dst.y(); + int y0 = dst.priv().writeStart(); + + int border = xborder; + run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0); + } + else + { + int length = chan * width; + int xshift = chan * xborder; + + // horizontal pass + + for (int k=0; k < kyLen; k++) { - float sum=0; + const SRC *inp[kMax] = {nullptr}; - for (int i=0; i < kyLen; i++) + for (int j=0; j < kxLen; j++) { - float sumi=0; + inp[j] = in[k] + (j - xborder)*xshift; + } + for (int l=0; l < length; l++) + { + float sum = 0; for (int j=0; j < kxLen; j++) { - sumi += in[i][(w + j - border)*chan + c] * kx[j]; + sum += inp[j][l] * kx[j]; } - - sum += sumi * ky[i]; + buf[k][l] = sum; } + } - float result = sum + delta; + // vertical pass - out[w*chan + c] = saturate(result, rintf); + for (int l=0; l < length; l++) + { + float sum = 0; + for (int k=0; k < kyLen; k++) + { + sum += buf[k][l] * ky[k]; + } + out[l] = saturate(sum*scale + delta, rintf); } } } @@ -580,21 +691,37 @@ GAPI_FLUID_KERNEL(GFluidSepFilter, cv::gapi::imgproc::GSepFilter, true) int kxLen = kernX.rows * kernX.cols; int kyLen = kernY.rows * kernY.cols; + GAPI_Assert(kyLen == 3); + float *kx = scratch.OutLine(); float *ky = kx + kxLen; + int width = src.meta().size.width; + int chan = src.meta().chan; + int length = width * chan; + + float *buf[3]; + buf[0] = ky + kyLen; + buf[1] = buf[0] + length; + buf[2] = buf[1] + length; + + float scale = 1; float delta = static_cast(delta_[0]); // DST SRC OP __VA_ARGS__ - UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta); - UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta); - UNARY_( short, short, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta); - UNARY_( float, float, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta); + UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf); + UNARY_( short, uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf); + UNARY_( float, uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf); + UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf); + UNARY_( float, ushort, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf); + UNARY_( short, short, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf); + UNARY_( float, short, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf); + UNARY_( float, float, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } - static void initScratch(const GMatDesc& /* in */, + static void initScratch(const GMatDesc& in, int /* ddepth */, const Mat & kernX, const Mat & kernY, @@ -607,7 +734,13 @@ GAPI_FLUID_KERNEL(GFluidSepFilter, cv::gapi::imgproc::GSepFilter, true) int kxLen = kernX.rows * kernX.cols; int kyLen = kernY.rows * kernY.cols; - cv::gapi::own::Size bufsize(kxLen + kyLen, 1); + int width = in.size.width; + int chan = in.chan; + + int buflen = kxLen + kyLen + // x, y kernels + width * chan * Window; // work buffers + + cv::gapi::own::Size bufsize(buflen, 1); GMatDesc bufdesc = {CV_32F, 1, bufsize}; Buffer buffer(bufdesc); scratch = std::move(buffer); @@ -664,29 +797,47 @@ GAPI_FLUID_KERNEL(GFluidGaussBlur, cv::gapi::imgproc::GGaussBlur, true) auto *kx = scratch.OutLine(); // cached kernX data auto *ky = kx + kxsize; // cached kernY data + int width = src.meta().size.width; + int chan = src.meta().chan; + int length = width * chan; + + float *buf[3]; + buf[0] = ky + kysize; + buf[1] = buf[0] + length; + buf[2] = buf[1] + length; + auto anchor = cv::Point(-1, -1); - float delta = 0.f; + + float scale = 1; + float delta = 0; // DST SRC OP __VA_ARGS__ - UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta); - UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta); - UNARY_( short, short, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta); + UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf); + UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf); + UNARY_( short, short, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf); + UNARY_( float, float, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } - static void initScratch(const GMatDesc& /* in */, + static void initScratch(const GMatDesc& in, const cv::Size & ksize, double sigmaX, double sigmaY, - int /* borderType */, - const cv::Scalar & /* borderValue */, + int /* borderType */, + const cv::Scalar & /* borderValue */, Buffer & scratch) { int kxsize = ksize.width; int kysize = ksize.height; - cv::gapi::own::Size bufsize(kxsize + kysize, 1); + int width = in.size.width; + int chan = in.chan; + + int buflen = kxsize + kysize + // x, y kernels + width * chan * Window; // work buffers + + cv::gapi::own::Size bufsize(buflen, 1); GMatDesc bufdesc = {CV_32F, 1, bufsize}; Buffer buffer(bufdesc); scratch = std::move(buffer); @@ -767,7 +918,7 @@ static void run_sobel(Buffer& dst, int y0 = dst.priv().writeStart(); // int y1 = dst.priv().writeEnd(); - run_sobel_row(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0); + run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0); } GAPI_FLUID_KERNEL(GFluidSobel, cv::gapi::imgproc::GSobel, true) @@ -1102,6 +1253,7 @@ GAPI_FLUID_KERNEL(GFluidErode, cv::gapi::imgproc::GErode, true) UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE); UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE); UNARY_( short, short, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE); + UNARY_( float, float, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } @@ -1109,7 +1261,7 @@ GAPI_FLUID_KERNEL(GFluidErode, cv::gapi::imgproc::GErode, true) static void initScratch(const GMatDesc& /* in */, const Mat & kernel, const Point & /* anchor */, - int /* iterations */, + int /* iterations */, int /* borderType */, const cv::Scalar & /* borderValue */, Buffer & scratch) @@ -1179,6 +1331,7 @@ GAPI_FLUID_KERNEL(GFluidDilate, cv::gapi::imgproc::GDilate, true) UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE); UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE); UNARY_( short, short, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE); + UNARY_( float, float, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } @@ -1290,6 +1443,7 @@ GAPI_FLUID_KERNEL(GFluidMedianBlur, cv::gapi::imgproc::GMedianBlur, false) UNARY_(uchar , uchar , run_medianblur, dst, src, ksize); UNARY_(ushort, ushort, run_medianblur, dst, src, ksize); UNARY_( short, short, run_medianblur, dst, src, ksize); + UNARY_( float, float, run_medianblur, dst, src, ksize); CV_Error(cv::Error::StsBadArg, "unsupported combination of types"); } diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp index 9b217903ef..b536bbfbdc 100644 --- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp +++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp @@ -57,34 +57,34 @@ void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef CV_CPU_DISPATCH(run_yuv2rgb_impl, (out, in, width, coef), CV_CPU_DISPATCH_MODES_ALL); } -//--------------------- +//------------------------- // -// Fluid kernels: Sobel +// Fluid kernels: sepFilter // -//--------------------- - -#define RUN_SOBEL_ROW(DST, SRC) \ -void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \ - const float kx[], const float ky[], int border, \ - float scale, float delta, float *buf[], \ - int y, int y0) \ -{ \ - CV_CPU_DISPATCH(run_sobel_row, \ - (out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0), \ - CV_CPU_DISPATCH_MODES_ALL); \ +//------------------------- + +#define RUN_SEPFILTER3X3_IMPL(DST, SRC) \ +void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \ + const float kx[], const float ky[], int border, \ + float scale, float delta, \ + float *buf[], int y, int y0) \ +{ \ + CV_CPU_DISPATCH(run_sepfilter3x3_impl, \ + (out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0), \ + CV_CPU_DISPATCH_MODES_ALL); \ } -RUN_SOBEL_ROW(uchar , uchar ) -RUN_SOBEL_ROW(ushort, ushort) -RUN_SOBEL_ROW( short, uchar ) -RUN_SOBEL_ROW( short, ushort) -RUN_SOBEL_ROW( short, short) -RUN_SOBEL_ROW( float, uchar ) -RUN_SOBEL_ROW( float, ushort) -RUN_SOBEL_ROW( float, short) -RUN_SOBEL_ROW( float, float) - -#undef RUN_SOBEL_ROW +RUN_SEPFILTER3X3_IMPL(uchar , uchar ) +RUN_SEPFILTER3X3_IMPL( short, uchar ) +RUN_SEPFILTER3X3_IMPL( float, uchar ) +RUN_SEPFILTER3X3_IMPL(ushort, ushort) +RUN_SEPFILTER3X3_IMPL( short, ushort) +RUN_SEPFILTER3X3_IMPL( float, ushort) +RUN_SEPFILTER3X3_IMPL( short, short) +RUN_SEPFILTER3X3_IMPL( float, short) +RUN_SEPFILTER3X3_IMPL( float, float) + +#undef RUN_SEPFILTER3X3_IMPL } // namespace fliud } // namespace gapi diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp index 1b6f1b8c0d..3b41c52794 100644 --- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp +++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp @@ -33,29 +33,29 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]); -//--------------------- +//------------------------- // -// Fluid kernels: Sobel +// Fluid kernels: sepFilter // -//--------------------- +//------------------------- -#define RUN_SOBEL_ROW(DST, SRC) \ -void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \ - const float kx[], const float ky[], int border, \ - float scale, float delta, float *buf[], \ - int y, int y0); +#define RUN_SEPFILTER3X3_IMPL(DST, SRC) \ +void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \ + const float kx[], const float ky[], int border, \ + float scale, float delta, \ + float *buf[], int y, int y0); -RUN_SOBEL_ROW(uchar , uchar ) -RUN_SOBEL_ROW(ushort, ushort) -RUN_SOBEL_ROW( short, uchar ) -RUN_SOBEL_ROW( short, ushort) -RUN_SOBEL_ROW( short, short) -RUN_SOBEL_ROW( float, uchar ) -RUN_SOBEL_ROW( float, ushort) -RUN_SOBEL_ROW( float, short) -RUN_SOBEL_ROW( float, float) +RUN_SEPFILTER3X3_IMPL(uchar , uchar ) +RUN_SEPFILTER3X3_IMPL( short, uchar ) +RUN_SEPFILTER3X3_IMPL( float, uchar ) +RUN_SEPFILTER3X3_IMPL(ushort, ushort) +RUN_SEPFILTER3X3_IMPL( short, ushort) +RUN_SEPFILTER3X3_IMPL( float, ushort) +RUN_SEPFILTER3X3_IMPL( short, short) +RUN_SEPFILTER3X3_IMPL( float, short) +RUN_SEPFILTER3X3_IMPL( float, float) -#undef RUN_SOBEL_ROW +#undef RUN_SEPFILTER3X3_IMPL } // namespace fluid } // namespace gapi diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp index c87be085a3..cd52b66189 100644 --- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp +++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp @@ -9,6 +9,8 @@ #if !defined(GAPI_STANDALONE) +#include "gfluidimgproc_func.hpp" + #include "opencv2/gapi/own/saturate.hpp" #include "opencv2/core.hpp" @@ -16,6 +18,8 @@ #include +#include + #ifdef __GNUC__ # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wstrict-overflow" @@ -48,34 +52,66 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]); -//--------------------- +//------------------------- // -// Fluid kernels: Sobel +// Fluid kernels: sepFilter // -//--------------------- - -#define RUN_SOBEL_ROW(DST, SRC) \ -void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \ - const float kx[], const float ky[], int border, \ - float scale, float delta, float *buf[], \ - int y, int y0); - -RUN_SOBEL_ROW(uchar , uchar ) -RUN_SOBEL_ROW(ushort, ushort) -RUN_SOBEL_ROW( short, uchar ) -RUN_SOBEL_ROW( short, ushort) -RUN_SOBEL_ROW( short, short) -RUN_SOBEL_ROW( float, uchar ) -RUN_SOBEL_ROW( float, ushort) -RUN_SOBEL_ROW( float, short) -RUN_SOBEL_ROW( float, float) - -#undef RUN_SOBEL_ROW +//------------------------- + +#define RUN_SEPFILTER3X3_IMPL(DST, SRC) \ +void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \ + const float kx[], const float ky[], int border, \ + float scale, float delta, \ + float *buf[], int y, int y0); + +RUN_SEPFILTER3X3_IMPL(uchar , uchar ) +RUN_SEPFILTER3X3_IMPL( short, uchar ) +RUN_SEPFILTER3X3_IMPL( float, uchar ) +RUN_SEPFILTER3X3_IMPL(ushort, ushort) +RUN_SEPFILTER3X3_IMPL( short, ushort) +RUN_SEPFILTER3X3_IMPL( float, ushort) +RUN_SEPFILTER3X3_IMPL( short, short) +RUN_SEPFILTER3X3_IMPL( float, short) +RUN_SEPFILTER3X3_IMPL( float, float) + +#undef RUN_SEPFILTER3X3_IMPL //---------------------------------------------------------------------- #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY +#if CV_SIMD +template +static inline v_float32 vx_load_f32(const SRC* ptr) +{ + if (std::is_same::value) + { + v_uint32 tmp = vx_load_expand_q(reinterpret_cast(ptr)); + return v_cvt_f32(v_reinterpret_as_s32(tmp)); + } + + if (std::is_same::value) + { + v_uint32 tmp = vx_load_expand(reinterpret_cast(ptr)); + return v_cvt_f32(v_reinterpret_as_s32(tmp)); + } + + if (std::is_same::value) + { + v_int32 tmp = vx_load_expand(reinterpret_cast(ptr)); + return v_cvt_f32(tmp); + } + + if (std::is_same::value) + { + v_float32 tmp = vx_load(reinterpret_cast(ptr)); + return tmp; + } + + CV_Error(cv::Error::StsBadArg, "unsupported type"); +} +#endif // CV_SIMD + //---------------------------------- // // Fluid kernels: RGB2Gray, BGR2Gray @@ -309,187 +345,359 @@ void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef } } -//--------------------- +//------------------------- // -// Fluid kernels: Sobel +// Fluid kernels: sepFilter // -//--------------------- +//------------------------- -// Sobel 3x3: vertical pass -template -static void run_sobel3x3_vert(DST out[], int length, const float ky[], - float scale, float delta, const int r[], float *buf[]) +#if CV_SIMD +// this variant not using buf[] appears 15% faster than reference any-2-float code below +template +static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width, int chan, + const float kx[], const float ky[], int border, + float scale, float delta) { - float ky0 = ky[0], - ky1 = ky[1], - ky2 = ky[2]; + const int length = width * chan; + const int shift = border * chan; - int r0 = r[0], - r1 = r[1], - r2 = r[2]; + const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2]; + const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2]; -#if CV_SIMD - // for floating-point output, - // manual vectoring may be not better than compiler's optimization -#define EXPLICIT_SIMD_32F 0 // 1=vectorize 32f case explicitly, 0=don't -#if EXPLICIT_SIMD_32F - if (std::is_same::value && length >= v_int16::nlanes) + for (int l=0; l < length; ) { - constexpr static int nlanes = v_float32::nlanes; + static const int nlanes = v_float32::nlanes; - for (int l=0; l < length; ) + // main part + for ( ; l <= length - nlanes; l += nlanes) { - for (; l <= length - nlanes; l += nlanes) + auto xsum = [l, shift, kx0, kx1, kx2](const SRC i[]) { - v_float32 sum = vx_load(&buf[r0][l]) * vx_setall_f32(ky0); - sum = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum); - sum = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum); + v_float32 t0 = vx_load_f32(&i[l - shift]); + v_float32 t1 = vx_load_f32(&i[l ]); + v_float32 t2 = vx_load_f32(&i[l + shift]); + v_float32 t = t0 * vx_setall_f32(kx0); + t = v_fma(t1, vx_setall_f32(kx1), t); + t = v_fma(t2, vx_setall_f32(kx2), t); + return t; + }; + + v_float32 s0 = xsum(in[0]); + v_float32 s1 = xsum(in[1]); + v_float32 s2 = xsum(in[2]); + v_float32 s = s0 * vx_setall_f32(ky0); + s = v_fma(s1, vx_setall_f32(ky1), s); + s = v_fma(s2, vx_setall_f32(ky2), s); + + if (!noscale) + { + s = v_fma(s, vx_setall_f32(scale), vx_setall_f32(delta)); + } + + v_store(&out[l], s); + } + + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } + } +} + +// this variant with manually vectored rounding to short/ushort appears 10-40x faster +// than reference code below +template +static void run_sepfilter3x3_any2short(DST out[], const SRC *in[], int width, int chan, + const float kx[], const float ky[], int border, + float scale, float delta, + float *buf[], int y, int y0) +{ + int r[3]; + r[0] = (y - y0 ) % 3; // buf[r[0]]: previous + r[1] = (y - y0 + 1) % 3; // this + r[2] = (y - y0 + 2) % 3; // next row + + const int length = width * chan; + const int shift = border * chan; + + const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2]; + const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2]; + + // horizontal pass + + int k0 = (y == y0)? 0: 2; + + for (int k = k0; k < 3; k++) + { + // previous , this , next pixel + const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift}; + + // rely on compiler vectoring + for (int l=0; l < length; l++) + { + buf[r[k]][l] = s[0][l]*kx0 + s[1][l]*kx1 + s[2][l]*kx2; + } + } + + // vertical pass + + const int r0=r[0], r1=r[1], r2=r[2]; + + for (int l=0; l < length;) + { + constexpr int nlanes = v_int16::nlanes; + + // main part of row + for (; l <= length - nlanes; l += nlanes) + { + v_float32 sum0 = vx_load(&buf[r0][l]) * vx_setall_f32(ky0); + sum0 = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum0); + sum0 = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum0); - if (!noscale) - { - sum = v_fma(sum, vx_setall_f32(scale), vx_setall_f32(delta)); - } + v_float32 sum1 = vx_load(&buf[r0][l + nlanes/2]) * vx_setall_f32(ky0); + sum1 = v_fma(vx_load(&buf[r1][l + nlanes/2]), vx_setall_f32(ky1), sum1); + sum1 = v_fma(vx_load(&buf[r2][l + nlanes/2]), vx_setall_f32(ky2), sum1); - v_store(reinterpret_cast(&out[l]), sum); + if (!noscale) + { + sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta)); + sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta)); } - if (l < length) + v_int32 isum0 = v_round(sum0), + isum1 = v_round(sum1); + + if (std::is_same::value) { - // tail: recalculate last pixels - GAPI_DbgAssert(length >= nlanes); - l = length - nlanes; + // signed short + v_int16 res = v_pack(isum0, isum1); + v_store(reinterpret_cast(&out[l]), res); + } else + { + // unsigned short + v_uint16 res = v_pack_u(isum0, isum1); + v_store(reinterpret_cast(&out[l]), res); } } - return; + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } } -#endif +} - if ((std::is_same::value || std::is_same::value) - && length >= v_int16::nlanes) +// this code with manually vectored rounding to uchar is 10-40x faster than reference +template +static void run_sepfilter3x3_any2char(uchar out[], const SRC *in[], int width, int chan, + const float kx[], const float ky[], int border, + float scale, float delta, + float *buf[], int y, int y0) +{ + int r[3]; + r[0] = (y - y0 ) % 3; // buf[r[0]]: previous + r[1] = (y - y0 + 1) % 3; // this + r[2] = (y - y0 + 2) % 3; // next row + + const int length = width * chan; + const int shift = border * chan; + + const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2]; + const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2]; + + // horizontal pass + + int k0 = (y == y0)? 0: 2; + + for (int k = k0; k < 3; k++) { - constexpr static int nlanes = v_int16::nlanes; + // previous , this , next pixel + const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift}; - for (int l=0; l < length; ) + // rely on compiler vectoring + for (int l=0; l < length; l++) { - for (; l <= length - nlanes; l += nlanes) - { - v_float32 sum0 = vx_load(&buf[r0][l]) * vx_setall_f32(ky0); - sum0 = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum0); - sum0 = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum0); - - v_float32 sum1 = vx_load(&buf[r0][l + nlanes/2]) * vx_setall_f32(ky0); - sum1 = v_fma(vx_load(&buf[r1][l + nlanes/2]), vx_setall_f32(ky1), sum1); - sum1 = v_fma(vx_load(&buf[r2][l + nlanes/2]), vx_setall_f32(ky2), sum1); - - if (!noscale) - { - sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta)); - sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta)); - } - - v_int32 isum0 = v_round(sum0), - isum1 = v_round(sum1); - - if (std::is_same::value) - { - // signed short - v_int16 res = v_pack(isum0, isum1); - v_store(reinterpret_cast(&out[l]), res); - } else - { - // unsigned short - v_uint16 res = v_pack_u(isum0, isum1); - v_store(reinterpret_cast(&out[l]), res); - } - } + buf[r[k]][l] = s[0][l]*kx0 + s[1][l]*kx1 + s[2][l]*kx2; + } + } - if (l < length) + // vertical pass + + const int r0=r[0], r1=r[1], r2=r[2]; + + for (int l=0; l < length;) + { + constexpr int nlanes = v_uint8::nlanes; + + // main part of row + for (; l <= length - nlanes; l += nlanes) + { + v_float32 sum0 = vx_load(&buf[r0][l]) * vx_setall_f32(ky0); + sum0 = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum0); + sum0 = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum0); + + v_float32 sum1 = vx_load(&buf[r0][l + nlanes/4]) * vx_setall_f32(ky0); + sum1 = v_fma(vx_load(&buf[r1][l + nlanes/4]), vx_setall_f32(ky1), sum1); + sum1 = v_fma(vx_load(&buf[r2][l + nlanes/4]), vx_setall_f32(ky2), sum1); + + v_float32 sum2 = vx_load(&buf[r0][l + 2*nlanes/4]) * vx_setall_f32(ky0); + sum2 = v_fma(vx_load(&buf[r1][l + 2*nlanes/4]), vx_setall_f32(ky1), sum2); + sum2 = v_fma(vx_load(&buf[r2][l + 2*nlanes/4]), vx_setall_f32(ky2), sum2); + + v_float32 sum3 = vx_load(&buf[r0][l + 3*nlanes/4]) * vx_setall_f32(ky0); + sum3 = v_fma(vx_load(&buf[r1][l + 3*nlanes/4]), vx_setall_f32(ky1), sum3); + sum3 = v_fma(vx_load(&buf[r2][l + 3*nlanes/4]), vx_setall_f32(ky2), sum3); + + if (!noscale) { - // tail: recalculate last pixels - GAPI_DbgAssert(length >= nlanes); - l = length - nlanes; + sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta)); + sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta)); + sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta)); + sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta)); } + + v_int32 isum0 = v_round(sum0), + isum1 = v_round(sum1), + isum2 = v_round(sum2), + isum3 = v_round(sum3); + + v_int16 ires0 = v_pack(isum0, isum1), + ires1 = v_pack(isum2, isum3); + + v_uint8 res = v_pack_u(ires0, ires1); + v_store(reinterpret_cast(&out[l]), res); } - return; + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } } +} - if (std::is_same::value && length >= v_uint8::nlanes) +// this code manually vectored for int16 not much faster than generic any-to-short code above +#define USE_SEPFILTER3X3_CHAR2SHORT 1 + +#if USE_SEPFILTER3X3_CHAR2SHORT +template +static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int width, int chan, + const float kx[], const float ky[], int border, + float scale, float delta, + float *buf[], int y, int y0) +{ + const schar ikx0 = saturate(kx[0], rintf); + const schar ikx1 = saturate(kx[1], rintf); + const schar ikx2 = saturate(kx[2], rintf); + + const schar iky0 = saturate(ky[0], rintf); + const schar iky1 = saturate(ky[1], rintf); + const schar iky2 = saturate(ky[2], rintf); + + const short iscale = saturate(scale * (1 << 15), rintf); + const short idelta = saturate(delta , rintf); + + // check if this code is applicable + if (ikx0 != kx[0] || ikx1 != kx[1] || ikx2 != kx[2] || + iky0 != ky[0] || iky1 != ky[1] || iky2 != ky[2] || + idelta != delta || + std::abs(scale) > 1 || std::abs(scale) < 0.01) { - constexpr static int nlanes = v_uint8::nlanes; + run_sepfilter3x3_any2short(out, in, width, chan, kx, ky, border, scale, delta, + buf, y, y0); + return; + } + + short *ibuf[3]; + ibuf[0] = reinterpret_cast(buf[0]); + ibuf[1] = reinterpret_cast(buf[1]); + ibuf[2] = reinterpret_cast(buf[2]); - for (int l=0; l < length; ) + int r[3]; + r[0] = (y - y0 ) % 3; // buf[r[0]]: previous + r[1] = (y - y0 + 1) % 3; // this + r[2] = (y - y0 + 2) % 3; // next row + + const int length = width * chan; + const int shift = border * chan; + + // horizontal pass + + int k0 = (y == y0)? 0: 2; + + for (int k = k0; k < 3; k++) + { + for (int l=0; l < length;) { + constexpr int nlanes = v_int16::nlanes; + + // main part of output row for (; l <= length - nlanes; l += nlanes) { - v_float32 sum0 = vx_load(&buf[r0][l]) * vx_setall_f32(ky0); - sum0 = v_fma(vx_load(&buf[r1][l]), vx_setall_f32(ky1), sum0); - sum0 = v_fma(vx_load(&buf[r2][l]), vx_setall_f32(ky2), sum0); - - v_float32 sum1 = vx_load(&buf[r0][l + nlanes/4]) * vx_setall_f32(ky0); - sum1 = v_fma(vx_load(&buf[r1][l + nlanes/4]), vx_setall_f32(ky1), sum1); - sum1 = v_fma(vx_load(&buf[r2][l + nlanes/4]), vx_setall_f32(ky2), sum1); - - v_float32 sum2 = vx_load(&buf[r0][l + 2*nlanes/4]) * vx_setall_f32(ky0); - sum2 = v_fma(vx_load(&buf[r1][l + 2*nlanes/4]), vx_setall_f32(ky1), sum2); - sum2 = v_fma(vx_load(&buf[r2][l + 2*nlanes/4]), vx_setall_f32(ky2), sum2); - - v_float32 sum3 = vx_load(&buf[r0][l + 3*nlanes/4]) * vx_setall_f32(ky0); - sum3 = v_fma(vx_load(&buf[r1][l + 3*nlanes/4]), vx_setall_f32(ky1), sum3); - sum3 = v_fma(vx_load(&buf[r2][l + 3*nlanes/4]), vx_setall_f32(ky2), sum3); - - if (!noscale) - { - sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta)); - sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta)); - sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta)); - sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta)); - } - - v_int32 isum0 = v_round(sum0), - isum1 = v_round(sum1), - isum2 = v_round(sum2), - isum3 = v_round(sum3); - - v_int16 ires0 = v_pack(isum0, isum1), - ires1 = v_pack(isum2, isum3); - - v_uint8 res = v_pack_u(ires0, ires1); - v_store(reinterpret_cast(&out[l]), res); + v_uint16 t0 = vx_load_expand(&in[k][l - shift]); // previous + v_uint16 t1 = vx_load_expand(&in[k][l ]); // current + v_uint16 t2 = vx_load_expand(&in[k][l + shift]); // next pixel + v_int16 t = v_reinterpret_as_s16(t0) * vx_setall_s16(ikx0) + + v_reinterpret_as_s16(t1) * vx_setall_s16(ikx1) + + v_reinterpret_as_s16(t2) * vx_setall_s16(ikx2); + v_store(&ibuf[r[k]][l], t); } + // tail (if any) if (l < length) { - // tail: recalculate last pixels GAPI_DbgAssert(length >= nlanes); l = length - nlanes; } } - - return; } -#endif - // reference code - for (int l=0; l < length; l++) + // vertical pass + + for (int l=0; l < length;) { - float sum = buf[r0][l]*ky0 + buf[r1][l]*ky1 + buf[r2][l]*ky2; + constexpr int nlanes = v_int16::nlanes; - if (!noscale) + // main part of output row + for (; l <= length - nlanes; l += nlanes) { - sum = sum*scale + delta; + v_int16 s0 = vx_load(&ibuf[r[0]][l]); // previous + v_int16 s1 = vx_load(&ibuf[r[1]][l]); // current + v_int16 s2 = vx_load(&ibuf[r[2]][l]); // next row + v_int16 s = s0 * vx_setall_s16(iky0) + + s1 * vx_setall_s16(iky1) + + s2 * vx_setall_s16(iky2); + + if (!noscale) + { + s = v_mul_hi(s << 1, vx_setall_s16(iscale)) + vx_setall_s16(idelta); + } + + v_store(&out[l], s); } - out[l] = cv::gapi::own::saturate(sum, rintf); + // tail (if any) + if (l < length) + { + GAPI_DbgAssert(length >= nlanes); + l = length - nlanes; + } } } +#endif + +#endif // CV_SIMD -template -static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan, - const float kx[], const float ky[], int border, - float scale, float delta, float *buf[], - int y, int y0) +template +static void run_sepfilter3x3_reference(DST out[], const SRC *in[], int width, int chan, + const float kx[], const float ky[], int border, + float scale, float delta, + float *buf[], int y, int y0) { int r[3]; r[0] = (y - y0) % 3; // buf[r[0]]: previous @@ -497,19 +705,21 @@ static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan, r[2] = (y - y0 + 2) % 3; // next row int length = width * chan; + int shift = border * chan; // horizontal pass // full horizontal pass is needed only if very 1st row in ROI; // for 2nd and further rows, it is enough to convolve only the // "next" row - as we can reuse buffers from previous calls to - // this kernel (note that Fluid processes rows consequently) + // this kernel (Fluid does rows consequently: y=y0, y0+1, ...) + int k0 = (y == y0)? 0: 2; for (int k = k0; k < 3; k++) { - // previous, this , next pixel - const SRC *s[3] = {in[k] - border*chan , in[k], in[k] + border*chan}; + // previous , this , next pixel + const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift}; // rely on compiler vectoring for (int l=0; l < length; l++) @@ -519,37 +729,121 @@ static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan, } // vertical pass - if (scale == 1 && delta == 0) + + for (int l=0; l < length; l++) + { + float sum = buf[r[0]][l]*ky[0] + buf[r[1]][l]*ky[1] + buf[r[2]][l]*ky[2]; + + if (!noscale) + { + sum = sum*scale + delta; + } + + out[l] = saturate(sum, rintf); + } +} + +template +static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int chan, + const float kx[], const float ky[], int border, + float scale, float delta, + float *buf[], int y, int y0) +{ +#if CV_SIMD + int length = width * chan; + + // length variable may be unused if types do not match at 'if' statements below + (void) length; + +#if USE_SEPFILTER3X3_CHAR2SHORT + if (std::is_same::value && std::is_same::value && + length >= v_int16::nlanes) + { + // only slightly faster than more generic any-to-short (see below) + run_sepfilter3x3_char2short(reinterpret_cast(out), + reinterpret_cast(in), + width, chan, kx, ky, border, scale, delta, + buf, y, y0); + return; + } +#endif + + if (std::is_same::value && std::is_same::value && + length >= v_float32::nlanes) + { + // appears 15% faster than reference any-to-float code (called below) + run_sepfilter3x3_any2float(reinterpret_cast(out), in, + width, chan, kx, ky, border, scale, delta); + return; + } + + if (std::is_same::value && length >= v_int16::nlanes) + { + // appears 10-40x faster than reference due to much faster rounding + run_sepfilter3x3_any2short(reinterpret_cast(out), in, + width, chan, kx, ky, border, scale, delta, + buf, y, y0); + return; + } + + if (std::is_same::value && length >= v_uint16::nlanes) { - constexpr static bool noscale = true; // omit scaling - run_sobel3x3_vert(out, length, ky, scale, delta, r, buf); - } else + // appears 10-40x faster than reference due to much faster rounding + run_sepfilter3x3_any2short(reinterpret_cast(out), in, + width, chan, kx, ky, border, scale, delta, + buf, y, y0); + return; + } + + if (std::is_same::value && length >= v_uint8::nlanes) { - constexpr static bool noscale = false; // do scaling - run_sobel3x3_vert(out, length, ky, scale, delta, r, buf); + // appears 10-40x faster than reference due to much faster rounding + run_sepfilter3x3_any2char(reinterpret_cast(out), in, + width, chan, kx, ky, border, scale, delta, + buf, y, y0); + return; } +#endif // CV_SIMD + + // reference code is quite fast for any-to-float case, + // but not for any-to-integral due to very slow rounding + run_sepfilter3x3_reference(out, in, width, chan, kx, ky, border, + scale, delta, buf, y, y0); } -#define RUN_SOBEL_ROW(DST, SRC) \ -void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \ - const float kx[], const float ky[], int border, \ - float scale, float delta, float *buf[], \ - int y, int y0) \ -{ \ - run_sobel_impl(out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0); \ +#define RUN_SEPFILTER3X3_IMPL(DST, SRC) \ +void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \ + const float kx[], const float ky[], int border, \ + float scale, float delta, \ + float *buf[], int y, int y0) \ +{ \ + if (scale == 1 && delta == 0) \ + { \ + constexpr bool noscale = true; \ + run_sepfilter3x3_code(out, in, width, chan, kx, ky, border, \ + scale, delta, buf, y, y0); \ + } \ + else \ + { \ + constexpr bool noscale = false; \ + run_sepfilter3x3_code(out, in, width, chan, kx, ky, border, \ + scale, delta, buf, y, y0); \ + } \ } -RUN_SOBEL_ROW(uchar , uchar ) -RUN_SOBEL_ROW(ushort, ushort) -RUN_SOBEL_ROW( short, uchar ) -RUN_SOBEL_ROW( short, ushort) -RUN_SOBEL_ROW( short, short) -RUN_SOBEL_ROW( float, uchar ) -RUN_SOBEL_ROW( float, ushort) -RUN_SOBEL_ROW( float, short) -RUN_SOBEL_ROW( float, float) - -#undef RUN_SOBEL_ROW +RUN_SEPFILTER3X3_IMPL(uchar , uchar ) +RUN_SEPFILTER3X3_IMPL( short, uchar ) +RUN_SEPFILTER3X3_IMPL( float, uchar ) +RUN_SEPFILTER3X3_IMPL(ushort, ushort) +RUN_SEPFILTER3X3_IMPL( short, ushort) +RUN_SEPFILTER3X3_IMPL( float, ushort) +RUN_SEPFILTER3X3_IMPL( short, short) +RUN_SEPFILTER3X3_IMPL( float, short) +RUN_SEPFILTER3X3_IMPL( float, float) + +#undef RUN_SEPFILTER3X3_IMPL + +//------------------------------------------------------------------------------ #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY