From f07856eab936dcae9a490acc4ba8ced0db91ca9e Mon Sep 17 00:00:00 2001
From: Evgeny Latkin <evgeny.latkin@intel.com>
Date: Mon, 26 Nov 2018 15:05:35 +0300
Subject: [PATCH] Merge pull request #13221 from elatkin:el/gapi_perf_sepfilter

GAPI (fluid): optimization of Separable filter (#13221)

* GAPI (fluid): Separable filter: performance test

* GAPI (fluid): enable all performance tests

* GAPI: separable filters: alternative code for Sobel

* GAPI (fluid): hide unused old code for Sobel filter

* GAPI (fluid): especial code for Sobel if U8 into S16

* GAPI (fluid): back to old code for Sobel

* GAPI (fluid): run_sepfilter3x3_impl() with CPU dispatcher

* GAPI (fluid): run_sepfilter3x3_impl(): fix compiler warnings

* GAPI (fluid): new engine for separable filters (but Sobel)

* GAPI (fluid): new performance engine for Sobel

* GAPI (fluid): Sepfilters performance: fixed compilation error
---
 .../common/gapi_imgproc_perf_tests_inl.hpp    |  26 +-
 .../cpu/gapi_imgproc_perf_tests_fluid.cpp     | 180 +++--
 .../gapi/src/backends/fluid/gfluidimgproc.cpp | 274 ++++++--
 .../fluid/gfluidimgproc_func.dispatch.cpp     |  48 +-
 .../src/backends/fluid/gfluidimgproc_func.hpp |  36 +-
 .../fluid/gfluidimgproc_func.simd.hpp         | 646 +++++++++++++-----
 6 files changed, 875 insertions(+), 335 deletions(-)

diff --git a/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp b/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp
index 5a13cfeebe..89ebf0405d 100644
--- a/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp
+++ b/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp
@@ -52,7 +52,7 @@ PERF_TEST_P_(SepFilterPerfTest, TestPerformance)
 
     TEST_CYCLE()
     {
-      c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+      c.apply(in_mat1, out_mat_gapi);
     }
 
     // Comparison //////////////////////////////////////////////////////////////
@@ -100,7 +100,7 @@ PERF_TEST_P_(Filter2DPerfTest, TestPerformance)
 
     TEST_CYCLE()
     {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
     }
 
     // Comparison //////////////////////////////////////////////////////////////
@@ -145,7 +145,7 @@ PERF_TEST_P_(BoxFilterPerfTest, TestPerformance)
 
     TEST_CYCLE()
     {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
     }
 
     // Comparison //////////////////////////////////////////////////////////////
@@ -188,7 +188,7 @@ PERF_TEST_P_(BlurPerfTest, TestPerformance)
 
     TEST_CYCLE()
     {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
     }
 
     // Comparison //////////////////////////////////////////////////////////////
@@ -230,7 +230,7 @@ PERF_TEST_P_(GaussianBlurPerfTest, TestPerformance)
 
     TEST_CYCLE()
     {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
     }
 
     // Comparison //////////////////////////////////////////////////////////////
@@ -271,7 +271,7 @@ PERF_TEST_P_(MedianBlurPerfTest, TestPerformance)
 
     TEST_CYCLE()
     {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
     }
 
     // Comparison //////////////////////////////////////////////////////////////
@@ -314,7 +314,7 @@ PERF_TEST_P_(ErodePerfTest, TestPerformance)
 
     TEST_CYCLE()
     {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
     }
 
     // Comparison //////////////////////////////////////////////////////////////
@@ -357,7 +357,7 @@ PERF_TEST_P_(Erode3x3PerfTest, TestPerformance)
 
     TEST_CYCLE()
     {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
     }
 
     // Comparison //////////////////////////////////////////////////////////////
@@ -400,7 +400,7 @@ PERF_TEST_P_(DilatePerfTest, TestPerformance)
 
     TEST_CYCLE()
     {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
     }
 
     // Comparison //////////////////////////////////////////////////////////////
@@ -443,7 +443,7 @@ PERF_TEST_P_(Dilate3x3PerfTest, TestPerformance)
 
     TEST_CYCLE()
     {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
     }
 
     // Comparison //////////////////////////////////////////////////////////////
@@ -526,7 +526,7 @@ PERF_TEST_P_(CannyPerfTest, TestPerformance)
 
     TEST_CYCLE()
     {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
     }
 
     // Comparison //////////////////////////////////////////////////////////////
@@ -564,7 +564,7 @@ PERF_TEST_P_(EqHistPerfTest, TestPerformance)
 
     TEST_CYCLE()
     {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
     }
 
     // Comparison //////////////////////////////////////////////////////////////
@@ -830,7 +830,7 @@ PERF_TEST_P_(LUV2BGRPerfTest, TestPerformance)
 
     TEST_CYCLE()
     {
-        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+        c.apply(in_mat1, out_mat_gapi);
     }
 
     // Comparison //////////////////////////////////////////////////////////////
diff --git a/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp
index a5d13e661d..12554bf4b1 100644
--- a/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp
+++ b/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp
@@ -13,9 +13,101 @@
 namespace opencv_test
 {
 
-    INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid, SobelPerfTest,
-        Combine(Values(AbsExact().to_compare_f()),
-            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),  // add CV_32FC1 when ready
+INSTANTIATE_TEST_CASE_P(SepFilterPerfTestFluid_8U, SepFilterPerfTest,
+    Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+            Values(CV_8UC1, CV_8UC3),
+            Values(3),
+            Values(szVGA, sz720p, sz1080p),
+            Values(-1, CV_16S, CV_32F),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(SepFilterPerfTestFluid_other, SepFilterPerfTest,
+    Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+            Values(CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(3),
+            Values(szVGA, sz720p, sz1080p),
+            Values(-1, CV_32F),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(Filter2DPerfTestFluid, Filter2DPerfTest,
+    Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(3),                                     // add 4, 5, 7 when kernel is ready
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::BORDER_DEFAULT),
+            Values(-1, CV_32F),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(BoxFilterPerfTestFluid, BoxFilterPerfTest,
+    Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(3),                                     // add size=5, when kernel is ready
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::BORDER_DEFAULT),
+            Values(-1, CV_32F),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(BlurPerfTestFluid, BlurPerfTest,
+    Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(3),                                     // add size=5, when kernel is ready
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::BORDER_DEFAULT),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(GaussianBlurPerfTestFluid, GaussianBlurPerfTest,
+    Combine(Values(ToleranceFilter(1e-3f, 0.01).to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(3),                                     // add size=5, when kernel is ready
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(MedianBlurPerfTestFluid, MedianBlurPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(3),                                     // add size=5, when kernel is ready
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(ErodePerfTestFluid, ErodePerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(3),                                     // add size=5, when kernel is ready
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::MorphShapes::MORPH_RECT,
+                   cv::MorphShapes::MORPH_CROSS,
+                   cv::MorphShapes::MORPH_ELLIPSE),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+// GAPI/fluid does not support iterations parameter for the Erode kernel
+INSTANTIATE_TEST_CASE_P(DISABLED_Erode3x3PerfTestFluid, Erode3x3PerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(szVGA, sz720p, sz1080p),
+            Values(1, 2, 4),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(DilatePerfTestFluid, DilatePerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(3),                                     // add size=5, when kernel is ready
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::MorphShapes::MORPH_RECT,
+                   cv::MorphShapes::MORPH_CROSS,
+                   cv::MorphShapes::MORPH_ELLIPSE),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+// GAPI/fluid does not support iterations parameter for the Dilate kernel
+INSTANTIATE_TEST_CASE_P(DISABLED_Dilate3x3PerfTestFluid, Dilate3x3PerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+            Values(szVGA, sz720p, sz1080p),
+            Values(1, 2, 4),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid, SobelPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
             Values(3),                                     // add 5x5 once supported
             Values(szVGA, sz720p, sz1080p),
             Values(-1, CV_16S, CV_32F),
@@ -23,8 +115,8 @@ namespace opencv_test
             Values(1, 2),
             Values(cv::compile_args(IMGPROC_FLUID))));
 
-    INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid32F, SobelPerfTest,
-        Combine(Values(ToleranceFilter(1e-3f, 0.0).to_compare_f()),
+INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid32F, SobelPerfTest,
+    Combine(Values(ToleranceFilter(1e-3f, 0.0).to_compare_f()),
             Values(CV_32FC1),
             Values(3),                                     // add 5x5 once supported
             Values(szVGA, sz720p, sz1080p),
@@ -33,44 +125,44 @@ namespace opencv_test
             Values(1, 2),
             Values(cv::compile_args(IMGPROC_FLUID))));
 
-    INSTANTIATE_TEST_CASE_P(RGB2GrayPerfTestFluid, RGB2GrayPerfTest,
-        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
-                Values(szVGA, sz720p, sz1080p),
-                Values(cv::compile_args(IMGPROC_FLUID))));
-
-    INSTANTIATE_TEST_CASE_P(BGR2GrayPerfTestFluid, BGR2GrayPerfTest,
-        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
-                Values(szVGA, sz720p, sz1080p),
-                Values(cv::compile_args(IMGPROC_FLUID))));
-
-    INSTANTIATE_TEST_CASE_P(RGB2YUVPerfTestFluid, RGB2YUVPerfTest,
-        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
-                Values(szVGA, sz720p, sz1080p),
-                Values(cv::compile_args(IMGPROC_FLUID))));
-
-    INSTANTIATE_TEST_CASE_P(YUV2RGBPerfTestFluid, YUV2RGBPerfTest,
-        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
-                Values(szVGA, sz720p, sz1080p),
-                Values(cv::compile_args(IMGPROC_FLUID))));
-
-    INSTANTIATE_TEST_CASE_P(BGR2YUVPerfTestFluid, BGR2YUVPerfTest,
-        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
-                Values(szVGA, sz720p, sz1080p),
-                Values(cv::compile_args(IMGPROC_FLUID))));
-
-    INSTANTIATE_TEST_CASE_P(YUV2BGRPerfTestFluid, YUV2BGRPerfTest,
-        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
-                Values(szVGA, sz720p, sz1080p),
-                Values(cv::compile_args(IMGPROC_FLUID))));
-
-    INSTANTIATE_TEST_CASE_P(BGR2LUVPerfTestFluid, BGR2LUVPerfTest,
-        Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
-                Values(szVGA, sz720p, sz1080p),
-                Values(cv::compile_args(IMGPROC_FLUID))));
-
-    INSTANTIATE_TEST_CASE_P(RGB2LabPerfTestFluid, RGB2LabPerfTest,
-        Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
-                Values(szVGA, sz720p, sz1080p),
-                Values(cv::compile_args(IMGPROC_FLUID))));
+INSTANTIATE_TEST_CASE_P(RGB2GrayPerfTestFluid, RGB2GrayPerfTest,
+    Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(BGR2GrayPerfTestFluid, BGR2GrayPerfTest,
+    Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(RGB2YUVPerfTestFluid, RGB2YUVPerfTest,
+    Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(YUV2RGBPerfTestFluid, YUV2RGBPerfTest,
+    Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(BGR2YUVPerfTestFluid, BGR2YUVPerfTest,
+    Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(YUV2BGRPerfTestFluid, YUV2BGRPerfTest,
+    Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(BGR2LUVPerfTestFluid, BGR2LUVPerfTest,
+    Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(RGB2LabPerfTestFluid, RGB2LabPerfTest,
+    Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
+            Values(szVGA, sz720p, sz1080p),
+            Values(cv::compile_args(IMGPROC_FLUID))));
 
 }
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
index e2e4c4f754..6b27b13876 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
@@ -344,7 +344,7 @@ static const int maxKernelSize = 9;
 
 template<typename DST, typename SRC>
 static void run_boxfilter(Buffer &dst, const View &src, const cv::Size &kernelSize,
-                          const cv::Point& /* anchor */, bool normalize)
+                          const cv::Point& /* anchor */, bool normalize, float *buf[])
 {
     GAPI_Assert(kernelSize.width <= maxKernelSize);
     GAPI_Assert(kernelSize.width == kernelSize.height);
@@ -365,36 +365,53 @@ static void run_boxfilter(Buffer &dst, const View &src, const cv::Size &kernelSi
     int width = dst.length();
     int chan  = dst.meta().chan;
 
-    GAPI_DbgAssert(chan <= 4);
+    if (kernelSize.width == 3 && kernelSize.height == 3)
+    {
+        int y  = dst.y();
+        int y0 = dst.priv().writeStart();
 
-    for (int w=0; w < width; w++)
+        float  kx[3] = {1, 1, 1};
+        float *ky = kx;
+
+        float scale=1, delta=0;
+        if (normalize)
+            scale = 1/9.f;
+
+        run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
+    } else
     {
-        float sum[4] = {0, 0, 0, 0};
+        GAPI_DbgAssert(chan <= 4);
 
-        for (int i=0; i < kernel; i++)
+        for (int w=0; w < width; w++)
         {
-            for (int j=0; j < kernel; j++)
+            float sum[4] = {0, 0, 0, 0};
+
+            for (int i=0; i < kernel; i++)
             {
-                for (int c=0; c < chan; c++)
-                    sum[c] += in[i][(w + j - border)*chan + c];
+                for (int j=0; j < kernel; j++)
+                {
+                    for (int c=0; c < chan; c++)
+                        sum[c] += in[i][(w + j - border)*chan + c];
+                }
             }
-        }
 
-        for (int c=0; c < chan; c++)
-        {
-            float result = normalize? sum[c]/(kernel * kernel) : sum[c];
+            for (int c=0; c < chan; c++)
+            {
+                float result = normalize? sum[c]/(kernel * kernel) : sum[c];
 
-            out[w*chan + c] = saturate<DST>(result, rintf);
+                out[w*chan + c] = saturate<DST>(result, rintf);
+            }
         }
     }
 }
 
-GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, false)
+GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, true)
 {
     static const int Window = 3;
 
     static void run(const View &src, const cv::Size& kernelSize, const cv::Point& anchor,
-                    int /* borderType */, const cv::Scalar& /* borderValue */, Buffer &dst)
+                    int /* borderType */, const cv::Scalar& /* borderValue */, Buffer &dst,
+                    Buffer& scratch)
     {
         // TODO: support sizes 3, 5, 7, 9, ...
         GAPI_Assert(kernelSize.width  == 3 && kernelSize.height == 3);
@@ -404,14 +421,46 @@ GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, false)
 
         static const bool normalize = true;
 
+        int width = src.length();
+        int chan  = src.meta().chan;
+        int length = width * chan;
+
+        float *buf[3];
+        buf[0] = scratch.OutLine<float>();
+        buf[1] = buf[0] + length;
+        buf[2] = buf[1] + length;
+
         //     DST     SRC     OP             __VA_ARGS__
-        UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize);
-        UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize);
-        UNARY_( short,  short, run_boxfilter, dst, src, kernelSize, anchor, normalize);
+        UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
+        UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
+        UNARY_( short,  short, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
+        UNARY_( float,  float, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
 
         CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
     }
 
+    static void initScratch(const GMatDesc   & in,
+                            const cv::Size   & /* ksize */,
+                            const cv::Point  & /* anchor */,
+                                  int          /* borderType */,
+                            const cv::Scalar & /* borderValue */,
+                                  Buffer     & scratch)
+    {
+        int width = in.size.width;
+        int chan  = in.chan;
+
+        int buflen = width * chan * Window;  // work buffers
+
+        cv::gapi::own::Size bufsize(buflen, 1);
+        GMatDesc bufdesc = {CV_32F, 1, bufsize};
+        Buffer buffer(bufdesc);
+        scratch = std::move(buffer);
+    }
+
+    static void resetScratch(Buffer& /* scratch */)
+    {
+    }
+
     static Border getBorder(const cv::GMatDesc& /* src */,
                             const cv::Size    & /* kernelSize */,
                             const cv::Point   & /* anchor */,
@@ -422,18 +471,19 @@ GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, false)
     }
 };
 
-GAPI_FLUID_KERNEL(GFluidBoxFilter, cv::gapi::imgproc::GBoxFilter, false)
+GAPI_FLUID_KERNEL(GFluidBoxFilter, cv::gapi::imgproc::GBoxFilter, true)
 {
     static const int Window = 3;
 
     static void run(const     View  &    src,
                               int     /* ddepth */,
                     const cv::Size  &    kernelSize,
-                    const cv::Point &   anchor,
+                    const cv::Point &    anchor,
                               bool       normalize,
                               int     /* borderType */,
                     const cv::Scalar& /* borderValue */,
-                              Buffer&    dst)
+                              Buffer&    dst,
+                              Buffer&    scratch)
     {
         // TODO: support sizes 3, 5, 7, 9, ...
         GAPI_Assert(kernelSize.width  == 3 && kernelSize.height == 3);
@@ -441,17 +491,51 @@ GAPI_FLUID_KERNEL(GFluidBoxFilter, cv::gapi::imgproc::GBoxFilter, false)
         // TODO: suport non-trivial anchor
         GAPI_Assert(anchor.x == -1 && anchor.y == -1);
 
+        int width = src.length();
+        int chan  = src.meta().chan;
+        int length = width * chan;
+
+        float *buf[3];
+        buf[0] = scratch.OutLine<float>();
+        buf[1] = buf[0] + length;
+        buf[2] = buf[1] + length;
+
         //     DST     SRC     OP             __VA_ARGS__
-        UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize);
-        UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize);
-        UNARY_( short,  short, run_boxfilter, dst, src, kernelSize, anchor, normalize);
-        UNARY_( float, uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize);
-        UNARY_( float, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize);
-        UNARY_( float,  short, run_boxfilter, dst, src, kernelSize, anchor, normalize);
+        UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
+        UNARY_( float, uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
+        UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
+        UNARY_( float, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
+        UNARY_( short,  short, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
+        UNARY_( float,  short, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
+        UNARY_( float,  float, run_boxfilter, dst, src, kernelSize, anchor, normalize, buf);
 
         CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
     }
 
+    static void initScratch(const GMatDesc  & in,
+                                      int     /* ddepth */,
+                            const cv::Size  & /* kernelSize */,
+                            const cv::Point & /* anchor */,
+                                      bool    /*  normalize */,
+                                      int     /* borderType */,
+                            const cv::Scalar& /* borderValue */,
+                                  Buffer    &  scratch)
+    {
+        int width = in.size.width;
+        int chan  = in.chan;
+
+        int buflen = width * chan * Window;  // work buffers
+
+        cv::gapi::own::Size bufsize(buflen, 1);
+        GMatDesc bufdesc = {CV_32F, 1, bufsize};
+        Buffer buffer(bufdesc);
+        scratch = std::move(buffer);
+    }
+
+    static void resetScratch(Buffer& /* scratch */)
+    {
+    }
+
     static Border getBorder(const cv::GMatDesc& /* src */,
                                       int       /* ddepth */,
                             const cv::Size    & /* kernelSize */,
@@ -510,18 +594,21 @@ static void run_sepfilter(Buffer& dst, const View& src,
                           const float kx[], int kxLen,
                           const float ky[], int kyLen,
                           const cv::Point& /* anchor */,
-                          float delta=0)
+                          float scale, float delta,
+                          float *buf[])
 {
-    static const int maxLines = 9;
-    GAPI_Assert(kyLen <= maxLines);
+    constexpr int kMax = 11;
+    GAPI_Assert(kxLen <= kMax && kyLen <= kMax);
 
-    const SRC *in[ maxLines ];
+    const SRC *in[kMax];
           DST *out;
 
-    int border = (kyLen - 1) / 2;
+    int xborder = (kxLen - 1) / 2;
+    int yborder = (kyLen - 1) / 2;
+
     for (int i=0; i < kyLen; i++)
     {
-        in[i] = src.InLine<SRC>(i - border);
+        in[i] = src.InLine<SRC>(i - yborder);
     }
 
     out = dst.OutLine<DST>();
@@ -529,28 +616,52 @@ static void run_sepfilter(Buffer& dst, const View& src,
     int width = dst.length();
     int chan  = dst.meta().chan;
 
-    for (int w=0; w < width; w++)
+    // optimized 3x3 vs reference
+    if (kxLen == 3 && kyLen == 3)
     {
-        // TODO: make this cycle innermost
-        for (int c=0; c < chan; c++)
+        int y  = dst.y();
+        int y0 = dst.priv().writeStart();
+
+        int border = xborder;
+        run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
+    }
+    else
+    {
+        int length = chan * width;
+        int xshift = chan * xborder;
+
+        // horizontal pass
+
+        for (int k=0; k < kyLen; k++)
         {
-            float sum=0;
+            const SRC *inp[kMax] = {nullptr};
 
-            for (int i=0; i < kyLen; i++)
+            for (int j=0; j < kxLen; j++)
             {
-                float sumi=0;
+                inp[j] = in[k] + (j - xborder)*xshift;
+            }
 
+            for (int l=0; l < length; l++)
+            {
+                float sum = 0;
                 for (int j=0; j < kxLen; j++)
                 {
-                    sumi += in[i][(w + j - border)*chan + c] * kx[j];
+                    sum += inp[j][l] * kx[j];
                 }
-
-                sum += sumi * ky[i];
+                buf[k][l] = sum;
             }
+        }
 
-            float result = sum + delta;
+        // vertical pass
 
-            out[w*chan + c] = saturate<DST>(result, rintf);
+        for (int l=0; l < length; l++)
+        {
+            float sum = 0;
+            for (int k=0; k < kyLen; k++)
+            {
+                sum += buf[k][l] * ky[k];
+            }
+            out[l] = saturate<DST>(sum*scale + delta, rintf);
         }
     }
 }
@@ -580,21 +691,37 @@ GAPI_FLUID_KERNEL(GFluidSepFilter, cv::gapi::imgproc::GSepFilter, true)
         int kxLen = kernX.rows * kernX.cols;
         int kyLen = kernY.rows * kernY.cols;
 
+        GAPI_Assert(kyLen == 3);
+
         float *kx = scratch.OutLine<float>();
         float *ky = kx + kxLen;
 
+        int width = src.meta().size.width;
+        int chan  = src.meta().chan;
+        int length = width * chan;
+
+        float *buf[3];
+        buf[0] = ky + kyLen;
+        buf[1] = buf[0] + length;
+        buf[2] = buf[1] + length;
+
+        float scale = 1;
         float delta = static_cast<float>(delta_[0]);
 
         //     DST     SRC     OP             __VA_ARGS__
-        UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
-        UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
-        UNARY_( short,  short, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
-        UNARY_( float,  float, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
+        UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
+        UNARY_( short, uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
+        UNARY_( float, uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
+        UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
+        UNARY_( float, ushort, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
+        UNARY_( short,  short, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
+        UNARY_( float,  short, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
+        UNARY_( float,  float, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, scale, delta, buf);
 
         CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
     }
 
-    static void initScratch(const GMatDesc& /* in */,
+    static void initScratch(const GMatDesc&    in,
                                   int       /* ddepth */,
                             const Mat     &    kernX,
                             const Mat     &    kernY,
@@ -607,7 +734,13 @@ GAPI_FLUID_KERNEL(GFluidSepFilter, cv::gapi::imgproc::GSepFilter, true)
         int kxLen = kernX.rows * kernX.cols;
         int kyLen = kernY.rows * kernY.cols;
 
-        cv::gapi::own::Size bufsize(kxLen + kyLen, 1);
+        int width = in.size.width;
+        int chan  = in.chan;
+
+        int buflen = kxLen + kyLen +         // x, y kernels
+                     width * chan * Window;  // work buffers
+
+        cv::gapi::own::Size bufsize(buflen, 1);
         GMatDesc bufdesc = {CV_32F, 1, bufsize};
         Buffer buffer(bufdesc);
         scratch = std::move(buffer);
@@ -664,29 +797,47 @@ GAPI_FLUID_KERNEL(GFluidGaussBlur, cv::gapi::imgproc::GGaussBlur, true)
         auto *kx = scratch.OutLine<float>(); // cached kernX data
         auto *ky = kx + kxsize;              // cached kernY data
 
+        int width = src.meta().size.width;
+        int chan  = src.meta().chan;
+        int length = width * chan;
+
+        float *buf[3];
+        buf[0] = ky + kysize;
+        buf[1] = buf[0] + length;
+        buf[2] = buf[1] + length;
+
         auto  anchor = cv::Point(-1, -1);
-        float delta = 0.f;
+
+        float scale = 1;
+        float delta = 0;
 
         //     DST     SRC     OP             __VA_ARGS__
-        UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta);
-        UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta);
-        UNARY_( short,  short, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta);
+        UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf);
+        UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf);
+        UNARY_( short,  short, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf);
+        UNARY_( float,  float, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, scale, delta, buf);
 
         CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
     }
 
-    static void initScratch(const GMatDesc& /* in */,
+    static void initScratch(const GMatDesc&    in,
                             const cv::Size &   ksize,
                                   double       sigmaX,
                                   double       sigmaY,
-                                  int       /* borderType */,
-                            const cv::Scalar  & /* borderValue */,
+                                  int          /* borderType */,
+                            const cv::Scalar & /* borderValue */,
                                   Buffer  &    scratch)
     {
         int kxsize = ksize.width;
         int kysize = ksize.height;
 
-        cv::gapi::own::Size bufsize(kxsize + kysize, 1);
+        int width = in.size.width;
+        int chan  = in.chan;
+
+        int buflen = kxsize + kysize +       // x, y kernels
+                     width * chan * Window;  // work buffers
+
+        cv::gapi::own::Size bufsize(buflen, 1);
         GMatDesc bufdesc = {CV_32F, 1, bufsize};
         Buffer buffer(bufdesc);
         scratch = std::move(buffer);
@@ -767,7 +918,7 @@ static void run_sobel(Buffer& dst,
     int y0 = dst.priv().writeStart();
 //  int y1 = dst.priv().writeEnd();
 
-    run_sobel_row(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
+    run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
 }
 
 GAPI_FLUID_KERNEL(GFluidSobel, cv::gapi::imgproc::GSobel, true)
@@ -1102,6 +1253,7 @@ GAPI_FLUID_KERNEL(GFluidErode, cv::gapi::imgproc::GErode, true)
         UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
         UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
         UNARY_( short,  short, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
+        UNARY_( float,  float, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
 
         CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
     }
@@ -1109,7 +1261,7 @@ GAPI_FLUID_KERNEL(GFluidErode, cv::gapi::imgproc::GErode, true)
     static void initScratch(const GMatDesc& /* in */,
                             const Mat     &    kernel,
                             const Point   & /* anchor */,
-                              int           /* iterations */,
+                                  int       /* iterations */,
                                   int       /* borderType */,
                             const cv::Scalar  & /* borderValue */,
                                   Buffer  &    scratch)
@@ -1179,6 +1331,7 @@ GAPI_FLUID_KERNEL(GFluidDilate, cv::gapi::imgproc::GDilate, true)
         UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
         UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
         UNARY_( short,  short, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
+        UNARY_( float,  float, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
 
         CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
     }
@@ -1290,6 +1443,7 @@ GAPI_FLUID_KERNEL(GFluidMedianBlur, cv::gapi::imgproc::GMedianBlur, false)
         UNARY_(uchar , uchar , run_medianblur, dst, src, ksize);
         UNARY_(ushort, ushort, run_medianblur, dst, src, ksize);
         UNARY_( short,  short, run_medianblur, dst, src, ksize);
+        UNARY_( float,  float, run_medianblur, dst, src, ksize);
 
         CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
     }
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
index 9b217903ef..b536bbfbdc 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
@@ -57,34 +57,34 @@ void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef
     CV_CPU_DISPATCH(run_yuv2rgb_impl, (out, in, width, coef), CV_CPU_DISPATCH_MODES_ALL);
 }
 
-//---------------------
+//-------------------------
 //
-// Fluid kernels: Sobel
+// Fluid kernels: sepFilter
 //
-//---------------------
-
-#define RUN_SOBEL_ROW(DST, SRC)                                          \
-void run_sobel_row(DST out[], const SRC *in[], int width, int chan,      \
-                   const float kx[], const float ky[], int border,       \
-                   float scale, float delta, float *buf[],               \
-                   int y, int y0)                                        \
-{                                                                        \
-    CV_CPU_DISPATCH(run_sobel_row,                                       \
-        (out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0), \
-        CV_CPU_DISPATCH_MODES_ALL);                                      \
+//-------------------------
+
+#define RUN_SEPFILTER3X3_IMPL(DST, SRC)                                     \
+void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \
+                           const float kx[], const float ky[], int border,  \
+                           float scale, float delta,                        \
+                           float *buf[], int y, int y0)                     \
+{                                                                           \
+    CV_CPU_DISPATCH(run_sepfilter3x3_impl,                                  \
+        (out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0),    \
+        CV_CPU_DISPATCH_MODES_ALL);                                         \
 }
 
-RUN_SOBEL_ROW(uchar , uchar )
-RUN_SOBEL_ROW(ushort, ushort)
-RUN_SOBEL_ROW( short, uchar )
-RUN_SOBEL_ROW( short, ushort)
-RUN_SOBEL_ROW( short,  short)
-RUN_SOBEL_ROW( float, uchar )
-RUN_SOBEL_ROW( float, ushort)
-RUN_SOBEL_ROW( float,  short)
-RUN_SOBEL_ROW( float,  float)
-
-#undef RUN_SOBEL_ROW
+RUN_SEPFILTER3X3_IMPL(uchar , uchar )
+RUN_SEPFILTER3X3_IMPL( short, uchar )
+RUN_SEPFILTER3X3_IMPL( float, uchar )
+RUN_SEPFILTER3X3_IMPL(ushort, ushort)
+RUN_SEPFILTER3X3_IMPL( short, ushort)
+RUN_SEPFILTER3X3_IMPL( float, ushort)
+RUN_SEPFILTER3X3_IMPL( short,  short)
+RUN_SEPFILTER3X3_IMPL( float,  short)
+RUN_SEPFILTER3X3_IMPL( float,  float)
+
+#undef RUN_SEPFILTER3X3_IMPL
 
 } // namespace fliud
 } // namespace gapi
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
index 1b6f1b8c0d..3b41c52794 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
@@ -33,29 +33,29 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef
 
 void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]);
 
-//---------------------
+//-------------------------
 //
-// Fluid kernels: Sobel
+// Fluid kernels: sepFilter
 //
-//---------------------
+//-------------------------
 
-#define RUN_SOBEL_ROW(DST, SRC)                                     \
-void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \
-                   const float kx[], const float ky[], int border,  \
-                   float scale, float delta, float *buf[],          \
-                   int y, int y0);
+#define RUN_SEPFILTER3X3_IMPL(DST, SRC)                                     \
+void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \
+                           const float kx[], const float ky[], int border,  \
+                           float scale, float delta,                        \
+                           float *buf[], int y, int y0);
 
-RUN_SOBEL_ROW(uchar , uchar )
-RUN_SOBEL_ROW(ushort, ushort)
-RUN_SOBEL_ROW( short, uchar )
-RUN_SOBEL_ROW( short, ushort)
-RUN_SOBEL_ROW( short,  short)
-RUN_SOBEL_ROW( float, uchar )
-RUN_SOBEL_ROW( float, ushort)
-RUN_SOBEL_ROW( float,  short)
-RUN_SOBEL_ROW( float,  float)
+RUN_SEPFILTER3X3_IMPL(uchar , uchar )
+RUN_SEPFILTER3X3_IMPL( short, uchar )
+RUN_SEPFILTER3X3_IMPL( float, uchar )
+RUN_SEPFILTER3X3_IMPL(ushort, ushort)
+RUN_SEPFILTER3X3_IMPL( short, ushort)
+RUN_SEPFILTER3X3_IMPL( float, ushort)
+RUN_SEPFILTER3X3_IMPL( short,  short)
+RUN_SEPFILTER3X3_IMPL( float,  short)
+RUN_SEPFILTER3X3_IMPL( float,  float)
 
-#undef RUN_SOBEL_ROW
+#undef RUN_SEPFILTER3X3_IMPL
 
 }  // namespace fluid
 }  // namespace gapi
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
index c87be085a3..cd52b66189 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
@@ -9,6 +9,8 @@
 
 #if !defined(GAPI_STANDALONE)
 
+#include "gfluidimgproc_func.hpp"
+
 #include "opencv2/gapi/own/saturate.hpp"
 
 #include "opencv2/core.hpp"
@@ -16,6 +18,8 @@
 
 #include <cstdint>
 
+#include <vector>
+
 #ifdef __GNUC__
 #  pragma GCC diagnostic push
 #  pragma GCC diagnostic ignored "-Wstrict-overflow"
@@ -48,34 +52,66 @@ void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef
 
 void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]);
 
-//---------------------
+//-------------------------
 //
-// Fluid kernels: Sobel
+// Fluid kernels: sepFilter
 //
-//---------------------
-
-#define RUN_SOBEL_ROW(DST, SRC)                                     \
-void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \
-                  const float kx[], const float ky[], int border,   \
-                  float scale, float delta, float *buf[],           \
-                  int y, int y0);
-
-RUN_SOBEL_ROW(uchar , uchar )
-RUN_SOBEL_ROW(ushort, ushort)
-RUN_SOBEL_ROW( short, uchar )
-RUN_SOBEL_ROW( short, ushort)
-RUN_SOBEL_ROW( short,  short)
-RUN_SOBEL_ROW( float, uchar )
-RUN_SOBEL_ROW( float, ushort)
-RUN_SOBEL_ROW( float,  short)
-RUN_SOBEL_ROW( float,  float)
-
-#undef RUN_SOBEL_ROW
+//-------------------------
+
+#define RUN_SEPFILTER3X3_IMPL(DST, SRC)                                     \
+void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \
+                           const float kx[], const float ky[], int border,  \
+                           float scale, float delta,                        \
+                           float *buf[], int y, int y0);
+
+RUN_SEPFILTER3X3_IMPL(uchar , uchar )
+RUN_SEPFILTER3X3_IMPL( short, uchar )
+RUN_SEPFILTER3X3_IMPL( float, uchar )
+RUN_SEPFILTER3X3_IMPL(ushort, ushort)
+RUN_SEPFILTER3X3_IMPL( short, ushort)
+RUN_SEPFILTER3X3_IMPL( float, ushort)
+RUN_SEPFILTER3X3_IMPL( short,  short)
+RUN_SEPFILTER3X3_IMPL( float,  short)
+RUN_SEPFILTER3X3_IMPL( float,  float)
+
+#undef RUN_SEPFILTER3X3_IMPL
 
 //----------------------------------------------------------------------
 
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
+#if CV_SIMD
+template<typename SRC>
+static inline v_float32 vx_load_f32(const SRC* ptr)
+{
+    if (std::is_same<SRC,uchar>::value)
+    {
+        v_uint32 tmp = vx_load_expand_q(reinterpret_cast<const uchar*>(ptr));
+        return v_cvt_f32(v_reinterpret_as_s32(tmp));
+    }
+
+    if (std::is_same<SRC,ushort>::value)
+    {
+        v_uint32 tmp = vx_load_expand(reinterpret_cast<const ushort*>(ptr));
+        return v_cvt_f32(v_reinterpret_as_s32(tmp));
+    }
+
+    if (std::is_same<SRC,short>::value)
+    {
+        v_int32 tmp = vx_load_expand(reinterpret_cast<const short*>(ptr));
+        return v_cvt_f32(tmp);
+    }
+
+    if (std::is_same<SRC,float>::value)
+    {
+        v_float32 tmp = vx_load(reinterpret_cast<const float*>(ptr));
+        return tmp;
+    }
+
+    CV_Error(cv::Error::StsBadArg, "unsupported type");
+}
+#endif  // CV_SIMD
+
 //----------------------------------
 //
 // Fluid kernels: RGB2Gray, BGR2Gray
@@ -309,187 +345,359 @@ void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef
     }
 }
 
-//---------------------
+//-------------------------
 //
-// Fluid kernels: Sobel
+// Fluid kernels: sepFilter
 //
-//---------------------
+//-------------------------
 
-// Sobel 3x3: vertical pass
-template<bool noscale, typename DST>
-static void run_sobel3x3_vert(DST out[], int length, const float ky[],
-                float scale, float delta, const int r[], float *buf[])
+#if CV_SIMD
+// this variant not using buf[] appears 15% faster than reference any-2-float code below
+template<bool noscale, typename SRC>
+static void run_sepfilter3x3_any2float(float out[], const SRC *in[], int width, int chan,
+                                       const float kx[], const float ky[], int border,
+                                       float scale, float delta)
 {
-    float ky0 = ky[0],
-          ky1 = ky[1],
-          ky2 = ky[2];
+    const int length = width * chan;
+    const int shift = border * chan;
 
-    int r0 = r[0],
-        r1 = r[1],
-        r2 = r[2];
+    const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2];
+    const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2];
 
-#if CV_SIMD
-    // for floating-point output,
-    // manual vectoring may be not better than compiler's optimization
-#define EXPLICIT_SIMD_32F 0  // 1=vectorize 32f case explicitly, 0=don't
-#if     EXPLICIT_SIMD_32F
-    if (std::is_same<DST, float>::value && length >= v_int16::nlanes)
+    for (int l=0; l < length; )
     {
-        constexpr static int nlanes = v_float32::nlanes;
+        static const int nlanes = v_float32::nlanes;
 
-        for (int l=0; l < length; )
+        // main part
+        for ( ; l <= length - nlanes; l += nlanes)
         {
-            for (; l <= length - nlanes; l += nlanes)
+            auto xsum = [l, shift, kx0, kx1, kx2](const SRC i[])
             {
-                v_float32 sum = vx_load(&buf[r0][l]) * vx_setall_f32(ky0);
-                    sum = v_fma(vx_load(&buf[r1][l]),  vx_setall_f32(ky1), sum);
-                    sum = v_fma(vx_load(&buf[r2][l]),  vx_setall_f32(ky2), sum);
+                v_float32 t0 = vx_load_f32(&i[l - shift]);
+                v_float32 t1 = vx_load_f32(&i[l        ]);
+                v_float32 t2 = vx_load_f32(&i[l + shift]);
+                v_float32 t = t0 * vx_setall_f32(kx0);
+                    t = v_fma(t1,  vx_setall_f32(kx1), t);
+                    t = v_fma(t2,  vx_setall_f32(kx2), t);
+                return t;
+            };
+
+            v_float32 s0 = xsum(in[0]);
+            v_float32 s1 = xsum(in[1]);
+            v_float32 s2 = xsum(in[2]);
+            v_float32 s = s0 * vx_setall_f32(ky0);
+                s = v_fma(s1,  vx_setall_f32(ky1), s);
+                s = v_fma(s2,  vx_setall_f32(ky2), s);
+
+            if (!noscale)
+            {
+                s = v_fma(s, vx_setall_f32(scale), vx_setall_f32(delta));
+            }
+
+            v_store(&out[l], s);
+        }
+
+        // tail (if any)
+        if (l < length)
+        {
+            GAPI_DbgAssert(length >= nlanes);
+            l = length - nlanes;
+        }
+    }
+}
+
+// this variant with manually vectored rounding to short/ushort appears 10-40x faster
+// than reference code below
+template<bool noscale, typename DST, typename SRC>
+static void run_sepfilter3x3_any2short(DST out[], const SRC *in[], int width, int chan,
+                                       const float kx[], const float ky[], int border,
+                                       float scale, float delta,
+                                       float *buf[], int y, int y0)
+{
+    int r[3];
+    r[0] = (y - y0    ) % 3;  // buf[r[0]]: previous
+    r[1] = (y - y0 + 1) % 3;  //            this
+    r[2] = (y - y0 + 2) % 3;  //            next row
+
+    const int length = width * chan;
+    const int shift = border * chan;
+
+    const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2];
+    const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2];
+
+    // horizontal pass
+
+    int k0 = (y == y0)? 0: 2;
+
+    for (int k = k0; k < 3; k++)
+    {
+        //                      previous , this , next pixel
+        const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift};
+
+        // rely on compiler vectoring
+        for (int l=0; l < length; l++)
+        {
+            buf[r[k]][l] = s[0][l]*kx0 + s[1][l]*kx1 + s[2][l]*kx2;
+        }
+    }
+
+    // vertical pass
+
+    const int r0=r[0], r1=r[1], r2=r[2];
+
+    for (int l=0; l < length;)
+    {
+        constexpr int nlanes = v_int16::nlanes;
+
+        // main part of row
+        for (; l <= length - nlanes; l += nlanes)
+        {
+            v_float32 sum0 = vx_load(&buf[r0][l])            * vx_setall_f32(ky0);
+                sum0 = v_fma(vx_load(&buf[r1][l]),             vx_setall_f32(ky1), sum0);
+                sum0 = v_fma(vx_load(&buf[r2][l]),             vx_setall_f32(ky2), sum0);
 
-                if (!noscale)
-                {
-                    sum = v_fma(sum, vx_setall_f32(scale), vx_setall_f32(delta));
-                }
+            v_float32 sum1 = vx_load(&buf[r0][l + nlanes/2]) * vx_setall_f32(ky0);
+                sum1 = v_fma(vx_load(&buf[r1][l + nlanes/2]),  vx_setall_f32(ky1), sum1);
+                sum1 = v_fma(vx_load(&buf[r2][l + nlanes/2]),  vx_setall_f32(ky2), sum1);
 
-                v_store(reinterpret_cast<float*>(&out[l]), sum);
+            if (!noscale)
+            {
+                sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
+                sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
             }
 
-            if (l < length)
+            v_int32 isum0 = v_round(sum0),
+                    isum1 = v_round(sum1);
+
+            if (std::is_same<DST, short>::value)
             {
-                // tail: recalculate last pixels
-                GAPI_DbgAssert(length >= nlanes);
-                l = length - nlanes;
+                // signed short
+                v_int16 res = v_pack(isum0, isum1);
+                v_store(reinterpret_cast<short*>(&out[l]), res);
+            } else
+            {
+                // unsigned short
+                v_uint16 res = v_pack_u(isum0, isum1);
+                v_store(reinterpret_cast<ushort*>(&out[l]), res);
             }
         }
 
-        return;
+        // tail (if any)
+        if (l < length)
+        {
+            GAPI_DbgAssert(length >= nlanes);
+            l = length - nlanes;
+        }
     }
-#endif
+}
 
-    if ((std::is_same<DST, short>::value || std::is_same<DST, ushort>::value)
-        && length >= v_int16::nlanes)
+// this code with manually vectored rounding to uchar is 10-40x faster than reference
+template<bool noscale, typename SRC>
+static void run_sepfilter3x3_any2char(uchar out[], const SRC *in[], int width, int chan,
+                                      const float kx[], const float ky[], int border,
+                                      float scale, float delta,
+                                      float *buf[], int y, int y0)
+{
+    int r[3];
+    r[0] = (y - y0    ) % 3;  // buf[r[0]]: previous
+    r[1] = (y - y0 + 1) % 3;  //            this
+    r[2] = (y - y0 + 2) % 3;  //            next row
+
+    const int length = width * chan;
+    const int shift = border * chan;
+
+    const float kx0 = kx[0], kx1 = kx[1], kx2 = kx[2];
+    const float ky0 = ky[0], ky1 = ky[1], ky2 = ky[2];
+
+    // horizontal pass
+
+    int k0 = (y == y0)? 0: 2;
+
+    for (int k = k0; k < 3; k++)
     {
-        constexpr static int nlanes = v_int16::nlanes;
+        //                      previous , this , next pixel
+        const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift};
 
-        for (int l=0; l < length; )
+        // rely on compiler vectoring
+        for (int l=0; l < length; l++)
         {
-            for (; l <= length - nlanes; l += nlanes)
-            {
-                v_float32 sum0 = vx_load(&buf[r0][l])            * vx_setall_f32(ky0);
-                    sum0 = v_fma(vx_load(&buf[r1][l]),             vx_setall_f32(ky1), sum0);
-                    sum0 = v_fma(vx_load(&buf[r2][l]),             vx_setall_f32(ky2), sum0);
-
-                v_float32 sum1 = vx_load(&buf[r0][l + nlanes/2]) * vx_setall_f32(ky0);
-                    sum1 = v_fma(vx_load(&buf[r1][l + nlanes/2]),  vx_setall_f32(ky1), sum1);
-                    sum1 = v_fma(vx_load(&buf[r2][l + nlanes/2]),  vx_setall_f32(ky2), sum1);
-
-                if (!noscale)
-                {
-                    sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
-                    sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
-                }
-
-                v_int32 isum0 = v_round(sum0),
-                        isum1 = v_round(sum1);
-
-                if (std::is_same<DST, short>::value)
-                {
-                    // signed short
-                    v_int16 res = v_pack(isum0, isum1);
-                    v_store(reinterpret_cast<short*>(&out[l]), res);
-                } else
-                {
-                    // unsigned short
-                    v_uint16 res = v_pack_u(isum0, isum1);
-                    v_store(reinterpret_cast<ushort*>(&out[l]), res);
-                }
-            }
+            buf[r[k]][l] = s[0][l]*kx0 + s[1][l]*kx1 + s[2][l]*kx2;
+        }
+    }
 
-            if (l < length)
+    // vertical pass
+
+    const int r0=r[0], r1=r[1], r2=r[2];
+
+    for (int l=0; l < length;)
+    {
+        constexpr int nlanes = v_uint8::nlanes;
+
+        // main part of row
+        for (; l <= length - nlanes; l += nlanes)
+        {
+            v_float32 sum0 = vx_load(&buf[r0][l])              * vx_setall_f32(ky0);
+                sum0 = v_fma(vx_load(&buf[r1][l]),               vx_setall_f32(ky1), sum0);
+                sum0 = v_fma(vx_load(&buf[r2][l]),               vx_setall_f32(ky2), sum0);
+
+            v_float32 sum1 = vx_load(&buf[r0][l +   nlanes/4]) * vx_setall_f32(ky0);
+                sum1 = v_fma(vx_load(&buf[r1][l +   nlanes/4]),  vx_setall_f32(ky1), sum1);
+                sum1 = v_fma(vx_load(&buf[r2][l +   nlanes/4]),  vx_setall_f32(ky2), sum1);
+
+            v_float32 sum2 = vx_load(&buf[r0][l + 2*nlanes/4]) * vx_setall_f32(ky0);
+                sum2 = v_fma(vx_load(&buf[r1][l + 2*nlanes/4]),  vx_setall_f32(ky1), sum2);
+                sum2 = v_fma(vx_load(&buf[r2][l + 2*nlanes/4]),  vx_setall_f32(ky2), sum2);
+
+            v_float32 sum3 = vx_load(&buf[r0][l + 3*nlanes/4]) * vx_setall_f32(ky0);
+                sum3 = v_fma(vx_load(&buf[r1][l + 3*nlanes/4]),  vx_setall_f32(ky1), sum3);
+                sum3 = v_fma(vx_load(&buf[r2][l + 3*nlanes/4]),  vx_setall_f32(ky2), sum3);
+
+            if (!noscale)
             {
-                // tail: recalculate last pixels
-                GAPI_DbgAssert(length >= nlanes);
-                l = length - nlanes;
+                sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
+                sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
+                sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta));
+                sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta));
             }
+
+            v_int32 isum0 = v_round(sum0),
+                    isum1 = v_round(sum1),
+                    isum2 = v_round(sum2),
+                    isum3 = v_round(sum3);
+
+            v_int16 ires0 = v_pack(isum0, isum1),
+                    ires1 = v_pack(isum2, isum3);
+
+            v_uint8 res = v_pack_u(ires0, ires1);
+            v_store(reinterpret_cast<uchar*>(&out[l]), res);
         }
 
-        return;
+        // tail (if any)
+        if (l < length)
+        {
+            GAPI_DbgAssert(length >= nlanes);
+            l = length - nlanes;
+        }
     }
+}
 
-    if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
+// this code manually vectored for int16 not much faster than generic any-to-short code above
+#define USE_SEPFILTER3X3_CHAR2SHORT 1
+
+#if USE_SEPFILTER3X3_CHAR2SHORT
+template<bool noscale>
+static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int width, int chan,
+                                        const float kx[], const float ky[], int border,
+                                        float scale, float delta,
+                                        float *buf[], int y, int y0)
+{
+    const schar ikx0 = saturate<schar>(kx[0], rintf);
+    const schar ikx1 = saturate<schar>(kx[1], rintf);
+    const schar ikx2 = saturate<schar>(kx[2], rintf);
+
+    const schar iky0 = saturate<schar>(ky[0], rintf);
+    const schar iky1 = saturate<schar>(ky[1], rintf);
+    const schar iky2 = saturate<schar>(ky[2], rintf);
+
+    const short iscale = saturate<short>(scale * (1 << 15), rintf);
+    const short idelta = saturate<short>(delta            , rintf);
+
+    // check if this code is applicable
+    if (ikx0 != kx[0] || ikx1 != kx[1] || ikx2 != kx[2] ||
+        iky0 != ky[0] || iky1 != ky[1] || iky2 != ky[2] ||
+        idelta != delta ||
+        std::abs(scale) > 1 || std::abs(scale) < 0.01)
     {
-        constexpr static int nlanes = v_uint8::nlanes;
+        run_sepfilter3x3_any2short<noscale>(out, in, width, chan, kx, ky, border, scale, delta,
+                                            buf, y, y0);
+        return;
+    }
+
+    short *ibuf[3];
+    ibuf[0] = reinterpret_cast<short*>(buf[0]);
+    ibuf[1] = reinterpret_cast<short*>(buf[1]);
+    ibuf[2] = reinterpret_cast<short*>(buf[2]);
 
-        for (int l=0; l < length; )
+    int r[3];
+    r[0] = (y - y0    ) % 3;  // buf[r[0]]: previous
+    r[1] = (y - y0 + 1) % 3;  //            this
+    r[2] = (y - y0 + 2) % 3;  //            next row
+
+    const int length = width * chan;
+    const int shift = border * chan;
+
+    // horizontal pass
+
+    int k0 = (y == y0)? 0: 2;
+
+    for (int k = k0; k < 3; k++)
+    {
+        for (int l=0; l < length;)
         {
+            constexpr int nlanes = v_int16::nlanes;
+
+            // main part of output row
             for (; l <= length - nlanes; l += nlanes)
             {
-                v_float32 sum0 = vx_load(&buf[r0][l])              * vx_setall_f32(ky0);
-                    sum0 = v_fma(vx_load(&buf[r1][l]),               vx_setall_f32(ky1), sum0);
-                    sum0 = v_fma(vx_load(&buf[r2][l]),               vx_setall_f32(ky2), sum0);
-
-                v_float32 sum1 = vx_load(&buf[r0][l +   nlanes/4]) * vx_setall_f32(ky0);
-                    sum1 = v_fma(vx_load(&buf[r1][l +   nlanes/4]),  vx_setall_f32(ky1), sum1);
-                    sum1 = v_fma(vx_load(&buf[r2][l +   nlanes/4]),  vx_setall_f32(ky2), sum1);
-
-                v_float32 sum2 = vx_load(&buf[r0][l + 2*nlanes/4]) * vx_setall_f32(ky0);
-                    sum2 = v_fma(vx_load(&buf[r1][l + 2*nlanes/4]),  vx_setall_f32(ky1), sum2);
-                    sum2 = v_fma(vx_load(&buf[r2][l + 2*nlanes/4]),  vx_setall_f32(ky2), sum2);
-
-                v_float32 sum3 = vx_load(&buf[r0][l + 3*nlanes/4]) * vx_setall_f32(ky0);
-                    sum3 = v_fma(vx_load(&buf[r1][l + 3*nlanes/4]),  vx_setall_f32(ky1), sum3);
-                    sum3 = v_fma(vx_load(&buf[r2][l + 3*nlanes/4]),  vx_setall_f32(ky2), sum3);
-
-                if (!noscale)
-                {
-                    sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
-                    sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
-                    sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta));
-                    sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta));
-                }
-
-                v_int32 isum0 = v_round(sum0),
-                        isum1 = v_round(sum1),
-                        isum2 = v_round(sum2),
-                        isum3 = v_round(sum3);
-
-                v_int16 ires0 = v_pack(isum0, isum1),
-                        ires1 = v_pack(isum2, isum3);
-
-                v_uint8 res = v_pack_u(ires0, ires1);
-                v_store(reinterpret_cast<uchar*>(&out[l]), res);
+                v_uint16 t0 = vx_load_expand(&in[k][l - shift]);  // previous
+                v_uint16 t1 = vx_load_expand(&in[k][l        ]);  // current
+                v_uint16 t2 = vx_load_expand(&in[k][l + shift]);  // next pixel
+                v_int16 t = v_reinterpret_as_s16(t0) * vx_setall_s16(ikx0) +
+                            v_reinterpret_as_s16(t1) * vx_setall_s16(ikx1) +
+                            v_reinterpret_as_s16(t2) * vx_setall_s16(ikx2);
+                v_store(&ibuf[r[k]][l], t);
             }
 
+            // tail (if any)
             if (l < length)
             {
-                // tail: recalculate last pixels
                 GAPI_DbgAssert(length >= nlanes);
                 l = length - nlanes;
             }
         }
-
-        return;
     }
-#endif
 
-    // reference code
-    for (int l=0; l < length; l++)
+    // vertical pass
+
+    for (int l=0; l < length;)
     {
-        float sum = buf[r0][l]*ky0 + buf[r1][l]*ky1 + buf[r2][l]*ky2;
+        constexpr int nlanes = v_int16::nlanes;
 
-        if (!noscale)
+        // main part of output row
+        for (; l <= length - nlanes; l += nlanes)
         {
-            sum = sum*scale + delta;
+            v_int16 s0 = vx_load(&ibuf[r[0]][l]);  // previous
+            v_int16 s1 = vx_load(&ibuf[r[1]][l]);  // current
+            v_int16 s2 = vx_load(&ibuf[r[2]][l]);  // next row
+            v_int16 s = s0 * vx_setall_s16(iky0) +
+                        s1 * vx_setall_s16(iky1) +
+                        s2 * vx_setall_s16(iky2);
+
+            if (!noscale)
+            {
+                s = v_mul_hi(s << 1, vx_setall_s16(iscale)) + vx_setall_s16(idelta);
+            }
+
+            v_store(&out[l], s);
         }
 
-        out[l] = cv::gapi::own::saturate<DST>(sum, rintf);
+        // tail (if any)
+        if (l < length)
+        {
+            GAPI_DbgAssert(length >= nlanes);
+            l = length - nlanes;
+        }
     }
 }
+#endif
+
+#endif  // CV_SIMD
 
-template<typename DST, typename SRC>
-static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan,
-                           const float kx[], const float ky[], int border,
-                           float scale, float delta, float *buf[],
-                           int y, int y0)
+template<bool noscale, typename DST, typename SRC>
+static void run_sepfilter3x3_reference(DST out[], const SRC *in[], int width, int chan,
+                                       const float kx[], const float ky[], int border,
+                                       float scale, float delta,
+                                       float *buf[], int y, int y0)
 {
     int r[3];
     r[0] = (y - y0)     % 3;  // buf[r[0]]: previous
@@ -497,19 +705,21 @@ static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan,
     r[2] = (y - y0 + 2) % 3;  //            next row
 
     int length = width * chan;
+    int shift = border * chan;
 
     // horizontal pass
 
     // full horizontal pass is needed only if very 1st row in ROI;
     // for 2nd and further rows, it is enough to convolve only the
     // "next" row - as we can reuse buffers from previous calls to
-    // this kernel (note that Fluid processes rows consequently)
+    // this kernel (Fluid does rows consequently: y=y0, y0+1, ...)
+
     int k0 = (y == y0)? 0: 2;
 
     for (int k = k0; k < 3; k++)
     {
-        //                             previous, this , next pixel
-        const SRC *s[3] = {in[k] - border*chan , in[k], in[k] + border*chan};
+        //                      previous , this , next pixel
+        const SRC *s[3] = {in[k] - shift , in[k], in[k] + shift};
 
         // rely on compiler vectoring
         for (int l=0; l < length; l++)
@@ -519,37 +729,121 @@ static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan,
     }
 
     // vertical pass
-    if (scale == 1 && delta == 0)
+
+    for (int l=0; l < length; l++)
+    {
+        float sum = buf[r[0]][l]*ky[0] + buf[r[1]][l]*ky[1] + buf[r[2]][l]*ky[2];
+
+        if (!noscale)
+        {
+            sum = sum*scale + delta;
+        }
+
+        out[l] = saturate<DST>(sum, rintf);
+    }
+}
+
+template<bool noscale, typename DST, typename SRC>
+static void run_sepfilter3x3_code(DST out[], const SRC *in[], int width, int chan,
+                                  const float kx[], const float ky[], int border,
+                                  float scale, float delta,
+                                  float *buf[], int y, int y0)
+{
+#if CV_SIMD
+    int length = width * chan;
+
+    // length variable may be unused if types do not match at 'if' statements below
+    (void) length;
+
+#if USE_SEPFILTER3X3_CHAR2SHORT
+    if (std::is_same<DST, short>::value && std::is_same<SRC, uchar>::value &&
+        length >= v_int16::nlanes)
+    {
+        // only slightly faster than more generic any-to-short (see below)
+        run_sepfilter3x3_char2short<noscale>(reinterpret_cast<short*>(out),
+                                             reinterpret_cast<const uchar**>(in),
+                                             width, chan, kx, ky, border, scale, delta,
+                                             buf, y, y0);
+        return;
+    }
+#endif
+
+    if (std::is_same<DST, float>::value && std::is_same<SRC, float>::value &&
+        length >= v_float32::nlanes)
+    {
+        // appears 15% faster than reference any-to-float code (called below)
+        run_sepfilter3x3_any2float<noscale>(reinterpret_cast<float*>(out), in,
+                                            width, chan, kx, ky, border, scale, delta);
+        return;
+    }
+
+    if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
+    {
+        // appears 10-40x faster than reference due to much faster rounding
+        run_sepfilter3x3_any2short<noscale>(reinterpret_cast<short*>(out), in,
+                                            width, chan, kx, ky, border, scale, delta,
+                                            buf, y, y0);
+        return;
+    }
+
+    if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
     {
-        constexpr static bool noscale = true;  // omit scaling
-        run_sobel3x3_vert<noscale, DST>(out, length, ky, scale, delta, r, buf);
-    } else
+        // appears 10-40x faster than reference due to much faster rounding
+        run_sepfilter3x3_any2short<noscale>(reinterpret_cast<ushort*>(out), in,
+                                            width, chan, kx, ky, border, scale, delta,
+                                            buf, y, y0);
+        return;
+    }
+
+    if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
     {
-        constexpr static bool noscale = false;  // do scaling
-        run_sobel3x3_vert<noscale, DST>(out, length, ky, scale, delta, r, buf);
+        // appears 10-40x faster than reference due to much faster rounding
+        run_sepfilter3x3_any2char<noscale>(reinterpret_cast<uchar*>(out), in,
+                                           width, chan, kx, ky, border, scale, delta,
+                                           buf, y, y0);
+        return;
     }
+#endif  // CV_SIMD
+
+    // reference code is quite fast for any-to-float case,
+    // but not for any-to-integral due to very slow rounding
+    run_sepfilter3x3_reference<noscale>(out, in, width, chan, kx, ky, border,
+                                        scale, delta, buf, y, y0);
 }
 
-#define RUN_SOBEL_ROW(DST, SRC)                                                    \
-void run_sobel_row(DST out[], const SRC *in[], int width, int chan,                \
-                   const float kx[], const float ky[], int border,                 \
-                   float scale, float delta, float *buf[],                         \
-                   int y, int y0)                                                  \
-{                                                                                  \
-    run_sobel_impl(out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0); \
+#define RUN_SEPFILTER3X3_IMPL(DST, SRC)                                      \
+void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan,  \
+                           const float kx[], const float ky[], int border,   \
+                           float scale, float delta,                         \
+                           float *buf[], int y, int y0)                      \
+{                                                                            \
+    if (scale == 1 && delta == 0)                                            \
+    {                                                                        \
+        constexpr bool noscale = true;                                       \
+        run_sepfilter3x3_code<noscale>(out, in, width, chan, kx, ky, border, \
+                                       scale, delta, buf, y, y0);            \
+    }                                                                        \
+    else                                                                     \
+    {                                                                        \
+        constexpr bool noscale = false;                                      \
+        run_sepfilter3x3_code<noscale>(out, in, width, chan, kx, ky, border, \
+                                       scale, delta, buf, y, y0);            \
+    }                                                                        \
 }
 
-RUN_SOBEL_ROW(uchar , uchar )
-RUN_SOBEL_ROW(ushort, ushort)
-RUN_SOBEL_ROW( short, uchar )
-RUN_SOBEL_ROW( short, ushort)
-RUN_SOBEL_ROW( short,  short)
-RUN_SOBEL_ROW( float, uchar )
-RUN_SOBEL_ROW( float, ushort)
-RUN_SOBEL_ROW( float,  short)
-RUN_SOBEL_ROW( float,  float)
-
-#undef RUN_SOBEL_ROW
+RUN_SEPFILTER3X3_IMPL(uchar , uchar )
+RUN_SEPFILTER3X3_IMPL( short, uchar )
+RUN_SEPFILTER3X3_IMPL( float, uchar )
+RUN_SEPFILTER3X3_IMPL(ushort, ushort)
+RUN_SEPFILTER3X3_IMPL( short, ushort)
+RUN_SEPFILTER3X3_IMPL( float, ushort)
+RUN_SEPFILTER3X3_IMPL( short,  short)
+RUN_SEPFILTER3X3_IMPL( float,  short)
+RUN_SEPFILTER3X3_IMPL( float,  float)
+
+#undef RUN_SEPFILTER3X3_IMPL
+
+//------------------------------------------------------------------------------
 
 #endif  // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY