From beb14c70da8a573d4157348a6800bd6e67d56d5c Mon Sep 17 00:00:00 2001
From: Anna Khakimova <anna.khakimova@intel.com>
Date: Fri, 8 Nov 2019 11:15:40 +0300
Subject: [PATCH] GAPI Fluid: SIMD optimization for sep filters 5x5 kernel size
 (gaussBlur)

---
 .../gapi/src/backends/fluid/gfluidimgproc.cpp |  14 +-
 .../fluid/gfluidimgproc_func.dispatch.cpp     |  22 +
 .../src/backends/fluid/gfluidimgproc_func.hpp |  19 +
 .../fluid/gfluidimgproc_func.simd.hpp         | 599 +++++++++++++++++-
 4 files changed, 640 insertions(+), 14 deletions(-)
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
index ba70954476..d1445675fa 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
@@ -599,6 +599,7 @@ static void run_sepfilter(Buffer& dst, const View& src,
 {
     constexpr int kMax = 11;
     GAPI_Assert(kxLen <= kMax && kyLen <= kMax);
+    GAPI_Assert(kxLen == kyLen);
 
     const SRC *in[kMax];
           DST *out;
@@ -625,6 +626,13 @@ static void run_sepfilter(Buffer& dst, const View& src,
         int border = xborder;
         run_sepfilter3x3_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
     }
+    else if (kxLen == 5 && kyLen == 5)
+    {
+        int y = dst.y();
+        int y0 = dst.priv().writeStart();
+
+        run_sepfilter5x5_impl(out, in, width, chan, kx, ky, xborder, scale, delta, buf, y, y0);
+    }
     else
     {
         int length = chan * width;
@@ -788,7 +796,9 @@ GAPI_FLUID_KERNEL(GFluidGaussBlur, cv::gapi::imgproc::GGaussBlur, true)
                               Buffer&    dst,
                               Buffer&    scratch)
     {
-        int kxsize = ksize.width;
+        GAPI_Assert(ksize.height == ksize.width);
+        GAPI_Assert((ksize.height == 3) || (ksize.height == 5));
+        const int kxsize = ksize.width;
         int kysize = ksize.height;
 
         auto *kx = scratch.OutLine<float>(); // cached kernX data
@@ -801,7 +811,7 @@ GAPI_FLUID_KERNEL(GFluidGaussBlur, cv::gapi::imgproc::GGaussBlur, true)
         constexpr int buffSize = 5;
         GAPI_Assert(ksize.height <= buffSize);
 
-        float *buf[buffSize]{};
+        float *buf[buffSize] = { nullptr };
 
         buf[0] = ky + kysize;
         for (int i = 1; i < ksize.height; ++i)
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
index 3ea4676dde..7b6dfb11f2 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
@@ -119,6 +119,28 @@ RUN_SEPFILTER3X3_IMPL( float,  float)
 
 #undef RUN_SEPFILTER3X3_IMPL
 
+#define RUN_SEPFILTER5x5_IMPL(DST, SRC)                                     \
+void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, \
+                           const float kx[], const float ky[], int border,  \
+                           float scale, float delta,                        \
+                           float *buf[], int y, int y0)                     \
+{                                                                           \
+    CV_CPU_DISPATCH(run_sepfilter5x5_impl,                                  \
+        (out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0),    \
+        CV_CPU_DISPATCH_MODES_ALL);                                         \
+}
+
+RUN_SEPFILTER5x5_IMPL(uchar, uchar)
+RUN_SEPFILTER5x5_IMPL(short, uchar)
+RUN_SEPFILTER5x5_IMPL(float, uchar)
+RUN_SEPFILTER5x5_IMPL(ushort, ushort)
+RUN_SEPFILTER5x5_IMPL(short, ushort)
+RUN_SEPFILTER5x5_IMPL(float, ushort)
+RUN_SEPFILTER5x5_IMPL(short, short)
+RUN_SEPFILTER5x5_IMPL(float, short)
+RUN_SEPFILTER5x5_IMPL(float, float)
+
+#undef RUN_SEPFILTER5x5_IMPL
 //-------------------------
 //
 // Fluid kernels: Filter 2D
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
index b89ccd8988..79715d1754 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
@@ -78,6 +78,25 @@ RUN_SEPFILTER3X3_IMPL( float,  float)
 
 #undef RUN_SEPFILTER3X3_IMPL
 
+#define RUN_SEPFILTER5x5_IMPL(DST, SRC)                                     \
+void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, \
+                           const float kx[], const float ky[], int border,  \
+                           float scale, float delta,                        \
+                           float *buf[], int y, int y0);
+
+
+RUN_SEPFILTER5x5_IMPL(uchar, uchar)
+RUN_SEPFILTER5x5_IMPL(short, uchar)
+RUN_SEPFILTER5x5_IMPL(float, uchar)
+RUN_SEPFILTER5x5_IMPL(ushort, ushort)
+RUN_SEPFILTER5x5_IMPL(short, ushort)
+RUN_SEPFILTER5x5_IMPL(float, ushort)
+RUN_SEPFILTER5x5_IMPL(short, short)
+RUN_SEPFILTER5x5_IMPL(float, short)
+RUN_SEPFILTER5x5_IMPL(float, float)
+
+#undef RUN_SEPFILTER5x5_IMPL
+
 //-------------------------
 //
 // Fluid kernels: Filter 2D
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
index ec1c8da971..e0d10e4ecd 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
@@ -100,6 +100,23 @@ RUN_SEPFILTER3X3_IMPL( float,  float)
 
 #undef RUN_SEPFILTER3X3_IMPL
 
+#define RUN_SEPFILTER5x5_IMPL(DST, SRC)                                     \
+void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, \
+                           const float kx[], const float ky[], int border,  \
+                           float scale, float delta,                        \
+                           float *buf[], int y, int y0);
+
+RUN_SEPFILTER5x5_IMPL(uchar, uchar)
+RUN_SEPFILTER5x5_IMPL(short, uchar)
+RUN_SEPFILTER5x5_IMPL(float, uchar)
+RUN_SEPFILTER5x5_IMPL(ushort, ushort)
+RUN_SEPFILTER5x5_IMPL(short, ushort)
+RUN_SEPFILTER5x5_IMPL(float, ushort)
+RUN_SEPFILTER5x5_IMPL(short, short)
+RUN_SEPFILTER5x5_IMPL(float, short)
+RUN_SEPFILTER5x5_IMPL(float, float)
+
+#undef RUN_SEPFILTER5x5_IMPL
 //-------------------------
 //
 // Fluid kernels: Filter 2D
@@ -978,11 +995,11 @@ void run_rgb2yuv422_impl(uchar out[], const uchar in[], int width)
     }
 }
 
-//-------------------------
+//-----------------------------
 //
-// Fluid kernels: sepFilter
+// Fluid kernels: sepFilter 3x3
 //
-//-------------------------
+//-----------------------------
 
 #if CV_SIMD
 // this variant not using buf[] appears 15% faster than reference any-2-float code below
@@ -1322,7 +1339,7 @@ static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int widt
         }
     }
 }
-#endif
+#endif //USE_SEPFILTER3X3_CHAR2SHORT
 
 #endif  // CV_SIMD
 
@@ -1464,18 +1481,576 @@ void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan,  \
     }                                                                        \
 }
 
-RUN_SEPFILTER3X3_IMPL(uchar , uchar )
-RUN_SEPFILTER3X3_IMPL( short, uchar )
-RUN_SEPFILTER3X3_IMPL( float, uchar )
+RUN_SEPFILTER3X3_IMPL(uchar, uchar)
+RUN_SEPFILTER3X3_IMPL(short, uchar)
+RUN_SEPFILTER3X3_IMPL(float, uchar)
 RUN_SEPFILTER3X3_IMPL(ushort, ushort)
-RUN_SEPFILTER3X3_IMPL( short, ushort)
-RUN_SEPFILTER3X3_IMPL( float, ushort)
-RUN_SEPFILTER3X3_IMPL( short,  short)
-RUN_SEPFILTER3X3_IMPL( float,  short)
-RUN_SEPFILTER3X3_IMPL( float,  float)
+RUN_SEPFILTER3X3_IMPL(short, ushort)
+RUN_SEPFILTER3X3_IMPL(float, ushort)
+RUN_SEPFILTER3X3_IMPL(short, short)
+RUN_SEPFILTER3X3_IMPL(float, short)
+RUN_SEPFILTER3X3_IMPL(float, float)
 
 #undef RUN_SEPFILTER3X3_IMPL
 
+//-----------------------------
+//
+// Fluid kernels: sepFilter 5x5
+//
+//-----------------------------
+
+#if CV_SIMD
+
+// this code with manually vectored rounding to uchar
+template<bool noscale, typename SRC>
+static void run_sepfilter5x5_any2char(uchar out[], const SRC *in[], int width, int chan,
+                                      const float kx[], const float ky[], int border,
+                                      float scale, float delta,
+                                      float *buf[], int y, int y0)
+{
+    constexpr int kxLen = 5;
+    constexpr int kyLen = kxLen;
+    constexpr int buffSize = 5;
+
+    int r[buffSize];
+    for (int n = 0; n < buffSize; ++n)
+    {
+        r[n] = (y - y0 + n) % 5;  // previous, this, next rows
+    }
+
+    const int length = width * chan;
+    const int shift = chan;
+
+    // horizontal pass
+
+    int k0 = (y == y0) ? 0 : 4;
+
+    for (int k = k0; k < kxLen; ++k)
+    {
+        const SRC *s[kxLen] = { nullptr };
+
+        for (int i = 0; i < kxLen; ++i)
+        {
+            //  previous , this , next pixels
+            s[i] = in[k] + (i - border)*shift;
+        }
+
+        // rely on compiler vectoring
+        for (int l = 0; l < length; ++l)
+        {
+            float sum = 0;
+            for (int j = 0; j < kxLen; ++j)
+            {
+                sum += s[j][l] * kx[j];
+            }
+            buf[r[k]][l] = sum;
+        }
+    }
+
+    // vertical pass
+
+    constexpr int nlanes = v_uint8::nlanes;
+
+    for (int l = 0; l < length;)
+    {
+        // main part of row
+        for (; l <= length - nlanes; l += nlanes)
+        {
+            v_float32 sum0 = vx_load(&buf[r[0]][l]) * vx_setall_f32(ky[0]);
+            v_float32 sum1 = vx_load(&buf[r[0]][l + nlanes / 4]) * vx_setall_f32(ky[0]);
+            v_float32 sum2 = vx_load(&buf[r[0]][l + 2 * nlanes / 4]) * vx_setall_f32(ky[0]);
+            v_float32 sum3 = vx_load(&buf[r[0]][l + 3 * nlanes / 4]) * vx_setall_f32(ky[0]);
+
+            for (int n = 1; n < kyLen; ++n)
+            {
+                sum0 = v_fma(vx_load(&buf[r[n]][l]), vx_setall_f32(ky[n]), sum0);
+                sum1 = v_fma(vx_load(&buf[r[n]][l + nlanes / 4]), vx_setall_f32(ky[n]), sum1);
+                sum2 = v_fma(vx_load(&buf[r[n]][l + 2 * nlanes / 4]), vx_setall_f32(ky[n]), sum2);
+                sum3 = v_fma(vx_load(&buf[r[n]][l + 3 * nlanes / 4]), vx_setall_f32(ky[n]), sum3);
+            }
+
+            if (!noscale)
+            {
+                sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
+                sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
+                sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta));
+                sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta));
+            }
+
+            v_int32 isum0 = v_round(sum0),
+                    isum1 = v_round(sum1),
+                    isum2 = v_round(sum2),
+                    isum3 = v_round(sum3);
+
+            v_int16 ires0 = v_pack(isum0, isum1),
+                    ires1 = v_pack(isum2, isum3);
+
+            v_uint8 res = v_pack_u(ires0, ires1);
+            v_store(reinterpret_cast<uchar*>(&out[l]), res);
+        }
+
+        // tail (if any)
+        if (l < length)
+        {
+            GAPI_DbgAssert(length >= nlanes);
+            l = length - nlanes;
+        }
+    }
+    return;
+}
+
+// this variant with manually vectored rounding to short/ushort
+template<bool noscale, typename DST, typename SRC>
+static void run_sepfilter5x5_any2short(DST out[], const SRC *in[], int width, int chan,
+                                       const float kx[], const float ky[], int border,
+                                       float scale, float delta,
+                                       float *buf[], int y, int y0)
+{
+    constexpr int kxLen = 5;
+    constexpr int kyLen = kxLen;
+    constexpr int buffSize = 5;
+
+    int r[buffSize];
+    for (int n = 0; n < buffSize; ++n)
+    {
+        r[n] = (y - y0 + n) % 5;  // previous, this, next rows
+    }
+
+    const int length = width * chan;
+    const int shift = chan;
+
+    // horizontal pass
+
+    int k0 = (y == y0) ? 0 : 4;
+
+    for (int k = k0; k < kyLen; ++k)
+    {
+        const SRC *s[kxLen] = { nullptr };
+
+        for (int i = 0; i < kxLen; ++i)
+        {
+            //  previous , this , next pixels
+            s[i] = in[k] + (i - border)*shift;
+        }
+
+        // rely on compiler vectoring
+        for (int l = 0; l < length; ++l)
+        {
+            float sum = 0;
+            for (int j = 0; j < kxLen; ++j)
+            {
+                sum += s[j][l] * kx[j];
+            }
+            buf[r[k]][l] = sum;
+        }
+    }
+
+    // vertical pass
+
+    constexpr int nlanes = v_int16::nlanes;
+    for (int l = 0; l < length;)
+    {
+        //GAPI_Assert(length >= nlanes);
+        // main part of row
+        for (; l <= length - nlanes; l += nlanes)
+        {
+            v_float32 sum0 = vx_load(&buf[r[0]][l]) * vx_setall_f32(ky[0]);
+            v_float32 sum1 = vx_load(&buf[r[0]][l + nlanes / 2]) * vx_setall_f32(ky[0]);
+
+            for (int j = 1; j < kyLen; ++j)
+            {
+                sum0 = v_fma(vx_load(&buf[r[j]][l]), vx_setall_f32(ky[j]), sum0);
+                sum1 = v_fma(vx_load(&buf[r[j]][l + nlanes / 2]), vx_setall_f32(ky[j]), sum1);
+            }
+
+            if (!noscale)
+            {
+                sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
+                sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
+            }
+
+            v_int32 isum0 = v_round(sum0),
+                    isum1 = v_round(sum1);
+
+            if (std::is_same<DST, short>::value)
+            {
+                // signed short
+                v_int16 res = v_pack(isum0, isum1);
+                v_store(reinterpret_cast<short*>(&out[l]), res);
+            }
+            else
+            {
+                // unsigned short
+                v_uint16 res = v_pack_u(isum0, isum1);
+                v_store(reinterpret_cast<ushort*>(&out[l]), res);
+            }
+        }
+
+        // tail (if any)
+        if (l < length)
+        {
+            GAPI_DbgAssert(length >= nlanes);
+            l = length - nlanes;
+        }
+    }
+    return;
+}
+
+// this variant not using buf[]
+template<bool noscale, typename SRC>
+static void run_sepfilter5x5_any2float(float out[], const SRC *in[], int width, int chan,
+                                       const float kx[], const float ky[], int border,
+                                       float scale, float delta)
+{
+    constexpr int kxLen = 5;
+    constexpr int kyLen = kxLen;
+    constexpr int buffSize = 5;
+
+    const int length = width * chan;
+    const int shift = chan;
+
+    static const int nlanes = v_float32::nlanes;
+    for (int l = 0; l < length; )
+    {
+        //GAPI_Assert(length >= nlanes);
+        // main part
+        for (; l <= length - nlanes; l += nlanes)
+        {
+            auto xsum = [l, border, shift, kx](const SRC inp[])
+            {
+                v_float32 t[5];
+                for (int i = 0; i < 5; ++i)
+                {
+                    t[i] = vx_load_f32(&inp[l + (i - border)*shift]);
+                }
+
+                v_float32 sum = t[0] * vx_setall_f32(kx[0]);
+                for (int j = 1; j < 5; ++j)
+                {
+                    sum = v_fma(t[j], vx_setall_f32(kx[j]), sum);
+                }
+
+                return sum;
+            };
+
+            v_float32 s[buffSize];
+            for (int m = 0; m < buffSize; ++m)
+            {
+                s[m] = xsum(in[m]);
+            }
+
+            v_float32 sum = s[0] * vx_setall_f32(ky[0]);
+            for (int n = 1; n < kyLen; ++n)
+            {
+                sum = v_fma(s[n], vx_setall_f32(ky[n]), sum);
+            }
+
+            if (!noscale)
+            {
+                sum = v_fma(sum, vx_setall_f32(scale), vx_setall_f32(delta));
+            }
+
+            v_store(&out[l], sum);
+        }
+
+        // tail (if any)
+        if (l < length)
+        {
+            GAPI_DbgAssert(length >= nlanes);
+            l = length - nlanes;
+        }
+    }
+    return;
+}
+
+#define USE_SEPFILTER5X5_CHAR2SHORT 1
+
+#if USE_SEPFILTER5X5_CHAR2SHORT
+template<bool noscale>
+static void run_sepfilter5x5_char2short(short out[], const uchar *in[], int width, int chan,
+                                        const float kx[], const float ky[], int border,
+                                        float scale, float delta,
+                                        float *buf[], int y, int y0)
+{
+    constexpr int kxLen = 5;
+    constexpr int kyLen = kxLen;
+
+    constexpr int buffSize = 5;
+
+    schar ikx[kxLen];
+    schar iky[kyLen];
+
+    for (int i = 0; i < kxLen; ++i)
+    {
+        ikx[i] = saturate<schar>(kx[i], rintf);
+        iky[i] = saturate<schar>(ky[i], rintf);
+    }
+
+    const short iscale = saturate<short>(scale * (1 << 15), rintf);
+    const short idelta = saturate<short>(delta, rintf);
+
+    // check if this code is applicable
+    if (ikx[0] != kx[0] || ikx[1] != kx[1] || ikx[2] != kx[2] || ikx[3] != kx[3] || ikx[4] != kx[4] ||
+        iky[0] != ky[0] || iky[1] != ky[1] || iky[2] != ky[2] || iky[3] != ky[3] || iky[4] != ky[4] ||
+        idelta != delta ||
+        std::abs(scale) > 1 || std::abs(scale) < 0.01)
+    {
+        run_sepfilter5x5_any2short<noscale>(out, in, width, chan, kx, ky, border, scale, delta,
+                                            buf, y, y0);
+        return;
+    }
+
+    short *ibuf[buffSize];
+    int r[buffSize];
+
+    for (int n = 0; n < buffSize; ++n)
+    {
+        ibuf[n] = reinterpret_cast<short*>(buf[n]);
+        r[n] = (y - y0 + n) % 5;  // previous, this, next rows
+    }
+
+    const int length = width * chan;
+    const int shift = chan;
+
+    // horizontal pass
+    // full horizontal pass is needed only if the very 1st row in ROI is handled;
+    // for 2nd and further rows, it's enough to convolve only the
+    // "next" row - as we can reuse buffers from previous calls to
+    // this kernel (Fluid does rows consequently: y=y0, y0+1, ...)
+    int k0 = (y == y0) ? 0 : 4;
+
+    constexpr int nlanes = v_int16::nlanes;
+
+    for (int k = k0; k < kyLen; ++k)
+    {
+        for (int l = 0; l < length;)
+        {
+            GAPI_Assert(length >= nlanes);
+
+            // main part of output row
+            for (; l <= length - nlanes; l += nlanes)
+            {
+                v_uint16 t[kxLen];
+                v_int16 sum;
+
+                for (int i = 0; i < kxLen; ++i)
+                {
+                    // previous, current, next pixels
+                    t[i] = vx_load_expand(&in[k][l + (i - border)*shift]);
+
+                    sum += v_reinterpret_as_s16(t[i]) * vx_setall_s16(ikx[i]);
+                }
+
+                v_store(&ibuf[r[k]][l], sum);
+            }
+
+            // tail (if any)
+            if (l < length)
+            {
+                GAPI_DbgAssert(length >= nlanes);
+                l = length - nlanes;
+            }
+        }
+    }
+
+    // vertical pass
+
+    for (int l = 0; l < length;)
+    {
+        //GAPI_Assert(length >= nlanes);
+        // main part of output row
+        for (; l <= length - nlanes; l += nlanes)
+        {
+            v_int16 s[buffSize];
+            v_int16 sum;
+
+            for (int i = 0; i < kyLen; ++i)
+            {
+                // previous, current, next rows
+                s[i] = vx_load(&ibuf[r[i]][l]);
+
+                sum += s[i] * vx_setall_s16(iky[i]);
+            }
+
+            if (!noscale)
+            {
+                sum = v_mul_hi(sum << 1, vx_setall_s16(iscale)) + vx_setall_s16(idelta);
+            }
+
+            v_store(&out[l], sum);
+        }
+
+        // tail (if any)
+        if (l < length)
+        {
+            GAPI_DbgAssert(length >= nlanes);
+            l = length - nlanes;
+        }
+    }
+    return;
+}
+#endif //USE_SEPFILTER5X5_CHAR2SHORT
+
+#endif //CV_SIMD
+
+template<bool noscale, typename DST, typename SRC>
+static void run_sepfilter5x5_reference(DST out[], const SRC *in[], int width, int chan,
+                                       const float kx[], const float ky[], int border,
+                                       float scale, float delta, float *buf[], int y, int y0)
+{
+    constexpr int kxLen = 5; // kernel size
+    constexpr int kyLen = kxLen;
+    int r[kyLen];
+    for (int n = 0; n < kyLen; ++n)
+    {
+        r[n] = (y - y0 + n) % 5; // previous, this, next rows
+    }
+
+    int length = width * chan;
+    int shift = chan;
+
+    // horizontal pass
+
+    // full horizontal pass is needed only if very 1st row in ROI;
+    // for 2nd and further rows, it is enough to convolve only the
+    // "next" row - as we can reuse buffers from previous calls to
+    // this kernel (Fluid does rows consequently: y=y0, y0+1, ...)
+
+    int k0 = (y == y0) ? 0 : 4;
+
+    for (int k = k0; k < kyLen; ++k)
+    {
+        const SRC *s[kxLen] = { nullptr };
+
+        for (int i = 0; i < kxLen; ++i)
+        {
+            //  previous , this , next pixels
+            s[i] = in[k] + (i - border)*shift;
+        }
+
+        // rely on compiler vectoring
+        for (int l = 0; l < length; ++l)
+        {
+            float sum = 0;
+            for (int i = 0; i < kxLen; ++i)
+            {
+                sum += s[i][l] * kx[i];
+            }
+            buf[r[k]][l] = sum;
+        }
+    }
+
+    // vertical pass
+
+    for (int l = 0; l < length; ++l)
+    {
+        float sum = 0;
+        for (int j = 0; j < kyLen; ++j)
+        {
+            sum += buf[r[j]][l] * ky[j];
+        }
+
+        if (!noscale)
+        {
+            sum = sum * scale + delta;
+        }
+
+        out[l] = saturate<DST>(sum, rintf);
+    }
+    return;
+}
+
+template<bool noscale, typename DST, typename SRC>
+static void run_sepfilter5x5_code(DST out[], const SRC *in[], int width, int chan,
+                                  const float kx[], const float ky[], int border,
+                                  float scale, float delta, float *buf[], int y, int y0)
+{
+#if CV_SIMD
+    int length = width * chan;
+
+    // length variable may be unused if types do not match at 'if' statements below
+    (void)length;
+
+    if (std::is_same<DST, short>::value && std::is_same<SRC, uchar>::value &&
+        length >= v_int16::nlanes)
+    {
+        run_sepfilter5x5_char2short<noscale>(reinterpret_cast<short*>(out),
+                                             reinterpret_cast<const uchar**>(in),
+                                             width, chan, kx, ky, border, scale, delta,
+                                             buf, y, y0);
+        return;
+    }
+
+    if (std::is_same<DST, float>::value && std::is_same<SRC, float>::value &&
+        length >= v_float32::nlanes)
+    {
+        run_sepfilter5x5_any2float<noscale>(reinterpret_cast<float*>(out), in, width,
+                                            chan, kx, ky, border, scale, delta);
+        return;
+    }
+
+    if (std::is_same<DST, short>::value && length >= v_int16::nlanes)
+    {
+        run_sepfilter5x5_any2short<noscale>(reinterpret_cast<short*>(out), in, width,
+                                            chan, kx, ky, border, scale, delta,
+                                            buf, y, y0);
+        return;
+    }
+
+    if (std::is_same<DST, ushort>::value && length >= v_uint16::nlanes)
+    {
+        run_sepfilter5x5_any2short<noscale>(reinterpret_cast<ushort*>(out), in, width,
+                                            chan, kx, ky, border, scale, delta,
+                                            buf, y, y0);
+        return;
+    }
+
+    if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
+    {
+        run_sepfilter5x5_any2char<noscale>(reinterpret_cast<uchar*>(out), in, width,
+                                           chan, kx, ky, border, scale, delta,
+                                           buf, y, y0);
+        return;
+    }
+#endif  // CV_SIMD
+
+    // reference code is quite fast for any-to-float case,
+    // but not for any-to-integral due to very slow rounding
+    run_sepfilter5x5_reference<noscale>(out, in, width, chan, kx, ky, border,
+        scale, delta, buf, y, y0);
+}
+#define RUN_SEPFILTER5x5_IMPL(DST, SRC)                                                                        \
+void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, const float kx[],                  \
+                           const float ky[], int border, float scale, float delta,                             \
+                           float *buf[], int y, int y0)                                                        \
+{                                                                                                              \
+    if (scale == 1 && delta == 0)                                                                              \
+    {                                                                                                          \
+        constexpr bool noscale = true;                                                                         \
+        run_sepfilter5x5_code<noscale>(out, in, width, chan, kx, ky, border,                                   \
+                                       scale, delta, buf, y, y0);                                              \
+    }                                                                                                          \
+    else                                                                                                       \
+    {                                                                                                          \
+        constexpr bool noscale = false;                                                                        \
+        run_sepfilter5x5_code<noscale>(out, in, width, chan, kx, ky, border,                                   \
+                                       scale, delta, buf, y, y0);                                              \
+    }                                                                                                          \
+    return;                                                                                                    \
+}
+
+RUN_SEPFILTER5x5_IMPL(uchar, uchar)
+RUN_SEPFILTER5x5_IMPL(short, uchar)
+RUN_SEPFILTER5x5_IMPL(float, uchar)
+RUN_SEPFILTER5x5_IMPL(ushort, ushort)
+RUN_SEPFILTER5x5_IMPL(short, ushort)
+RUN_SEPFILTER5x5_IMPL(float, ushort)
+RUN_SEPFILTER5x5_IMPL(short, short)
+RUN_SEPFILTER5x5_IMPL(float, short)
+RUN_SEPFILTER5x5_IMPL(float, float)
+
+#undef RUN_SEPFILTER5x5_IMPL
+
 //-------------------------
 //
 // Fluid kernels: Filter 2D