From cc5190eb911c77fb5a6eb63567021fc223acf705 Mon Sep 17 00:00:00 2001
From: Evgeny Latkin <evgeny.latkin@intel.com>
Date: Tue, 13 Nov 2018 17:48:10 +0300
Subject: [PATCH] Merge pull request #13133 from elatkin:el/gapi_perf_sobel_2

GAPI (fluid): Sobel 3x3 optimization: CV_SIMD dynamic dispatching (#13133)

* GAPI (fluid): Sobel 3x3: remove template for run_sobel_row()

* GAPI (fluid): Sobel 3x3: dynamic dispatching of CV_SIMD code

* GAPI (fluid): Sobel 3x3 optimization: fixed CV_SIMD dynamic dispatcher
---
 modules/gapi/CMakeLists.txt                   |   5 +-
 .../gapi/src/backends/fluid/gfluidimgproc.cpp |   2 +-
 .../fluid/gfluidimgproc_func.dispatch.cpp     |  63 ++++
 .../src/backends/fluid/gfluidimgproc_func.hpp |  24 +-
 .../fluid/gfluidimgproc_func.simd.hpp         | 296 ++++++++++++++++++
 5 files changed, 382 insertions(+), 8 deletions(-)
 create mode 100644 modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
 create mode 100644 modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
diff --git a/modules/gapi/CMakeLists.txt b/modules/gapi/CMakeLists.txt
index 74b73d1834..ec05b385cd 100644
--- a/modules/gapi/CMakeLists.txt
+++ b/modules/gapi/CMakeLists.txt
@@ -69,7 +69,7 @@ set(gapi_srcs
     src/backends/fluid/gfluidbuffer.cpp
     src/backends/fluid/gfluidbackend.cpp
     src/backends/fluid/gfluidimgproc.cpp
-    src/backends/fluid/gfluidimgproc_func.cpp
+    src/backends/fluid/gfluidimgproc_func.dispatch.cpp
     src/backends/fluid/gfluidcore.cpp
 
     # GPU Backend (currently built-in)
@@ -78,12 +78,13 @@ set(gapi_srcs
     src/backends/gpu/ggpuimgproc.cpp
     src/backends/gpu/ggpucore.cpp
 
-
     # Compound
     src/backends/common/gcompoundbackend.cpp
     src/backends/common/gcompoundkernel.cpp
     )
 
+ocv_add_dispatched_file(backends/fluid/gfluidimgproc_func SSE4_1 AVX2)
+
 ocv_list_add_prefix(gapi_srcs "${CMAKE_CURRENT_LIST_DIR}/")
 
 # For IDE users
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
index 3c7a7d0111..71be5e02f8 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
@@ -768,7 +768,7 @@ static void run_sobel(Buffer& dst,
     int y0 = dst.priv().writeStart();
 //  int y1 = dst.priv().writeEnd();
 
-    run_sobel_impl(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
+    run_sobel_row(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
 }
 
 GAPI_FLUID_KERNEL(GFluidSobel, cv::gapi::imgproc::GSobel, true)
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
new file mode 100644
index 0000000000..46cb19c911
--- /dev/null
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
@@ -0,0 +1,63 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+#if !defined(GAPI_STANDALONE)
+
+#include "gfluidimgproc_func.hpp"
+#include "gfluidimgproc_func.simd.hpp"
+#if 1
+  // NB: workaround for CV_SIMD bug (or feature?):
+  // - dynamic dispatcher assumes *.simd.hpp is directly in src dir
+  #include "backends/fluid/gfluidimgproc_func.simd_declarations.hpp"
+#else
+  #include                "gfluidimgproc_func.simd_declarations.hpp"
+#endif
+
+#include "gfluidutils.hpp"
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/hal/intrin.hpp"
+
+#include <cmath>
+#include <cstdlib>
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wstrict-overflow"
+#endif
+
+namespace cv {
+namespace gapi {
+namespace fluid {
+
+#define RUN_SOBEL_ROW(DST, SRC)                                          \
+void run_sobel_row(DST out[], const SRC *in[], int width, int chan,      \
+                   const float kx[], const float ky[], int border,       \
+                   float scale, float delta, float *buf[],               \
+                   int y, int y0)                                        \
+{                                                                        \
+    CV_CPU_DISPATCH(run_sobel_row,                                       \
+        (out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0), \
+        CV_CPU_DISPATCH_MODES_ALL);                                      \
+}
+
+RUN_SOBEL_ROW(uchar , uchar )
+RUN_SOBEL_ROW(ushort, ushort)
+RUN_SOBEL_ROW( short, uchar )
+RUN_SOBEL_ROW( short, ushort)
+RUN_SOBEL_ROW( short,  short)
+RUN_SOBEL_ROW( float, uchar )
+RUN_SOBEL_ROW( float, ushort)
+RUN_SOBEL_ROW( float,  short)
+RUN_SOBEL_ROW( float,  float)
+
+#undef RUN_SOBEL_ROW
+
+} // namespace fliud
+} // namespace gapi
+} // namespace cv
+
+#endif // !defined(GAPI_STANDALONE)
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
index f4d49c272e..4d94976987 100644
--- a/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
@@ -8,6 +8,8 @@
 
 #if !defined(GAPI_STANDALONE)
 
+#include "opencv2/core.hpp"
+
 namespace cv {
 namespace gapi {
 namespace fluid {
@@ -18,11 +20,23 @@ namespace fluid {
 //
 //---------------------
 
-template<typename DST, typename SRC>
-void run_sobel_impl(DST out[], const SRC *in[], int width, int chan,
-                    const float kx[], const float ky[], int border,
-                    float scale, float delta, float *buf[],
-                    int y, int y0);
+#define RUN_SOBEL_ROW(DST, SRC)                                     \
+void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \
+                   const float kx[], const float ky[], int border,  \
+                   float scale, float delta, float *buf[],          \
+                   int y, int y0);
+
+RUN_SOBEL_ROW(uchar , uchar )
+RUN_SOBEL_ROW(ushort, ushort)
+RUN_SOBEL_ROW( short, uchar )
+RUN_SOBEL_ROW( short, ushort)
+RUN_SOBEL_ROW( short,  short)
+RUN_SOBEL_ROW( float, uchar )
+RUN_SOBEL_ROW( float, ushort)
+RUN_SOBEL_ROW( float,  short)
+RUN_SOBEL_ROW( float,  float)
+
+#undef RUN_SOBEL_ROW
 
 }  // namespace fluid
 }  // namespace gapi
diff --git a/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
new file mode 100644
index 0000000000..55098b3284
--- /dev/null
+++ b/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
@@ -0,0 +1,296 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+// NB: allow including this *.hpp several times!
+// #pragma once -- don't: this file is NOT once!
+
+#if !defined(GAPI_STANDALONE)
+
+#include "opencv2/gapi/own/saturate.hpp"
+
+#include "opencv2/core.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wstrict-overflow"
+#endif
+
+namespace cv {
+namespace gapi {
+namespace fluid {
+
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+//----------------------------------------------------------------------
+
+#define RUN_SOBEL_ROW(DST, SRC)                                     \
+void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \
+                  const float kx[], const float ky[], int border,   \
+                  float scale, float delta, float *buf[],           \
+                  int y, int y0);
+
+RUN_SOBEL_ROW(uchar , uchar )
+RUN_SOBEL_ROW(ushort, ushort)
+RUN_SOBEL_ROW( short, uchar )
+RUN_SOBEL_ROW( short, ushort)
+RUN_SOBEL_ROW( short,  short)
+RUN_SOBEL_ROW( float, uchar )
+RUN_SOBEL_ROW( float, ushort)
+RUN_SOBEL_ROW( float,  short)
+RUN_SOBEL_ROW( float,  float)
+
+#undef RUN_SOBEL_ROW
+
+//----------------------------------------------------------------------
+#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+// Sobel 3x3: vertical pass
+template<bool noscale, typename DST>
+static void run_sobel3x3_vert(DST out[], int length, const float ky[],
+                float scale, float delta, const int r[], float *buf[])
+{
+    float ky0 = ky[0],
+          ky1 = ky[1],
+          ky2 = ky[2];
+
+    int r0 = r[0],
+        r1 = r[1],
+        r2 = r[2];
+
+#if CV_SIMD
+    // for floating-point output,
+    // manual vectoring may be not better than compiler's optimization
+#define EXPLICIT_SIMD_32F 0  // 1=vectorize 32f case explicitly, 0=don't
+#if     EXPLICIT_SIMD_32F
+    if (std::is_same<DST, float>::value && length >= v_int16::nlanes)
+    {
+        constexpr static int nlanes = v_float32::nlanes;
+
+        for (int l=0; l < length; )
+        {
+            for (; l <= length - nlanes; l += nlanes)
+            {
+                v_float32 sum = vx_load(&buf[r0][l]) * vx_setall_f32(ky0);
+                    sum = v_fma(vx_load(&buf[r1][l]),  vx_setall_f32(ky1), sum);
+                    sum = v_fma(vx_load(&buf[r2][l]),  vx_setall_f32(ky2), sum);
+
+                if (!noscale)
+                {
+                    sum = v_fma(sum, vx_setall_f32(scale), vx_setall_f32(delta));
+                }
+
+                v_store(reinterpret_cast<float*>(&out[l]), sum);
+            }
+
+            if (l < length)
+            {
+                // tail: recalculate last pixels
+                GAPI_DbgAssert(length >= nlanes);
+                l = length - nlanes;
+            }
+        }
+
+        return;
+    }
+#endif
+
+    if ((std::is_same<DST, short>::value || std::is_same<DST, ushort>::value)
+        && length >= v_int16::nlanes)
+    {
+        constexpr static int nlanes = v_int16::nlanes;
+
+        for (int l=0; l < length; )
+        {
+            for (; l <= length - nlanes; l += nlanes)
+            {
+                v_float32 sum0 = vx_load(&buf[r0][l])            * vx_setall_f32(ky0);
+                    sum0 = v_fma(vx_load(&buf[r1][l]),             vx_setall_f32(ky1), sum0);
+                    sum0 = v_fma(vx_load(&buf[r2][l]),             vx_setall_f32(ky2), sum0);
+
+                v_float32 sum1 = vx_load(&buf[r0][l + nlanes/2]) * vx_setall_f32(ky0);
+                    sum1 = v_fma(vx_load(&buf[r1][l + nlanes/2]),  vx_setall_f32(ky1), sum1);
+                    sum1 = v_fma(vx_load(&buf[r2][l + nlanes/2]),  vx_setall_f32(ky2), sum1);
+
+                if (!noscale)
+                {
+                    sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
+                    sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
+                }
+
+                v_int32 isum0 = v_round(sum0),
+                        isum1 = v_round(sum1);
+
+                if (std::is_same<DST, short>::value)
+                {
+                    // signed short
+                    v_int16 res = v_pack(isum0, isum1);
+                    v_store(reinterpret_cast<short*>(&out[l]), res);
+                } else
+                {
+                    // unsigned short
+                    v_uint16 res = v_pack_u(isum0, isum1);
+                    v_store(reinterpret_cast<ushort*>(&out[l]), res);
+                }
+            }
+
+            if (l < length)
+            {
+                // tail: recalculate last pixels
+                GAPI_DbgAssert(length >= nlanes);
+                l = length - nlanes;
+            }
+        }
+
+        return;
+    }
+
+    if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
+    {
+        constexpr static int nlanes = v_uint8::nlanes;
+
+        for (int l=0; l < length; )
+        {
+            for (; l <= length - nlanes; l += nlanes)
+            {
+                v_float32 sum0 = vx_load(&buf[r0][l])              * vx_setall_f32(ky0);
+                    sum0 = v_fma(vx_load(&buf[r1][l]),               vx_setall_f32(ky1), sum0);
+                    sum0 = v_fma(vx_load(&buf[r2][l]),               vx_setall_f32(ky2), sum0);
+
+                v_float32 sum1 = vx_load(&buf[r0][l +   nlanes/4]) * vx_setall_f32(ky0);
+                    sum1 = v_fma(vx_load(&buf[r1][l +   nlanes/4]),  vx_setall_f32(ky1), sum1);
+                    sum1 = v_fma(vx_load(&buf[r2][l +   nlanes/4]),  vx_setall_f32(ky2), sum1);
+
+                v_float32 sum2 = vx_load(&buf[r0][l + 2*nlanes/4]) * vx_setall_f32(ky0);
+                    sum2 = v_fma(vx_load(&buf[r1][l + 2*nlanes/4]),  vx_setall_f32(ky1), sum2);
+                    sum2 = v_fma(vx_load(&buf[r2][l + 2*nlanes/4]),  vx_setall_f32(ky2), sum2);
+
+                v_float32 sum3 = vx_load(&buf[r0][l + 3*nlanes/4]) * vx_setall_f32(ky0);
+                    sum3 = v_fma(vx_load(&buf[r1][l + 3*nlanes/4]),  vx_setall_f32(ky1), sum3);
+                    sum3 = v_fma(vx_load(&buf[r2][l + 3*nlanes/4]),  vx_setall_f32(ky2), sum3);
+
+                if (!noscale)
+                {
+                    sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
+                    sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
+                    sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta));
+                    sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta));
+                }
+
+                v_int32 isum0 = v_round(sum0),
+                        isum1 = v_round(sum1),
+                        isum2 = v_round(sum2),
+                        isum3 = v_round(sum3);
+
+                v_int16 ires0 = v_pack(isum0, isum1),
+                        ires1 = v_pack(isum2, isum3);
+
+                v_uint8 res = v_pack_u(ires0, ires1);
+                v_store(reinterpret_cast<uchar*>(&out[l]), res);
+            }
+
+            if (l < length)
+            {
+                // tail: recalculate last pixels
+                GAPI_DbgAssert(length >= nlanes);
+                l = length - nlanes;
+            }
+        }
+
+        return;
+    }
+#endif
+
+    // reference code
+    for (int l=0; l < length; l++)
+    {
+        float sum = buf[r0][l]*ky0 + buf[r1][l]*ky1 + buf[r2][l]*ky2;
+
+        if (!noscale)
+        {
+            sum = sum*scale + delta;
+        }
+
+        out[l] = cv::gapi::own::saturate<DST>(sum, rintf);
+    }
+}
+
+template<typename DST, typename SRC>
+static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan,
+                           const float kx[], const float ky[], int border,
+                           float scale, float delta, float *buf[],
+                           int y, int y0)
+{
+    int r[3];
+    r[0] = (y - y0)     % 3;  // buf[r[0]]: previous
+    r[1] = (y - y0 + 1) % 3;  //            this
+    r[2] = (y - y0 + 2) % 3;  //            next row
+
+    int length = width * chan;
+
+    // horizontal pass
+
+    // full horizontal pass is needed only if very 1st row in ROI;
+    // for 2nd and further rows, it is enough to convolve only the
+    // "next" row - as we can reuse buffers from previous calls to
+    // this kernel (note that Fluid processes rows consequently)
+    int k0 = (y == y0)? 0: 2;
+
+    for (int k = k0; k < 3; k++)
+    {
+        //                             previous, this , next pixel
+        const SRC *s[3] = {in[k] - border*chan , in[k], in[k] + border*chan};
+
+        // rely on compiler vectoring
+        for (int l=0; l < length; l++)
+        {
+            buf[r[k]][l] = s[0][l]*kx[0] + s[1][l]*kx[1] + s[2][l]*kx[2];
+        }
+    }
+
+    // vertical pass
+    if (scale == 1 && delta == 0)
+    {
+        constexpr static bool noscale = true;  // omit scaling
+        run_sobel3x3_vert<noscale, DST>(out, length, ky, scale, delta, r, buf);
+    } else
+    {
+        constexpr static bool noscale = false;  // do scaling
+        run_sobel3x3_vert<noscale, DST>(out, length, ky, scale, delta, r, buf);
+    }
+}
+
+#define RUN_SOBEL_ROW(DST, SRC)                                                    \
+void run_sobel_row(DST out[], const SRC *in[], int width, int chan,                \
+                   const float kx[], const float ky[], int border,                 \
+                   float scale, float delta, float *buf[],                         \
+                   int y, int y0)                                                  \
+{                                                                                  \
+    run_sobel_impl(out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0); \
+}
+
+RUN_SOBEL_ROW(uchar , uchar )
+RUN_SOBEL_ROW(ushort, ushort)
+RUN_SOBEL_ROW( short, uchar )
+RUN_SOBEL_ROW( short, ushort)
+RUN_SOBEL_ROW( short,  short)
+RUN_SOBEL_ROW( float, uchar )
+RUN_SOBEL_ROW( float, ushort)
+RUN_SOBEL_ROW( float,  short)
+RUN_SOBEL_ROW( float,  float)
+
+#undef RUN_SOBEL_ROW
+
+#endif  // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+//----------------------------------------------------------------------
+
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+
+}  // namespace fluid
+}  // namespace gapi
+}  // namespace cv
+
+#endif // !defined(GAPI_STANDALONE)