From 840b1d5c94325eac6158a5a3cd1d57bee2c25aa1 Mon Sep 17 00:00:00 2001
From: zihaomu <zihaomu@outlook.com>
Date: Wed, 11 Jan 2023 08:42:51 +0800
Subject: [PATCH] add depthwise add fuse

---
 .../depthwise_convolution.cpp                 | 40 +++++++++++++++----
 .../fast_convolution/fast_convolution.cpp     |  4 +-
 .../fast_convolution/fast_convolution.hpp     |  2 +-
 modules/dnn/test/test_onnx_importer.cpp       |  5 +++
 4 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp b/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
index 0c471e8920..4566c880c9 100644
--- a/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
+++ b/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
@@ -24,7 +24,7 @@ static void depthWiseBlockConv2D(const float* wptr,
                                  const float* inptr_,
                                  int height, int width,
                                  float* outptr_,
-                                 int out_d, int outH, int outW)
+                                 int out_d, int outH, int outW, bool fusedAdd)
 {
     const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
             w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
@@ -57,6 +57,8 @@ static void depthWiseBlockConv2D(const float* wptr,
             out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
                   imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
                   imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
+            if (fusedAdd)
+                out += outptr[0];
             if (relu)
                 out = out > 0.f ? out : out*relu_coeff;
             outptr[0] = out;
@@ -65,6 +67,10 @@ static void depthWiseBlockConv2D(const float* wptr,
 
 #if CV_SIMD128
         const int VEC_NLANES = 4;
+
+        if (fusedAdd)
+            outW1 = max(out_j, outW1 - outW1%VEC_NLANES);
+
         v_float32x4 vw00 = v_setall_f32(w00);
         v_float32x4 vw01 = v_setall_f32(w01);
         v_float32x4 vw02 = v_setall_f32(w02);
@@ -104,6 +110,8 @@ static void depthWiseBlockConv2D(const float* wptr,
                     v_float32x4 vout = v00*vw00 + v01*vw01 + v02*vw02 +
                                      v10*vw10 + v11*vw11 + v12*vw12 +
                                      v20*vw20 + v21*vw21 + v22*vw22 + vbias;
+                    if (fusedAdd)
+                        vout = v_load(outptr + out_j) + vout;
                     if (relu)
                         vout = v_select(vout > z, vout, vout*vrc);
                     v_store(outptr + out_j, vout);
@@ -134,6 +142,8 @@ static void depthWiseBlockConv2D(const float* wptr,
                             v10 * vw10 + v11 * vw11 + v12 * vw12 +
                             v20 * vw20 + v21 * vw21 + v22 * vw22 + vbias;
 
+                    if (fusedAdd)
+                        vout = v_load(outptr + out_j) + vout;
                     if (relu)
                         vout = v_select(vout > z, vout, vout*vrc);
                     v_store(outptr + out_j, vout);
@@ -148,6 +158,8 @@ static void depthWiseBlockConv2D(const float* wptr,
             out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
                   imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
                   imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
+            if (fusedAdd)
+                out += outptr[out_j];
             if (relu)
                 out = out > 0.f ? out : out*relu_coeff;
             outptr[out_j] = out;
@@ -175,6 +187,8 @@ static void depthWiseBlockConv2D(const float* wptr,
             out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
                   imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
                   imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
+            if (fusedAdd)
+                out += outptr[out_j];
             if (relu)
                 out = out > 0.f ? out : out*relu_coeff;
             outptr[out_j] = out;
@@ -187,7 +201,7 @@ static void depthWiseBlockConv1D(const float* wptr,
                                  const float* biasptr, const float* relu,
                                  const float* inptr_, int width,
                                  float* outptr_,
-                                 int out_d, int outW)
+                                 int out_d, int outW, bool fusedAdd)
 {
     const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2];
     int outW1 = min(outW, (width - dilation_w * (kernel_w - 1) + pad_l)/stride_w);
@@ -201,7 +215,8 @@ static void depthWiseBlockConv1D(const float* wptr,
     if (pad_l > 0)
     {
         out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 + bias;
-
+        if (fusedAdd)
+            out += outptr[0];
         if (relu)
             out = out > 0.f ? out : out*relu_coeff;
         outptr[0] = out;
@@ -210,6 +225,8 @@ static void depthWiseBlockConv1D(const float* wptr,
 
 #if CV_SIMD128
     const int VEC_NLANES = 4;
+    if (fusedAdd)
+        outW1 = max(out_j, outW1 - outW1%VEC_NLANES);
     v_float32x4 vw00 = v_setall_f32(w00);
     v_float32x4 vw01 = v_setall_f32(w01);
     v_float32x4 vw02 = v_setall_f32(w02);
@@ -235,6 +252,8 @@ static void depthWiseBlockConv1D(const float* wptr,
                         v02 = v_load(imgptr0 + in_j + dilation_w*2);
 
                 v_float32x4 vout = v00*vw00 + v01*vw01 + v02*vw02 + vbias;
+                if (fusedAdd)
+                    vout = v_load(outptr + out_j) + vout;
                 if (relu)
                     vout = v_select(vout > z, vout, vout*vrc);
                 v_store(outptr + out_j, vout);
@@ -258,6 +277,9 @@ static void depthWiseBlockConv1D(const float* wptr,
 
                 v_float32x4 vout = v00 * vw00 + v01 * vw01 + v02 * vw02 + vbias;
 
+                if (fusedAdd)
+                    vout = v_load(outptr + out_j) + vout;
+
                 if (relu)
                     vout = v_select(vout > z, vout, vout*vrc);
                 v_store(outptr + out_j, vout);
@@ -270,6 +292,8 @@ static void depthWiseBlockConv1D(const float* wptr,
     {
         int in_j = out_j * stride_w - pad_l;
         out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 + bias;
+        if (fusedAdd)
+            out += outptr[out_j];
         if (relu)
             out = out > 0.f ? out : out*relu_coeff;
         outptr[out_j] = out;
@@ -295,6 +319,8 @@ static void depthWiseBlockConv1D(const float* wptr,
             s2 = 0.f;
         }
         out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 + bias;
+        if (fusedAdd)
+            out += outptr[out_j];
         if (relu)
             out = out > 0.f ? out : out*relu_coeff;
         outptr[out_j] = out;
@@ -302,7 +328,7 @@ static void depthWiseBlockConv1D(const float* wptr,
 }
 
 void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, ActivationLayer* activ_,
-                  const std::vector<float>& reluslope)
+                  const std::vector<float>& reluslope, bool fusedAdd)
 {
     Mat input = _input.getMat();
     Mat output = _output.getMat();
@@ -349,7 +375,7 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& c
 
 #if CV_TRY_AVX2 || CV_TRY_AVX || CV_TRY_RVV
     // TODO: remove the following limitation, need change code in layers_common.simd.hpp.
-    bool canRunOpt = Wi >= 16 + dilation_w*(Wk - 1);
+    bool canRunOpt = Wi >= 16 + dilation_w*(Wk - 1) && !fusedAdd;
 #endif
     std::vector<int> ofstab_(3 * ksize, 0);
     int *ofstab = ofstab_.data();
@@ -399,11 +425,11 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& c
             else
 #endif
             depthWiseBlockConv2D(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
-                                 pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
+                                 pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0, fusedAdd);
         }
         else // conv_dim == CONV_1D, spatial branch for depth-wise Conv1D.
         {
-            depthWiseBlockConv1D(weights, Wk, stride_w, dilation_w, pad_left, bias, relu, inptr0, Wi, outptr0, c, W0);
+            depthWiseBlockConv1D(weights, Wk, stride_w, dilation_w, pad_left, bias, relu, inptr0, Wi, outptr0, c, W0, fusedAdd);
         }
 
         if (activ)
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
index 1cde7b324f..feda70a598 100644
--- a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
+++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
@@ -369,8 +369,8 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
     if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE)
     {
         // Depthwise-Convolution layer should not be followed by Add layer.
-        CV_Assert(fusedAddMat.empty() && (conv_dim == CONV_1D || conv_dim == CONV_2D));
-        return runDepthwise(input, output, conv,actLayer.get(), reluslope);
+        CV_Assert((conv_dim == CONV_1D || conv_dim == CONV_2D));
+        return runDepthwise(input, output, conv, actLayer.get(), reluslope, fusedAdd);
     }
 
     MatShape inputShape = shape(input);
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
index 895ad562bb..7794078bb4 100644
--- a/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
+++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
@@ -100,7 +100,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
                    const Ptr<ActivationLayer>& actLayer, const std::vector<float>& reluslope, bool fusedAdd);
 
 void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, ActivationLayer* activ,
-                  const std::vector<float>& reluslope);
+                  const std::vector<float>& reluslope, bool fusedAdd);
 
 int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv, int ntasks,
                   float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct);
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 12dc3987b9..0ffa252c71 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -1726,6 +1726,11 @@ TEST_P(Test_ONNX_layers, ConvResizePool1d)
     testONNXModels("conv_resize_pool_1d");
 }
 
+TEST_P(Test_ONNX_layers, DepthWiseAdd)
+{
+    testONNXModels("depthwiseconv_add");
+}
+
 TEST_P(Test_ONNX_layers, SubFromConst)
 {
     testONNXModels("sub_from_const1");