From a809ae4e88c96f07d70a1642ca6d7d2c71cca8ec Mon Sep 17 00:00:00 2001
From: HAN Liutong <liutong2020@iscas.ac.cn>
Date: Mon, 27 Mar 2023 21:30:47 +0800
Subject: [PATCH] Fix HAL compatibility layer and modify use cases.

---
 .../core/include/opencv2/core/hal/intrin.hpp  | 237 +++++++++++++++---
 .../src/layers/cpu_kernels/convolution.cpp    | 123 +++++----
 2 files changed, 266 insertions(+), 94 deletions(-)

diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp
index 207b8cab4e..ee8310b5c5 100644
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@@ -758,6 +758,36 @@ namespace CV__SIMD_NAMESPACE {
     #if CV_SIMD_64F
     OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
     #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+    // when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+    // when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
+        #endif
+    #endif
 
     #define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \
     inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
@@ -785,6 +815,26 @@ namespace CV__SIMD_NAMESPACE {
     OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16)
     OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32)
     OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64)
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x16)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x8)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x4)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x2)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x16)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x8)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x4)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x2)
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x32)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x16)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x8)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x4)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x32)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x16)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x8)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x4)
+    #endif
 
     #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
     inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
@@ -805,17 +855,51 @@ namespace CV__SIMD_NAMESPACE {
     #if CV_SIMD_64F
     OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
     #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4)
+        #endif
+    #endif
 
-
-    inline v_float32 v_div(const v_float32& a, const v_float32& b) \
+    #define OPENCV_HAL_WRAP_BIN_OP_DIV(_Tpvec) \
+    inline _Tpvec v_div(const _Tpvec& a, const _Tpvec& b) \
     { \
         return a / b; \
     }
+    OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32)
     #if CV_SIMD_64F
-    inline v_float64 v_div(const v_float64& a, const v_float64& b) \
-    { \
-        return a / b; \
-    }
+    OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x4)
+        #endif
     #endif
 
     #define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \
@@ -844,44 +928,124 @@ namespace CV__SIMD_NAMESPACE {
     #if CV_SIMD_64F
     OPENCV_HAL_WRAP_CMP(v_float64)
     #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_CMP(v_uint8x16)
+        OPENCV_HAL_WRAP_CMP(v_uint16x8)
+        OPENCV_HAL_WRAP_CMP(v_uint32x4)
+        OPENCV_HAL_WRAP_CMP(v_int8x16)
+        OPENCV_HAL_WRAP_CMP(v_int16x8)
+        OPENCV_HAL_WRAP_CMP(v_int32x4)
+        OPENCV_HAL_WRAP_CMP(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_CMP(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_CMP(v_uint8x32)
+        OPENCV_HAL_WRAP_CMP(v_uint16x16)
+        OPENCV_HAL_WRAP_CMP(v_uint32x8)
+        OPENCV_HAL_WRAP_CMP(v_int8x32)
+        OPENCV_HAL_WRAP_CMP(v_int16x16)
+        OPENCV_HAL_WRAP_CMP(v_int32x8)
+        OPENCV_HAL_WRAP_CMP(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_CMP(v_float64x4)
+        #endif
+    #endif
 
     //////////// get0 ////////////
-    #define OPENCV_HAL_WRAP_GRT0_INT(_Tpvec, _Tp) \
-    inline _Tp v_get0(const v_##_Tpvec& v) \
+    #define OPENCV_HAL_WRAP_GRT0(_Tpvec) \
+    inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \
     { \
         return v.get0(); \
     }
 
-    OPENCV_HAL_WRAP_GRT0_INT(uint8, uchar)
-    OPENCV_HAL_WRAP_GRT0_INT(int8, schar)
-    OPENCV_HAL_WRAP_GRT0_INT(uint16, ushort)
-    OPENCV_HAL_WRAP_GRT0_INT(int16, short)
-    OPENCV_HAL_WRAP_GRT0_INT(uint32, unsigned)
-    OPENCV_HAL_WRAP_GRT0_INT(int32, int)
-    OPENCV_HAL_WRAP_GRT0_INT(uint64, uint64)
-    OPENCV_HAL_WRAP_GRT0_INT(int64, int64)
-    OPENCV_HAL_WRAP_GRT0_INT(float32, float)
+    OPENCV_HAL_WRAP_GRT0(v_uint8)
+    OPENCV_HAL_WRAP_GRT0(v_int8)
+    OPENCV_HAL_WRAP_GRT0(v_uint16)
+    OPENCV_HAL_WRAP_GRT0(v_int16)
+    OPENCV_HAL_WRAP_GRT0(v_uint32)
+    OPENCV_HAL_WRAP_GRT0(v_int32)
+    OPENCV_HAL_WRAP_GRT0(v_uint64)
+    OPENCV_HAL_WRAP_GRT0(v_int64)
+    OPENCV_HAL_WRAP_GRT0(v_float32)
     #if CV_SIMD_64F
-    OPENCV_HAL_WRAP_GRT0_INT(float64, double)
+    OPENCV_HAL_WRAP_GRT0(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_GRT0(v_uint8x16)
+        OPENCV_HAL_WRAP_GRT0(v_uint16x8)
+        OPENCV_HAL_WRAP_GRT0(v_uint32x4)
+        OPENCV_HAL_WRAP_GRT0(v_uint64x2)
+        OPENCV_HAL_WRAP_GRT0(v_int8x16)
+        OPENCV_HAL_WRAP_GRT0(v_int16x8)
+        OPENCV_HAL_WRAP_GRT0(v_int32x4)
+        OPENCV_HAL_WRAP_GRT0(v_int64x2)
+        OPENCV_HAL_WRAP_GRT0(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_GRT0(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_GRT0(v_uint8x32)
+        OPENCV_HAL_WRAP_GRT0(v_uint16x16)
+        OPENCV_HAL_WRAP_GRT0(v_uint32x8)
+        OPENCV_HAL_WRAP_GRT0(v_uint64x4)
+        OPENCV_HAL_WRAP_GRT0(v_int8x32)
+        OPENCV_HAL_WRAP_GRT0(v_int16x16)
+        OPENCV_HAL_WRAP_GRT0(v_int32x8)
+        OPENCV_HAL_WRAP_GRT0(v_int64x4)
+        OPENCV_HAL_WRAP_GRT0(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_GRT0(v_float64x4)
+        #endif
     #endif
 
-    #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec, _Tp, vl) \
-    inline _Tp v_extract_highest(const _Tpvec& v) \
+    #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
+    inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
     { \
-        return v_extract_n<vl-1>(v); \
+        return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
     }
 
-    OPENCV_HAL_WRAP_EXTRACT(v_uint8, uchar, VTraits<v_uint8>::nlanes)
-    OPENCV_HAL_WRAP_EXTRACT(v_int8, schar, VTraits<v_int8>::nlanes)
-    OPENCV_HAL_WRAP_EXTRACT(v_uint16, ushort, VTraits<v_uint16>::nlanes)
-    OPENCV_HAL_WRAP_EXTRACT(v_int16, short, VTraits<v_int16>::nlanes)
-    OPENCV_HAL_WRAP_EXTRACT(v_uint32, unsigned int, VTraits<v_uint32>::nlanes)
-    OPENCV_HAL_WRAP_EXTRACT(v_int32, int, VTraits<v_int32>::nlanes)
-    OPENCV_HAL_WRAP_EXTRACT(v_uint64, uint64, VTraits<v_uint64>::nlanes)
-    OPENCV_HAL_WRAP_EXTRACT(v_int64, int64, VTraits<v_int64>::nlanes)
-    OPENCV_HAL_WRAP_EXTRACT(v_float32, float, VTraits<v_float32>::nlanes)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint8)
+    OPENCV_HAL_WRAP_EXTRACT(v_int8)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint16)
+    OPENCV_HAL_WRAP_EXTRACT(v_int16)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint32)
+    OPENCV_HAL_WRAP_EXTRACT(v_int32)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint64)
+    OPENCV_HAL_WRAP_EXTRACT(v_int64)
+    OPENCV_HAL_WRAP_EXTRACT(v_float32)
     #if CV_SIMD_64F
-    OPENCV_HAL_WRAP_EXTRACT(v_float64, double, VTraits<v_float64>::nlanes)
+    OPENCV_HAL_WRAP_EXTRACT(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_EXTRACT(v_uint8x16)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint16x8)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint32x4)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint64x2)
+        OPENCV_HAL_WRAP_EXTRACT(v_int8x16)
+        OPENCV_HAL_WRAP_EXTRACT(v_int16x8)
+        OPENCV_HAL_WRAP_EXTRACT(v_int32x4)
+        OPENCV_HAL_WRAP_EXTRACT(v_int64x2)
+        OPENCV_HAL_WRAP_EXTRACT(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_EXTRACT(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_EXTRACT(v_uint8x32)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint16x16)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint32x8)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint64x4)
+        OPENCV_HAL_WRAP_EXTRACT(v_int8x32)
+        OPENCV_HAL_WRAP_EXTRACT(v_int16x16)
+        OPENCV_HAL_WRAP_EXTRACT(v_int32x8)
+        OPENCV_HAL_WRAP_EXTRACT(v_int64x4)
+        OPENCV_HAL_WRAP_EXTRACT(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_EXTRACT(v_float64x4)
+        #endif
     #endif
 
     #define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
@@ -893,7 +1057,16 @@ namespace CV__SIMD_NAMESPACE {
     OPENCV_HAL_WRAP_BROADCAST(v_uint32)
     OPENCV_HAL_WRAP_BROADCAST(v_int32)
     OPENCV_HAL_WRAP_BROADCAST(v_float32)
-
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_BROADCAST(v_uint32x4)
+        OPENCV_HAL_WRAP_BROADCAST(v_int32x4)
+        OPENCV_HAL_WRAP_BROADCAST(v_float32x4)
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_BROADCAST(v_uint32x8)
+        OPENCV_HAL_WRAP_BROADCAST(v_int32x8)
+        OPENCV_HAL_WRAP_BROADCAST(v_float32x8)
+    #endif
 
 #endif //!CV_SIMD_SCALABLE
 
diff --git a/modules/dnn/src/layers/cpu_kernels/convolution.cpp b/modules/dnn/src/layers/cpu_kernels/convolution.cpp
index 6b0f9c865e..be1f99852b 100644
--- a/modules/dnn/src/layers/cpu_kernels/convolution.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/convolution.cpp
@@ -1028,11 +1028,10 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
                         {
                             for (; j + 7 < out_width; j += 8)
                             {
-                                v_float32x4 v0 = v_load(cptr + j) + vbias;
-                                v_float32x4 v1 = v_load(cptr + j + 4) + vbias;
-
-                                v0 += v_load(pbptr + j);
-                                v1 += v_load(pbptr + j + 4);
+                                v_float32x4 v0 = v_add(v_load(cptr + j), vbias);
+                                v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias);
+                                v0 = v_add(v0, v_load(pbptr + j));
+                                v1 = v_add(v1, v_load(pbptr + j + 4));
 
                                 if (ifMinMaxAct)
                                 {
@@ -1048,8 +1047,8 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
                         {
                             for (; j + 7 < out_width; j += 8)
                             {
-                                v_float32x4 v0 = v_load(cptr + j) + vbias;
-                                v_float32x4 v1 = v_load(cptr + j + 4) + vbias;
+                                v_float32x4 v0 = v_add(v_load(cptr + j), vbias);
+                                v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias);
 
                                 if (ifMinMaxAct)
                                 {
@@ -1154,13 +1153,13 @@ static void convBlockMR1x28(int np, const float* a, const float* b, float *c, co
 
     if (init_c)
     {
-        c0 += v_load(c);
-        c1 += v_load(c + 4);
-        c2 += v_load(c + 8);
-        c3 += v_load(c + 12);
-        c4 += v_load(c + 16);
-        c5 += v_load(c + 20);
-        c6  += v_load(c + 24);
+        c0 = v_add(c0, v_load(c));
+        c1 = v_add(c1, v_load(c + 4));
+        c2 = v_add(c2, v_load(c + 8));
+        c3 = v_add(c3, v_load(c + 12));
+        c4 = v_add(c4, v_load(c + 16));
+        c5 = v_add(c5, v_load(c + 20));
+        c6 = v_add(c6, v_load(c + 24));
     }
 
     if (ifMinMaxAct)
@@ -1207,12 +1206,12 @@ static void convBlockMR1x24(int np, const float* a, const float* b, float *c, co
 
     if (init_c)
     {
-        c0 += v_load(c);
-        c1 += v_load(c + 4);
-        c2 += v_load(c + 8);
-        c3 += v_load(c + 12);
-        c4 += v_load(c + 16);
-        c5 += v_load(c + 20);
+        c0 = v_add(c0, v_load(c));
+        c1 = v_add(c1, v_load(c + 4));
+        c2 = v_add(c2, v_load(c + 8));
+        c3 = v_add(c3, v_load(c + 12));
+        c4 = v_add(c4, v_load(c + 16));
+        c5 = v_add(c5, v_load(c + 20));
     }
 
     if (ifMinMaxAct)
@@ -1251,9 +1250,9 @@ static void convBlockMR1x12(int np, const float* a, const float* b, float *c, co
 
     if (init_c)
     {
-        c0 += v_load(c);
-        c1 += v_load(c + 4);
-        c2 += v_load(c + 8);
+        c0 = v_add(c0, v_load(c));
+        c1 = v_add(c1, v_load(c + 4));
+        c2 = v_add(c2, v_load(c + 8));
     }
 
     if (ifMinMaxAct)
@@ -1343,33 +1342,33 @@ static void convBlock4x24(int np, const float* a, const float* b, float* c, int
 
     if (!init_c)
     {
-        c0 += v_load(c);
-        c1 += v_load(c + 4);
-        c2 += v_load(c + 8);
-        c3 += v_load(c + 12);
-        c4 += v_load(c + 16);
-        c5 += v_load(c + 20);
-
-        c6  += v_load(c + ldc);
-        c7  += v_load(c + ldc + 4);
-        c8  += v_load(c + ldc + 8);
-        c9  += v_load(c + ldc + 12);
-        c10 += v_load(c + ldc + 16);
-        c11 += v_load(c + ldc + 20);
-
-        c12 += v_load(c + ldc*2);
-        c13 += v_load(c + ldc*2 + 4);
-        c14 += v_load(c + ldc*2 + 8);
-        c15 += v_load(c + ldc*2 + 12);
-        c16 += v_load(c + ldc*2 + 16);
-        c17 += v_load(c + ldc*2 + 20);
-
-        c18 += v_load(c + ldc*3);
-        c19 += v_load(c + ldc*3 + 4);
-        c20 += v_load(c + ldc*3 + 8);
-        c21 += v_load(c + ldc*3 + 12);
-        c22 += v_load(c + ldc*3 + 16);
-        c23 += v_load(c + ldc*3 + 20);
+        c0 = v_add(c0, v_load(c));
+        c1 = v_add(c1, v_load(c + 4));
+        c2 = v_add(c2, v_load(c + 8));
+        c3 = v_add(c3, v_load(c + 12));
+        c4 = v_add(c4, v_load(c + 16));
+        c5 = v_add(c5, v_load(c + 20));
+
+        c6  = v_add(c6 , v_load(c + ldc));
+        c7  = v_add(c7 , v_load(c + ldc + 4));
+        c8  = v_add(c8 , v_load(c + ldc + 8));
+        c9  = v_add(c9 , v_load(c + ldc + 12));
+        c10 = v_add(c10, v_load(c + ldc + 16));
+        c11 = v_add(c11, v_load(c + ldc + 20));
+
+        c12 = v_add(c12, v_load(c + ldc*2));
+        c13 = v_add(c13, v_load(c + ldc*2 + 4));
+        c14 = v_add(c14, v_load(c + ldc*2 + 8));
+        c15 = v_add(c15, v_load(c + ldc*2 + 12));
+        c16 = v_add(c16, v_load(c + ldc*2 + 16));
+        c17 = v_add(c17, v_load(c + ldc*2 + 20));
+
+        c18 = v_add(c18, v_load(c + ldc*3));
+        c19 = v_add(c19, v_load(c + ldc*3 + 4));
+        c20 = v_add(c20, v_load(c + ldc*3 + 8));
+        c21 = v_add(c21, v_load(c + ldc*3 + 12));
+        c22 = v_add(c22, v_load(c + ldc*3 + 16));
+        c23 = v_add(c23, v_load(c + ldc*3 + 20));
     }
 
     v_store(c, c0);
@@ -1431,17 +1430,17 @@ static void convBlock4x8(int np, const float* a, const float* b, float* c, int l
 
     if (!init_c)
     {
-        c0 += v_load(c);
-        c1 += v_load(c + 4);
+        c0 = v_add(c0, v_load(c));
+        c1 = v_add(c1, v_load(c + 4));
 
-        c2  += v_load(c + ldc);
-        c3  += v_load(c + ldc + 4);
+        c2 = v_add(c2, v_load(c + ldc));
+        c3 = v_add(c3, v_load(c + ldc + 4));
 
-        c4 += v_load(c + ldc*2);
-        c5 += v_load(c + ldc*2 + 4);
+        c4 = v_add(c4, v_load(c + ldc*2));
+        c5 = v_add(c5, v_load(c + ldc*2 + 4));
 
-        c6 += v_load(c + ldc*3);
-        c7 += v_load(c + ldc*3 + 4);
+        c6 = v_add(c6, v_load(c + ldc*3));
+        c7 = v_add(c7, v_load(c + ldc*3 + 4));
     }
 
     v_store(c, c0);
@@ -1476,10 +1475,10 @@ static void convBlock4x4(int np, const float* a, const float* b, float* c, int l
 
     if (!init_c)
     {
-        c0 += v_load(c);
-        c1 += v_load(c + ldc);
-        c2 += v_load(c + ldc*2);
-        c3 += v_load(c + ldc*3);
+        c0 = v_add(c0, v_load(c));
+        c1 = v_add(c1, v_load(c + ldc));
+        c2 = v_add(c2, v_load(c + ldc*2));
+        c3 = v_add(c3, v_load(c + ldc*3));
     }
 
     v_store(c, c0);