Fix HAL compatibility layer and modify use cases.

2 years ago · a809ae4e88
parent 352f92e437
commit a809ae4e88
2 changed files with 266 additions and 94 deletions
--- a/modules/core/include/opencv2/core/hal/intrin.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin.hpp
@ -758,6 +758,36 @@ namespace CV__SIMD_NAMESPACE {
    #if CV_SIMD_64F
    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+    // when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+    // when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
+        #endif
+    #endif

    #define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \
    inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
@ -785,6 +815,26 @@ namespace CV__SIMD_NAMESPACE {
    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16)
    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32)
    OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64)
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x16)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x8)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x4)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x2)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x16)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x8)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x4)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x2)
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x32)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x16)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x8)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x4)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x32)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x16)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x8)
+        OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x4)
+    #endif

    #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
    inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
@ -805,17 +855,51 @@ namespace CV__SIMD_NAMESPACE {
    #if CV_SIMD_64F
    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4)
+        #endif
+    #endif

-
-    inline v_float32 v_div(const v_float32& a, const v_float32& b) \
+    #define OPENCV_HAL_WRAP_BIN_OP_DIV(_Tpvec) \
+    inline _Tpvec v_div(const _Tpvec& a, const _Tpvec& b) \
    { \
        return a / b; \
    }
+    OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32)
    #if CV_SIMD_64F
-    inline v_float64 v_div(const v_float64& a, const v_float64& b) \
-    { \
-        return a / b; \
-    }
+    OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x4)
+        #endif
    #endif

    #define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \
@ -844,44 +928,124 @@ namespace CV__SIMD_NAMESPACE {
    #if CV_SIMD_64F
    OPENCV_HAL_WRAP_CMP(v_float64)
    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_CMP(v_uint8x16)
+        OPENCV_HAL_WRAP_CMP(v_uint16x8)
+        OPENCV_HAL_WRAP_CMP(v_uint32x4)
+        OPENCV_HAL_WRAP_CMP(v_int8x16)
+        OPENCV_HAL_WRAP_CMP(v_int16x8)
+        OPENCV_HAL_WRAP_CMP(v_int32x4)
+        OPENCV_HAL_WRAP_CMP(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_CMP(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_CMP(v_uint8x32)
+        OPENCV_HAL_WRAP_CMP(v_uint16x16)
+        OPENCV_HAL_WRAP_CMP(v_uint32x8)
+        OPENCV_HAL_WRAP_CMP(v_int8x32)
+        OPENCV_HAL_WRAP_CMP(v_int16x16)
+        OPENCV_HAL_WRAP_CMP(v_int32x8)
+        OPENCV_HAL_WRAP_CMP(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_CMP(v_float64x4)
+        #endif
+    #endif

    //////////// get0 ////////////
-    #define OPENCV_HAL_WRAP_GRT0_INT(_Tpvec, _Tp) \
-    inline _Tp v_get0(const v_##_Tpvec& v) \
+    #define OPENCV_HAL_WRAP_GRT0(_Tpvec) \
+    inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \
    { \
        return v.get0(); \
    }

-    OPENCV_HAL_WRAP_GRT0_INT(uint8, uchar)
-    OPENCV_HAL_WRAP_GRT0_INT(int8, schar)
-    OPENCV_HAL_WRAP_GRT0_INT(uint16, ushort)
-    OPENCV_HAL_WRAP_GRT0_INT(int16, short)
-    OPENCV_HAL_WRAP_GRT0_INT(uint32, unsigned)
-    OPENCV_HAL_WRAP_GRT0_INT(int32, int)
-    OPENCV_HAL_WRAP_GRT0_INT(uint64, uint64)
-    OPENCV_HAL_WRAP_GRT0_INT(int64, int64)
-    OPENCV_HAL_WRAP_GRT0_INT(float32, float)
+    OPENCV_HAL_WRAP_GRT0(v_uint8)
+    OPENCV_HAL_WRAP_GRT0(v_int8)
+    OPENCV_HAL_WRAP_GRT0(v_uint16)
+    OPENCV_HAL_WRAP_GRT0(v_int16)
+    OPENCV_HAL_WRAP_GRT0(v_uint32)
+    OPENCV_HAL_WRAP_GRT0(v_int32)
+    OPENCV_HAL_WRAP_GRT0(v_uint64)
+    OPENCV_HAL_WRAP_GRT0(v_int64)
+    OPENCV_HAL_WRAP_GRT0(v_float32)
    #if CV_SIMD_64F
-    OPENCV_HAL_WRAP_GRT0_INT(float64, double)
+    OPENCV_HAL_WRAP_GRT0(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_GRT0(v_uint8x16)
+        OPENCV_HAL_WRAP_GRT0(v_uint16x8)
+        OPENCV_HAL_WRAP_GRT0(v_uint32x4)
+        OPENCV_HAL_WRAP_GRT0(v_uint64x2)
+        OPENCV_HAL_WRAP_GRT0(v_int8x16)
+        OPENCV_HAL_WRAP_GRT0(v_int16x8)
+        OPENCV_HAL_WRAP_GRT0(v_int32x4)
+        OPENCV_HAL_WRAP_GRT0(v_int64x2)
+        OPENCV_HAL_WRAP_GRT0(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_GRT0(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_GRT0(v_uint8x32)
+        OPENCV_HAL_WRAP_GRT0(v_uint16x16)
+        OPENCV_HAL_WRAP_GRT0(v_uint32x8)
+        OPENCV_HAL_WRAP_GRT0(v_uint64x4)
+        OPENCV_HAL_WRAP_GRT0(v_int8x32)
+        OPENCV_HAL_WRAP_GRT0(v_int16x16)
+        OPENCV_HAL_WRAP_GRT0(v_int32x8)
+        OPENCV_HAL_WRAP_GRT0(v_int64x4)
+        OPENCV_HAL_WRAP_GRT0(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_GRT0(v_float64x4)
+        #endif
    #endif

-    #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec, _Tp, vl) \
-    inline _Tp v_extract_highest(const _Tpvec& v) \
+    #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
+    inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
    { \
-        return v_extract_n<vl-1>(v); \
+        return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
    }

-    OPENCV_HAL_WRAP_EXTRACT(v_uint8, uchar, VTraits<v_uint8>::nlanes)
-    OPENCV_HAL_WRAP_EXTRACT(v_int8, schar, VTraits<v_int8>::nlanes)
-    OPENCV_HAL_WRAP_EXTRACT(v_uint16, ushort, VTraits<v_uint16>::nlanes)
-    OPENCV_HAL_WRAP_EXTRACT(v_int16, short, VTraits<v_int16>::nlanes)
-    OPENCV_HAL_WRAP_EXTRACT(v_uint32, unsigned int, VTraits<v_uint32>::nlanes)
-    OPENCV_HAL_WRAP_EXTRACT(v_int32, int, VTraits<v_int32>::nlanes)
-    OPENCV_HAL_WRAP_EXTRACT(v_uint64, uint64, VTraits<v_uint64>::nlanes)
-    OPENCV_HAL_WRAP_EXTRACT(v_int64, int64, VTraits<v_int64>::nlanes)
-    OPENCV_HAL_WRAP_EXTRACT(v_float32, float, VTraits<v_float32>::nlanes)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint8)
+    OPENCV_HAL_WRAP_EXTRACT(v_int8)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint16)
+    OPENCV_HAL_WRAP_EXTRACT(v_int16)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint32)
+    OPENCV_HAL_WRAP_EXTRACT(v_int32)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint64)
+    OPENCV_HAL_WRAP_EXTRACT(v_int64)
+    OPENCV_HAL_WRAP_EXTRACT(v_float32)
    #if CV_SIMD_64F
-    OPENCV_HAL_WRAP_EXTRACT(v_float64, double, VTraits<v_float64>::nlanes)
+    OPENCV_HAL_WRAP_EXTRACT(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_EXTRACT(v_uint8x16)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint16x8)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint32x4)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint64x2)
+        OPENCV_HAL_WRAP_EXTRACT(v_int8x16)
+        OPENCV_HAL_WRAP_EXTRACT(v_int16x8)
+        OPENCV_HAL_WRAP_EXTRACT(v_int32x4)
+        OPENCV_HAL_WRAP_EXTRACT(v_int64x2)
+        OPENCV_HAL_WRAP_EXTRACT(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_EXTRACT(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_EXTRACT(v_uint8x32)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint16x16)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint32x8)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint64x4)
+        OPENCV_HAL_WRAP_EXTRACT(v_int8x32)
+        OPENCV_HAL_WRAP_EXTRACT(v_int16x16)
+        OPENCV_HAL_WRAP_EXTRACT(v_int32x8)
+        OPENCV_HAL_WRAP_EXTRACT(v_int64x4)
+        OPENCV_HAL_WRAP_EXTRACT(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_EXTRACT(v_float64x4)
+        #endif
    #endif

    #define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
@ -893,7 +1057,16 @@ namespace CV__SIMD_NAMESPACE {
    OPENCV_HAL_WRAP_BROADCAST(v_uint32)
    OPENCV_HAL_WRAP_BROADCAST(v_int32)
    OPENCV_HAL_WRAP_BROADCAST(v_float32)
-
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_BROADCAST(v_uint32x4)
+        OPENCV_HAL_WRAP_BROADCAST(v_int32x4)
+        OPENCV_HAL_WRAP_BROADCAST(v_float32x4)
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_BROADCAST(v_uint32x8)
+        OPENCV_HAL_WRAP_BROADCAST(v_int32x8)
+        OPENCV_HAL_WRAP_BROADCAST(v_float32x8)
+    #endif

 #endif //!CV_SIMD_SCALABLE

--- a/modules/dnn/src/layers/cpu_kernels/convolution.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/convolution.cpp
@ -1028,11 +1028,10 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
                        {
                            for (; j + 7 < out_width; j += 8)
                            {
-                                v_float32x4 v0 = v_load(cptr + j) + vbias;
-                                v_float32x4 v1 = v_load(cptr + j + 4) + vbias;
-
-                                v0 += v_load(pbptr + j);
-                                v1 += v_load(pbptr + j + 4);
+                                v_float32x4 v0 = v_add(v_load(cptr + j), vbias);
+                                v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias);
+                                v0 = v_add(v0, v_load(pbptr + j));
+                                v1 = v_add(v1, v_load(pbptr + j + 4));

                                if (ifMinMaxAct)
                                {
@ -1048,8 +1047,8 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
                        {
                            for (; j + 7 < out_width; j += 8)
                            {
-                                v_float32x4 v0 = v_load(cptr + j) + vbias;
-                                v_float32x4 v1 = v_load(cptr + j + 4) + vbias;
+                                v_float32x4 v0 = v_add(v_load(cptr + j), vbias);
+                                v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias);

                                if (ifMinMaxAct)
                                {
@ -1154,13 +1153,13 @@ static void convBlockMR1x28(int np, const float* a, const float* b, float *c, co

    if (init_c)
    {
-        c0 += v_load(c);
-        c1 += v_load(c + 4);
-        c2 += v_load(c + 8);
-        c3 += v_load(c + 12);
-        c4 += v_load(c + 16);
-        c5 += v_load(c + 20);
-        c6  += v_load(c + 24);
+        c0 = v_add(c0, v_load(c));
+        c1 = v_add(c1, v_load(c + 4));
+        c2 = v_add(c2, v_load(c + 8));
+        c3 = v_add(c3, v_load(c + 12));
+        c4 = v_add(c4, v_load(c + 16));
+        c5 = v_add(c5, v_load(c + 20));
+        c6 = v_add(c6, v_load(c + 24));
    }

    if (ifMinMaxAct)
@ -1207,12 +1206,12 @@ static void convBlockMR1x24(int np, const float* a, const float* b, float *c, co

    if (init_c)
    {
-        c0 += v_load(c);
-        c1 += v_load(c + 4);
-        c2 += v_load(c + 8);
-        c3 += v_load(c + 12);
-        c4 += v_load(c + 16);
-        c5 += v_load(c + 20);
+        c0 = v_add(c0, v_load(c));
+        c1 = v_add(c1, v_load(c + 4));
+        c2 = v_add(c2, v_load(c + 8));
+        c3 = v_add(c3, v_load(c + 12));
+        c4 = v_add(c4, v_load(c + 16));
+        c5 = v_add(c5, v_load(c + 20));
    }

    if (ifMinMaxAct)
@ -1251,9 +1250,9 @@ static void convBlockMR1x12(int np, const float* a, const float* b, float *c, co

    if (init_c)
    {
-        c0 += v_load(c);
-        c1 += v_load(c + 4);
-        c2 += v_load(c + 8);
+        c0 = v_add(c0, v_load(c));
+        c1 = v_add(c1, v_load(c + 4));
+        c2 = v_add(c2, v_load(c + 8));
    }

    if (ifMinMaxAct)
@ -1343,33 +1342,33 @@ static void convBlock4x24(int np, const float* a, const float* b, float* c, int

    if (!init_c)
    {
-        c0 += v_load(c);
-        c1 += v_load(c + 4);
-        c2 += v_load(c + 8);
-        c3 += v_load(c + 12);
-        c4 += v_load(c + 16);
-        c5 += v_load(c + 20);
-
-        c6  += v_load(c + ldc);
-        c7  += v_load(c + ldc + 4);
-        c8  += v_load(c + ldc + 8);
-        c9  += v_load(c + ldc + 12);
-        c10 += v_load(c + ldc + 16);
-        c11 += v_load(c + ldc + 20);
-
-        c12 += v_load(c + ldc*2);
-        c13 += v_load(c + ldc*2 + 4);
-        c14 += v_load(c + ldc*2 + 8);
-        c15 += v_load(c + ldc*2 + 12);
-        c16 += v_load(c + ldc*2 + 16);
-        c17 += v_load(c + ldc*2 + 20);
-
-        c18 += v_load(c + ldc*3);
-        c19 += v_load(c + ldc*3 + 4);
-        c20 += v_load(c + ldc*3 + 8);
-        c21 += v_load(c + ldc*3 + 12);
-        c22 += v_load(c + ldc*3 + 16);
-        c23 += v_load(c + ldc*3 + 20);
+        c0 = v_add(c0, v_load(c));
+        c1 = v_add(c1, v_load(c + 4));
+        c2 = v_add(c2, v_load(c + 8));
+        c3 = v_add(c3, v_load(c + 12));
+        c4 = v_add(c4, v_load(c + 16));
+        c5 = v_add(c5, v_load(c + 20));
+
+        c6  = v_add(c6 , v_load(c + ldc));
+        c7  = v_add(c7 , v_load(c + ldc + 4));
+        c8  = v_add(c8 , v_load(c + ldc + 8));
+        c9  = v_add(c9 , v_load(c + ldc + 12));
+        c10 = v_add(c10, v_load(c + ldc + 16));
+        c11 = v_add(c11, v_load(c + ldc + 20));
+
+        c12 = v_add(c12, v_load(c + ldc*2));
+        c13 = v_add(c13, v_load(c + ldc*2 + 4));
+        c14 = v_add(c14, v_load(c + ldc*2 + 8));
+        c15 = v_add(c15, v_load(c + ldc*2 + 12));
+        c16 = v_add(c16, v_load(c + ldc*2 + 16));
+        c17 = v_add(c17, v_load(c + ldc*2 + 20));
+
+        c18 = v_add(c18, v_load(c + ldc*3));
+        c19 = v_add(c19, v_load(c + ldc*3 + 4));
+        c20 = v_add(c20, v_load(c + ldc*3 + 8));
+        c21 = v_add(c21, v_load(c + ldc*3 + 12));
+        c22 = v_add(c22, v_load(c + ldc*3 + 16));
+        c23 = v_add(c23, v_load(c + ldc*3 + 20));
    }

    v_store(c, c0);
@ -1431,17 +1430,17 @@ static void convBlock4x8(int np, const float* a, const float* b, float* c, int l

    if (!init_c)
    {
-        c0 += v_load(c);
-        c1 += v_load(c + 4);
+        c0 = v_add(c0, v_load(c));
+        c1 = v_add(c1, v_load(c + 4));

-        c2  += v_load(c + ldc);
-        c3  += v_load(c + ldc + 4);
+        c2 = v_add(c2, v_load(c + ldc));
+        c3 = v_add(c3, v_load(c + ldc + 4));

-        c4 += v_load(c + ldc*2);
-        c5 += v_load(c + ldc*2 + 4);
+        c4 = v_add(c4, v_load(c + ldc*2));
+        c5 = v_add(c5, v_load(c + ldc*2 + 4));

-        c6 += v_load(c + ldc*3);
-        c7 += v_load(c + ldc*3 + 4);
+        c6 = v_add(c6, v_load(c + ldc*3));
+        c7 = v_add(c7, v_load(c + ldc*3 + 4));
    }

    v_store(c, c0);
@ -1476,10 +1475,10 @@ static void convBlock4x4(int np, const float* a, const float* b, float* c, int l

    if (!init_c)
    {
-        c0 += v_load(c);
-        c1 += v_load(c + ldc);
-        c2 += v_load(c + ldc*2);
-        c3 += v_load(c + ldc*3);
+        c0 = v_add(c0, v_load(c));
+        c1 = v_add(c1, v_load(c + ldc));
+        c2 = v_add(c2, v_load(c + ldc*2));
+        c3 = v_add(c3, v_load(c + ldc*3));
    }

    v_store(c, c0);