From a809ae4e88c96f07d70a1642ca6d7d2c71cca8ec Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Mon, 27 Mar 2023 21:30:47 +0800 Subject: [PATCH] Fix HAL compatibility layer and modify use cases. --- .../core/include/opencv2/core/hal/intrin.hpp | 237 +++++++++++++++--- .../src/layers/cpu_kernels/convolution.cpp | 123 +++++---- 2 files changed, 266 insertions(+), 94 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 207b8cab4e..ee8310b5c5 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -758,6 +758,36 @@ namespace CV__SIMD_NAMESPACE { #if CV_SIMD_64F OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64) #endif + #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 + // when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2) + #endif + #endif + #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 + // when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4) + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4) + #endif + #endif #define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \ inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \ @@ -785,6 +815,26 @@ namespace CV__SIMD_NAMESPACE { OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16) OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32) OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64) + #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x16) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x8) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x4) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x2) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x16) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x8) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x4) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x2) + #endif + #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x32) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x16) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x8) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x4) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x32) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x16) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x8) + OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x4) + #endif #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \ inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \ @@ -805,17 +855,51 @@ namespace CV__SIMD_NAMESPACE { #if CV_SIMD_64F OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64) #endif + #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2) + #endif + #endif + #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8) + OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4) + #endif + #endif - - inline v_float32 v_div(const v_float32& a, const v_float32& b) \ + #define OPENCV_HAL_WRAP_BIN_OP_DIV(_Tpvec) \ + inline _Tpvec v_div(const _Tpvec& a, const _Tpvec& b) \ { \ return a / b; \ } + OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32) #if CV_SIMD_64F - inline v_float64 v_div(const v_float64& a, const v_float64& b) \ - { \ - return a / b; \ - } + OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64) + #endif + #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 + OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x4) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x2) + #endif + #endif + #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 + OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x8) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x4) + #endif #endif #define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \ @@ -844,44 +928,124 @@ namespace CV__SIMD_NAMESPACE { #if CV_SIMD_64F OPENCV_HAL_WRAP_CMP(v_float64) #endif + #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 + OPENCV_HAL_WRAP_CMP(v_uint8x16) + OPENCV_HAL_WRAP_CMP(v_uint16x8) + OPENCV_HAL_WRAP_CMP(v_uint32x4) + OPENCV_HAL_WRAP_CMP(v_int8x16) + OPENCV_HAL_WRAP_CMP(v_int16x8) + OPENCV_HAL_WRAP_CMP(v_int32x4) + OPENCV_HAL_WRAP_CMP(v_float32x4) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_CMP(v_float64x2) + #endif + #endif + #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 + OPENCV_HAL_WRAP_CMP(v_uint8x32) + OPENCV_HAL_WRAP_CMP(v_uint16x16) + OPENCV_HAL_WRAP_CMP(v_uint32x8) + OPENCV_HAL_WRAP_CMP(v_int8x32) + OPENCV_HAL_WRAP_CMP(v_int16x16) + OPENCV_HAL_WRAP_CMP(v_int32x8) + OPENCV_HAL_WRAP_CMP(v_float32x8) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_CMP(v_float64x4) + #endif + #endif //////////// get0 //////////// - #define OPENCV_HAL_WRAP_GRT0_INT(_Tpvec, _Tp) \ - inline _Tp v_get0(const v_##_Tpvec& v) \ + #define OPENCV_HAL_WRAP_GRT0(_Tpvec) \ + inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \ { \ return v.get0(); \ } - OPENCV_HAL_WRAP_GRT0_INT(uint8, uchar) - OPENCV_HAL_WRAP_GRT0_INT(int8, schar) - OPENCV_HAL_WRAP_GRT0_INT(uint16, ushort) - OPENCV_HAL_WRAP_GRT0_INT(int16, short) - OPENCV_HAL_WRAP_GRT0_INT(uint32, unsigned) - OPENCV_HAL_WRAP_GRT0_INT(int32, int) - OPENCV_HAL_WRAP_GRT0_INT(uint64, uint64) - OPENCV_HAL_WRAP_GRT0_INT(int64, int64) - OPENCV_HAL_WRAP_GRT0_INT(float32, float) + OPENCV_HAL_WRAP_GRT0(v_uint8) + OPENCV_HAL_WRAP_GRT0(v_int8) + OPENCV_HAL_WRAP_GRT0(v_uint16) + OPENCV_HAL_WRAP_GRT0(v_int16) + OPENCV_HAL_WRAP_GRT0(v_uint32) + OPENCV_HAL_WRAP_GRT0(v_int32) + OPENCV_HAL_WRAP_GRT0(v_uint64) + OPENCV_HAL_WRAP_GRT0(v_int64) + OPENCV_HAL_WRAP_GRT0(v_float32) #if CV_SIMD_64F - OPENCV_HAL_WRAP_GRT0_INT(float64, double) + OPENCV_HAL_WRAP_GRT0(v_float64) + #endif + #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 + OPENCV_HAL_WRAP_GRT0(v_uint8x16) + OPENCV_HAL_WRAP_GRT0(v_uint16x8) + OPENCV_HAL_WRAP_GRT0(v_uint32x4) + OPENCV_HAL_WRAP_GRT0(v_uint64x2) + OPENCV_HAL_WRAP_GRT0(v_int8x16) + OPENCV_HAL_WRAP_GRT0(v_int16x8) + OPENCV_HAL_WRAP_GRT0(v_int32x4) + OPENCV_HAL_WRAP_GRT0(v_int64x2) + OPENCV_HAL_WRAP_GRT0(v_float32x4) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_GRT0(v_float64x2) + #endif + #endif + #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 + OPENCV_HAL_WRAP_GRT0(v_uint8x32) + OPENCV_HAL_WRAP_GRT0(v_uint16x16) + OPENCV_HAL_WRAP_GRT0(v_uint32x8) + OPENCV_HAL_WRAP_GRT0(v_uint64x4) + OPENCV_HAL_WRAP_GRT0(v_int8x32) + OPENCV_HAL_WRAP_GRT0(v_int16x16) + OPENCV_HAL_WRAP_GRT0(v_int32x8) + OPENCV_HAL_WRAP_GRT0(v_int64x4) + OPENCV_HAL_WRAP_GRT0(v_float32x8) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_GRT0(v_float64x4) + #endif #endif - #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec, _Tp, vl) \ - inline _Tp v_extract_highest(const _Tpvec& v) \ + #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \ + inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \ { \ - return v_extract_n(v); \ + return v_extract_n::nlanes-1>(v); \ } - OPENCV_HAL_WRAP_EXTRACT(v_uint8, uchar, VTraits::nlanes) - OPENCV_HAL_WRAP_EXTRACT(v_int8, schar, VTraits::nlanes) - OPENCV_HAL_WRAP_EXTRACT(v_uint16, ushort, VTraits::nlanes) - OPENCV_HAL_WRAP_EXTRACT(v_int16, short, VTraits::nlanes) - OPENCV_HAL_WRAP_EXTRACT(v_uint32, unsigned int, VTraits::nlanes) - OPENCV_HAL_WRAP_EXTRACT(v_int32, int, VTraits::nlanes) - OPENCV_HAL_WRAP_EXTRACT(v_uint64, uint64, VTraits::nlanes) - OPENCV_HAL_WRAP_EXTRACT(v_int64, int64, VTraits::nlanes) - OPENCV_HAL_WRAP_EXTRACT(v_float32, float, VTraits::nlanes) + OPENCV_HAL_WRAP_EXTRACT(v_uint8) + OPENCV_HAL_WRAP_EXTRACT(v_int8) + OPENCV_HAL_WRAP_EXTRACT(v_uint16) + OPENCV_HAL_WRAP_EXTRACT(v_int16) + OPENCV_HAL_WRAP_EXTRACT(v_uint32) + OPENCV_HAL_WRAP_EXTRACT(v_int32) + OPENCV_HAL_WRAP_EXTRACT(v_uint64) + OPENCV_HAL_WRAP_EXTRACT(v_int64) + OPENCV_HAL_WRAP_EXTRACT(v_float32) #if CV_SIMD_64F - OPENCV_HAL_WRAP_EXTRACT(v_float64, double, VTraits::nlanes) + OPENCV_HAL_WRAP_EXTRACT(v_float64) + #endif + #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 + OPENCV_HAL_WRAP_EXTRACT(v_uint8x16) + OPENCV_HAL_WRAP_EXTRACT(v_uint16x8) + OPENCV_HAL_WRAP_EXTRACT(v_uint32x4) + OPENCV_HAL_WRAP_EXTRACT(v_uint64x2) + OPENCV_HAL_WRAP_EXTRACT(v_int8x16) + OPENCV_HAL_WRAP_EXTRACT(v_int16x8) + OPENCV_HAL_WRAP_EXTRACT(v_int32x4) + OPENCV_HAL_WRAP_EXTRACT(v_int64x2) + OPENCV_HAL_WRAP_EXTRACT(v_float32x4) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_EXTRACT(v_float64x2) + #endif + #endif + #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 + OPENCV_HAL_WRAP_EXTRACT(v_uint8x32) + OPENCV_HAL_WRAP_EXTRACT(v_uint16x16) + OPENCV_HAL_WRAP_EXTRACT(v_uint32x8) + OPENCV_HAL_WRAP_EXTRACT(v_uint64x4) + OPENCV_HAL_WRAP_EXTRACT(v_int8x32) + OPENCV_HAL_WRAP_EXTRACT(v_int16x16) + OPENCV_HAL_WRAP_EXTRACT(v_int32x8) + OPENCV_HAL_WRAP_EXTRACT(v_int64x4) + OPENCV_HAL_WRAP_EXTRACT(v_float32x8) + #if CV_SIMD_64F + OPENCV_HAL_WRAP_EXTRACT(v_float64x4) + #endif #endif #define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \ @@ -893,7 +1057,16 @@ namespace CV__SIMD_NAMESPACE { OPENCV_HAL_WRAP_BROADCAST(v_uint32) OPENCV_HAL_WRAP_BROADCAST(v_int32) OPENCV_HAL_WRAP_BROADCAST(v_float32) - + #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128 + OPENCV_HAL_WRAP_BROADCAST(v_uint32x4) + OPENCV_HAL_WRAP_BROADCAST(v_int32x4) + OPENCV_HAL_WRAP_BROADCAST(v_float32x4) + #endif + #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256 + OPENCV_HAL_WRAP_BROADCAST(v_uint32x8) + OPENCV_HAL_WRAP_BROADCAST(v_int32x8) + OPENCV_HAL_WRAP_BROADCAST(v_float32x8) + #endif #endif //!CV_SIMD_SCALABLE diff --git a/modules/dnn/src/layers/cpu_kernels/convolution.cpp b/modules/dnn/src/layers/cpu_kernels/convolution.cpp index 6b0f9c865e..be1f99852b 100644 --- a/modules/dnn/src/layers/cpu_kernels/convolution.cpp +++ b/modules/dnn/src/layers/cpu_kernels/convolution.cpp @@ -1028,11 +1028,10 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co { for (; j + 7 < out_width; j += 8) { - v_float32x4 v0 = v_load(cptr + j) + vbias; - v_float32x4 v1 = v_load(cptr + j + 4) + vbias; - - v0 += v_load(pbptr + j); - v1 += v_load(pbptr + j + 4); + v_float32x4 v0 = v_add(v_load(cptr + j), vbias); + v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias); + v0 = v_add(v0, v_load(pbptr + j)); + v1 = v_add(v1, v_load(pbptr + j + 4)); if (ifMinMaxAct) { @@ -1048,8 +1047,8 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co { for (; j + 7 < out_width; j += 8) { - v_float32x4 v0 = v_load(cptr + j) + vbias; - v_float32x4 v1 = v_load(cptr + j + 4) + vbias; + v_float32x4 v0 = v_add(v_load(cptr + j), vbias); + v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias); if (ifMinMaxAct) { @@ -1154,13 +1153,13 @@ static void convBlockMR1x28(int np, const float* a, const float* b, float *c, co if (init_c) { - c0 += v_load(c); - c1 += v_load(c + 4); - c2 += v_load(c + 8); - c3 += v_load(c + 12); - c4 += v_load(c + 16); - c5 += v_load(c + 20); - c6 += v_load(c + 24); + c0 = v_add(c0, v_load(c)); + c1 = v_add(c1, v_load(c + 4)); + c2 = v_add(c2, v_load(c + 8)); + c3 = v_add(c3, v_load(c + 12)); + c4 = v_add(c4, v_load(c + 16)); + c5 = v_add(c5, v_load(c + 20)); + c6 = v_add(c6, v_load(c + 24)); } if (ifMinMaxAct) @@ -1207,12 +1206,12 @@ static void convBlockMR1x24(int np, const float* a, const float* b, float *c, co if (init_c) { - c0 += v_load(c); - c1 += v_load(c + 4); - c2 += v_load(c + 8); - c3 += v_load(c + 12); - c4 += v_load(c + 16); - c5 += v_load(c + 20); + c0 = v_add(c0, v_load(c)); + c1 = v_add(c1, v_load(c + 4)); + c2 = v_add(c2, v_load(c + 8)); + c3 = v_add(c3, v_load(c + 12)); + c4 = v_add(c4, v_load(c + 16)); + c5 = v_add(c5, v_load(c + 20)); } if (ifMinMaxAct) @@ -1251,9 +1250,9 @@ static void convBlockMR1x12(int np, const float* a, const float* b, float *c, co if (init_c) { - c0 += v_load(c); - c1 += v_load(c + 4); - c2 += v_load(c + 8); + c0 = v_add(c0, v_load(c)); + c1 = v_add(c1, v_load(c + 4)); + c2 = v_add(c2, v_load(c + 8)); } if (ifMinMaxAct) @@ -1343,33 +1342,33 @@ static void convBlock4x24(int np, const float* a, const float* b, float* c, int if (!init_c) { - c0 += v_load(c); - c1 += v_load(c + 4); - c2 += v_load(c + 8); - c3 += v_load(c + 12); - c4 += v_load(c + 16); - c5 += v_load(c + 20); - - c6 += v_load(c + ldc); - c7 += v_load(c + ldc + 4); - c8 += v_load(c + ldc + 8); - c9 += v_load(c + ldc + 12); - c10 += v_load(c + ldc + 16); - c11 += v_load(c + ldc + 20); - - c12 += v_load(c + ldc*2); - c13 += v_load(c + ldc*2 + 4); - c14 += v_load(c + ldc*2 + 8); - c15 += v_load(c + ldc*2 + 12); - c16 += v_load(c + ldc*2 + 16); - c17 += v_load(c + ldc*2 + 20); - - c18 += v_load(c + ldc*3); - c19 += v_load(c + ldc*3 + 4); - c20 += v_load(c + ldc*3 + 8); - c21 += v_load(c + ldc*3 + 12); - c22 += v_load(c + ldc*3 + 16); - c23 += v_load(c + ldc*3 + 20); + c0 = v_add(c0, v_load(c)); + c1 = v_add(c1, v_load(c + 4)); + c2 = v_add(c2, v_load(c + 8)); + c3 = v_add(c3, v_load(c + 12)); + c4 = v_add(c4, v_load(c + 16)); + c5 = v_add(c5, v_load(c + 20)); + + c6 = v_add(c6 , v_load(c + ldc)); + c7 = v_add(c7 , v_load(c + ldc + 4)); + c8 = v_add(c8 , v_load(c + ldc + 8)); + c9 = v_add(c9 , v_load(c + ldc + 12)); + c10 = v_add(c10, v_load(c + ldc + 16)); + c11 = v_add(c11, v_load(c + ldc + 20)); + + c12 = v_add(c12, v_load(c + ldc*2)); + c13 = v_add(c13, v_load(c + ldc*2 + 4)); + c14 = v_add(c14, v_load(c + ldc*2 + 8)); + c15 = v_add(c15, v_load(c + ldc*2 + 12)); + c16 = v_add(c16, v_load(c + ldc*2 + 16)); + c17 = v_add(c17, v_load(c + ldc*2 + 20)); + + c18 = v_add(c18, v_load(c + ldc*3)); + c19 = v_add(c19, v_load(c + ldc*3 + 4)); + c20 = v_add(c20, v_load(c + ldc*3 + 8)); + c21 = v_add(c21, v_load(c + ldc*3 + 12)); + c22 = v_add(c22, v_load(c + ldc*3 + 16)); + c23 = v_add(c23, v_load(c + ldc*3 + 20)); } v_store(c, c0); @@ -1431,17 +1430,17 @@ static void convBlock4x8(int np, const float* a, const float* b, float* c, int l if (!init_c) { - c0 += v_load(c); - c1 += v_load(c + 4); + c0 = v_add(c0, v_load(c)); + c1 = v_add(c1, v_load(c + 4)); - c2 += v_load(c + ldc); - c3 += v_load(c + ldc + 4); + c2 = v_add(c2, v_load(c + ldc)); + c3 = v_add(c3, v_load(c + ldc + 4)); - c4 += v_load(c + ldc*2); - c5 += v_load(c + ldc*2 + 4); + c4 = v_add(c4, v_load(c + ldc*2)); + c5 = v_add(c5, v_load(c + ldc*2 + 4)); - c6 += v_load(c + ldc*3); - c7 += v_load(c + ldc*3 + 4); + c6 = v_add(c6, v_load(c + ldc*3)); + c7 = v_add(c7, v_load(c + ldc*3 + 4)); } v_store(c, c0); @@ -1476,10 +1475,10 @@ static void convBlock4x4(int np, const float* a, const float* b, float* c, int l if (!init_c) { - c0 += v_load(c); - c1 += v_load(c + ldc); - c2 += v_load(c + ldc*2); - c3 += v_load(c + ldc*3); + c0 = v_add(c0, v_load(c)); + c1 = v_add(c1, v_load(c + ldc)); + c2 = v_add(c2, v_load(c + ldc*2)); + c3 = v_add(c3, v_load(c + ldc*3)); } v_store(c, c0);