Fix HAL compatibility layer and modify use cases.

pull/23310/head
HAN Liutong 2 years ago
parent 352f92e437
commit a809ae4e88
  1. 237
      modules/core/include/opencv2/core/hal/intrin.hpp
  2. 123
      modules/dnn/src/layers/cpu_kernels/convolution.cpp

@ -758,6 +758,36 @@ namespace CV__SIMD_NAMESPACE {
#if CV_SIMD_64F #if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64) OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
#endif #endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
// when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
// when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \ #define OPENCV_HAL_WRAP_BIN_OP_LOGIC(_Tpvec) \
inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \ inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
@ -785,6 +815,26 @@ namespace CV__SIMD_NAMESPACE {
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16) OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32) OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64) OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64)
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x4)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x2)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x4)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x2)
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint8x32)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint16x16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint32x8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_uint64x4)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int8x32)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int16x16)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int32x8)
OPENCV_HAL_WRAP_BIN_OP_LOGIC(v_int64x4)
#endif
#define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \ #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \ inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
@ -805,17 +855,51 @@ namespace CV__SIMD_NAMESPACE {
#if CV_SIMD_64F #if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64) OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
#endif #endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8)
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4)
#endif
#endif
#define OPENCV_HAL_WRAP_BIN_OP_DIV(_Tpvec) \
inline v_float32 v_div(const v_float32& a, const v_float32& b) \ inline _Tpvec v_div(const _Tpvec& a, const _Tpvec& b) \
{ \ { \
return a / b; \ return a / b; \
} }
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32)
#if CV_SIMD_64F #if CV_SIMD_64F
inline v_float64 v_div(const v_float64& a, const v_float64& b) \ OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64)
{ \ #endif
return a / b; \ #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
} OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_BIN_OP_DIV(v_float64x4)
#endif
#endif #endif
#define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \ #define OPENCV_HAL_WRAP_CMP_OP(_Tpvec, intrin, op) \
@ -844,44 +928,124 @@ namespace CV__SIMD_NAMESPACE {
#if CV_SIMD_64F #if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64) OPENCV_HAL_WRAP_CMP(v_float64)
#endif #endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_CMP(v_uint8x16)
OPENCV_HAL_WRAP_CMP(v_uint16x8)
OPENCV_HAL_WRAP_CMP(v_uint32x4)
OPENCV_HAL_WRAP_CMP(v_int8x16)
OPENCV_HAL_WRAP_CMP(v_int16x8)
OPENCV_HAL_WRAP_CMP(v_int32x4)
OPENCV_HAL_WRAP_CMP(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_CMP(v_uint8x32)
OPENCV_HAL_WRAP_CMP(v_uint16x16)
OPENCV_HAL_WRAP_CMP(v_uint32x8)
OPENCV_HAL_WRAP_CMP(v_int8x32)
OPENCV_HAL_WRAP_CMP(v_int16x16)
OPENCV_HAL_WRAP_CMP(v_int32x8)
OPENCV_HAL_WRAP_CMP(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_CMP(v_float64x4)
#endif
#endif
//////////// get0 //////////// //////////// get0 ////////////
#define OPENCV_HAL_WRAP_GRT0_INT(_Tpvec, _Tp) \ #define OPENCV_HAL_WRAP_GRT0(_Tpvec) \
inline _Tp v_get0(const v_##_Tpvec& v) \ inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \
{ \ { \
return v.get0(); \ return v.get0(); \
} }
OPENCV_HAL_WRAP_GRT0_INT(uint8, uchar) OPENCV_HAL_WRAP_GRT0(v_uint8)
OPENCV_HAL_WRAP_GRT0_INT(int8, schar) OPENCV_HAL_WRAP_GRT0(v_int8)
OPENCV_HAL_WRAP_GRT0_INT(uint16, ushort) OPENCV_HAL_WRAP_GRT0(v_uint16)
OPENCV_HAL_WRAP_GRT0_INT(int16, short) OPENCV_HAL_WRAP_GRT0(v_int16)
OPENCV_HAL_WRAP_GRT0_INT(uint32, unsigned) OPENCV_HAL_WRAP_GRT0(v_uint32)
OPENCV_HAL_WRAP_GRT0_INT(int32, int) OPENCV_HAL_WRAP_GRT0(v_int32)
OPENCV_HAL_WRAP_GRT0_INT(uint64, uint64) OPENCV_HAL_WRAP_GRT0(v_uint64)
OPENCV_HAL_WRAP_GRT0_INT(int64, int64) OPENCV_HAL_WRAP_GRT0(v_int64)
OPENCV_HAL_WRAP_GRT0_INT(float32, float) OPENCV_HAL_WRAP_GRT0(v_float32)
#if CV_SIMD_64F #if CV_SIMD_64F
OPENCV_HAL_WRAP_GRT0_INT(float64, double) OPENCV_HAL_WRAP_GRT0(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_GRT0(v_uint8x16)
OPENCV_HAL_WRAP_GRT0(v_uint16x8)
OPENCV_HAL_WRAP_GRT0(v_uint32x4)
OPENCV_HAL_WRAP_GRT0(v_uint64x2)
OPENCV_HAL_WRAP_GRT0(v_int8x16)
OPENCV_HAL_WRAP_GRT0(v_int16x8)
OPENCV_HAL_WRAP_GRT0(v_int32x4)
OPENCV_HAL_WRAP_GRT0(v_int64x2)
OPENCV_HAL_WRAP_GRT0(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_GRT0(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_GRT0(v_uint8x32)
OPENCV_HAL_WRAP_GRT0(v_uint16x16)
OPENCV_HAL_WRAP_GRT0(v_uint32x8)
OPENCV_HAL_WRAP_GRT0(v_uint64x4)
OPENCV_HAL_WRAP_GRT0(v_int8x32)
OPENCV_HAL_WRAP_GRT0(v_int16x16)
OPENCV_HAL_WRAP_GRT0(v_int32x8)
OPENCV_HAL_WRAP_GRT0(v_int64x4)
OPENCV_HAL_WRAP_GRT0(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_GRT0(v_float64x4)
#endif
#endif #endif
#define OPENCV_HAL_WRAP_EXTRACT(_Tpvec, _Tp, vl) \ #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
inline _Tp v_extract_highest(const _Tpvec& v) \ inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
{ \ { \
return v_extract_n<vl-1>(v); \ return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
} }
OPENCV_HAL_WRAP_EXTRACT(v_uint8, uchar, VTraits<v_uint8>::nlanes) OPENCV_HAL_WRAP_EXTRACT(v_uint8)
OPENCV_HAL_WRAP_EXTRACT(v_int8, schar, VTraits<v_int8>::nlanes) OPENCV_HAL_WRAP_EXTRACT(v_int8)
OPENCV_HAL_WRAP_EXTRACT(v_uint16, ushort, VTraits<v_uint16>::nlanes) OPENCV_HAL_WRAP_EXTRACT(v_uint16)
OPENCV_HAL_WRAP_EXTRACT(v_int16, short, VTraits<v_int16>::nlanes) OPENCV_HAL_WRAP_EXTRACT(v_int16)
OPENCV_HAL_WRAP_EXTRACT(v_uint32, unsigned int, VTraits<v_uint32>::nlanes) OPENCV_HAL_WRAP_EXTRACT(v_uint32)
OPENCV_HAL_WRAP_EXTRACT(v_int32, int, VTraits<v_int32>::nlanes) OPENCV_HAL_WRAP_EXTRACT(v_int32)
OPENCV_HAL_WRAP_EXTRACT(v_uint64, uint64, VTraits<v_uint64>::nlanes) OPENCV_HAL_WRAP_EXTRACT(v_uint64)
OPENCV_HAL_WRAP_EXTRACT(v_int64, int64, VTraits<v_int64>::nlanes) OPENCV_HAL_WRAP_EXTRACT(v_int64)
OPENCV_HAL_WRAP_EXTRACT(v_float32, float, VTraits<v_float32>::nlanes) OPENCV_HAL_WRAP_EXTRACT(v_float32)
#if CV_SIMD_64F #if CV_SIMD_64F
OPENCV_HAL_WRAP_EXTRACT(v_float64, double, VTraits<v_float64>::nlanes) OPENCV_HAL_WRAP_EXTRACT(v_float64)
#endif
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_EXTRACT(v_uint8x16)
OPENCV_HAL_WRAP_EXTRACT(v_uint16x8)
OPENCV_HAL_WRAP_EXTRACT(v_uint32x4)
OPENCV_HAL_WRAP_EXTRACT(v_uint64x2)
OPENCV_HAL_WRAP_EXTRACT(v_int8x16)
OPENCV_HAL_WRAP_EXTRACT(v_int16x8)
OPENCV_HAL_WRAP_EXTRACT(v_int32x4)
OPENCV_HAL_WRAP_EXTRACT(v_int64x2)
OPENCV_HAL_WRAP_EXTRACT(v_float32x4)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_EXTRACT(v_float64x2)
#endif
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_EXTRACT(v_uint8x32)
OPENCV_HAL_WRAP_EXTRACT(v_uint16x16)
OPENCV_HAL_WRAP_EXTRACT(v_uint32x8)
OPENCV_HAL_WRAP_EXTRACT(v_uint64x4)
OPENCV_HAL_WRAP_EXTRACT(v_int8x32)
OPENCV_HAL_WRAP_EXTRACT(v_int16x16)
OPENCV_HAL_WRAP_EXTRACT(v_int32x8)
OPENCV_HAL_WRAP_EXTRACT(v_int64x4)
OPENCV_HAL_WRAP_EXTRACT(v_float32x8)
#if CV_SIMD_64F
OPENCV_HAL_WRAP_EXTRACT(v_float64x4)
#endif
#endif #endif
#define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \ #define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
@ -893,7 +1057,16 @@ namespace CV__SIMD_NAMESPACE {
OPENCV_HAL_WRAP_BROADCAST(v_uint32) OPENCV_HAL_WRAP_BROADCAST(v_uint32)
OPENCV_HAL_WRAP_BROADCAST(v_int32) OPENCV_HAL_WRAP_BROADCAST(v_int32)
OPENCV_HAL_WRAP_BROADCAST(v_float32) OPENCV_HAL_WRAP_BROADCAST(v_float32)
#if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
OPENCV_HAL_WRAP_BROADCAST(v_uint32x4)
OPENCV_HAL_WRAP_BROADCAST(v_int32x4)
OPENCV_HAL_WRAP_BROADCAST(v_float32x4)
#endif
#if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
OPENCV_HAL_WRAP_BROADCAST(v_uint32x8)
OPENCV_HAL_WRAP_BROADCAST(v_int32x8)
OPENCV_HAL_WRAP_BROADCAST(v_float32x8)
#endif
#endif //!CV_SIMD_SCALABLE #endif //!CV_SIMD_SCALABLE

@ -1028,11 +1028,10 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
{ {
for (; j + 7 < out_width; j += 8) for (; j + 7 < out_width; j += 8)
{ {
v_float32x4 v0 = v_load(cptr + j) + vbias; v_float32x4 v0 = v_add(v_load(cptr + j), vbias);
v_float32x4 v1 = v_load(cptr + j + 4) + vbias; v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias);
v0 = v_add(v0, v_load(pbptr + j));
v0 += v_load(pbptr + j); v1 = v_add(v1, v_load(pbptr + j + 4));
v1 += v_load(pbptr + j + 4);
if (ifMinMaxAct) if (ifMinMaxAct)
{ {
@ -1048,8 +1047,8 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
{ {
for (; j + 7 < out_width; j += 8) for (; j + 7 < out_width; j += 8)
{ {
v_float32x4 v0 = v_load(cptr + j) + vbias; v_float32x4 v0 = v_add(v_load(cptr + j), vbias);
v_float32x4 v1 = v_load(cptr + j + 4) + vbias; v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias);
if (ifMinMaxAct) if (ifMinMaxAct)
{ {
@ -1154,13 +1153,13 @@ static void convBlockMR1x28(int np, const float* a, const float* b, float *c, co
if (init_c) if (init_c)
{ {
c0 += v_load(c); c0 = v_add(c0, v_load(c));
c1 += v_load(c + 4); c1 = v_add(c1, v_load(c + 4));
c2 += v_load(c + 8); c2 = v_add(c2, v_load(c + 8));
c3 += v_load(c + 12); c3 = v_add(c3, v_load(c + 12));
c4 += v_load(c + 16); c4 = v_add(c4, v_load(c + 16));
c5 += v_load(c + 20); c5 = v_add(c5, v_load(c + 20));
c6 += v_load(c + 24); c6 = v_add(c6, v_load(c + 24));
} }
if (ifMinMaxAct) if (ifMinMaxAct)
@ -1207,12 +1206,12 @@ static void convBlockMR1x24(int np, const float* a, const float* b, float *c, co
if (init_c) if (init_c)
{ {
c0 += v_load(c); c0 = v_add(c0, v_load(c));
c1 += v_load(c + 4); c1 = v_add(c1, v_load(c + 4));
c2 += v_load(c + 8); c2 = v_add(c2, v_load(c + 8));
c3 += v_load(c + 12); c3 = v_add(c3, v_load(c + 12));
c4 += v_load(c + 16); c4 = v_add(c4, v_load(c + 16));
c5 += v_load(c + 20); c5 = v_add(c5, v_load(c + 20));
} }
if (ifMinMaxAct) if (ifMinMaxAct)
@ -1251,9 +1250,9 @@ static void convBlockMR1x12(int np, const float* a, const float* b, float *c, co
if (init_c) if (init_c)
{ {
c0 += v_load(c); c0 = v_add(c0, v_load(c));
c1 += v_load(c + 4); c1 = v_add(c1, v_load(c + 4));
c2 += v_load(c + 8); c2 = v_add(c2, v_load(c + 8));
} }
if (ifMinMaxAct) if (ifMinMaxAct)
@ -1343,33 +1342,33 @@ static void convBlock4x24(int np, const float* a, const float* b, float* c, int
if (!init_c) if (!init_c)
{ {
c0 += v_load(c); c0 = v_add(c0, v_load(c));
c1 += v_load(c + 4); c1 = v_add(c1, v_load(c + 4));
c2 += v_load(c + 8); c2 = v_add(c2, v_load(c + 8));
c3 += v_load(c + 12); c3 = v_add(c3, v_load(c + 12));
c4 += v_load(c + 16); c4 = v_add(c4, v_load(c + 16));
c5 += v_load(c + 20); c5 = v_add(c5, v_load(c + 20));
c6 += v_load(c + ldc); c6 = v_add(c6 , v_load(c + ldc));
c7 += v_load(c + ldc + 4); c7 = v_add(c7 , v_load(c + ldc + 4));
c8 += v_load(c + ldc + 8); c8 = v_add(c8 , v_load(c + ldc + 8));
c9 += v_load(c + ldc + 12); c9 = v_add(c9 , v_load(c + ldc + 12));
c10 += v_load(c + ldc + 16); c10 = v_add(c10, v_load(c + ldc + 16));
c11 += v_load(c + ldc + 20); c11 = v_add(c11, v_load(c + ldc + 20));
c12 += v_load(c + ldc*2); c12 = v_add(c12, v_load(c + ldc*2));
c13 += v_load(c + ldc*2 + 4); c13 = v_add(c13, v_load(c + ldc*2 + 4));
c14 += v_load(c + ldc*2 + 8); c14 = v_add(c14, v_load(c + ldc*2 + 8));
c15 += v_load(c + ldc*2 + 12); c15 = v_add(c15, v_load(c + ldc*2 + 12));
c16 += v_load(c + ldc*2 + 16); c16 = v_add(c16, v_load(c + ldc*2 + 16));
c17 += v_load(c + ldc*2 + 20); c17 = v_add(c17, v_load(c + ldc*2 + 20));
c18 += v_load(c + ldc*3); c18 = v_add(c18, v_load(c + ldc*3));
c19 += v_load(c + ldc*3 + 4); c19 = v_add(c19, v_load(c + ldc*3 + 4));
c20 += v_load(c + ldc*3 + 8); c20 = v_add(c20, v_load(c + ldc*3 + 8));
c21 += v_load(c + ldc*3 + 12); c21 = v_add(c21, v_load(c + ldc*3 + 12));
c22 += v_load(c + ldc*3 + 16); c22 = v_add(c22, v_load(c + ldc*3 + 16));
c23 += v_load(c + ldc*3 + 20); c23 = v_add(c23, v_load(c + ldc*3 + 20));
} }
v_store(c, c0); v_store(c, c0);
@ -1431,17 +1430,17 @@ static void convBlock4x8(int np, const float* a, const float* b, float* c, int l
if (!init_c) if (!init_c)
{ {
c0 += v_load(c); c0 = v_add(c0, v_load(c));
c1 += v_load(c + 4); c1 = v_add(c1, v_load(c + 4));
c2 += v_load(c + ldc); c2 = v_add(c2, v_load(c + ldc));
c3 += v_load(c + ldc + 4); c3 = v_add(c3, v_load(c + ldc + 4));
c4 += v_load(c + ldc*2); c4 = v_add(c4, v_load(c + ldc*2));
c5 += v_load(c + ldc*2 + 4); c5 = v_add(c5, v_load(c + ldc*2 + 4));
c6 += v_load(c + ldc*3); c6 = v_add(c6, v_load(c + ldc*3));
c7 += v_load(c + ldc*3 + 4); c7 = v_add(c7, v_load(c + ldc*3 + 4));
} }
v_store(c, c0); v_store(c, c0);
@ -1476,10 +1475,10 @@ static void convBlock4x4(int np, const float* a, const float* b, float* c, int l
if (!init_c) if (!init_c)
{ {
c0 += v_load(c); c0 = v_add(c0, v_load(c));
c1 += v_load(c + ldc); c1 = v_add(c1, v_load(c + ldc));
c2 += v_load(c + ldc*2); c2 = v_add(c2, v_load(c + ldc*2));
c3 += v_load(c + ldc*3); c3 = v_add(c3, v_load(c + ldc*3));
} }
v_store(c, c0); v_store(c, c0);

Loading…
Cancel
Save