From eba158fb0c68b98d785746a4538a54253fcbddac Mon Sep 17 00:00:00 2001 From: HAN Liutong Date: Sun, 31 Mar 2024 21:47:06 +0800 Subject: [PATCH] Merge pull request #25230 from hanliutong/rvv-conv Optimize int8 layers in DNN modules by using RISC-V Vector intrinsic. #25230 This patch optimize 3 functions in the int8 layer by using RVV Native Intrinsic. This patch was tested on QEMU using VLEN=128 and VLEN=256 on `./bin/opencv_test_dnn --gtest_filter="*Int8*"`; On the real device (k230, VLEN=128), `EfficientDet_int8` in `opencv_perf_dnn` showed a performance improvement of 1.46x. | Name of Test | Original | optimized | Speed-up | | ------------------------------------------ | -------- | ---------- | -------- | | EfficientDet_int8::DNNTestNetwork::OCV/CPU | 2843.467 | 1947.013 | 1.46 | ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [ ] I agree to contribute to the project under Apache 2 License. - [ ] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake --- modules/dnn/CMakeLists.txt | 2 +- .../dnn/src/int8layers/convolution_layer.cpp | 17 +- .../src/int8layers/fully_connected_layer.cpp | 9 +- .../dnn/src/int8layers/layers_common.simd.hpp | 435 ++++++++++++++++++ 4 files changed, 460 insertions(+), 3 deletions(-) diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt index 562b14483c..3b66b460d9 100644 --- a/modules/dnn/CMakeLists.txt +++ b/modules/dnn/CMakeLists.txt @@ -5,7 +5,7 @@ endif() set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass") ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX) -ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX LASX) +ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX RVV LASX) ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_block" AVX AVX2 NEON NEON_FP16) ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_depthwise" AVX AVX2 RVV LASX) ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_winograd_f63" AVX AVX2 NEON_FP16) diff --git a/modules/dnn/src/int8layers/convolution_layer.cpp b/modules/dnn/src/int8layers/convolution_layer.cpp index 603100ae11..25132542cd 100644 --- a/modules/dnn/src/int8layers/convolution_layer.cpp +++ b/modules/dnn/src/int8layers/convolution_layer.cpp @@ -702,13 +702,14 @@ public: bool useAVX2; bool useAVX512; bool useLASX; + bool useRVV; int blk_size_cn; int inpZp, outZp; const std::vector* multiplier; ParallelConv() : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0), - biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false), useLASX(false) + biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false), useLASX(false), useRVV(false) , blk_size_cn(0), inpZp(0), outZp(0), multiplier(0) {} @@ -765,6 +766,7 @@ public: p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D; p.useLASX = checkHardwareSupport(CPU_LASX) && isConv2D; + p.useRVV = checkHardwareSupport(CPU_RVV) && isConv2D; int kernel_d = isConv3D? kernel_size[0] : 1; int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2]; @@ -970,6 +972,13 @@ public: biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp); else #endif + #if CV_TRY_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000 + if(useRVV) + opt_RVV::fastDepthwiseConv(wptr, kernel_h, kernel_w, + stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l, + biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp); + else + #endif #if CV_RVP052 if(isConv2D) opt_RVP052::fastDepthwiseConv(wptr, kernel_h, kernel_w, @@ -1356,6 +1365,12 @@ public: outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn); else #endif + #if CV_TRY_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000 + if(useRVV) + opt_RVV::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, + outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn); + else + #endif #if CV_RVP052 if(isConv2D) opt_RVP052::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0, diff --git a/modules/dnn/src/int8layers/fully_connected_layer.cpp b/modules/dnn/src/int8layers/fully_connected_layer.cpp index 6691128990..105b2dbaac 100644 --- a/modules/dnn/src/int8layers/fully_connected_layer.cpp +++ b/modules/dnn/src/int8layers/fully_connected_layer.cpp @@ -228,7 +228,7 @@ public: { public: FullyConnected() : srcMat(0), weights(0), biasMat(0), outputMultiplier(0), activationLUT(0), activ(0), - dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false), useLASX(false) {} + dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false), useLASX(false), useRVV(false) {} static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, const Mat& outputMultiplier, const Mat& activationLUT, Mat& dstMat, const ActivationLayerInt8* activ, int nstripes, int outZp) @@ -253,6 +253,7 @@ public: p.useAVX2 = checkHardwareSupport(CPU_AVX2); p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX; p.useLASX = checkHardwareSupport(CPU_LASX); + p.useRVV = checkHardwareSupport(CPU_RVV); parallel_for_(Range(0, nstripes), p, nstripes); } @@ -303,6 +304,11 @@ public: opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp ); else #endif + #if CV_TRY_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000 + if( useRVV) + opt_RVV::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp ); + else + #endif #if CV_RVP052 if( 1 ) opt_RVP052::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp ); @@ -363,6 +369,7 @@ public: bool useAVX2; bool useAVX512; bool useLASX; + bool useRVV; }; void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE diff --git a/modules/dnn/src/int8layers/layers_common.simd.hpp b/modules/dnn/src/int8layers/layers_common.simd.hpp index 1b3ac7a4b8..7f9dca505e 100644 --- a/modules/dnn/src/int8layers/layers_common.simd.hpp +++ b/modules/dnn/src/int8layers/layers_common.simd.hpp @@ -1257,5 +1257,440 @@ void fastGEMM1T( const int8_t* vec, const int8_t* weights, } #endif // CV_LASX +#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=11000 + +static const size_t __cv_rvv_e8m1_max = __riscv_vsetvlmax_e8m1(); +static const size_t __cv_rvv_e16m1_max = __riscv_vsetvlmax_e16m1(); +static const size_t __cv_rvv_e32m2_max = __riscv_vsetvlmax_e32m2(); + +inline vint32m2_t __riscv_vwmacc_vv_i32m2(vint32m2_t& dst, const vint8m1_t& a, const vint8m1_t& b, size_t vl) { + vint16m2_t tmp = __riscv_vwmul(a, b, vl); + dst = __riscv_vwadd_wv_i32m2_tu(dst, dst, __riscv_vget_i16m1(tmp, 0), vl); + dst = __riscv_vwadd_wv_i32m2_tu(dst, dst, __riscv_vget_i16m1(tmp, 1), vl > __cv_rvv_e16m1_max ? vl - __cv_rvv_e16m1_max : 0); + return dst; +} + +enum { FASCONV_BASE_VECSZ = 4 }; +void fastConv( const int8_t* weights, size_t wstep, const int* bias, + const int8_t* rowbuf, int* output, const int* outShape, + int blockSize, int vecsize, int vecsize_aligned, int outZp, + const float* multiplier, bool initOutput, bool finalOutput ) +{ + const size_t e8m1 = __cv_rvv_e8m1_max; + int outCn = outShape[1]; + size_t outPlaneSize = outShape[2]*outShape[3]; + // now compute dot product of the weights + // and im2row-transformed part of the tensor + for( int i = 0; i < outCn; i += 3 ) + { + int unroll_tail = FASCONV_BASE_VECSZ; + const int8_t* wptr0 = weights + i*wstep; + const int8_t* wptr1 = wptr0 + wstep; + const int8_t* wptr2 = wptr1 + wstep; + int* outptr0 = output + i*outPlaneSize; + int* outptr1 = outptr0 + outPlaneSize; + int* outptr2 = outptr1 + outPlaneSize; + int bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2]; + float mult0 = multiplier[i], mult1 = multiplier[i+1], mult2 = multiplier[i+2]; + + if( i+2 >= outCn ) + { + wptr2 = wptr1; + outptr2 = outptr1; + bias2 = bias1; + mult2 = mult1; + if( i+1 >= outCn ) + { + wptr2 = wptr1 = wptr0; + outptr2 = outptr1 = outptr0; + bias2 = bias1 = bias0; + mult2 = mult1 = mult0; + } + } + + int j = 0; + for( ; j < blockSize; j += FASCONV_BASE_VECSZ ) + { + const int8_t* rptr = rowbuf + j*vecsize_aligned; + const int8_t *rptr1 = rptr + vecsize_aligned*1, + *rptr2 = rptr + vecsize_aligned*2, + *rptr3 = rptr + vecsize_aligned*3; + + if (j + FASCONV_BASE_VECSZ > blockSize) + { + unroll_tail = blockSize - j; + rptr1 = rptr + vecsize_aligned*std::min(1, unroll_tail-1); + rptr2 = rptr + vecsize_aligned*std::min(2, unroll_tail-1); + rptr3 = rptr + vecsize_aligned*std::min(3, unroll_tail-1); + } + + int vl, avl = vecsize; + + vint32m2_t + vs00 = __riscv_vmv_v_x_i32m2(0, e8m1), vs10 = __riscv_vmv_v_x_i32m2(0, e8m1), vs20 = __riscv_vmv_v_x_i32m2(0, e8m1), + vs01 = __riscv_vmv_v_x_i32m2(0, e8m1), vs11 = __riscv_vmv_v_x_i32m2(0, e8m1), vs21 = __riscv_vmv_v_x_i32m2(0, e8m1), + vs02 = __riscv_vmv_v_x_i32m2(0, e8m1), vs12 = __riscv_vmv_v_x_i32m2(0, e8m1), vs22 = __riscv_vmv_v_x_i32m2(0, e8m1), + vs03 = __riscv_vmv_v_x_i32m2(0, e8m1), vs13 = __riscv_vmv_v_x_i32m2(0, e8m1), vs23 = __riscv_vmv_v_x_i32m2(0, e8m1); + for (int k = 0; k < vecsize; k += vl, avl -= vl) + { + vl = __riscv_vsetvl_e8m1(avl); + + vint8m1_t w0 = (__riscv_vle8_v_i8m1(wptr0 + k, vl)); + vint8m1_t w1 = (__riscv_vle8_v_i8m1(wptr1 + k, vl)); + vint8m1_t w2 = (__riscv_vle8_v_i8m1(wptr2 + k, vl)); + vint8m1_t r0 = (__riscv_vle8_v_i8m1(rptr, vl)); + + + vs00 = __riscv_vwmacc_vv_i32m2(vs00, w0, r0, vl); + vs10 = __riscv_vwmacc_vv_i32m2(vs10, w1, r0, vl); + vs20 = __riscv_vwmacc_vv_i32m2(vs20, w2, r0, vl); + + r0 = (__riscv_vle8_v_i8m1(rptr1, vl)); + vs01 = __riscv_vwmacc_vv_i32m2(vs01, w0, r0, vl); + vs11 = __riscv_vwmacc_vv_i32m2(vs11, w1, r0, vl); + vs21 = __riscv_vwmacc_vv_i32m2(vs21, w2, r0, vl); + + r0 = (__riscv_vle8_v_i8m1(rptr2, vl)); + vs02 = __riscv_vwmacc_vv_i32m2(vs02, w0, r0, vl); + vs12 = __riscv_vwmacc_vv_i32m2(vs12, w1, r0, vl); + vs22 = __riscv_vwmacc_vv_i32m2(vs22, w2, r0, vl); + + r0 = (__riscv_vle8_v_i8m1(rptr3, vl)); + vs03 = __riscv_vwmacc_vv_i32m2(vs03, w0, r0, vl); + vs13 = __riscv_vwmacc_vv_i32m2(vs13, w1, r0, vl); + vs23 = __riscv_vwmacc_vv_i32m2(vs23, w2, r0, vl); + + rptr += vl; rptr1 += vl; rptr2 += vl; rptr3 += vl; + } + + // compute sum of each vs + vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, e8m1); + int sum0[FASCONV_BASE_VECSZ], sum1[FASCONV_BASE_VECSZ], sum2[FASCONV_BASE_VECSZ]; + + sum0[0] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs00, zero, e8m1)); + sum0[1] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs01, zero, e8m1)); + sum0[2] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs02, zero, e8m1)); + sum0[3] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs03, zero, e8m1)); + + sum1[0] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs10, zero, e8m1)); + sum1[1] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs11, zero, e8m1)); + sum1[2] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs12, zero, e8m1)); + sum1[3] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs13, zero, e8m1)); + + sum2[0] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs20, zero, e8m1)); + sum2[1] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs21, zero, e8m1)); + sum2[2] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs22, zero, e8m1)); + sum2[3] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs23, zero, e8m1)); + + vint32m1_t s0, s1, s2; + if( initOutput ) + { + s0 = __riscv_vmv_v_x_i32m1(bias0, unroll_tail); + s1 = __riscv_vmv_v_x_i32m1(bias1, unroll_tail); + s2 = __riscv_vmv_v_x_i32m1(bias2, unroll_tail); + } + else + { + s0 = __riscv_vle32_v_i32m1(outptr0 + j, unroll_tail); + s1 = __riscv_vle32_v_i32m1(outptr1 + j, unroll_tail); + s2 = __riscv_vle32_v_i32m1(outptr2 + j, unroll_tail); + } + s0 = __riscv_vadd(__riscv_vle32_v_i32m1(sum0, unroll_tail), s0, unroll_tail); + s1 = __riscv_vadd(__riscv_vle32_v_i32m1(sum1, unroll_tail), s1, unroll_tail); + s2 = __riscv_vadd(__riscv_vle32_v_i32m1(sum2, unroll_tail), s2, unroll_tail); + + if( finalOutput ) + { + s0 = __riscv_vadd(__riscv_vfcvt_x_f_v_i32m1(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m1(s0, unroll_tail), mult0, unroll_tail), unroll_tail), outZp, unroll_tail); + s1 = __riscv_vadd(__riscv_vfcvt_x_f_v_i32m1(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m1(s1, unroll_tail), mult1, unroll_tail), unroll_tail), outZp, unroll_tail); + s2 = __riscv_vadd(__riscv_vfcvt_x_f_v_i32m1(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m1(s2, unroll_tail), mult2, unroll_tail), unroll_tail), outZp, unroll_tail); + + s0 = __riscv_vmin(__riscv_vmax(s0, -128, unroll_tail), 127, unroll_tail); + s1 = __riscv_vmin(__riscv_vmax(s1, -128, unroll_tail), 127, unroll_tail); + s2 = __riscv_vmin(__riscv_vmax(s2, -128, unroll_tail), 127, unroll_tail); + } + + __riscv_vse32(outptr0 + j, s0, unroll_tail); + __riscv_vse32(outptr1 + j, s1, unroll_tail); + __riscv_vse32(outptr2 + j, s2, unroll_tail); + } + } +} + +void fastDepthwiseConv( const int8_t* wptr, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int dilation_h, int dilation_w, + int pad_t, int pad_l, + const int* biasptr, const float* multptr, + const int8_t* inptr_, + int height, int width, + int* outptr_, + int out_d, int outH, int outW, + int inpZp, int outZp) +{ + int vl; + const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], + w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], + w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; + int outW1 = std::min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w); + float mult = multptr[out_d]; + int bias = biasptr[out_d]; + int biasCopy; + + for (int out_i = 0; out_i < outH; out_i++) + { + int in_i = out_i * stride_h - pad_t, out_j = 0; + const int8_t* imgptr0 = inptr_ + in_i*width; + const int8_t* imgptr1 = imgptr0 + dilation_h*width; + const int8_t* imgptr2 = imgptr0 + (dilation_h*2)*width; + int8_t w00 = w00_, w01 = w01_, w02 = w02_; + int8_t w20 = w20_, w21 = w21_, w22 = w22_; + int out, out1; + biasCopy = bias; + if (in_i < 0) + { + biasCopy += inpZp * (w00 + w01 + w02); + w00 = w01 = w02 = 0; + imgptr0 = imgptr1; + } + else if (in_i + dilation_h*(kernel_h-1) >= height) + { + biasCopy += inpZp * (w20 + w21 + w22); + w20 = w21 = w22 = 0; + imgptr2 = imgptr1; + } + int* outptr = outptr_ + out_i*outW; + if (pad_l > 0) + { + out = (int)imgptr0[0]*w01 + (int)imgptr0[dilation_w]*w02 + + (int)imgptr1[0]*w11 + (int)imgptr1[dilation_w]*w12 + + (int)imgptr2[0]*w21 + (int)imgptr2[dilation_w]*w22 + + biasCopy + inpZp*(w00 + w10 + w20); + out1 = outZp + (int)std::round(out*mult); + outptr[0] = std::min(std::max(out1, -128), 127); + out_j = 1; + } + if (stride_w == 1 || (stride_w == 2 && dilation_w == 1)) + { + int avl = outW1 - out_j; + if( stride_w == 1 ) + for( ; out_j < outW1; out_j += vl, avl -= vl) + { + vl = __riscv_vsetvl_e8m2(avl); + int in_j = out_j * stride_w - pad_l; + + vint32m8_t vout = __riscv_vmv_v_x_i32m8(biasCopy, vl); + vout = __riscv_vwmacc(vout, w00, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr0 + in_j , vl), vl), vl); + vout = __riscv_vwmacc(vout, w01, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr0 + in_j + dilation_w , vl), vl), vl); + vout = __riscv_vwmacc(vout, w02, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr0 + in_j + dilation_w*2, vl), vl), vl); + vout = __riscv_vwmacc(vout, w10, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr1 + in_j , vl), vl), vl); + vout = __riscv_vwmacc(vout, w11, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr1 + in_j + dilation_w , vl), vl), vl); + vout = __riscv_vwmacc(vout, w12, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr1 + in_j + dilation_w*2, vl), vl), vl); + vout = __riscv_vwmacc(vout, w20, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr2 + in_j , vl), vl), vl); + vout = __riscv_vwmacc(vout, w21, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr2 + in_j + dilation_w , vl), vl), vl); + vout = __riscv_vwmacc(vout, w22, __riscv_vwcvt_x_x_v_i16m4(__riscv_vle8_v_i8m2(imgptr2 + in_j + dilation_w*2, vl), vl), vl); + + vout = __riscv_vfcvt_x(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m8(vout, vl), mult, vl), vl); + vout = __riscv_vadd(vout, outZp, vl); + vout = __riscv_vmin(__riscv_vmax(vout, -128, vl), 127, vl); + + __riscv_vse32_v_i32m8(outptr + out_j, vout, vl); + + } + else //stride_w == 2 && dilation_w == 1; + { + for( ; out_j < outW1; out_j += vl, avl -= vl) + { + vl = __riscv_vsetvl_e8m2(avl); + int in_j = out_j * stride_w - pad_l; + + vint32m8_t vout = __riscv_vmv_v_x_i32m8(biasCopy, vl); + + vout = __riscv_vwmacc(vout, w00, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr0+in_j , 2, vl), vl), vl); + vout = __riscv_vwmacc(vout, w01, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr0+in_j+1, 2, vl), vl), vl); + vout = __riscv_vwmacc(vout, w02, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr0+in_j+2, 2, vl), vl), vl); + vout = __riscv_vwmacc(vout, w10, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr1+in_j , 2, vl), vl), vl); + vout = __riscv_vwmacc(vout, w11, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr1+in_j+1, 2, vl), vl), vl); + vout = __riscv_vwmacc(vout, w12, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr1+in_j+2, 2, vl), vl), vl); + vout = __riscv_vwmacc(vout, w20, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr2+in_j , 2, vl), vl), vl); + vout = __riscv_vwmacc(vout, w21, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr2+in_j+1, 2, vl), vl), vl); + vout = __riscv_vwmacc(vout, w22, __riscv_vwcvt_x_x_v_i16m4(__riscv_vlse8_v_i8m2(imgptr2+in_j+2, 2, vl), vl), vl); + + vout = __riscv_vfcvt_x(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m8(vout, vl), mult, vl), vl); + vout = __riscv_vadd(vout, outZp, vl); + vout = __riscv_vmin(__riscv_vmax(vout, -128, vl), 127, vl); + + __riscv_vse32_v_i32m8(outptr + out_j, vout, vl); + } + } + } + + for (; out_j < outW1; out_j++) + { + int in_j = out_j * stride_w - pad_l; + out = (int)imgptr0[in_j]*w00 + (int)imgptr0[in_j + dilation_w]*w01 + (int)imgptr0[in_j + dilation_w*2]*w02 + + (int)imgptr1[in_j]*w10 + (int)imgptr1[in_j + dilation_w]*w11 + (int)imgptr1[in_j + dilation_w*2]*w12 + + (int)imgptr2[in_j]*w20 + (int)imgptr2[in_j + dilation_w]*w21 + (int)imgptr2[in_j + dilation_w*2]*w22 + biasCopy; + outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127); + } + + for (; out_j < outW; out_j++ ) + { + int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; + int s0 = 1, s1 = 1, s2 = 1; + if (in_j0 >= width) + { + in_j0 = 0; + s0 = 0; + biasCopy += inpZp*(w00 + w10 + w20); + } + if (in_j1 >= width) + { + in_j1 = 0; + s1 = 0; + biasCopy += inpZp*(w01 + w11 + w21); + } + if (in_j2 >= width) + { + in_j2 = 0; + s2 = 0; + biasCopy += inpZp*(w02 + w12 + w22); + } + out = (int)imgptr0[in_j0]*w00*s0 + (int)imgptr0[in_j1]*w01*s1 + (int)imgptr0[in_j2]*w02*s2 + + (int)imgptr1[in_j0]*w10*s0 + (int)imgptr1[in_j1]*w11*s1 + (int)imgptr1[in_j2]*w12*s2 + + (int)imgptr2[in_j0]*w20*s0 + (int)imgptr2[in_j1]*w21*s1 + (int)imgptr2[in_j2]*w22*s2 + biasCopy; + outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127); + } + } +} + +void fastGEMM1T( const int8_t* vec, const int8_t* weights, + size_t wstep, const int* bias, const float* multiplier, + int* dst, int nvecs, int vecsize, int outZp ) +{ + int i = 0; + for( ; i <= nvecs - 15; i += 15 ) + { + const int8_t* wptr = weights + i*wstep; + vint32m2_t + vs0 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs1 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), + vs2 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs3 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), + vs4 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs5 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), + vs6 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs7 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), + vs8 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs9 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), + vs10 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs11 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), + vs12 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs13 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), + vs14 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max); + int avl = vecsize, vl; + for(int k = 0 ; k < vecsize; k += vl, wptr += vl, avl -= vl) + { + vl = __riscv_vsetvl_e8m1(avl); + vint8m1_t v = __riscv_vle8_v_i8m1(vec + k, vl); + + vs0 = __riscv_vwmacc_vv_i32m2(vs0, __riscv_vle8_v_i8m1(wptr, vl), v, vl); + vs1 = __riscv_vwmacc_vv_i32m2(vs1, __riscv_vle8_v_i8m1(wptr + wstep, vl), v, vl); + vs2 = __riscv_vwmacc_vv_i32m2(vs2, __riscv_vle8_v_i8m1(wptr + wstep*2, vl), v, vl); + vs3 = __riscv_vwmacc_vv_i32m2(vs3, __riscv_vle8_v_i8m1(wptr + wstep*3, vl), v, vl); + vs4 = __riscv_vwmacc_vv_i32m2(vs4, __riscv_vle8_v_i8m1(wptr + wstep*4, vl), v, vl); + vs5 = __riscv_vwmacc_vv_i32m2(vs5, __riscv_vle8_v_i8m1(wptr + wstep*5, vl), v, vl); + vs6 = __riscv_vwmacc_vv_i32m2(vs6, __riscv_vle8_v_i8m1(wptr + wstep*6, vl), v, vl); + vs7 = __riscv_vwmacc_vv_i32m2(vs7, __riscv_vle8_v_i8m1(wptr + wstep*7, vl), v, vl); + vs8 = __riscv_vwmacc_vv_i32m2(vs8, __riscv_vle8_v_i8m1(wptr + wstep*8, vl), v, vl); + vs9 = __riscv_vwmacc_vv_i32m2(vs9, __riscv_vle8_v_i8m1(wptr + wstep*9, vl), v, vl); + vs10 = __riscv_vwmacc_vv_i32m2(vs10, __riscv_vle8_v_i8m1(wptr + wstep*10, vl), v, vl); + vs11 = __riscv_vwmacc_vv_i32m2(vs11, __riscv_vle8_v_i8m1(wptr + wstep*11, vl), v, vl); + vs12 = __riscv_vwmacc_vv_i32m2(vs12, __riscv_vle8_v_i8m1(wptr + wstep*12, vl), v, vl); + vs13 = __riscv_vwmacc_vv_i32m2(vs13, __riscv_vle8_v_i8m1(wptr + wstep*13, vl), v, vl); + vs14 = __riscv_vwmacc_vv_i32m2(vs14, __riscv_vle8_v_i8m1(wptr + wstep*14, vl), v, vl); + } + + int sum[15]; + vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, __cv_rvv_e32m2_max); + sum[0] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs0, zero, __cv_rvv_e32m2_max)); + sum[1] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs1, zero, __cv_rvv_e32m2_max)); + sum[2] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs2, zero, __cv_rvv_e32m2_max)); + sum[3] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs3, zero, __cv_rvv_e32m2_max)); + sum[4] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs4, zero, __cv_rvv_e32m2_max)); + sum[5] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs5, zero, __cv_rvv_e32m2_max)); + sum[6] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs6, zero, __cv_rvv_e32m2_max)); + sum[7] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs7, zero, __cv_rvv_e32m2_max)); + sum[8] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs8, zero, __cv_rvv_e32m2_max)); + sum[9] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs9, zero, __cv_rvv_e32m2_max)); + sum[10] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs10, zero, __cv_rvv_e32m2_max)); + sum[11] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs11, zero, __cv_rvv_e32m2_max)); + sum[12] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs12, zero, __cv_rvv_e32m2_max)); + sum[13] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs13, zero, __cv_rvv_e32m2_max)); + sum[14] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs14, zero, __cv_rvv_e32m2_max)); + + vint32m4_t s0 = __riscv_vadd(__riscv_vle32_v_i32m4(sum, 15), __riscv_vle32_v_i32m4(bias + i, 15), 15); + + s0 = __riscv_vfcvt_x(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m4(s0, 15), __riscv_vle32_v_f32m4(multiplier + i, 15), 15), 15); + s0 = __riscv_vadd(s0, outZp, 15); + s0 = __riscv_vmin(__riscv_vmax(s0, -128, 15), 127, 15); + __riscv_vse32_v_i32m4(dst + i, s0, 15); + } + int unroll_tail = nvecs - i; + if (unroll_tail > 0) + { + const int8_t* wptr = weights + i*wstep; + vint32m2_t + vs0 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs1 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), + vs2 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs3 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), + vs4 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs5 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), + vs6 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs7 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), + vs8 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs9 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), + vs10 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs11 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), + vs12 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max), vs13 = __riscv_vmv_v_x_i32m2(0, __cv_rvv_e32m2_max); + int avl = vecsize, vl; + for(int k = 0 ; k < vecsize; k += vl, wptr += vl, avl -= vl) + { + vl = __riscv_vsetvl_e8m1(avl); + vint8m1_t v = __riscv_vle8_v_i8m1(vec + k, vl); + + vs0 = __riscv_vwmacc_vv_i32m2(vs0, __riscv_vle8_v_i8m1(wptr, vl), v, vl); + vs1 = __riscv_vwmacc_vv_i32m2(vs1, __riscv_vle8_v_i8m1(wptr + wstep*std::min(1, unroll_tail-1), vl), v, vl); + vs2 = __riscv_vwmacc_vv_i32m2(vs2, __riscv_vle8_v_i8m1(wptr + wstep*std::min(2, unroll_tail-1), vl), v, vl); + vs3 = __riscv_vwmacc_vv_i32m2(vs3, __riscv_vle8_v_i8m1(wptr + wstep*std::min(3, unroll_tail-1), vl), v, vl); + vs4 = __riscv_vwmacc_vv_i32m2(vs4, __riscv_vle8_v_i8m1(wptr + wstep*std::min(4, unroll_tail-1), vl), v, vl); + vs5 = __riscv_vwmacc_vv_i32m2(vs5, __riscv_vle8_v_i8m1(wptr + wstep*std::min(5, unroll_tail-1), vl), v, vl); + vs6 = __riscv_vwmacc_vv_i32m2(vs6, __riscv_vle8_v_i8m1(wptr + wstep*std::min(6, unroll_tail-1), vl), v, vl); + vs7 = __riscv_vwmacc_vv_i32m2(vs7, __riscv_vle8_v_i8m1(wptr + wstep*std::min(7, unroll_tail-1), vl), v, vl); + vs8 = __riscv_vwmacc_vv_i32m2(vs8, __riscv_vle8_v_i8m1(wptr + wstep*std::min(8, unroll_tail-1), vl), v, vl); + vs9 = __riscv_vwmacc_vv_i32m2(vs9, __riscv_vle8_v_i8m1(wptr + wstep*std::min(9, unroll_tail-1), vl), v, vl); + vs10 = __riscv_vwmacc_vv_i32m2(vs10, __riscv_vle8_v_i8m1(wptr + wstep*std::min(10, unroll_tail-1), vl), v, vl); + vs11 = __riscv_vwmacc_vv_i32m2(vs11, __riscv_vle8_v_i8m1(wptr + wstep*std::min(11, unroll_tail-1), vl), v, vl); + vs13 = __riscv_vwmacc_vv_i32m2(vs13, __riscv_vle8_v_i8m1(wptr + wstep*std::min(12, unroll_tail-1), vl), v, vl); + vs12 = __riscv_vwmacc_vv_i32m2(vs12, __riscv_vle8_v_i8m1(wptr + wstep*std::min(13, unroll_tail-1), vl), v, vl); + } + + int sum[14]; + vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, __cv_rvv_e32m2_max); + sum[0] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs0, zero, __cv_rvv_e32m2_max)); + sum[1] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs1, zero, __cv_rvv_e32m2_max)); + sum[2] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs2, zero, __cv_rvv_e32m2_max)); + sum[3] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs3, zero, __cv_rvv_e32m2_max)); + sum[4] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs4, zero, __cv_rvv_e32m2_max)); + sum[5] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs5, zero, __cv_rvv_e32m2_max)); + sum[6] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs6, zero, __cv_rvv_e32m2_max)); + sum[7] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs7, zero, __cv_rvv_e32m2_max)); + sum[8] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs8, zero, __cv_rvv_e32m2_max)); + sum[9] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs9, zero, __cv_rvv_e32m2_max)); + sum[10] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs10, zero, __cv_rvv_e32m2_max)); + sum[11] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs11, zero, __cv_rvv_e32m2_max)); + sum[12] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs12, zero, __cv_rvv_e32m2_max)); + sum[13] = __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(vs13, zero, __cv_rvv_e32m2_max)); + + vint32m4_t s0 = __riscv_vadd(__riscv_vle32_v_i32m4(sum, unroll_tail), __riscv_vle32_v_i32m4(bias + i, unroll_tail), unroll_tail); + + s0 = __riscv_vfcvt_x(__riscv_vfmul(__riscv_vfcvt_f_x_v_f32m4(s0, unroll_tail), __riscv_vle32_v_f32m4(multiplier + i, unroll_tail), unroll_tail), unroll_tail); + s0 = __riscv_vadd(s0, outZp, unroll_tail); + s0 = __riscv_vmin(__riscv_vmax(s0, -128, unroll_tail), 127, unroll_tail); + __riscv_vse32_v_i32m4(dst + i, s0, unroll_tail); + } +} + +#endif // CV_RVV + CV_CPU_OPTIMIZATION_NAMESPACE_END }} // namespace