From 9d64e2959f41f8673720012a99de55c9e4e87537 Mon Sep 17 00:00:00 2001 From: Maksim Shabunin Date: Tue, 17 Sep 2024 13:28:51 +0300 Subject: [PATCH] dnn: use dispatcher for Winograd --- modules/dnn/CMakeLists.txt | 2 +- .../layers/cpu_kernels/conv_winograd_f63.cpp | 608 +--------- .../conv_winograd_f63.dispatch.cpp | 22 + .../cpu_kernels/conv_winograd_f63.neon.cpp | 476 -------- .../cpu_kernels/conv_winograd_f63.simd.hpp | 1028 ++++++++++++++++- .../src/layers/cpu_kernels/convolution.hpp | 34 +- 6 files changed, 1059 insertions(+), 1111 deletions(-) create mode 100644 modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.dispatch.cpp delete mode 100644 modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.neon.cpp diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt index bc22696671..10d8951484 100644 --- a/modules/dnn/CMakeLists.txt +++ b/modules/dnn/CMakeLists.txt @@ -8,7 +8,7 @@ ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX RVV LASX) ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_block" AVX AVX2 NEON NEON_FP16) ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_depthwise" AVX AVX2 RVV LASX) -ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_winograd_f63" AVX AVX2 NEON_FP16) +ocv_add_dispatched_file("layers/cpu_kernels/conv_winograd_f63" AVX AVX2 NEON NEON_FP16) ocv_add_dispatched_file_force_all("layers/cpu_kernels/fast_gemm_kernels" AVX AVX2 NEON LASX) ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js) diff --git a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp index 46e220e69f..d054dfc121 100644 --- a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp +++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp @@ -12,28 +12,21 @@ #include "../../precomp.hpp" #include "convolution.hpp" -#include "conv_winograd_f63.simd.hpp" -#include "layers/cpu_kernels/conv_winograd_f63.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content - namespace cv { namespace dnn { -#if CV_NEON || CV_SIMD128 || CV_TRY_AVX2 enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment. -void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock, - const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32); - -/*Input transform*/ -void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep, - float* outptr, int Cg, const int winoIblock, const int winoAtomF32); - -/*Output transform*/ -void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep, - float bias, float minval, float maxval, bool ifMinMaxAct); - int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr& conv, int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) { + const cv::dnn::Winofunc func = + conv->useFP16 ? cv::dnn::getWinofunc_F16() + : (conv->useAVX || conv->useAVX2 || conv->useNEON || conv->useRVV || conv->useSIMD128) ? cv::dnn::getWinofunc_F32() + : cv::dnn::Winofunc::empty(); + + if (!func.isGood()) + return 0; + Mat input = _input.getMat(); Mat output = _output.getMat(); Mat fusedAddMat = _fusedAddMat.getMat(); @@ -52,42 +45,10 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu int ngroups = conv->ngroups, Cg = C/ngroups, Kg = K/ngroups; const int CONV_WINO_KBLOCK = 4; -#if (CV_NEON && CV_NEON_AARCH64) - const int CONV_WINO_IBLOCK = 6; -#elif CV_TRY_AVX || CV_TRY_AVX2 - const int CONV_WINO_IBLOCK = (conv->useAVX || conv->useAVX2) ? 6 : 3; -#else - const int CONV_WINO_IBLOCK = 3; -#endif - -#if CV_TRY_AVX || CV_TRY_AVX2 - const int CONV_WINO_ATOM_F32 = (conv->useAVX || conv->useAVX2) ? 8 : 4; -#else - const int CONV_WINO_ATOM_F32 = 4; -#endif - const int CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32; // for AVX2, it is 8, otherwise, it's 16. - - int CONV_WINO_ATOM = CONV_WINO_ATOM_F32; - int CONV_WINO_NATOMS = CONV_WINO_NATOMS_F32; - -#ifdef CONV_ARM_FP16 - // FP 16 - const int CONV_WINO_ATOM_F16 = CONV_WINO_ATOM_F32 * 2; - const int CONV_WINO_NATOMS_F16 = CONV_WINO_AREA / CONV_WINO_ATOM_F16; -#endif - - int esz = sizeof(float ); - -#ifdef CONV_ARM_FP16 - const bool useFP16 = conv->useFP16; - if (useFP16) - { - // works at FP 16. - CONV_WINO_ATOM = CONV_WINO_ATOM_F16; - CONV_WINO_NATOMS = CONV_WINO_NATOMS_F16; - esz = sizeof(__fp16); - } -#endif + const int CONV_WINO_IBLOCK = func.iblock; + const int CONV_WINO_ATOM = func.natom; + const int CONV_WINO_NATOMS = CONV_WINO_AREA / CONV_WINO_ATOM; + const int esz = func.esz; int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - 1)/CONV_WINO_KBLOCK; const size_t inp_planesize = (size_t)Hi*Wi; @@ -175,35 +136,7 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu inptr = inpbuf; inpstep = CONV_WINO_SIZE; } - -#if CV_TRY_AVX2 - if (conv->useAVX2) - opt_AVX2::winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM); - else -#endif -#if CV_TRY_AVX - if (conv->useAVX) - opt_AVX::winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM); - else -#endif -#if CV_NEON && CV_NEON_AARCH64 - if (conv->useNEON) - { -#ifdef CONV_ARM_FP16 - if (useFP16) - { - opt_NEON_FP16::winofunc_BtXB_8x8_F16(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, - CONV_WINO_ATOM); - } - else -#endif - opt_NEON::winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK, - CONV_WINO_ATOM); - } - else -#endif - winofunc_BtXB_8x8_F32(inptr, inpstep, (float *)inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM); - + func.BtXB_8x8(inptr, inpstep, (uchar*)inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM); } else { @@ -219,18 +152,20 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu // apply inverse Winograd transforms to the sums, // add bias, apply activation function if any and store the results. char* wptr0 = nullptr; -#ifdef CONV_ARM_FP16 - if (useFP16) + if (esz == 2) { CV_Assert(!conv->weightsWinoBuf_FP16.empty()); wptr0 = (char *)conv->getWeightsWinoFP16(); } - else -#endif + else if (esz == 4) { CV_Assert(!conv->weightsWinoBuf.empty()); wptr0 = (char *)conv->getWeightsWino(); } + else + { + CV_Error(Error::StsError, "Impossible configuration"); + } parallel_for_(Range(0, ntasks), [&](const Range& r0) { for (int task_id = r0.start; task_id < r0.end; task_id++) @@ -271,36 +206,9 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu char* inwptr = wbuf_all + inwofs * esz; char* wptr = wptr0 + wofs * esz; -#if CV_TRY_AVX2 - if (conv->useAVX2) - opt_AVX2::winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK, - CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS); - else -#endif -#if CV_TRY_AVX - if (conv->useAVX) - opt_AVX::winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK, - CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS); - else -#endif -#if CV_NEON && CV_NEON_AARCH64 - if (conv->useNEON) - { -#ifdef CONV_ARM_FP16 - if (useFP16) - { - opt_NEON_FP16::winofunc_accum_F16(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK, - CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS); - } - else -#endif - opt_NEON::winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK, - CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS); - } - else -#endif - winofunc_accum_F32((float *)inwptr, (float *)wptr, (float *)out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK, - CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS); + func.accum((uchar*)inwptr, (uchar*)wptr, (uchar*)out_wbuf, Cg, + block_id1 - block_id0, CONV_WINO_IBLOCK, + CONV_WINO_KBLOCK, CONV_WINO_ATOM, CONV_WINO_NATOMS); for (int k = k0; k < k1; k++) { @@ -336,37 +244,10 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu dx1*sizeof(pbptr0[0])); } } -#if CV_TRY_AVX2 - if (conv->useAVX2) - opt_AVX2::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE, - bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct); - else -#endif -#if CV_TRY_AVX - if (conv->useAVX) - opt_AVX::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE, - bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct); - else -#endif -#if CV_NEON && CV_NEON_AARCH64 - // NEON optimization is only for ARMv8 device, and for ARMv7 device, we use the Universal intrinsics. - if (conv->useNEON) - { -#ifdef CONV_ARM_FP16 - if (useFP16) - { - opt_NEON_FP16::winofunc_AtXA_8x8_F16(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA * esz, CONV_WINO_SIZE, - bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct); - } - else -#endif - opt_NEON::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE, - bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct); - } - else -#endif - winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE, - bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct); + + const int count = ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA; + func.AtXA_8x8((uchar*)out_wbuf + count * esz, CONV_WINO_SIZE, + bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct); if (partial) { @@ -383,441 +264,4 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu return 1; } -/****************************************************************************************\ - SIMD for winograd function -\****************************************************************************************/ - -#if CV_SIMD128 - -void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock, - const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32) -{ -#if 1 - CV_Assert(winoIblock == 3 && winoKblock == 4 && winoAtomF32 == 4); - for (int atom_id = 0; atom_id < winoNatomF32; atom_id++, - outbuf += winoAtomF32) - { - v_float32x4 s00 = v_setzero_f32(), s01 = s00, s02 = s00; - v_float32x4 s10 = v_setzero_f32(), s11 = s00, s12 = s00; - v_float32x4 s20 = v_setzero_f32(), s21 = s00, s22 = s00; - v_float32x4 s30 = v_setzero_f32(), s31 = s00, s32 = s00; - - for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, - wptr += winoKblock*winoAtomF32) - { - v_float32x4 x0, x1, x2; - x0 = v_load(inwptr); - x1 = v_load(inwptr + 4); - x2 = v_load(inwptr + 8); - - v_float32x4 w0 = v_load(wptr); - s00 = v_fma(w0, x0, s00); - s01 = v_fma(w0, x1, s01); - s02 = v_fma(w0, x2, s02); - - w0 = v_load(wptr + 4); - s10 = v_fma(w0, x0, s10); - s11 = v_fma(w0, x1, s11); - s12 = v_fma(w0, x2, s12); - - w0 = v_load(wptr + 8); - s20 = v_fma(w0, x0, s20); - s21 = v_fma(w0, x1, s21); - s22 = v_fma(w0, x2, s22); - - w0 = v_load(wptr + 12); - s30 = v_fma(w0, x0, s30); - s31 = v_fma(w0, x1, s31); - s32 = v_fma(w0, x2, s32); - } - - v_store(outbuf, s00); - v_store(outbuf + 1*64, s01); - v_store(outbuf + 2*64, s02); - v_store(outbuf + 3*64, s10); - v_store(outbuf + 4*64, s11); - v_store(outbuf + 5*64, s12); - v_store(outbuf + 6*64, s20); - v_store(outbuf + 7*64, s21); - v_store(outbuf + 8*64, s22); - v_store(outbuf + 9*64, s30); - v_store(outbuf + 10*64, s31); - v_store(outbuf + 11*64, s32); - } -#else - // Naive C++ code, the code should never be run here. - for (int atom_id = 0; atom_id < winoNatomF32; - atom_id++, outbuf += winoAtomF32) - { - float sumbuf[winoIblock*winoKblock*winoAtomF32]; - memset(sumbuf, 0, sizeof(sumbuf)); - for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, - wptr += winoKblock*winoAtomF32) - { - for (int i = 0; i < winoKblock; i++) - { - for (int j = 0; j < winoIblock; j++) - { - int i_ = i*winoAtomF32; - int j_ = j*winoAtomF32; - int ij_ = i_*winoIblock + j_; - float s0 = inwptr[j_ + 0]*wptr[i_ + 0]; - float s1 = inwptr[j_ + 1]*wptr[i_ + 1]; - float s2 = inwptr[j_ + 2]*wptr[i_ + 2]; - float s3 = inwptr[j_ + 3]*wptr[i_ + 3]; - sumbuf[ij_ + 0] += s0; - sumbuf[ij_ + 1] += s1; - sumbuf[ij_ + 2] += s2; - sumbuf[ij_ + 3] += s3; - } - } - } - for (int ij = 0; ij < winoKblock*winoIblock; ij++) - { - int ij_ = ij*winoAtomF32; - int ij_out = ij*CONV_WINO_AREA; - outbuf[ij_out + 0] = sumbuf[ij_ + 0]; - outbuf[ij_out + 1] = sumbuf[ij_ + 1]; - outbuf[ij_out + 2] = sumbuf[ij_ + 2]; - outbuf[ij_out + 3] = sumbuf[ij_ + 3]; - } - } -#endif -} - -/*Input transform*/ -void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep, - float* outptr, int Cg, const int winoIblock, const int winoAtomF32) -{ - CV_Assert(winoIblock == 3 && winoAtomF32 == 4); - v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4); - v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4); - v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4); - v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4); - v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4); - v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4); - v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4); - v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4); - - v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71; - - { - /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */ - /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */ - v_float32x4 q5_25 = v_setall_f32(5.25f), t00, t01, t10, t11; - t00 = v_sub(x40, x20); - t01 = v_sub(x41, x21); - t10 = v_sub(x30, x50); - t11 = v_sub(x31, x51); - v_float32x4 y00 = v_fma(t00, q5_25, v_sub(x00, x60)); - v_float32x4 y01 = v_fma(t01, q5_25, v_sub(x01, x61)); - v_float32x4 y70 = v_fma(t10, q5_25, v_sub(x70, x10)); - v_float32x4 y71 = v_fma(t11, q5_25, v_sub(x71, x11)); - - /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */ - /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */ - v_float32x4 qm4_25 = v_setall_f32(-4.25f); - t00 = v_fma(x30, qm4_25, v_add(x10, x50)); - t01 = v_fma(x31, qm4_25, v_add(x11, x51)); - t10 = v_fma(x40, qm4_25, v_add(x20, x60)); - t11 = v_fma(x41, qm4_25, v_add(x21, x61)); - - v_float32x4 y10 = v_add(t00, t10), y11 = v_add(t01, t11); - v_float32x4 y20 = v_sub(t10, t00), y21 = v_sub(t11, t01); - - /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */ - /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */ - v_float32x4 q0_5 = v_setall_f32(0.5f), q0_25 = v_setall_f32(0.25f); - v_float32x4 qm2_5 = v_setall_f32(-2.5f), qm1_25 = v_setall_f32(-1.25f); - t00 = v_fma(x10, q0_5, v_add(x50, x50)); - t01 = v_fma(x11, q0_5, v_add(x51, x51)); - t10 = v_fma(x20, q0_25, x60); - t11 = v_fma(x21, q0_25, x61); - t00 = v_fma(x30, qm2_5, t00); - t01 = v_fma(x31, qm2_5, t01); - t10 = v_fma(x40, qm1_25, t10); - t11 = v_fma(x41, qm1_25, t11); - - v_float32x4 y30 = v_add(t00, t10), y31 = v_add(t01, t11); - v_float32x4 y40 = v_sub(t10, t00), y41 = v_sub(t11, t01); - - /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */ - /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */ - v_float32x4 q4 = v_setall_f32(4.f), qm5 = v_setall_f32(-5.f); - t00 = v_fma(x50, q0_5, v_add(x10, x10)); - t01 = v_fma(x51, q0_5, v_add(x11, x11)); - t10 = v_fma(x20, q4 , x60); - t11 = v_fma(x21, q4 , x61); - t00 = v_fma(x30, qm2_5, t00); - t01 = v_fma(x31, qm2_5, t01); - t10 = v_fma(x40, qm5 , t10); - t11 = v_fma(x41, qm5 , t11); - - v_float32x4 y50 = v_add(t00, t10), y51 = v_add(t01, t11); - v_float32x4 y60 = v_sub(t10, t00), y61 = v_sub(t11, t01); - - /* transpose 8x8 matrix with v_transpose4x4 */ - - v_float32x4 y000, y100, y200, y300, y010, y110, y210, y310, y400, y500, y600, y700, y410, y510, y610, y710; - v_transpose4x4(y00, y10, y20, y30, y000, y100, y200, y300); - v_transpose4x4(y01, y11, y21, y31, y010, y110, y210, y310); - v_transpose4x4(y40, y50, y60, y70, y400, y500, y600, y700); - v_transpose4x4(y41, y51, y61, y71, y410, y510, y610, y710); - - /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */ - /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */ - t00 = v_sub(y010, y200); - t01 = v_sub(y410, y600); - t10 = v_sub(y300, y110); - t11 = v_sub(y700, y510); - z00 = v_fma(t00, q5_25, v_sub(y000, y210)); - z01 = v_fma(t01, q5_25, v_sub(y400, y610)); - z70 = v_fma(t10, q5_25, v_sub(y310, y100)); - z71 = v_fma(t11, q5_25, v_sub(y710, y500)); - - /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */ - /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */ - t00 = v_fma(y300, qm4_25, v_add(y100, y110)); - t01 = v_fma(y700, qm4_25, v_add(y500, y510)); - t10 = v_fma(y010, qm4_25, v_add(y200, y210)); - t11 = v_fma(y410, qm4_25, v_add(y600, y610)); - - z10 = v_add(t00, t10); z11 = v_add(t01, t11); - z20 = v_sub(t10, t00); z21 = v_sub(t11, t01); - - /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */ - /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */ - t00 = v_fma(y100, q0_5, v_add(y110, y110)); - t01 = v_fma(y500, q0_5, v_add(y510, y510)); - t10 = v_fma(y200, q0_25, y210); - t11 = v_fma(y600, q0_25, y610); - t00 = v_fma(y300, qm2_5, t00); - t01 = v_fma(y700, qm2_5, t01); - t10 = v_fma(y010, qm1_25, t10); - t11 = v_fma(y410, qm1_25, t11); - - z30 = v_add(t00, t10); z31 = v_add(t01, t11); - z40 = v_sub(t10, t00); z41 = v_sub(t11, t01); - - /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */ - /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */ - t00 = v_fma(y110, q0_5, v_add(y100, y100)); - t01 = v_fma(y510, q0_5, v_add(y500, y500)); - t10 = v_fma(y200, q4, y210); - t11 = v_fma(y600, q4, y610); - t00 = v_fma(y300, qm2_5, t00); - t01 = v_fma(y700, qm2_5, t01); - t10 = v_fma(y010, qm5, t10); - t11 = v_fma(y410, qm5, t11); - - z50 = v_add(t00, t10); z51 = v_add(t01, t11); - z60 = v_sub(t10, t00); z61 = v_sub(t11, t01); - } - - const int outstep = winoIblock*winoAtomF32*Cg; - - v_store(outptr, z00); - v_store(outptr + outstep, z01); - v_store(outptr + outstep*2, z10); - v_store(outptr + outstep*3, z11); - v_store(outptr + outstep*4, z20); - v_store(outptr + outstep*5, z21); - v_store(outptr + outstep*6, z30); - v_store(outptr + outstep*7, z31); - v_store(outptr + outstep*8, z40); - v_store(outptr + outstep*9, z41); - v_store(outptr + outstep*10, z50); - v_store(outptr + outstep*11, z51); - v_store(outptr + outstep*12, z60); - v_store(outptr + outstep*13, z61); - v_store(outptr + outstep*14, z70); - v_store(outptr + outstep*15, z71); -} - -/*Output transform*/ -/* Inverse Winograd 8x8 transform: - out = (A'*inp*A)', where - inp is input 8x8 FP32 matrix, - A' is - [1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, - 0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f, - 0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f, - 0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f, - 0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f, - 0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f] - - inp is pre-loaded into xij registers, - out will be stored in zij, where (0<=i<=7 for x, 0<=i<=5 for z), 0<=j<=1. - - After the inverse transform is done, we add bias, - optionally add results from the earlier tensors (by-pass), - optionally apply activation function and then - store the final results. - - That is, after both forward and then inverse transformation, - we get non-transposed result. - Of course, for the correct work of Winograd-based convolution, - the Winograd-transformed weights should also be transposed. - init_conv() (see OpConv.fx) takes care of that. -*/ -void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep, - float* bpptr, int bpstep, float* outptr, int outstep, - float bias, float minval, float maxval, bool ifMinMaxAct) -{ - v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4); - v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4); - v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4); - v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4); - v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4); - v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4); - v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4); - v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4); - v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51; - - { - v_float32x4 s12_0, s12_1, s34_0, s34_1, s56_0, s56_1; - s12_0 = v_add(x10, x20); s12_1 = v_add(x11, x21); - s34_0 = v_add(x30, x40); s34_1 = v_add(x31, x41); - s56_0 = v_add(x50, x60); s56_1 = v_add(x51, x61); - - v_float32x4 y00 = v_add(v_add(v_add(x00, s12_0), s34_0), s56_0); - v_float32x4 y01 = v_add(v_add(v_add(x01, s12_1), s34_1), s56_1); - - v_float32x4 a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f); - v_float32x4 y20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); - v_float32x4 y21 = v_fma(s56_1, a0 ,v_fma(s34_1, a1, s12_1) ); - - a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f); - v_float32x4 y40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); - v_float32x4 y41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); - - s12_0 = v_sub(x10, x20); s12_1 = v_sub(x11, x21); - s34_0 = v_sub(x30, x40); s34_1 = v_sub(x31, x41); - s56_0 = v_sub(x50, x60); s56_1 = v_sub(x51, x61); - - a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.f); - v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(x70, s12_0))); - v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(x71, s12_1))); - - a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.f); - v_float32x4 y10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); - v_float32x4 y11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); - - a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.f); - v_float32x4 y30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); - v_float32x4 y31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); - - v_float32x4 y60 = v_setall_f32(0.f), y61 = y60, y70 = y60, y71 = y60; - - /* transpose 8x8 matrix with v_transpose4x4 */ - - v_float32x4 y000, y100, y200, y300, y010, y110, y210, y310, y400, y500, y600, y700, y410, y510, y610, y710; - v_transpose4x4(y00, y10, y20, y30, y000, y100, y200, y300); - v_transpose4x4(y01, y11, y21, y31, y010, y110, y210, y310); - v_transpose4x4(y40, y50, y60, y70, y400, y500, y600, y700); - v_transpose4x4(y41, y51, y61, y71, y410, y510, y610, y710); - - s12_0 = v_add(y100, y200); s12_1 = v_add(y500, y600); - s34_0 = v_add(y300, y010); s34_1 = v_add(y700, y410); - s56_0 = v_add(y110, y210); s56_1 = v_add(y510, y610); - - z00 = v_add(v_add(v_add(y000, s12_0), s34_0), s56_0); - z01 = v_add(v_add(v_add(y400, s12_1), s34_1), s56_1); - - a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f); - z20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); - z21 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); - - a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f); - z40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); - z41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); - - s12_0 = v_sub(y100, y200); s12_1 = v_sub(y500, y600); - s34_0 = v_sub(y300, y010); s34_1 = v_sub(y700, y410); - s56_0 = v_sub(y110, y210); s56_1 = v_sub(y510, y610); - - a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.0f); - z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(y310, s12_0))); - z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(y710, s12_1))); - a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.0f); - z10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); - z11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); - - a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.0f); - z30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); - z31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); - - v_float32x4 vbias = v_setall_f32(bias); - z00 = v_add(z00, vbias); - z01 = v_add(z01, vbias); - z10 = v_add(z10, vbias); - z11 = v_add(z11, vbias); - z20 = v_add(z20, vbias); - z21 = v_add(z21, vbias); - z30 = v_add(z30, vbias); - z31 = v_add(z31, vbias); - z40 = v_add(z40, vbias); - z41 = v_add(z41, vbias); - z50 = v_add(z50, vbias); - z51 = v_add(z51, vbias); - } - - if (bpptr) - { - z00 = v_add(z00, v_load(bpptr)); - z01 = v_add(z01, v_load_low(bpptr + 4)); - z10 = v_add(z10, v_load(bpptr + bpstep)); - z11 = v_add(z11, v_load_low(bpptr + bpstep + 4)); - z20 = v_add(z20, v_load(bpptr + bpstep * 2)); - z21 = v_add(z21, v_load_low(bpptr + bpstep * 2 + 4)); - z30 = v_add(z30, v_load(bpptr + bpstep * 3)); - z31 = v_add(z31, v_load_low(bpptr + bpstep * 3 + 4)); - z40 = v_add(z40, v_load(bpptr + bpstep * 4)); - z41 = v_add(z41, v_load_low(bpptr + bpstep * 4 + 4)); - z50 = v_add(z50, v_load(bpptr + bpstep * 5)); - z51 = v_add(z51, v_load_low(bpptr + bpstep * 5 + 4)); - } - - if (ifMinMaxAct) - { - v_float32x4 vmax = v_setall_f32(maxval); - v_float32x4 vmin = v_setall_f32(minval); - - z00 = v_min(v_max(z00, vmin), vmax); - z01 = v_min(v_max(z01, vmin), vmax); - z10 = v_min(v_max(z10, vmin), vmax); - z11 = v_min(v_max(z11, vmin), vmax); - z20 = v_min(v_max(z20, vmin), vmax); - z21 = v_min(v_max(z21, vmin), vmax); - z30 = v_min(v_max(z30, vmin), vmax); - z31 = v_min(v_max(z31, vmin), vmax); - z40 = v_min(v_max(z40, vmin), vmax); - z41 = v_min(v_max(z41, vmin), vmax); - z50 = v_min(v_max(z50, vmin), vmax); - z51 = v_min(v_max(z51, vmin), vmax); - } - - v_store(outptr, z00); - v_store_low(outptr + 4, z01); - v_store(outptr + outstep, z10); - v_store_low(outptr + outstep + 4, z11); - v_store(outptr + outstep*2, z20); - v_store_low(outptr + outstep*2 + 4, z21); - v_store(outptr + outstep*3, z30); - v_store_low(outptr + outstep*3 + 4, z31); - v_store(outptr + outstep*4, z40); - v_store_low(outptr + outstep*4 + 4, z41); - v_store(outptr + outstep*5, z50); - v_store_low(outptr + outstep*5 + 4, z51); -} -#endif - -#else -int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr& conv, - int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) -{ - return 0; -} -#endif - }} // namespace cv::dnn diff --git a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.dispatch.cpp b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.dispatch.cpp new file mode 100644 index 0000000000..d927a8ab77 --- /dev/null +++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.dispatch.cpp @@ -0,0 +1,22 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "convolution.hpp" +#include "conv_winograd_f63.simd.hpp" +#include "layers/cpu_kernels/conv_winograd_f63.simd_declarations.hpp" + +namespace cv { +namespace dnn { + +cv::dnn::Winofunc getWinofunc_F32() +{ + CV_CPU_DISPATCH(getWinofunc_F32, (), CV_CPU_DISPATCH_MODES_ALL); +} + +cv::dnn::Winofunc getWinofunc_F16() +{ + CV_CPU_DISPATCH(getWinofunc_F16, (), CV_CPU_DISPATCH_MODES_ALL); +} + +}} // namespace cv::dnn:: diff --git a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.neon.cpp b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.neon.cpp deleted file mode 100644 index 70b716f9c7..0000000000 --- a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.neon.cpp +++ /dev/null @@ -1,476 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level directory -// of this distribution and at http://opencv.org/license.html. - -#include "../../precomp.hpp" -#include "convolution.hpp" -#include "opencv2/core/hal/intrin.hpp" - -namespace cv { -namespace dnn { - -// NEON code work around. -namespace opt_NEON -{ - -#if CV_NEON && CV_NEON_AARCH64 - -/* Accumulate */ -void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock, - const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32) -{ - CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF32 == 4); - if (iblock > 3) - { - for (int atom_id = 0; atom_id < winoNatomF32; atom_id++, - outbuf += winoAtomF32) - { - float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00; - float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00; - float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00; - float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00; - for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, - wptr += winoKblock*winoAtomF32) { - float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4); - float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12); - float32x4_t x0, x1; - x0 = vld1q_f32(inwptr); - x1 = vld1q_f32(inwptr + 4); - s00 = vfmaq_f32(s00, w0, x0); - s01 = vfmaq_f32(s01, w0, x1); - s10 = vfmaq_f32(s10, w1, x0); - s11 = vfmaq_f32(s11, w1, x1); - s20 = vfmaq_f32(s20, w2, x0); - s21 = vfmaq_f32(s21, w2, x1); - s30 = vfmaq_f32(s30, w3, x0); - s31 = vfmaq_f32(s31, w3, x1); - x0 = vld1q_f32(inwptr + 8); - x1 = vld1q_f32(inwptr + 12); - s02 = vfmaq_f32(s02, w0, x0); - s03 = vfmaq_f32(s03, w0, x1); - s12 = vfmaq_f32(s12, w1, x0); - s13 = vfmaq_f32(s13, w1, x1); - s22 = vfmaq_f32(s22, w2, x0); - s23 = vfmaq_f32(s23, w2, x1); - s32 = vfmaq_f32(s32, w3, x0); - s33 = vfmaq_f32(s33, w3, x1); - x0 = vld1q_f32(inwptr + 16); - x1 = vld1q_f32(inwptr + 20); - s04 = vfmaq_f32(s04, w0, x0); - s05 = vfmaq_f32(s05, w0, x1); - s14 = vfmaq_f32(s14, w1, x0); - s15 = vfmaq_f32(s15, w1, x1); - s24 = vfmaq_f32(s24, w2, x0); - s25 = vfmaq_f32(s25, w2, x1); - s34 = vfmaq_f32(s34, w3, x0); - s35 = vfmaq_f32(s35, w3, x1); - } - - vst1q_f32(outbuf, s00); - vst1q_f32(outbuf + 1*64, s01); - vst1q_f32(outbuf + 2*64, s02); - vst1q_f32(outbuf + 3*64, s03); - vst1q_f32(outbuf + 4*64, s04); - vst1q_f32(outbuf + 5*64, s05); - - vst1q_f32(outbuf + 6*64, s10); - vst1q_f32(outbuf + 7*64, s11); - vst1q_f32(outbuf + 8*64, s12); - vst1q_f32(outbuf + 9*64, s13); - vst1q_f32(outbuf + 10*64, s14); - vst1q_f32(outbuf + 11*64, s15); - - vst1q_f32(outbuf + 12*64, s20); - vst1q_f32(outbuf + 13*64, s21); - vst1q_f32(outbuf + 14*64, s22); - vst1q_f32(outbuf + 15*64, s23); - vst1q_f32(outbuf + 16*64, s24); - vst1q_f32(outbuf + 17*64, s25); - - vst1q_f32(outbuf + 18*64, s30); - vst1q_f32(outbuf + 19*64, s31); - vst1q_f32(outbuf + 20*64, s32); - vst1q_f32(outbuf + 21*64, s33); - vst1q_f32(outbuf + 22*64, s34); - vst1q_f32(outbuf + 23*64, s35); - } - } - else - { - for (int atom_id = 0; atom_id < winoNatomF32; atom_id++, - outbuf += winoAtomF32) - { - float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00; - float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00; - float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00; - float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00; - for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, - wptr += winoKblock*winoAtomF32) { - float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4); - float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12); - float32x4_t x0, x1, x2; - x0 = vld1q_f32(inwptr); - x1 = vld1q_f32(inwptr + 4); - x2 = vld1q_f32(inwptr + 8); - s00 = vfmaq_f32(s00, w0, x0); - s01 = vfmaq_f32(s01, w0, x1); - s02 = vfmaq_f32(s02, w0, x2); - s10 = vfmaq_f32(s10, w1, x0); - s11 = vfmaq_f32(s11, w1, x1); - s12 = vfmaq_f32(s12, w1, x2); - s20 = vfmaq_f32(s20, w2, x0); - s21 = vfmaq_f32(s21, w2, x1); - s22 = vfmaq_f32(s22, w2, x2); - s30 = vfmaq_f32(s30, w3, x0); - s31 = vfmaq_f32(s31, w3, x1); - s32 = vfmaq_f32(s32, w3, x2); - } - - vst1q_f32(outbuf, s00); - vst1q_f32(outbuf + 1*64, s01); - vst1q_f32(outbuf + 2*64, s02); - vst1q_f32(outbuf + 6*64, s10); - vst1q_f32(outbuf + 7*64, s11); - vst1q_f32(outbuf + 8*64, s12); - vst1q_f32(outbuf + 12*64, s20); - vst1q_f32(outbuf + 13*64, s21); - vst1q_f32(outbuf + 14*64, s22); - vst1q_f32(outbuf + 18*64, s30); - vst1q_f32(outbuf + 19*64, s31); - vst1q_f32(outbuf + 20*64, s32); - } - } -} - -#undef T4x4 -#define T4x4(a, b, c, d, tr0, tr1) \ - tr0 = vtrnq_f32(a, b); \ - tr1 = vtrnq_f32(c, d); \ - a = vcombine_f32(vget_low_f32(tr0.val[0]), vget_low_f32(tr1.val[0])); \ - b = vcombine_f32(vget_low_f32(tr0.val[1]), vget_low_f32(tr1.val[1])); \ - c = vcombine_f32(vget_high_f32(tr0.val[0]), vget_high_f32(tr1.val[0])); \ - d = vcombine_f32(vget_high_f32(tr0.val[1]), vget_high_f32(tr1.val[1])) - -/*Input transform*/ -void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep, - float* outptr, int Cg, const int winoIblock, const int winoAtomF32) -{ - float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4); - float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4); - float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4); - float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4); - float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4); - float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4); - float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4); - float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4); - - float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71; - - { - /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */ - /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */ - float32x4_t q5_25 = vdupq_n_f32(5.25f), t00, t01, t10, t11; - t00 = vsubq_f32(x40, x20); - t01 = vsubq_f32(x41, x21); - t10 = vsubq_f32(x30, x50); - t11 = vsubq_f32(x31, x51); - float32x4_t y00 = vfmaq_f32(vsubq_f32(x00, x60), t00, q5_25); - float32x4_t y01 = vfmaq_f32(vsubq_f32(x01, x61), t01, q5_25); - float32x4_t y70 = vfmaq_f32(vsubq_f32(x70, x10), t10, q5_25); - float32x4_t y71 = vfmaq_f32(vsubq_f32(x71, x11), t11, q5_25); - - /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */ - /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */ - float32x4_t qm4_25 = vdupq_n_f32(-4.25f); - t00 = vfmaq_f32(vaddq_f32(x10, x50), x30, qm4_25); - t01 = vfmaq_f32(vaddq_f32(x11, x51), x31, qm4_25); - t10 = vfmaq_f32(vaddq_f32(x20, x60), x40, qm4_25); - t11 = vfmaq_f32(vaddq_f32(x21, x61), x41, qm4_25); - - float32x4_t y10 = vaddq_f32(t00, t10), y11 = vaddq_f32(t01, t11); - float32x4_t y20 = vsubq_f32(t10, t00), y21 = vsubq_f32(t11, t01); - - /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */ - /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */ - float32x4_t q0_5 = vdupq_n_f32(0.5f), q0_25 = vdupq_n_f32(0.25f); - float32x4_t qm2_5 = vdupq_n_f32(-2.5f), qm1_25 = vdupq_n_f32(-1.25f); - t00 = vfmaq_f32(vaddq_f32(x50, x50), x10, q0_5); - t01 = vfmaq_f32(vaddq_f32(x51, x51), x11, q0_5); - t10 = vfmaq_f32(x60, x20, q0_25); - t11 = vfmaq_f32(x61, x21, q0_25); - t00 = vfmaq_f32(t00, x30, qm2_5); - t01 = vfmaq_f32(t01, x31, qm2_5); - t10 = vfmaq_f32(t10, x40, qm1_25); - t11 = vfmaq_f32(t11, x41, qm1_25); - - float32x4_t y30 = vaddq_f32(t00, t10), y31 = vaddq_f32(t01, t11); - float32x4_t y40 = vsubq_f32(t10, t00), y41 = vsubq_f32(t11, t01); - - /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */ - /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */ - float32x4_t q4 = vdupq_n_f32(4.f), qm5 = vdupq_n_f32(-5.f); - t00 = vfmaq_f32(vaddq_f32(x10, x10), x50, q0_5); - t01 = vfmaq_f32(vaddq_f32(x11, x11), x51, q0_5); - t10 = vfmaq_f32(x60, x20, q4); - t11 = vfmaq_f32(x61, x21, q4); - t00 = vfmaq_f32(t00, x30, qm2_5); - t01 = vfmaq_f32(t01, x31, qm2_5); - t10 = vfmaq_f32(t10, x40, qm5); - t11 = vfmaq_f32(t11, x41, qm5); - - float32x4_t y50 = vaddq_f32(t00, t10), y51 = vaddq_f32(t01, t11); - float32x4_t y60 = vsubq_f32(t10, t00), y61 = vsubq_f32(t11, t01); - - /* transpose 8x8 matrix in-place with some renumeration of the elements: */ - /* Y: */ - /* y00 y01 */ - /* y10 y11 */ - /* ... */ - /* y70 y71 */ - /* Y': */ - /* y00 y40 */ - /* y10 y50 */ - /* y20 y60 */ - /* y30 y70 */ - /* y01 y41 */ - /* y11 y51 */ - /* y21 y61 */ - /* y31 y71 */ - /* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */ - float32x4x2_t tr0, tr1; - - T4x4(y00, y10, y20, y30, tr0, tr1); - T4x4(y01, y11, y21, y31, tr0, tr1); - T4x4(y40, y50, y60, y70, tr0, tr1); - T4x4(y41, y51, y61, y71, tr0, tr1); - - /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */ - /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */ - t00 = vsubq_f32(y01, y20); - t01 = vsubq_f32(y41, y60); - t10 = vsubq_f32(y30, y11); - t11 = vsubq_f32(y70, y51); - z00 = vfmaq_f32(vsubq_f32(y00, y21), t00, q5_25); - z01 = vfmaq_f32(vsubq_f32(y40, y61), t01, q5_25); - z70 = vfmaq_f32(vsubq_f32(y31, y10), t10, q5_25); - z71 = vfmaq_f32(vsubq_f32(y71, y50), t11, q5_25); - - /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */ - /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */ - t00 = vfmaq_f32(vaddq_f32(y10, y11), y30, qm4_25); - t01 = vfmaq_f32(vaddq_f32(y50, y51), y70, qm4_25); - t10 = vfmaq_f32(vaddq_f32(y20, y21), y01, qm4_25); - t11 = vfmaq_f32(vaddq_f32(y60, y61), y41, qm4_25); - - z10 = vaddq_f32(t00, t10); z11 = vaddq_f32(t01, t11); - z20 = vsubq_f32(t10, t00); z21 = vsubq_f32(t11, t01); - - /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */ - /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */ - t00 = vfmaq_f32(vaddq_f32(y11, y11), y10, q0_5); - t01 = vfmaq_f32(vaddq_f32(y51, y51), y50, q0_5); - t10 = vfmaq_f32(y21, y20, q0_25); - t11 = vfmaq_f32(y61, y60, q0_25); - t00 = vfmaq_f32(t00, y30, qm2_5); - t01 = vfmaq_f32(t01, y70, qm2_5); - t10 = vfmaq_f32(t10, y01, qm1_25); - t11 = vfmaq_f32(t11, y41, qm1_25); - - z30 = vaddq_f32(t00, t10); z31 = vaddq_f32(t01, t11); - z40 = vsubq_f32(t10, t00); z41 = vsubq_f32(t11, t01); - - /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */ - /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */ - t00 = vfmaq_f32(vaddq_f32(y10, y10), y11, q0_5); - t01 = vfmaq_f32(vaddq_f32(y50, y50), y51, q0_5); - t10 = vfmaq_f32(y21, y20, q4); - t11 = vfmaq_f32(y61, y60, q4); - t00 = vfmaq_f32(t00, y30, qm2_5); - t01 = vfmaq_f32(t01, y70, qm2_5); - t10 = vfmaq_f32(t10, y01, qm5); - t11 = vfmaq_f32(t11, y41, qm5); - - z50 = vaddq_f32(t00, t10); z51 = vaddq_f32(t01, t11); - z60 = vsubq_f32(t10, t00); z61 = vsubq_f32(t11, t01); - } - - const int outstep = winoIblock*winoAtomF32*Cg; - - vst1q_f32(outptr, z00); - vst1q_f32(outptr + outstep, z01); - vst1q_f32(outptr + outstep*2, z10); - vst1q_f32(outptr + outstep*3, z11); - vst1q_f32(outptr + outstep*4, z20); - vst1q_f32(outptr + outstep*5, z21); - vst1q_f32(outptr + outstep*6, z30); - vst1q_f32(outptr + outstep*7, z31); - vst1q_f32(outptr + outstep*8, z40); - vst1q_f32(outptr + outstep*9, z41); - vst1q_f32(outptr + outstep*10, z50); - vst1q_f32(outptr + outstep*11, z51); - vst1q_f32(outptr + outstep*12, z60); - vst1q_f32(outptr + outstep*13, z61); - vst1q_f32(outptr + outstep*14, z70); - vst1q_f32(outptr + outstep*15, z71); -} - -/*Output transform*/ -void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep, - float* bpptr, int bpstep, float* outptr, int outstep, - float bias, float minval, float maxval, bool ifMinMaxAct) -{ - float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4); - float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4); - float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4); - float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4); - float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4); - float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4); - float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4); - float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4); - float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51; - - { - float32x4_t s12_0, s12_1, s34_0, s34_1, s56_0, s56_1; - s12_0 = vaddq_f32(x10, x20); s12_1 = vaddq_f32(x11, x21); - s34_0 = vaddq_f32(x30, x40); s34_1 = vaddq_f32(x31, x41); - s56_0 = vaddq_f32(x50, x60); s56_1 = vaddq_f32(x51, x61); - - float32x4_t y00 = vaddq_f32(vaddq_f32(vaddq_f32(x00, s12_0), s34_0), s56_0); - float32x4_t y01 = vaddq_f32(vaddq_f32(vaddq_f32(x01, s12_1), s34_1), s56_1); - float32x4_t y20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f); - float32x4_t y21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f); - float32x4_t y40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16); - float32x4_t y41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16); - - s12_0 = vsubq_f32(x10, x20); s12_1 = vsubq_f32(x11, x21); - s34_0 = vsubq_f32(x30, x40); s34_1 = vsubq_f32(x31, x41); - s56_0 = vsubq_f32(x50, x60); s56_1 = vsubq_f32(x51, x61); - - float32x4_t y50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x70, s12_0), - s34_0, 32.f), s56_0, 1.f/32); - float32x4_t y51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x71, s12_1), - s34_1, 32.f), s56_1, 1.f/32); - float32x4_t y10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f); - float32x4_t y11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f); - float32x4_t y30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f); - float32x4_t y31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f); - float32x4_t y60 = vdupq_n_f32(0.f), y61 = y60, y70 = y60, y71 = y60; - - /* transpose 8x8 matrix in-place with some renumeration of the elements: */ - /* Y: */ - /* y00 y01 */ - /* y10 y11 */ - /* ... */ - /* y50 y51 */ - /* 0 0 */ - /* 0 0 */ - /* Y': */ - /* y00 y40 */ - /* y10 y50 */ - /* y20 y60 */ - /* y30 y70 */ - /* y01 y41 */ - /* y11 y51 */ - /* y21 y61 */ - /* y31 y71 */ - /* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */ - float32x4x2_t tr0, tr1; - - T4x4(y00, y10, y20, y30, tr0, tr1); - T4x4(y01, y11, y21, y31, tr0, tr1); - T4x4(y40, y50, y60, y70, tr0, tr1); - T4x4(y41, y51, y61, y71, tr0, tr1); - - s12_0 = vaddq_f32(y10, y20); s12_1 = vaddq_f32(y50, y60); - s34_0 = vaddq_f32(y30, y01); s34_1 = vaddq_f32(y70, y41); - s56_0 = vaddq_f32(y11, y21); s56_1 = vaddq_f32(y51, y61); - - z00 = vaddq_f32(vaddq_f32(vaddq_f32(y00, s12_0), s34_0), s56_0); - z01 = vaddq_f32(vaddq_f32(vaddq_f32(y40, s12_1), s34_1), s56_1); - z20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f); - z21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f); - z40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16); - z41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16); - - s12_0 = vsubq_f32(y10, y20); s12_1 = vsubq_f32(y50, y60); - s34_0 = vsubq_f32(y30, y01); s34_1 = vsubq_f32(y70, y41); - s56_0 = vsubq_f32(y11, y21); s56_1 = vsubq_f32(y51, y61); - - z50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y31, s12_0), - s34_0, 32.f), s56_0, 1.f/32); - z51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y71, s12_1), - s34_1, 32.f), s56_1, 1.f/32); - z10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f); - z11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f); - z30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f); - z31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f); - float32x4_t vbias = vdupq_n_f32(bias); - - z00 = vaddq_f32(z00, vbias); - z01 = vaddq_f32(z01, vbias); - z10 = vaddq_f32(z10, vbias); - z11 = vaddq_f32(z11, vbias); - z20 = vaddq_f32(z20, vbias); - z21 = vaddq_f32(z21, vbias); - z30 = vaddq_f32(z30, vbias); - z31 = vaddq_f32(z31, vbias); - z40 = vaddq_f32(z40, vbias); - z41 = vaddq_f32(z41, vbias); - z50 = vaddq_f32(z50, vbias); - z51 = vaddq_f32(z51, vbias); - } - - if (bpptr) - { - float32x2_t zhalf = vdup_n_f32(0.f); - z00 = vaddq_f32(z00, vld1q_f32(bpptr)); - z01 = vaddq_f32(z01, vcombine_f32(vld1_f32(bpptr + 4), zhalf)); - z10 = vaddq_f32(z10, vld1q_f32(bpptr + bpstep)); - z11 = vaddq_f32(z11, vcombine_f32(vld1_f32(bpptr + bpstep + 4), zhalf)); - z20 = vaddq_f32(z20, vld1q_f32(bpptr + bpstep*2)); - z21 = vaddq_f32(z21, vcombine_f32(vld1_f32(bpptr + bpstep*2 + 4), zhalf)); - z30 = vaddq_f32(z30, vld1q_f32(bpptr + bpstep*3)); - z31 = vaddq_f32(z31, vcombine_f32(vld1_f32(bpptr + bpstep*3 + 4), zhalf)); - z40 = vaddq_f32(z40, vld1q_f32(bpptr + bpstep*4)); - z41 = vaddq_f32(z41, vcombine_f32(vld1_f32(bpptr + bpstep*4 + 4), zhalf)); - z50 = vaddq_f32(z50, vld1q_f32(bpptr + bpstep*5)); - z51 = vaddq_f32(z51, vcombine_f32(vld1_f32(bpptr + bpstep*5 + 4), zhalf)); - } - - if (ifMinMaxAct) - { - float32x4_t vmax = vdupq_n_f32(maxval); - float32x4_t vmin = vdupq_n_f32(minval); - - z00 = vminq_f32(vmaxq_f32(z00, vmin), vmax); - z01 = vminq_f32(vmaxq_f32(z01, vmin), vmax); - z10 = vminq_f32(vmaxq_f32(z10, vmin), vmax); - z11 = vminq_f32(vmaxq_f32(z11, vmin), vmax); - z20 = vminq_f32(vmaxq_f32(z20, vmin), vmax); - z21 = vminq_f32(vmaxq_f32(z21, vmin), vmax); - z30 = vminq_f32(vmaxq_f32(z30, vmin), vmax); - z31 = vminq_f32(vmaxq_f32(z31, vmin), vmax); - z40 = vminq_f32(vmaxq_f32(z40, vmin), vmax); - z41 = vminq_f32(vmaxq_f32(z41, vmin), vmax); - z50 = vminq_f32(vmaxq_f32(z50, vmin), vmax); - z51 = vminq_f32(vmaxq_f32(z51, vmin), vmax); - } - - vst1q_f32(outptr, z00); - vst1_f32(outptr + 4, vget_low_f32(z01)); - vst1q_f32(outptr + outstep, z10); - vst1_f32(outptr + outstep + 4, vget_low_f32(z11)); - vst1q_f32(outptr + outstep*2, z20); - vst1_f32(outptr + outstep*2 + 4, vget_low_f32(z21)); - vst1q_f32(outptr + outstep*3, z30); - vst1_f32(outptr + outstep*3 + 4, vget_low_f32(z31)); - vst1q_f32(outptr + outstep*4, z40); - vst1_f32(outptr + outstep*4 + 4, vget_low_f32(z41)); - vst1q_f32(outptr + outstep*5, z50); - vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51)); -} - -#endif -} - -}} // namespace diff --git a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp index e44d0f8004..cea726a374 100644 --- a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp +++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp @@ -3,45 +3,44 @@ // of this distribution and at http://opencv.org/license.html. #include "opencv2/core/hal/intrin.hpp" +#include "convolution.hpp" + +// === dispatched calls (implemented here) namespace cv { namespace dnn { CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN -/* Accumulate */ -void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock, - const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32); +cv::dnn::Winofunc getWinofunc_F32(); +cv::dnn::Winofunc getWinofunc_F16(); -/*Input transform*/ -void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep, - float* outptr, int Cg, const int winoIblock, const int winoAtomF32); +CV_CPU_OPTIMIZATION_NAMESPACE_END +}} // cv::dnn:: -/*Output transform*/ -void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep, - float* bpptr, int bpstep, float* outptr, int outstep, - float bias, float minval, float maxval, bool ifMinMaxAct); - -// FP 16 branch, only ARMv8 supports. -void winofunc_accum_F16(const char* _inwptr, const char* _wptr, char* _outbuf, int Cg, int iblock, - const int winoIblock, const int winoKblock, const int winoAtomF16, const int winoNatomF16); -void winofunc_BtXB_8x8_F16(const float * inptr, int inpstep, - char * _outptr, int Cg, const int winoIblock, const int winoAtomF16); -void winofunc_AtXA_8x8_F16(const char* inptr, int inpstep, - float * bpptr, int bpstep, float* outptr, int outstep, - float bias, float minval, float maxval, bool ifMinMaxAct); +// === implementation + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +namespace cv { +namespace dnn { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN -#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) -#if CV_AVX +#if defined(CV_CPU_COMPILE_AVX) && CV_CPU_COMPILE_AVX || defined(CV_CPU_COMPILE_AVX2) && CV_CPU_COMPILE_AVX2 + #if !CV_FMA3 // AVX workaround #undef _mm256_fmadd_ps #define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b)) #endif -void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock, +void impl_accum_F32(const uchar* inwptr_, const uchar* wptr_, uchar* outbuf_, int Cg, int iblock, const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32) { + const float * inwptr = (float*)inwptr_; + const float * wptr = (float*)wptr_; + float * outbuf = (float*)outbuf_; + CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF32 == 8); if (iblock > 3) { @@ -166,6 +165,7 @@ void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, i } _mm256_zeroupper(); } + static inline void transpose8_ps(__m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m256 &row4, __m256 &row5, __m256 &row6, __m256 &row7) { @@ -198,9 +198,11 @@ void transpose8_ps(__m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m25 } /*Input transform*/ -void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep, - float* outptr, int Cg, const int winoIblock, const int winoAtomF32) +void impl_BtXB_8x8_F32(const float* inptr, int inpstep, + uchar* outptr_, int Cg, const int winoIblock, const int winoAtomF32) { + float * outptr = (float*)outptr_; + __m256 x00 = _mm256_loadu_ps(inptr); __m256 x10 = _mm256_loadu_ps(inptr + inpstep); __m256 x20 = _mm256_loadu_ps(inptr + inpstep*2); @@ -322,10 +324,11 @@ void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep, 0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f, 0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f] */ -void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep, +void impl_AtXA_8x8_F32(const uchar* inptr_, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep, float bias, float minval, float maxval, bool ifMinMaxAct) { + const float * inptr = (float*)inptr_; __m256 x00 = _mm256_load_ps(inptr); __m256 x10 = _mm256_load_ps(inptr + inpstep); @@ -417,10 +420,940 @@ void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep, _mm256_zeroupper(); } -#endif // CV_AVX +cv::dnn::Winofunc getWinofunc_F32() +{ + return {&impl_accum_F32, &impl_BtXB_8x8_F32, &impl_AtXA_8x8_F32, 6, 8, 4}; +} + + +// end of AVX/AVX2 +#elif defined(CV_CPU_COMPILE_NEON) && CV_CPU_COMPILE_NEON && defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 + + +/* Accumulate */ +void impl_accum_F32(const uchar* inwptr_, const uchar* wptr_, uchar* outbuf_, int Cg, int iblock, + const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32) +{ + const float * inwptr = (float*)inwptr_; + const float * wptr = (float*)wptr_; + float * outbuf = (float*)outbuf_; + + CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF32 == 4); + if (iblock > 3) + { + for (int atom_id = 0; atom_id < winoNatomF32; atom_id++, + outbuf += winoAtomF32) + { + float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00; + float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00; + float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00; + float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00; + for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, + wptr += winoKblock*winoAtomF32) { + float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4); + float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12); + float32x4_t x0, x1; + x0 = vld1q_f32(inwptr); + x1 = vld1q_f32(inwptr + 4); + s00 = vfmaq_f32(s00, w0, x0); + s01 = vfmaq_f32(s01, w0, x1); + s10 = vfmaq_f32(s10, w1, x0); + s11 = vfmaq_f32(s11, w1, x1); + s20 = vfmaq_f32(s20, w2, x0); + s21 = vfmaq_f32(s21, w2, x1); + s30 = vfmaq_f32(s30, w3, x0); + s31 = vfmaq_f32(s31, w3, x1); + x0 = vld1q_f32(inwptr + 8); + x1 = vld1q_f32(inwptr + 12); + s02 = vfmaq_f32(s02, w0, x0); + s03 = vfmaq_f32(s03, w0, x1); + s12 = vfmaq_f32(s12, w1, x0); + s13 = vfmaq_f32(s13, w1, x1); + s22 = vfmaq_f32(s22, w2, x0); + s23 = vfmaq_f32(s23, w2, x1); + s32 = vfmaq_f32(s32, w3, x0); + s33 = vfmaq_f32(s33, w3, x1); + x0 = vld1q_f32(inwptr + 16); + x1 = vld1q_f32(inwptr + 20); + s04 = vfmaq_f32(s04, w0, x0); + s05 = vfmaq_f32(s05, w0, x1); + s14 = vfmaq_f32(s14, w1, x0); + s15 = vfmaq_f32(s15, w1, x1); + s24 = vfmaq_f32(s24, w2, x0); + s25 = vfmaq_f32(s25, w2, x1); + s34 = vfmaq_f32(s34, w3, x0); + s35 = vfmaq_f32(s35, w3, x1); + } + + vst1q_f32(outbuf, s00); + vst1q_f32(outbuf + 1*64, s01); + vst1q_f32(outbuf + 2*64, s02); + vst1q_f32(outbuf + 3*64, s03); + vst1q_f32(outbuf + 4*64, s04); + vst1q_f32(outbuf + 5*64, s05); + + vst1q_f32(outbuf + 6*64, s10); + vst1q_f32(outbuf + 7*64, s11); + vst1q_f32(outbuf + 8*64, s12); + vst1q_f32(outbuf + 9*64, s13); + vst1q_f32(outbuf + 10*64, s14); + vst1q_f32(outbuf + 11*64, s15); + + vst1q_f32(outbuf + 12*64, s20); + vst1q_f32(outbuf + 13*64, s21); + vst1q_f32(outbuf + 14*64, s22); + vst1q_f32(outbuf + 15*64, s23); + vst1q_f32(outbuf + 16*64, s24); + vst1q_f32(outbuf + 17*64, s25); + + vst1q_f32(outbuf + 18*64, s30); + vst1q_f32(outbuf + 19*64, s31); + vst1q_f32(outbuf + 20*64, s32); + vst1q_f32(outbuf + 21*64, s33); + vst1q_f32(outbuf + 22*64, s34); + vst1q_f32(outbuf + 23*64, s35); + } + } + else + { + for (int atom_id = 0; atom_id < winoNatomF32; atom_id++, + outbuf += winoAtomF32) + { + float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00; + float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00; + float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00; + float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00; + for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, + wptr += winoKblock*winoAtomF32) { + float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4); + float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12); + float32x4_t x0, x1, x2; + x0 = vld1q_f32(inwptr); + x1 = vld1q_f32(inwptr + 4); + x2 = vld1q_f32(inwptr + 8); + s00 = vfmaq_f32(s00, w0, x0); + s01 = vfmaq_f32(s01, w0, x1); + s02 = vfmaq_f32(s02, w0, x2); + s10 = vfmaq_f32(s10, w1, x0); + s11 = vfmaq_f32(s11, w1, x1); + s12 = vfmaq_f32(s12, w1, x2); + s20 = vfmaq_f32(s20, w2, x0); + s21 = vfmaq_f32(s21, w2, x1); + s22 = vfmaq_f32(s22, w2, x2); + s30 = vfmaq_f32(s30, w3, x0); + s31 = vfmaq_f32(s31, w3, x1); + s32 = vfmaq_f32(s32, w3, x2); + } + + vst1q_f32(outbuf, s00); + vst1q_f32(outbuf + 1*64, s01); + vst1q_f32(outbuf + 2*64, s02); + vst1q_f32(outbuf + 6*64, s10); + vst1q_f32(outbuf + 7*64, s11); + vst1q_f32(outbuf + 8*64, s12); + vst1q_f32(outbuf + 12*64, s20); + vst1q_f32(outbuf + 13*64, s21); + vst1q_f32(outbuf + 14*64, s22); + vst1q_f32(outbuf + 18*64, s30); + vst1q_f32(outbuf + 19*64, s31); + vst1q_f32(outbuf + 20*64, s32); + } + } +} + +#undef T4x4 +#define T4x4(a, b, c, d, tr0, tr1) \ + tr0 = vtrnq_f32(a, b); \ + tr1 = vtrnq_f32(c, d); \ + a = vcombine_f32(vget_low_f32(tr0.val[0]), vget_low_f32(tr1.val[0])); \ + b = vcombine_f32(vget_low_f32(tr0.val[1]), vget_low_f32(tr1.val[1])); \ + c = vcombine_f32(vget_high_f32(tr0.val[0]), vget_high_f32(tr1.val[0])); \ + d = vcombine_f32(vget_high_f32(tr0.val[1]), vget_high_f32(tr1.val[1])) + +/*Input transform*/ +void impl_BtXB_8x8_F32(const float* inptr, int inpstep, + uchar* outptr_, int Cg, const int winoIblock, const int winoAtomF32) +{ + float * outptr = (float*)outptr_; + + float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4); + float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4); + float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4); + float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4); + float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4); + float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4); + float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4); + float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4); + + float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71; + + { + /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */ + /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */ + float32x4_t q5_25 = vdupq_n_f32(5.25f), t00, t01, t10, t11; + t00 = vsubq_f32(x40, x20); + t01 = vsubq_f32(x41, x21); + t10 = vsubq_f32(x30, x50); + t11 = vsubq_f32(x31, x51); + float32x4_t y00 = vfmaq_f32(vsubq_f32(x00, x60), t00, q5_25); + float32x4_t y01 = vfmaq_f32(vsubq_f32(x01, x61), t01, q5_25); + float32x4_t y70 = vfmaq_f32(vsubq_f32(x70, x10), t10, q5_25); + float32x4_t y71 = vfmaq_f32(vsubq_f32(x71, x11), t11, q5_25); + + /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */ + /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */ + float32x4_t qm4_25 = vdupq_n_f32(-4.25f); + t00 = vfmaq_f32(vaddq_f32(x10, x50), x30, qm4_25); + t01 = vfmaq_f32(vaddq_f32(x11, x51), x31, qm4_25); + t10 = vfmaq_f32(vaddq_f32(x20, x60), x40, qm4_25); + t11 = vfmaq_f32(vaddq_f32(x21, x61), x41, qm4_25); + + float32x4_t y10 = vaddq_f32(t00, t10), y11 = vaddq_f32(t01, t11); + float32x4_t y20 = vsubq_f32(t10, t00), y21 = vsubq_f32(t11, t01); + + /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */ + /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */ + float32x4_t q0_5 = vdupq_n_f32(0.5f), q0_25 = vdupq_n_f32(0.25f); + float32x4_t qm2_5 = vdupq_n_f32(-2.5f), qm1_25 = vdupq_n_f32(-1.25f); + t00 = vfmaq_f32(vaddq_f32(x50, x50), x10, q0_5); + t01 = vfmaq_f32(vaddq_f32(x51, x51), x11, q0_5); + t10 = vfmaq_f32(x60, x20, q0_25); + t11 = vfmaq_f32(x61, x21, q0_25); + t00 = vfmaq_f32(t00, x30, qm2_5); + t01 = vfmaq_f32(t01, x31, qm2_5); + t10 = vfmaq_f32(t10, x40, qm1_25); + t11 = vfmaq_f32(t11, x41, qm1_25); + + float32x4_t y30 = vaddq_f32(t00, t10), y31 = vaddq_f32(t01, t11); + float32x4_t y40 = vsubq_f32(t10, t00), y41 = vsubq_f32(t11, t01); + + /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */ + /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */ + float32x4_t q4 = vdupq_n_f32(4.f), qm5 = vdupq_n_f32(-5.f); + t00 = vfmaq_f32(vaddq_f32(x10, x10), x50, q0_5); + t01 = vfmaq_f32(vaddq_f32(x11, x11), x51, q0_5); + t10 = vfmaq_f32(x60, x20, q4); + t11 = vfmaq_f32(x61, x21, q4); + t00 = vfmaq_f32(t00, x30, qm2_5); + t01 = vfmaq_f32(t01, x31, qm2_5); + t10 = vfmaq_f32(t10, x40, qm5); + t11 = vfmaq_f32(t11, x41, qm5); + + float32x4_t y50 = vaddq_f32(t00, t10), y51 = vaddq_f32(t01, t11); + float32x4_t y60 = vsubq_f32(t10, t00), y61 = vsubq_f32(t11, t01); + + /* transpose 8x8 matrix in-place with some renumeration of the elements: */ + /* Y: */ + /* y00 y01 */ + /* y10 y11 */ + /* ... */ + /* y70 y71 */ + /* Y': */ + /* y00 y40 */ + /* y10 y50 */ + /* y20 y60 */ + /* y30 y70 */ + /* y01 y41 */ + /* y11 y51 */ + /* y21 y61 */ + /* y31 y71 */ + /* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */ + float32x4x2_t tr0, tr1; + + T4x4(y00, y10, y20, y30, tr0, tr1); + T4x4(y01, y11, y21, y31, tr0, tr1); + T4x4(y40, y50, y60, y70, tr0, tr1); + T4x4(y41, y51, y61, y71, tr0, tr1); + + /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */ + /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */ + t00 = vsubq_f32(y01, y20); + t01 = vsubq_f32(y41, y60); + t10 = vsubq_f32(y30, y11); + t11 = vsubq_f32(y70, y51); + z00 = vfmaq_f32(vsubq_f32(y00, y21), t00, q5_25); + z01 = vfmaq_f32(vsubq_f32(y40, y61), t01, q5_25); + z70 = vfmaq_f32(vsubq_f32(y31, y10), t10, q5_25); + z71 = vfmaq_f32(vsubq_f32(y71, y50), t11, q5_25); + + /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */ + /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */ + t00 = vfmaq_f32(vaddq_f32(y10, y11), y30, qm4_25); + t01 = vfmaq_f32(vaddq_f32(y50, y51), y70, qm4_25); + t10 = vfmaq_f32(vaddq_f32(y20, y21), y01, qm4_25); + t11 = vfmaq_f32(vaddq_f32(y60, y61), y41, qm4_25); + + z10 = vaddq_f32(t00, t10); z11 = vaddq_f32(t01, t11); + z20 = vsubq_f32(t10, t00); z21 = vsubq_f32(t11, t01); + + /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */ + /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */ + t00 = vfmaq_f32(vaddq_f32(y11, y11), y10, q0_5); + t01 = vfmaq_f32(vaddq_f32(y51, y51), y50, q0_5); + t10 = vfmaq_f32(y21, y20, q0_25); + t11 = vfmaq_f32(y61, y60, q0_25); + t00 = vfmaq_f32(t00, y30, qm2_5); + t01 = vfmaq_f32(t01, y70, qm2_5); + t10 = vfmaq_f32(t10, y01, qm1_25); + t11 = vfmaq_f32(t11, y41, qm1_25); + + z30 = vaddq_f32(t00, t10); z31 = vaddq_f32(t01, t11); + z40 = vsubq_f32(t10, t00); z41 = vsubq_f32(t11, t01); + + /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */ + /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */ + t00 = vfmaq_f32(vaddq_f32(y10, y10), y11, q0_5); + t01 = vfmaq_f32(vaddq_f32(y50, y50), y51, q0_5); + t10 = vfmaq_f32(y21, y20, q4); + t11 = vfmaq_f32(y61, y60, q4); + t00 = vfmaq_f32(t00, y30, qm2_5); + t01 = vfmaq_f32(t01, y70, qm2_5); + t10 = vfmaq_f32(t10, y01, qm5); + t11 = vfmaq_f32(t11, y41, qm5); + + z50 = vaddq_f32(t00, t10); z51 = vaddq_f32(t01, t11); + z60 = vsubq_f32(t10, t00); z61 = vsubq_f32(t11, t01); + } + + const int outstep = winoIblock*winoAtomF32*Cg; + + vst1q_f32(outptr, z00); + vst1q_f32(outptr + outstep, z01); + vst1q_f32(outptr + outstep*2, z10); + vst1q_f32(outptr + outstep*3, z11); + vst1q_f32(outptr + outstep*4, z20); + vst1q_f32(outptr + outstep*5, z21); + vst1q_f32(outptr + outstep*6, z30); + vst1q_f32(outptr + outstep*7, z31); + vst1q_f32(outptr + outstep*8, z40); + vst1q_f32(outptr + outstep*9, z41); + vst1q_f32(outptr + outstep*10, z50); + vst1q_f32(outptr + outstep*11, z51); + vst1q_f32(outptr + outstep*12, z60); + vst1q_f32(outptr + outstep*13, z61); + vst1q_f32(outptr + outstep*14, z70); + vst1q_f32(outptr + outstep*15, z71); +} + +/*Output transform*/ +void impl_AtXA_8x8_F32(const uchar* inptr_, int inpstep, + float* bpptr, int bpstep, float* outptr, int outstep, + float bias, float minval, float maxval, bool ifMinMaxAct) +{ + const float * inptr = (float*)inptr_; + + float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4); + float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4); + float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4); + float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4); + float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4); + float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4); + float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4); + float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4); + float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51; + + { + float32x4_t s12_0, s12_1, s34_0, s34_1, s56_0, s56_1; + s12_0 = vaddq_f32(x10, x20); s12_1 = vaddq_f32(x11, x21); + s34_0 = vaddq_f32(x30, x40); s34_1 = vaddq_f32(x31, x41); + s56_0 = vaddq_f32(x50, x60); s56_1 = vaddq_f32(x51, x61); + + float32x4_t y00 = vaddq_f32(vaddq_f32(vaddq_f32(x00, s12_0), s34_0), s56_0); + float32x4_t y01 = vaddq_f32(vaddq_f32(vaddq_f32(x01, s12_1), s34_1), s56_1); + float32x4_t y20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f); + float32x4_t y21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f); + float32x4_t y40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16); + float32x4_t y41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16); + + s12_0 = vsubq_f32(x10, x20); s12_1 = vsubq_f32(x11, x21); + s34_0 = vsubq_f32(x30, x40); s34_1 = vsubq_f32(x31, x41); + s56_0 = vsubq_f32(x50, x60); s56_1 = vsubq_f32(x51, x61); + + float32x4_t y50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x70, s12_0), + s34_0, 32.f), s56_0, 1.f/32); + float32x4_t y51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x71, s12_1), + s34_1, 32.f), s56_1, 1.f/32); + float32x4_t y10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f); + float32x4_t y11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f); + float32x4_t y30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f); + float32x4_t y31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f); + float32x4_t y60 = vdupq_n_f32(0.f), y61 = y60, y70 = y60, y71 = y60; + + /* transpose 8x8 matrix in-place with some renumeration of the elements: */ + /* Y: */ + /* y00 y01 */ + /* y10 y11 */ + /* ... */ + /* y50 y51 */ + /* 0 0 */ + /* 0 0 */ + /* Y': */ + /* y00 y40 */ + /* y10 y50 */ + /* y20 y60 */ + /* y30 y70 */ + /* y01 y41 */ + /* y11 y51 */ + /* y21 y61 */ + /* y31 y71 */ + /* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */ + float32x4x2_t tr0, tr1; + + T4x4(y00, y10, y20, y30, tr0, tr1); + T4x4(y01, y11, y21, y31, tr0, tr1); + T4x4(y40, y50, y60, y70, tr0, tr1); + T4x4(y41, y51, y61, y71, tr0, tr1); + + s12_0 = vaddq_f32(y10, y20); s12_1 = vaddq_f32(y50, y60); + s34_0 = vaddq_f32(y30, y01); s34_1 = vaddq_f32(y70, y41); + s56_0 = vaddq_f32(y11, y21); s56_1 = vaddq_f32(y51, y61); + + z00 = vaddq_f32(vaddq_f32(vaddq_f32(y00, s12_0), s34_0), s56_0); + z01 = vaddq_f32(vaddq_f32(vaddq_f32(y40, s12_1), s34_1), s56_1); + z20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f); + z21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f); + z40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16); + z41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16); + + s12_0 = vsubq_f32(y10, y20); s12_1 = vsubq_f32(y50, y60); + s34_0 = vsubq_f32(y30, y01); s34_1 = vsubq_f32(y70, y41); + s56_0 = vsubq_f32(y11, y21); s56_1 = vsubq_f32(y51, y61); + + z50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y31, s12_0), + s34_0, 32.f), s56_0, 1.f/32); + z51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y71, s12_1), + s34_1, 32.f), s56_1, 1.f/32); + z10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f); + z11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f); + z30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f); + z31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f); + float32x4_t vbias = vdupq_n_f32(bias); + + z00 = vaddq_f32(z00, vbias); + z01 = vaddq_f32(z01, vbias); + z10 = vaddq_f32(z10, vbias); + z11 = vaddq_f32(z11, vbias); + z20 = vaddq_f32(z20, vbias); + z21 = vaddq_f32(z21, vbias); + z30 = vaddq_f32(z30, vbias); + z31 = vaddq_f32(z31, vbias); + z40 = vaddq_f32(z40, vbias); + z41 = vaddq_f32(z41, vbias); + z50 = vaddq_f32(z50, vbias); + z51 = vaddq_f32(z51, vbias); + } + + if (bpptr) + { + float32x2_t zhalf = vdup_n_f32(0.f); + z00 = vaddq_f32(z00, vld1q_f32(bpptr)); + z01 = vaddq_f32(z01, vcombine_f32(vld1_f32(bpptr + 4), zhalf)); + z10 = vaddq_f32(z10, vld1q_f32(bpptr + bpstep)); + z11 = vaddq_f32(z11, vcombine_f32(vld1_f32(bpptr + bpstep + 4), zhalf)); + z20 = vaddq_f32(z20, vld1q_f32(bpptr + bpstep*2)); + z21 = vaddq_f32(z21, vcombine_f32(vld1_f32(bpptr + bpstep*2 + 4), zhalf)); + z30 = vaddq_f32(z30, vld1q_f32(bpptr + bpstep*3)); + z31 = vaddq_f32(z31, vcombine_f32(vld1_f32(bpptr + bpstep*3 + 4), zhalf)); + z40 = vaddq_f32(z40, vld1q_f32(bpptr + bpstep*4)); + z41 = vaddq_f32(z41, vcombine_f32(vld1_f32(bpptr + bpstep*4 + 4), zhalf)); + z50 = vaddq_f32(z50, vld1q_f32(bpptr + bpstep*5)); + z51 = vaddq_f32(z51, vcombine_f32(vld1_f32(bpptr + bpstep*5 + 4), zhalf)); + } + + if (ifMinMaxAct) + { + float32x4_t vmax = vdupq_n_f32(maxval); + float32x4_t vmin = vdupq_n_f32(minval); + + z00 = vminq_f32(vmaxq_f32(z00, vmin), vmax); + z01 = vminq_f32(vmaxq_f32(z01, vmin), vmax); + z10 = vminq_f32(vmaxq_f32(z10, vmin), vmax); + z11 = vminq_f32(vmaxq_f32(z11, vmin), vmax); + z20 = vminq_f32(vmaxq_f32(z20, vmin), vmax); + z21 = vminq_f32(vmaxq_f32(z21, vmin), vmax); + z30 = vminq_f32(vmaxq_f32(z30, vmin), vmax); + z31 = vminq_f32(vmaxq_f32(z31, vmin), vmax); + z40 = vminq_f32(vmaxq_f32(z40, vmin), vmax); + z41 = vminq_f32(vmaxq_f32(z41, vmin), vmax); + z50 = vminq_f32(vmaxq_f32(z50, vmin), vmax); + z51 = vminq_f32(vmaxq_f32(z51, vmin), vmax); + } + + vst1q_f32(outptr, z00); + vst1_f32(outptr + 4, vget_low_f32(z01)); + vst1q_f32(outptr + outstep, z10); + vst1_f32(outptr + outstep + 4, vget_low_f32(z11)); + vst1q_f32(outptr + outstep*2, z20); + vst1_f32(outptr + outstep*2 + 4, vget_low_f32(z21)); + vst1q_f32(outptr + outstep*3, z30); + vst1_f32(outptr + outstep*3 + 4, vget_low_f32(z31)); + vst1q_f32(outptr + outstep*4, z40); + vst1_f32(outptr + outstep*4 + 4, vget_low_f32(z41)); + vst1q_f32(outptr + outstep*5, z50); + vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51)); +} + +cv::dnn::Winofunc getWinofunc_F32() +{ + return {&impl_accum_F32, &impl_BtXB_8x8_F32, &impl_AtXA_8x8_F32, 6, 4, 4}; +} + + +// end of NEON/AArch64 +#elif CV_SIMD128 + + +void impl_accum_F32(const uchar* inwptr_, const uchar* wptr_, uchar* outbuf_, int Cg, int iblock, + const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32) +{ + const float * inwptr = (float*)inwptr_; + const float * wptr = (float*)wptr_; + float * outbuf = (float*)outbuf_; +#if 1 + CV_Assert(winoIblock == 3 && winoKblock == 4 && winoAtomF32 == 4); + for (int atom_id = 0; atom_id < winoNatomF32; atom_id++, + outbuf += winoAtomF32) + { + v_float32x4 s00 = v_setzero_f32(), s01 = s00, s02 = s00; + v_float32x4 s10 = v_setzero_f32(), s11 = s00, s12 = s00; + v_float32x4 s20 = v_setzero_f32(), s21 = s00, s22 = s00; + v_float32x4 s30 = v_setzero_f32(), s31 = s00, s32 = s00; + + for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, + wptr += winoKblock*winoAtomF32) + { + v_float32x4 x0, x1, x2; + x0 = v_load(inwptr); + x1 = v_load(inwptr + 4); + x2 = v_load(inwptr + 8); + + v_float32x4 w0 = v_load(wptr); + s00 = v_fma(w0, x0, s00); + s01 = v_fma(w0, x1, s01); + s02 = v_fma(w0, x2, s02); + + w0 = v_load(wptr + 4); + s10 = v_fma(w0, x0, s10); + s11 = v_fma(w0, x1, s11); + s12 = v_fma(w0, x2, s12); + + w0 = v_load(wptr + 8); + s20 = v_fma(w0, x0, s20); + s21 = v_fma(w0, x1, s21); + s22 = v_fma(w0, x2, s22); + + w0 = v_load(wptr + 12); + s30 = v_fma(w0, x0, s30); + s31 = v_fma(w0, x1, s31); + s32 = v_fma(w0, x2, s32); + } + + v_store(outbuf, s00); + v_store(outbuf + 1*64, s01); + v_store(outbuf + 2*64, s02); + v_store(outbuf + 3*64, s10); + v_store(outbuf + 4*64, s11); + v_store(outbuf + 5*64, s12); + v_store(outbuf + 6*64, s20); + v_store(outbuf + 7*64, s21); + v_store(outbuf + 8*64, s22); + v_store(outbuf + 9*64, s30); + v_store(outbuf + 10*64, s31); + v_store(outbuf + 11*64, s32); + } +#else + // Naive C++ code, the code should never be run here. + for (int atom_id = 0; atom_id < winoNatomF32; + atom_id++, outbuf += winoAtomF32) + { + float sumbuf[winoIblock*winoKblock*winoAtomF32]; + memset(sumbuf, 0, sizeof(sumbuf)); + for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, + wptr += winoKblock*winoAtomF32) + { + for (int i = 0; i < winoKblock; i++) + { + for (int j = 0; j < winoIblock; j++) + { + int i_ = i*winoAtomF32; + int j_ = j*winoAtomF32; + int ij_ = i_*winoIblock + j_; + float s0 = inwptr[j_ + 0]*wptr[i_ + 0]; + float s1 = inwptr[j_ + 1]*wptr[i_ + 1]; + float s2 = inwptr[j_ + 2]*wptr[i_ + 2]; + float s3 = inwptr[j_ + 3]*wptr[i_ + 3]; + sumbuf[ij_ + 0] += s0; + sumbuf[ij_ + 1] += s1; + sumbuf[ij_ + 2] += s2; + sumbuf[ij_ + 3] += s3; + } + } + } + for (int ij = 0; ij < winoKblock*winoIblock; ij++) + { + int ij_ = ij*winoAtomF32; + int ij_out = ij*CONV_WINO_AREA; + outbuf[ij_out + 0] = sumbuf[ij_ + 0]; + outbuf[ij_out + 1] = sumbuf[ij_ + 1]; + outbuf[ij_out + 2] = sumbuf[ij_ + 2]; + outbuf[ij_out + 3] = sumbuf[ij_ + 3]; + } + } +#endif +} + +/*Input transform*/ +void impl_BtXB_8x8_F32(const float* inptr, int inpstep, uchar* outptr_, int Cg, const int winoIblock, const int winoAtomF32) +{ + float * outptr = (float*)outptr_; + + CV_Assert(winoIblock == 3 && winoAtomF32 == 4); + v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4); + v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4); + v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4); + v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4); + v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4); + v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4); + v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4); + v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4); + + v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71; + + { + /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */ + /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */ + v_float32x4 q5_25 = v_setall_f32(5.25f), t00, t01, t10, t11; + t00 = v_sub(x40, x20); + t01 = v_sub(x41, x21); + t10 = v_sub(x30, x50); + t11 = v_sub(x31, x51); + v_float32x4 y00 = v_fma(t00, q5_25, v_sub(x00, x60)); + v_float32x4 y01 = v_fma(t01, q5_25, v_sub(x01, x61)); + v_float32x4 y70 = v_fma(t10, q5_25, v_sub(x70, x10)); + v_float32x4 y71 = v_fma(t11, q5_25, v_sub(x71, x11)); + + /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */ + /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */ + v_float32x4 qm4_25 = v_setall_f32(-4.25f); + t00 = v_fma(x30, qm4_25, v_add(x10, x50)); + t01 = v_fma(x31, qm4_25, v_add(x11, x51)); + t10 = v_fma(x40, qm4_25, v_add(x20, x60)); + t11 = v_fma(x41, qm4_25, v_add(x21, x61)); + + v_float32x4 y10 = v_add(t00, t10), y11 = v_add(t01, t11); + v_float32x4 y20 = v_sub(t10, t00), y21 = v_sub(t11, t01); + + /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */ + /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */ + v_float32x4 q0_5 = v_setall_f32(0.5f), q0_25 = v_setall_f32(0.25f); + v_float32x4 qm2_5 = v_setall_f32(-2.5f), qm1_25 = v_setall_f32(-1.25f); + t00 = v_fma(x10, q0_5, v_add(x50, x50)); + t01 = v_fma(x11, q0_5, v_add(x51, x51)); + t10 = v_fma(x20, q0_25, x60); + t11 = v_fma(x21, q0_25, x61); + t00 = v_fma(x30, qm2_5, t00); + t01 = v_fma(x31, qm2_5, t01); + t10 = v_fma(x40, qm1_25, t10); + t11 = v_fma(x41, qm1_25, t11); + + v_float32x4 y30 = v_add(t00, t10), y31 = v_add(t01, t11); + v_float32x4 y40 = v_sub(t10, t00), y41 = v_sub(t11, t01); + + /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */ + /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */ + v_float32x4 q4 = v_setall_f32(4.f), qm5 = v_setall_f32(-5.f); + t00 = v_fma(x50, q0_5, v_add(x10, x10)); + t01 = v_fma(x51, q0_5, v_add(x11, x11)); + t10 = v_fma(x20, q4 , x60); + t11 = v_fma(x21, q4 , x61); + t00 = v_fma(x30, qm2_5, t00); + t01 = v_fma(x31, qm2_5, t01); + t10 = v_fma(x40, qm5 , t10); + t11 = v_fma(x41, qm5 , t11); + + v_float32x4 y50 = v_add(t00, t10), y51 = v_add(t01, t11); + v_float32x4 y60 = v_sub(t10, t00), y61 = v_sub(t11, t01); + + /* transpose 8x8 matrix with v_transpose4x4 */ + + v_float32x4 y000, y100, y200, y300, y010, y110, y210, y310, y400, y500, y600, y700, y410, y510, y610, y710; + v_transpose4x4(y00, y10, y20, y30, y000, y100, y200, y300); + v_transpose4x4(y01, y11, y21, y31, y010, y110, y210, y310); + v_transpose4x4(y40, y50, y60, y70, y400, y500, y600, y700); + v_transpose4x4(y41, y51, y61, y71, y410, y510, y610, y710); + + /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */ + /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */ + t00 = v_sub(y010, y200); + t01 = v_sub(y410, y600); + t10 = v_sub(y300, y110); + t11 = v_sub(y700, y510); + z00 = v_fma(t00, q5_25, v_sub(y000, y210)); + z01 = v_fma(t01, q5_25, v_sub(y400, y610)); + z70 = v_fma(t10, q5_25, v_sub(y310, y100)); + z71 = v_fma(t11, q5_25, v_sub(y710, y500)); + + /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */ + /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */ + t00 = v_fma(y300, qm4_25, v_add(y100, y110)); + t01 = v_fma(y700, qm4_25, v_add(y500, y510)); + t10 = v_fma(y010, qm4_25, v_add(y200, y210)); + t11 = v_fma(y410, qm4_25, v_add(y600, y610)); + + z10 = v_add(t00, t10); z11 = v_add(t01, t11); + z20 = v_sub(t10, t00); z21 = v_sub(t11, t01); + + /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */ + /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */ + t00 = v_fma(y100, q0_5, v_add(y110, y110)); + t01 = v_fma(y500, q0_5, v_add(y510, y510)); + t10 = v_fma(y200, q0_25, y210); + t11 = v_fma(y600, q0_25, y610); + t00 = v_fma(y300, qm2_5, t00); + t01 = v_fma(y700, qm2_5, t01); + t10 = v_fma(y010, qm1_25, t10); + t11 = v_fma(y410, qm1_25, t11); + + z30 = v_add(t00, t10); z31 = v_add(t01, t11); + z40 = v_sub(t10, t00); z41 = v_sub(t11, t01); + + /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */ + /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */ + t00 = v_fma(y110, q0_5, v_add(y100, y100)); + t01 = v_fma(y510, q0_5, v_add(y500, y500)); + t10 = v_fma(y200, q4, y210); + t11 = v_fma(y600, q4, y610); + t00 = v_fma(y300, qm2_5, t00); + t01 = v_fma(y700, qm2_5, t01); + t10 = v_fma(y010, qm5, t10); + t11 = v_fma(y410, qm5, t11); + + z50 = v_add(t00, t10); z51 = v_add(t01, t11); + z60 = v_sub(t10, t00); z61 = v_sub(t11, t01); + } + + const int outstep = winoIblock*winoAtomF32*Cg; + + v_store(outptr, z00); + v_store(outptr + outstep, z01); + v_store(outptr + outstep*2, z10); + v_store(outptr + outstep*3, z11); + v_store(outptr + outstep*4, z20); + v_store(outptr + outstep*5, z21); + v_store(outptr + outstep*6, z30); + v_store(outptr + outstep*7, z31); + v_store(outptr + outstep*8, z40); + v_store(outptr + outstep*9, z41); + v_store(outptr + outstep*10, z50); + v_store(outptr + outstep*11, z51); + v_store(outptr + outstep*12, z60); + v_store(outptr + outstep*13, z61); + v_store(outptr + outstep*14, z70); + v_store(outptr + outstep*15, z71); +} + +/*Output transform*/ +/* Inverse Winograd 8x8 transform: + out = (A'*inp*A)', where + inp is input 8x8 FP32 matrix, + A' is + [1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, + 0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f, + 0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f, + 0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f, + 0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f, + 0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f] + + inp is pre-loaded into xij registers, + out will be stored in zij, where (0<=i<=7 for x, 0<=i<=5 for z), 0<=j<=1. + + After the inverse transform is done, we add bias, + optionally add results from the earlier tensors (by-pass), + optionally apply activation function and then + store the final results. + + That is, after both forward and then inverse transformation, + we get non-transposed result. + Of course, for the correct work of Winograd-based convolution, + the Winograd-transformed weights should also be transposed. + init_conv() (see OpConv.fx) takes care of that. +*/ +void impl_AtXA_8x8_F32(const uchar* inptr_, int inpstep, + float* bpptr, int bpstep, float* outptr, int outstep, + float bias, float minval, float maxval, bool ifMinMaxAct) +{ + const float * inptr = (float*)inptr_; + + v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4); + v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4); + v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4); + v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4); + v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4); + v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4); + v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4); + v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4); + v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51; + + { + v_float32x4 s12_0, s12_1, s34_0, s34_1, s56_0, s56_1; + s12_0 = v_add(x10, x20); s12_1 = v_add(x11, x21); + s34_0 = v_add(x30, x40); s34_1 = v_add(x31, x41); + s56_0 = v_add(x50, x60); s56_1 = v_add(x51, x61); + + v_float32x4 y00 = v_add(v_add(v_add(x00, s12_0), s34_0), s56_0); + v_float32x4 y01 = v_add(v_add(v_add(x01, s12_1), s34_1), s56_1); + + v_float32x4 a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f); + v_float32x4 y20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); + v_float32x4 y21 = v_fma(s56_1, a0 ,v_fma(s34_1, a1, s12_1) ); + + a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f); + v_float32x4 y40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); + v_float32x4 y41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); + + s12_0 = v_sub(x10, x20); s12_1 = v_sub(x11, x21); + s34_0 = v_sub(x30, x40); s34_1 = v_sub(x31, x41); + s56_0 = v_sub(x50, x60); s56_1 = v_sub(x51, x61); + + a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.f); + v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(x70, s12_0))); + v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(x71, s12_1))); + + a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.f); + v_float32x4 y10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); + v_float32x4 y11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); + + a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.f); + v_float32x4 y30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); + v_float32x4 y31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); + + v_float32x4 y60 = v_setall_f32(0.f), y61 = y60, y70 = y60, y71 = y60; + + /* transpose 8x8 matrix with v_transpose4x4 */ + + v_float32x4 y000, y100, y200, y300, y010, y110, y210, y310, y400, y500, y600, y700, y410, y510, y610, y710; + v_transpose4x4(y00, y10, y20, y30, y000, y100, y200, y300); + v_transpose4x4(y01, y11, y21, y31, y010, y110, y210, y310); + v_transpose4x4(y40, y50, y60, y70, y400, y500, y600, y700); + v_transpose4x4(y41, y51, y61, y71, y410, y510, y610, y710); + + s12_0 = v_add(y100, y200); s12_1 = v_add(y500, y600); + s34_0 = v_add(y300, y010); s34_1 = v_add(y700, y410); + s56_0 = v_add(y110, y210); s56_1 = v_add(y510, y610); + + z00 = v_add(v_add(v_add(y000, s12_0), s34_0), s56_0); + z01 = v_add(v_add(v_add(y400, s12_1), s34_1), s56_1); + + a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f); + z20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); + z21 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); + + a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f); + z40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); + z41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); + + s12_0 = v_sub(y100, y200); s12_1 = v_sub(y500, y600); + s34_0 = v_sub(y300, y010); s34_1 = v_sub(y700, y410); + s56_0 = v_sub(y110, y210); s56_1 = v_sub(y510, y610); + + a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.0f); + z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, v_add(y310, s12_0))); + z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, v_add(y710, s12_1))); + a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.0f); + z10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); + z11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); + + a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.0f); + z30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); + z31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); + + v_float32x4 vbias = v_setall_f32(bias); + z00 = v_add(z00, vbias); + z01 = v_add(z01, vbias); + z10 = v_add(z10, vbias); + z11 = v_add(z11, vbias); + z20 = v_add(z20, vbias); + z21 = v_add(z21, vbias); + z30 = v_add(z30, vbias); + z31 = v_add(z31, vbias); + z40 = v_add(z40, vbias); + z41 = v_add(z41, vbias); + z50 = v_add(z50, vbias); + z51 = v_add(z51, vbias); + } + + if (bpptr) + { + z00 = v_add(z00, v_load(bpptr)); + z01 = v_add(z01, v_load_low(bpptr + 4)); + z10 = v_add(z10, v_load(bpptr + bpstep)); + z11 = v_add(z11, v_load_low(bpptr + bpstep + 4)); + z20 = v_add(z20, v_load(bpptr + bpstep * 2)); + z21 = v_add(z21, v_load_low(bpptr + bpstep * 2 + 4)); + z30 = v_add(z30, v_load(bpptr + bpstep * 3)); + z31 = v_add(z31, v_load_low(bpptr + bpstep * 3 + 4)); + z40 = v_add(z40, v_load(bpptr + bpstep * 4)); + z41 = v_add(z41, v_load_low(bpptr + bpstep * 4 + 4)); + z50 = v_add(z50, v_load(bpptr + bpstep * 5)); + z51 = v_add(z51, v_load_low(bpptr + bpstep * 5 + 4)); + } + + if (ifMinMaxAct) + { + v_float32x4 vmax = v_setall_f32(maxval); + v_float32x4 vmin = v_setall_f32(minval); + + z00 = v_min(v_max(z00, vmin), vmax); + z01 = v_min(v_max(z01, vmin), vmax); + z10 = v_min(v_max(z10, vmin), vmax); + z11 = v_min(v_max(z11, vmin), vmax); + z20 = v_min(v_max(z20, vmin), vmax); + z21 = v_min(v_max(z21, vmin), vmax); + z30 = v_min(v_max(z30, vmin), vmax); + z31 = v_min(v_max(z31, vmin), vmax); + z40 = v_min(v_max(z40, vmin), vmax); + z41 = v_min(v_max(z41, vmin), vmax); + z50 = v_min(v_max(z50, vmin), vmax); + z51 = v_min(v_max(z51, vmin), vmax); + } + + v_store(outptr, z00); + v_store_low(outptr + 4, z01); + v_store(outptr + outstep, z10); + v_store_low(outptr + outstep + 4, z11); + v_store(outptr + outstep*2, z20); + v_store_low(outptr + outstep*2 + 4, z21); + v_store(outptr + outstep*3, z30); + v_store_low(outptr + outstep*3 + 4, z31); + v_store(outptr + outstep*4, z40); + v_store_low(outptr + outstep*4 + 4, z41); + v_store(outptr + outstep*5, z50); + v_store_low(outptr + outstep*5 + 4, z51); +} + +cv::dnn::Winofunc getWinofunc_F32() +{ + return {&impl_accum_F32, &impl_BtXB_8x8_F32, &impl_AtXA_8x8_F32, 3, 4, 4}; +} + + +// end of CV_SIMD128 +#else + + +cv::dnn::Winofunc getWinofunc_F32() +{ + return cv::dnn::Winofunc::empty(); +} + + +// end of fallback +#endif + +//============================================================================== // FP16, currently, only ARMv8 may support it -#if defined(CV_NEON_AARCH64) && CV_NEON_AARCH64 && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(CV_CPU_COMPILE_NEON_FP16) && CV_CPU_COMPILE_NEON_FP16 #undef T4x4 #define T4x4(a, b, c, d, tr0, tr1) \ @@ -432,12 +1365,12 @@ void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep, d = vcombine_f32(vget_high_f32(tr0.val[1]), vget_high_f32(tr1.val[1])) /* Accumulate */ -void winofunc_accum_F16(const char* _inwptr, const char* _wptr, char* _outbuf, int Cg, int iblock, +void impl_accum_F16(const uchar* inwptr_, const uchar* wptr_, uchar* outbuf_, int Cg, int iblock, const int winoIblock, const int winoKblock, const int winoAtomF16, const int winoNatomF16) { - const __fp16* inwptr = (const __fp16*)_inwptr; - const __fp16* wptr = (const __fp16*)_wptr; - __fp16* outbuf = (__fp16*)_outbuf; + const __fp16* inwptr = (const __fp16*)inwptr_; + const __fp16* wptr = (const __fp16*)wptr_; + __fp16* outbuf = (__fp16*)outbuf_; CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF16 == 8); @@ -587,10 +1520,11 @@ void winofunc_accum_F16(const char* _inwptr, const char* _wptr, char* _outbuf, i /*Input transform*/ //NOTE: Since we don't have the fully fp16 support. Current work around is that we need packing the data and // convert it to FP16 in input transform stage. And at output transform stage we will convert it back to FP32. -void winofunc_BtXB_8x8_F16(const float * inptr, int inpstep, - char * _outptr, int Cg, const int winoIblock, const int winoAtomF16) +void impl_BtXB_8x8_F16(const float * inptr, int inpstep, + uchar * outptr_, int Cg, const int winoIblock, const int winoAtomF16) { - __fp16* outptr = (__fp16*)_outptr; + __fp16* outptr = (__fp16*)outptr_; + float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4); float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4); float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4); @@ -751,11 +1685,11 @@ void winofunc_BtXB_8x8_F16(const float * inptr, int inpstep, } // Output transform -void winofunc_AtXA_8x8_F16(const char* _inptr, int inpstep, +void impl_AtXA_8x8_F16(const uchar* inptr_, int inpstep, float * bpptr, int bpstep, float* outptr, int outstep, float bias, float minval, float maxval, bool ifMinMaxAct) { - const __fp16* inptr = (const __fp16*)_inptr; + const __fp16* inptr = (const __fp16*)inptr_; float32x4_t x00 = vcvt_f32_f16(vld1_f16(inptr)), x01 = vcvt_f32_f16(vld1_f16(inptr + 4)); float32x4_t x10 = vcvt_f32_f16(vld1_f16(inptr + inpstep)), x11 = vcvt_f32_f16(vld1_f16(inptr + inpstep + 4)); @@ -907,8 +1841,26 @@ void winofunc_AtXA_8x8_F16(const char* _inptr, int inpstep, vst1q_f32(outptr + outstep*5, z50); vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51)); } + +cv::dnn::Winofunc getWinofunc_F16() +{ + return {&impl_accum_F16, &impl_BtXB_8x8_F16, &impl_AtXA_8x8_F16, 6, 8, 2}; +} + +// end of NEON_FP16 +#else + +cv::dnn::Winofunc getWinofunc_F16() +{ + return cv::dnn::Winofunc::empty(); +} + + +// end of fallback #endif -#endif + CV_CPU_OPTIMIZATION_NAMESPACE_END -}} // namespace +}} // cv::dnn:: + +#endif // !CV_CPU_DECLARATIONS_ONLY diff --git a/modules/dnn/src/layers/cpu_kernels/convolution.hpp b/modules/dnn/src/layers/cpu_kernels/convolution.hpp index 5c8055337c..98b33ddc5f 100644 --- a/modules/dnn/src/layers/cpu_kernels/convolution.hpp +++ b/modules/dnn/src/layers/cpu_kernels/convolution.hpp @@ -6,6 +6,7 @@ #define OPENCV_FAST_CONVOLUTION_HPP #include "opencv2/core/hal/intrin.hpp" +#include "opencv2/dnn/all_layers.hpp" #ifndef CONV_PRAM #define CONV_PRAM @@ -119,23 +120,28 @@ void convBlock_F32(int np, const float* a, const float* b, float* c, int ldc, bo void convBlockMR1_F32(int np, const float* a, const float* b, float* c, const float bias, bool init_c, const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR); +#endif // CV_NEON +} // namespace opt_NEON. -#if CV_NEON_AARCH64 -/* Accumulate */ -void winofunc_accum_F32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock, - const int winoIblock, const int winoKblock, const int winoAtom, const int winoNatom); -/*Input transform*/ -void winofunc_BtXB_8x8_F32(const float* inptr, int inpstep, - float* outptr, int Cg, const int winoIblock, const int winoAtom); -/*Output transform*/ -void winofunc_AtXA_8x8_F32(const float* inptr, int inpstep, - float* bpptr, int bpstep, float* outptr, int outstep, - float bias, float minval, float maxval, bool ifMinMaxAct); -#endif // CV_NEON_AARCH64 -#endif // CV_NEON -} // namespace opt_NEON. +// === Function tables +struct Winofunc +{ + void (*accum)(const uchar* inwptr, const uchar* wptr, uchar* outbuf, int Cg, int iblock, const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32); + void (*BtXB_8x8)(const float* inptr, int inpstep, uchar* outptr, int Cg, const int winoIblock, const int winoAtomF32); + void (*AtXA_8x8)(const uchar* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep, float bias, float minval, float maxval, bool ifMinMaxAct); + int iblock; + int natom; + int esz; + + bool isGood() const { return accum && BtXB_8x8 && AtXA_8x8 && iblock > 0 && natom > 0 && esz > 0; } + static Winofunc empty() { return {0, 0, 0, 0, 0, 0}; } +}; + +// === wrapper calls (implemented in .dispatch.cpp) +Winofunc getWinofunc_F32(); +Winofunc getWinofunc_F16(); } // namespace dnn