Merge pull request #23763 from zihaomu:add_runtime_check

DNN: fix bug for X86 Winograd #23763

Address https://github.com/opencv/opencv/issues/23760
The patch aims to add a runtime check for X86 platform without AVX(2).

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
pull/23767/head
Zihao Mu 1 year ago committed by GitHub
parent 5d913f4d72
commit eec8a20c33
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 21
      modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
  2. 15
      modules/dnn/src/layers/cpu_kernels/convolution.cpp
  3. 23
      modules/dnn/src/layers/cpu_kernels/convolution.hpp

@ -31,7 +31,6 @@ void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep,
float bias, float minval, float maxval, bool ifMinMaxAct);
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv,
int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
{
@ -51,6 +50,23 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
int pad_left = conv->pad_left;
int ngroups = conv->ngroups, Cg = C/ngroups, Kg = K/ngroups;
const int CONV_WINO_KBLOCK = 4;
#if (CV_NEON && CV_NEON_AARCH64)
const int CONV_WINO_IBLOCK = 6;
#elif CV_TRY_AVX || CV_TRY_AVX2
const int CONV_WINO_IBLOCK = (conv->useAVX || conv->useAVX2) ? 6 : 3;
#else
const int CONV_WINO_IBLOCK = 3;
#endif
#if CV_TRY_AVX || CV_TRY_AVX2
const int CONV_WINO_ATOM_F32 = (conv->useAVX || conv->useAVX2) ? 8 : 4;
#else
const int CONV_WINO_ATOM_F32 = 4;
#endif
const int CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32; // for AVX2, it is 8, otherwise, it's 16.
int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - 1)/CONV_WINO_KBLOCK;
const size_t inp_planesize = (size_t)Hi*Wi;
const size_t out_planesize = (size_t)H0*W0;
@ -398,7 +414,7 @@ void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, i
void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
float* outptr, int Cg, const int winoIblock, const int winoAtomF32)
{
CV_Assert(CONV_WINO_IBLOCK == 3 && CONV_WINO_KBLOCK == 4 && CONV_WINO_ATOM_F32 == 4);
CV_Assert(winoIblock == 3 && winoAtomF32 == 4);
v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
@ -573,7 +589,6 @@ void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
float* bpptr, int bpstep, float* outptr, int outstep,
float bias, float minval, float maxval, bool ifMinMaxAct)
{
CV_Assert(CONV_WINO_IBLOCK == 3 && CONV_WINO_KBLOCK == 4 && CONV_WINO_ATOM_F32 == 4);
v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);

@ -181,6 +181,21 @@ Ptr<FastConv> initFastConv(
{0.0f, 0.0f, 1.0f}
};
const int CONV_WINO_KBLOCK = 4;
#if CV_TRY_AVX || CV_TRY_AVX2
const int CONV_WINO_ATOM_F32 = (conv->useAVX || conv->useAVX2) ? 8 : 4;
#else
const int CONV_WINO_ATOM_F32 = 4;
#endif
const int CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32; // for AVX2, it is 8, otherwise, it's 16.
#ifdef CONV_ARM_FP16
// FP 16
const int CONV_WINO_ATOM_F16 = CONV_WINO_ATOM_F32 * 2;
const int CONV_WINO_NATOMS_F16 = CONV_WINO_AREA / CONV_WINO_ATOM_F16;
#endif
// the weights are packed as 6-dim tensor:
// ngroups * ceil((K/ngroups)/KBLOCK) * (W*W/ATOM_SIZE) * (C/ngroups) * KBLOCK * ATOM_SIZE,
// where W is the size of Winograd-transformed kernel (8x8),

@ -33,36 +33,17 @@ typedef __fp16 float16_t; // Fix conflict between float16_t in arm_neon.h and fl
#define CONV_NR_FP32 24
#endif
// Winograd Params
enum {
CONV_WINO_STEP=6,
CONV_WINO_KSIZE=3,
CONV_WINO_SIZE=CONV_WINO_STEP+CONV_WINO_KSIZE-1, // 8
CONV_WINO_SIZE=CONV_WINO_STEP+CONV_WINO_KSIZE - 1, // 8
CONV_WINO_AREA=CONV_WINO_SIZE*CONV_WINO_SIZE,
CONV_WINO_KBLOCK = 4,
#if (CV_NEON && CV_NEON_AARCH64) || CV_TRY_AVX || CV_TRY_AVX2
CONV_WINO_IBLOCK = 6,
#else
CONV_WINO_IBLOCK = 3,
#endif
#if CV_TRY_AVX || CV_TRY_AVX2
CONV_WINO_ATOM_F32 = 8,
#else
CONV_WINO_ATOM_F32 = 4,
#endif
CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16.
// FP 16
CONV_WINO_ATOM_F16 = CONV_WINO_ATOM_F32 * 2,
CONV_WINO_NATOMS_F16 = CONV_WINO_AREA / CONV_WINO_ATOM_F16,
};
// NOTE that: CONV_TYPE_DEPTHWISE is for 3x3 depthwise conv, and others depthwise will be set as CONV_TYPE_DEPTHWISE_REMAIN.
enum { CONV_TYPE_GENERIC=0, CONV_TYPE_DEPTHWISE=1, CONV_TYPE_WINOGRAD3X3=2, CONV_TYPE_DEPTHWISE_REMAIN=3 };
enum { CONV_1D = 0, CONV_2D = 1, CONV_3D = 2 };
#endif
namespace cv {

Loading…
Cancel
Save