fixed bug at winograd of SIMD128 and more robust code.

pull/22667/head
Zihao Mu 2 years ago
parent 5d292826b2
commit cee8c86b6e
  1. 14
      modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
  2. 8
      modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp
  3. 14
      modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
  4. 28
      modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
  5. 39
      modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp

@ -20,6 +20,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3)
{
#if CV_SIMD128
const int VEC_NLANES = 4;
v_float32x4 vminval = v_setall_f32(minval), vmaxval = v_setall_f32(maxval);
v_float32x4 w0 = v_setall_f32(
@ -110,7 +111,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
{
if (dy0 == 3)
{
for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
{
int xi_ = x0 * stride_x - pad_left;
const float *inptr_xi = inptr + Wi * yi_ + xi_;
@ -186,7 +187,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
}
else
{
for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
{
int xi_ = x0 * stride_x - pad_left;
const float *inptr_xi = inptr + Wi * yi_ + xi_;
@ -211,7 +212,7 @@ static void depthWiseBlock(const float *inptr, float *outptr, const float *weigh
}
else
{
for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
{
int xi_ = x0 * stride_x - pad_left, k = 0;
const float *inptr_xi = inptr + Wi * yi_ + xi_;
@ -314,7 +315,12 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>&
int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom;
int pad_left = conv->pad_left, pad_right = conv->pad_right;
int ksize = Hk * Wk, padded_ksize = ((ksize + FAST_VEC_NLANES - 1) / FAST_VEC_NLANES) * FAST_VEC_NLANES;
int VEC_NLANES = 4;
#if CV_TRY_AVX2
if (conv->useAVX2)
VEC_NLANES = 8;
#endif
int ksize = Hk * Wk, padded_ksize = ((ksize + VEC_NLANES - 1) / VEC_NLANES) * VEC_NLANES;
const float *inp = input.ptr<float>();
float *out = output.ptr<float>();

@ -78,6 +78,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop,
int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3)
{
const int VEC_NLANES = 8;
__m256 vminval = _mm256_set1_ps(minval);
__m256 vmaxval = _mm256_set1_ps(maxval);
@ -174,7 +175,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
{
if (dy0 == 3)
{
for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
{
int xi_ = x0 * stride_x - pad_left;
const float *inptr_xi = inptr + Wi * yi_ + xi_;
@ -250,7 +251,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
}
else
{
for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
{
int xi_ = x0 * stride_x - pad_left;
const float *inptr_xi = inptr + Wi * yi_ + xi_;
@ -276,7 +277,7 @@ void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights
}
else
{
for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
for (; x0 <= x1 - VEC_NLANES; x0 += VEC_NLANES)
{
int xi_ = x0 * stride_x - pad_left, k = 0;
const float *inptr_xi = inptr + Wi * yi_ + xi_;
@ -701,7 +702,6 @@ void _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep,
z50 = _mm256_add_ps(vbias, z50);
}
// TODO make sure the lenght of bpptr is 8.
if (bpptr)
{
z00 = _mm256_add_ps(z00, _mm256_loadu_ps(bpptr));

@ -49,6 +49,15 @@ Ptr<FastConv2d> initFastConv2d(
useWinograd && ((conv->useSIMD128 || conv->useAVX2 || conv->useNEON) && Hk == 3 && Wk == 3 &&
dilation_y == 1 && dilation_x == 1 && stride_y == 1 && stride_x == 1) ? _FX_CONV_TYPE_WINOGRAD3X3 :
_FX_CONV_TYPE_GENERIC;
int VEC_NLANES = 4;
#if CV_TRY_AVX2
if (!conv->useAVX2 && conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // convert Winograd to generic conv.
conv->conv_type = _FX_CONV_TYPE_GENERIC;
if (conv->useAVX2)
VEC_NLANES = 8;
#endif
Mat weightsMat = _weightsMat.getMat();
auto wShape = shape(weightsMat);
const size_t wstep = weightsMat.step1();
@ -61,7 +70,7 @@ Ptr<FastConv2d> initFastConv2d(
int ksize = Hk*Wk;
// this code aims to let memory fit with vector size.
int padded_ksize = ((ksize + FAST_VEC_NLANES-1) / FAST_VEC_NLANES) * FAST_VEC_NLANES;
int padded_ksize = ((ksize + VEC_NLANES-1) / VEC_NLANES) * VEC_NLANES;
int nweights = C*padded_ksize;
conv->weightsBuf.reserve(nweights + VEC_ALIGN);
conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN);
@ -265,7 +274,8 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr<FastConv2d>
else if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3 && inputShape[2] >= 12 && inputShape[3] >= 12) // winograd
{
CV_Assert(conv->weightsWinoBufPtr);
return runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct);
if (runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct))
return;
}
int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3]; // [N, C, H, W]

@ -12,35 +12,25 @@
#if CV_NEON && CV_NEON_AARCH64 // 32 registers.
#define CONV_MR 4
#define CONV_NR 28
enum { FAST_VEC_NLANES=4 };
#elif CV_NEON // 16 registers.
#define CONV_MR 4
#define CONV_NR 12
enum { FAST_VEC_NLANES=4 };
#else // SIMD 128, AVX or AVX2
#define CONV_MR 4
#define CONV_NR 24
#if CV_TRY_AVX2
enum { FAST_VEC_NLANES=8 }; // AVX2
#else
enum { FAST_VEC_NLANES=4 }; // SIMD 128
#endif
#endif
#endif
// Winograd Params
enum {
_FX_WINO_STEP=6,
_FX_WINO_KSIZE=3,
_FX_WINO_SIZE=_FX_WINO_STEP+_FX_WINO_KSIZE-1,
_FX_WINO_AREA=_FX_WINO_SIZE*_FX_WINO_SIZE,
#if CV_TRY_AVX2 || (CV_NEON && CV_NEON_AARCH64)
_FX_WINO_KBLOCK = 4,
#if (CV_NEON && CV_NEON_AARCH64) || CV_TRY_AVX2
_FX_WINO_IBLOCK = 6,
#else
_FX_WINO_KBLOCK = 4,
_FX_WINO_IBLOCK = 3,
#endif
@ -52,8 +42,8 @@ enum {
_FX_WINO_NATOMS_F32 = _FX_WINO_AREA / _FX_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16.
};
enum { _FX_CONV_TYPE_GENERIC=0, _FX_CONV_TYPE_DEPTHWISE=1, _FX_CONV_TYPE_WINOGRAD3X3=2 };
#endif
namespace cv {
namespace dnn {
@ -77,8 +67,18 @@ struct FastConv2d
#else
bool useSIMD128 = false;
#endif
#if CV_TRY_AVX2
bool useAVX2 = checkHardwareSupport(CPU_AVX2);
#else
bool useAVX2 = false;
#endif
#if CV_NEON
bool useNEON = checkHardwareSupport(CPU_NEON);
#else
bool useNEON = false;
#endif
};
// return a FastConv2d instance.
@ -99,7 +99,7 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr<FastConv2d>
void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, float minval, float maxval,
ActivationLayer* activ, bool ifMinMaxAct);
void runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv, int ntasks,
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv, int ntasks,
float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct);
} // namespace dnn

@ -13,6 +13,8 @@
#include "fast_convolution.hpp"
namespace cv { namespace dnn {
#if CV_NEON || CV_SIMD128 || CV_TRY_AVX2
enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment.
static void
@ -141,7 +143,7 @@ _fx_winograd_accum_f32(const float* inwptr, const float* wptr,
vst1q_f32(outbuf + 20*64, s32);
}
}
#elif CV_SIMD
#elif CV_SIMD128
CV_Assert(_FX_WINO_IBLOCK == 3 && _FX_WINO_KBLOCK == 4);
for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++,
outbuf += _FX_WINO_ATOM_F32)
@ -183,15 +185,15 @@ _fx_winograd_accum_f32(const float* inwptr, const float* wptr,
v_store(outbuf, s00);
v_store(outbuf + 1*64, s01);
v_store(outbuf + 2*64, s02);
v_store(outbuf + 6*64, s10);
v_store(outbuf + 7*64, s11);
v_store(outbuf + 8*64, s12);
v_store(outbuf + 12*64, s20);
v_store(outbuf + 13*64, s21);
v_store(outbuf + 14*64, s22);
v_store(outbuf + 18*64, s30);
v_store(outbuf + 19*64, s31);
v_store(outbuf + 20*64, s32);
v_store(outbuf + 3*64, s10);
v_store(outbuf + 4*64, s11);
v_store(outbuf + 5*64, s12);
v_store(outbuf + 6*64, s20);
v_store(outbuf + 7*64, s21);
v_store(outbuf + 8*64, s22);
v_store(outbuf + 9*64, s30);
v_store(outbuf + 10*64, s31);
v_store(outbuf + 11*64, s32);
}
#else
for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32;
@ -406,7 +408,7 @@ _fx_winograd_BtXB_8x8_f32(const float* inptr, int inpstep,
vst1q_f32(outptr + outstep*13, z61);
vst1q_f32(outptr + outstep*14, z70);
vst1q_f32(outptr + outstep*15, z71);
#elif CV_SIMD
#elif CV_SIMD128
v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
@ -750,8 +752,7 @@ _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep,
vst1_f32(outptr + outstep*4 + 4, vget_low_f32(z41));
vst1q_f32(outptr + outstep*5, z50);
vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51));
//#elif CV_AVX2
#elif CV_SIMD
#elif CV_SIMD128
v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
@ -919,7 +920,7 @@ _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep,
#endif
}
void runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv,
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv,
int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
{
Mat input = _input.getMat();
@ -1138,5 +1139,15 @@ void runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outp
}
}
}});
return 1;
}
#else
int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv2d>& conv,
int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
{
return 0;
}
#endif
}} // namespace cv::dnn

Loading…
Cancel
Save