diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt index d285e544c0..88f4347bb6 100644 --- a/modules/dnn/CMakeLists.txt +++ b/modules/dnn/CMakeLists.txt @@ -10,6 +10,9 @@ set(the_description "Deep neural network module. It allows to load models from d ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX) ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX LASX) +ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_block" AVX AVX2) +ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_depthwise" AVX AVX2 RVV LASX) +ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_winograd_f63" AVX AVX2) ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js) diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 5567a58a2a..3e62887bd7 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -72,7 +72,7 @@ using namespace cv::dnn::ocl4dnn; using namespace cv::dnn::cuda4dnn; #endif -#include "fast_convolution/fast_convolution.hpp" +#include "cpu_kernels/convolution.hpp" namespace cv { diff --git a/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp b/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp new file mode 100644 index 0000000000..71b17dcc9b --- /dev/null +++ b/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp @@ -0,0 +1,259 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "opencv2/core/hal/intrin.hpp" + +namespace cv { +namespace dnn { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN + +void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR); + +#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX + +#if !CV_FMA3 // AVX workaround +#undef _mm256_fmadd_ps +#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b)) +#endif + +void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR) +{ + CV_Assert(convMR == 4 && convNR == 24); + __m256 c00 = _mm256_set1_ps(0.f), c01 = c00, c02 = c00; + __m256 c10 = c00, c11 = c00, c12 = c00; + __m256 c20 = c00, c21 = c00, c22 = c00; + __m256 c30 = c00, c31 = c00, c32 = c00; + + __m256 a0 = _mm256_setzero_ps(), a1 = _mm256_setzero_ps(); + __m256 b0 = _mm256_setzero_ps(), b1 = _mm256_setzero_ps(), b2 = _mm256_setzero_ps(); + + for (int p = 0; p < np; p++, a += convMR, b += convNR) + { + a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]); + b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16); + + c00 = _mm256_fmadd_ps(b0, a0, c00); + c01 = _mm256_fmadd_ps(b1, a0, c01); + c02 = _mm256_fmadd_ps(b2, a0, c02); + + c10 = _mm256_fmadd_ps(b0, a1, c10); + c11 = _mm256_fmadd_ps(b1, a1, c11); + c12 = _mm256_fmadd_ps(b2, a1, c12); + + a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]); + + c20 = _mm256_fmadd_ps(b0, a0, c20); + c21 = _mm256_fmadd_ps(b1, a0, c21); + c22 = _mm256_fmadd_ps(b2, a0, c22); + + c30 = _mm256_fmadd_ps(b0, a1, c30); + c31 = _mm256_fmadd_ps(b1, a1, c31); + c32 = _mm256_fmadd_ps(b2, a1, c32); + } + + if (!init_c) + { + c00 = _mm256_add_ps(c00, _mm256_load_ps(c)); + c01 = _mm256_add_ps(c01, _mm256_load_ps(c + 8)); + c02 = _mm256_add_ps(c02, _mm256_load_ps(c + 16)); + + c10 = _mm256_add_ps(c10, _mm256_load_ps(c + ldc)); + c11 = _mm256_add_ps(c11, _mm256_load_ps(c + ldc + 8)); + c12 = _mm256_add_ps(c12, _mm256_load_ps(c + ldc + 16)); + + c20 = _mm256_add_ps(c20, _mm256_load_ps(c + ldc*2)); + c21 = _mm256_add_ps(c21, _mm256_load_ps(c + ldc*2 + 8)); + c22 = _mm256_add_ps(c22, _mm256_load_ps(c + ldc*2 + 16)); + + c30 = _mm256_add_ps(c30, _mm256_load_ps(c + ldc*3)); + c31 = _mm256_add_ps(c31, _mm256_load_ps(c + ldc*3 + 8)); + c32 = _mm256_add_ps(c32, _mm256_load_ps(c + ldc*3 + 16)); + } + + _mm256_storeu_ps(c, c00), _mm256_storeu_ps(c+8, c01), _mm256_storeu_ps(c+16, c02); + _mm256_storeu_ps(c + ldc, c10), _mm256_storeu_ps(c + ldc + 8, c11), _mm256_storeu_ps(c + ldc + 16, c12); + _mm256_storeu_ps(c + ldc*2, c20), _mm256_storeu_ps(c + ldc*2 + 8, c21), _mm256_storeu_ps(c + ldc*2 + 16, c22); + _mm256_storeu_ps(c + ldc*3, c30), _mm256_storeu_ps(c + ldc*3 + 8, c31), _mm256_storeu_ps(c + ldc*3 + 16, c32); + _mm256_zeroupper(); +} + +#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +CV_CPU_OPTIMIZATION_NAMESPACE_END + +// NEON code work around. +namespace opt_NEON +{ +#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON + +void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR) +{ +#if CV_NEON_AARCH64 + if (convMR == 4 && convNR == 28) // AARCH64 + { + float32x4_t c00 = vdupq_n_f32(0.f), c01 = c00, c02 = c00, c03 = c00, c04 = c00, c05 = c00, c06 = c00; + float32x4_t c10 = vdupq_n_f32(0.f), c11 = c10, c12 = c10, c13 = c10, c14 = c10, c15 = c10, c16 = c10; + float32x4_t c20 = vdupq_n_f32(0.f), c21 = c20, c22 = c20, c23 = c20, c24 = c20, c25 = c20, c26 = c20; + float32x4_t c30 = vdupq_n_f32(0.f), c31 = c30, c32 = c30, c33 = c30, c34 = c30, c35 = c30, c36 = c30; + + for( int p = 0; p < np; p++, a += convMR, b += convNR ) + { + float32x4_t a0 = vld1q_f32(a), b0, b1, b2; + b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8); + + c00 = vfmaq_laneq_f32(c00, b0, a0, 0); + c01 = vfmaq_laneq_f32(c01, b1, a0, 0); + c02 = vfmaq_laneq_f32(c02, b2, a0, 0); + c10 = vfmaq_laneq_f32(c10, b0, a0, 1); + c11 = vfmaq_laneq_f32(c11, b1, a0, 1); + c12 = vfmaq_laneq_f32(c12, b2, a0, 1); + c20 = vfmaq_laneq_f32(c20, b0, a0, 2); + c21 = vfmaq_laneq_f32(c21, b1, a0, 2); + c22 = vfmaq_laneq_f32(c22, b2, a0, 2); + c30 = vfmaq_laneq_f32(c30, b0, a0, 3); + c31 = vfmaq_laneq_f32(c31, b1, a0, 3); + c32 = vfmaq_laneq_f32(c32, b2, a0, 3); + + b0 = vld1q_f32(b + 12); b1 = vld1q_f32(b + 16); b2 = vld1q_f32(b + 20); + + c03 = vfmaq_laneq_f32(c03, b0, a0, 0); + c04 = vfmaq_laneq_f32(c04, b1, a0, 0); + c05 = vfmaq_laneq_f32(c05, b2, a0, 0); + c13 = vfmaq_laneq_f32(c13, b0, a0, 1); + c14 = vfmaq_laneq_f32(c14, b1, a0, 1); + c15 = vfmaq_laneq_f32(c15, b2, a0, 1); + c23 = vfmaq_laneq_f32(c23, b0, a0, 2); + c24 = vfmaq_laneq_f32(c24, b1, a0, 2); + c25 = vfmaq_laneq_f32(c25, b2, a0, 2); + c33 = vfmaq_laneq_f32(c33, b0, a0, 3); + c34 = vfmaq_laneq_f32(c34, b1, a0, 3); + c35 = vfmaq_laneq_f32(c35, b2, a0, 3); + + b0 = vld1q_f32(b + 24); + c06 = vfmaq_laneq_f32(c06, b0, a0, 0); + c16 = vfmaq_laneq_f32(c16, b0, a0, 1); + c26 = vfmaq_laneq_f32(c26, b0, a0, 2); + c36 = vfmaq_laneq_f32(c36, b0, a0, 3); + } + + if (!init_c) + { + c00 = vaddq_f32(c00, vld1q_f32(c)); + c01 = vaddq_f32(c01, vld1q_f32(c + 4)); + c02 = vaddq_f32(c02, vld1q_f32(c + 8)); + c03 = vaddq_f32(c03, vld1q_f32(c + 12)); + c04 = vaddq_f32(c04, vld1q_f32(c + 16)); + c05 = vaddq_f32(c05, vld1q_f32(c + 20)); + c06 = vaddq_f32(c06, vld1q_f32(c + 24)); + + c10 = vaddq_f32(c10, vld1q_f32(c + ldc)); + c11 = vaddq_f32(c11, vld1q_f32(c + ldc + 4)); + c12 = vaddq_f32(c12, vld1q_f32(c + ldc + 8)); + c13 = vaddq_f32(c13, vld1q_f32(c + ldc + 12)); + c14 = vaddq_f32(c14, vld1q_f32(c + ldc + 16)); + c15 = vaddq_f32(c15, vld1q_f32(c + ldc + 20)); + c16 = vaddq_f32(c16, vld1q_f32(c + ldc + 24)); + + c20 = vaddq_f32(c20, vld1q_f32(c + ldc*2)); + c21 = vaddq_f32(c21, vld1q_f32(c + ldc*2 + 4)); + c22 = vaddq_f32(c22, vld1q_f32(c + ldc*2 + 8)); + c23 = vaddq_f32(c23, vld1q_f32(c + ldc*2 + 12)); + c24 = vaddq_f32(c24, vld1q_f32(c + ldc*2 + 16)); + c25 = vaddq_f32(c25, vld1q_f32(c + ldc*2 + 20)); + c26 = vaddq_f32(c26, vld1q_f32(c + ldc*2 + 24)); + + c30 = vaddq_f32(c30, vld1q_f32(c + ldc*3)); + c31 = vaddq_f32(c31, vld1q_f32(c + ldc*3 + 4)); + c32 = vaddq_f32(c32, vld1q_f32(c + ldc*3 + 8)); + c33 = vaddq_f32(c33, vld1q_f32(c + ldc*3 + 12)); + c34 = vaddq_f32(c34, vld1q_f32(c + ldc*3 + 16)); + c35 = vaddq_f32(c35, vld1q_f32(c + ldc*3 + 20)); + c36 = vaddq_f32(c36, vld1q_f32(c + ldc*3 + 24)); + } + + vst1q_f32(c, c00); vst1q_f32(c+4, c01); + vst1q_f32(c+8, c02); vst1q_f32(c+12, c03); + vst1q_f32(c+16, c04); vst1q_f32(c+20, c05); + vst1q_f32(c+24, c06); + + vst1q_f32(c+ldc, c10); vst1q_f32(c+ldc+4, c11); + vst1q_f32(c+ldc+8, c12); vst1q_f32(c+ldc+12, c13); + vst1q_f32(c+ldc+16, c14); vst1q_f32(c+ldc+20, c15); + vst1q_f32(c+ldc+24, c16); + + vst1q_f32(c+ldc*2, c20); vst1q_f32(c+ldc*2+4, c21); + vst1q_f32(c+ldc*2+8, c22); vst1q_f32(c+ldc*2+12, c23); + vst1q_f32(c+ldc*2+16, c24); vst1q_f32(c+ldc*2+20, c25); + vst1q_f32(c+ldc*2+24, c26); + + vst1q_f32(c+ldc*3, c30); vst1q_f32(c+ldc*3+4, c31); + vst1q_f32(c+ldc*3+8, c32); vst1q_f32(c+ldc*3+12, c33); + vst1q_f32(c+ldc*3+16, c34); vst1q_f32(c+ldc*3+20, c35); + vst1q_f32(c+ldc*3+24, c36); + } + else +#endif + if (convMR == 4 && convNR == 12) // ARMv7 + { + float32x4_t c0 = vdupq_n_f32(0.f), c1 = c0, c2 = c0; + float32x4_t c3 = vdupq_n_f32(0.f), c4 = c3, c5 = c3; + float32x4_t c6 = vdupq_n_f32(0.f), c7 = c6, c8 = c6; + float32x4_t c9 = vdupq_n_f32(0.f), c10 = c9, c11 = c9; + + float32x2_t a0 = vdup_n_f32(0.0f), a1 = a0; + float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f); + + for (int p = 0; p < np; p++, a += convMR, b += convNR) + { + a0 = vld1_f32(a), a1 = vld1_f32(a+2); + b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8); + + c0 = vmlaq_lane_f32(c0, b0, a0, 0); + c1 = vmlaq_lane_f32(c1, b1, a0, 0); + c2 = vmlaq_lane_f32(c2, b2, a0, 0); + + c3 = vmlaq_lane_f32(c3, b0, a0, 1); + c4 = vmlaq_lane_f32(c4, b1, a0, 1); + c5 = vmlaq_lane_f32(c5, b2, a0, 1); + + c6 = vmlaq_lane_f32(c6, b0, a1, 0); + c7 = vmlaq_lane_f32(c7, b1, a1, 0); + c8 = vmlaq_lane_f32(c8, b2, a1, 0); + + c9 = vmlaq_lane_f32(c9 , b0, a1, 1); + c10 = vmlaq_lane_f32(c10, b1, a1, 1); + c11 = vmlaq_lane_f32(c11, b2, a1, 1); + } + + if (!init_c) + { + c0 = vaddq_f32(c0, vld1q_f32(c)); + c1 = vaddq_f32(c1, vld1q_f32(c + 4)); + c2 = vaddq_f32(c2, vld1q_f32(c + 8)); + + c3 = vaddq_f32(c3, vld1q_f32(c + ldc)); + c4 = vaddq_f32(c4, vld1q_f32(c + ldc + 4)); + c5 = vaddq_f32(c5, vld1q_f32(c + ldc + 8)); + + c6 = vaddq_f32(c6, vld1q_f32(c + ldc * 2)); + c7 = vaddq_f32(c7, vld1q_f32(c + ldc * 2 + 4)); + c8 = vaddq_f32(c8, vld1q_f32(c + ldc * 2 + 8)); + + c9 = vaddq_f32(c9 , vld1q_f32(c + ldc * 3)); + c10 = vaddq_f32(c10, vld1q_f32(c + ldc * 3 + 4)); + c11 = vaddq_f32(c11, vld1q_f32(c + ldc * 3 + 8)); + } + + vst1q_f32(c, c0), vst1q_f32(c+4, c1), vst1q_f32(c+8, c2); + vst1q_f32(c + ldc, c3), vst1q_f32(c + ldc + 4, c4), vst1q_f32(c + ldc + 8, c5); + vst1q_f32(c + ldc*2, c6), vst1q_f32(c + ldc*2 + 4, c7), vst1q_f32(c + ldc*2 + 8, c8); + vst1q_f32(c + ldc*3, c9), vst1q_f32(c + ldc*3 + 4, c10), vst1q_f32(c + ldc*3 + 8, c11); + } + else + CV_Error(Error::StsNotImplemented, "Unsupported convMR and/or convNR in opt_NEON::convBlock"); +} + +#endif +} +}} // namespace cv::dnn diff --git a/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp b/modules/dnn/src/layers/cpu_kernels/conv_depthwise.cpp similarity index 91% rename from modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp rename to modules/dnn/src/layers/cpu_kernels/conv_depthwise.cpp index b690156941..3e969336ad 100644 --- a/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp +++ b/modules/dnn/src/layers/cpu_kernels/conv_depthwise.cpp @@ -2,20 +2,147 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. -// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv.fx). -// Here is the original license: -/* - This file is a part of ficus language project. - See ficus/LICENSE for the licensing terms -*/ - #include "../../precomp.hpp" -#include "fast_convolution.hpp" -#include "../layers_common.hpp" +#include "convolution.hpp" + +#include "conv_depthwise.simd.hpp" +#include "layers/cpu_kernels/conv_depthwise.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content namespace cv { namespace dnn { -static void depthWiseBlockConv2D(const float* wptr, +void depthWiseBlockConv2D(const float* wptr, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int dilation_h, int dilation_w, + int pad_t, int pad_l, + const float* biasptr, const float* relu, + const float* inptr_, + int height, int width, + float* outptr_, + int out_d, int outH, int outW, bool fusedAdd); + +void depthWiseBlockConv1D(const float* wptr, + int kernel_w, int stride_w, int dilation_w, int pad_l, + const float* biasptr, const float* relu, + const float* inptr_, int width, + float* outptr_, + int out_d, int outW, bool fusedAdd); + +void runDepthwise(InputArray _input, OutputArray _output, const Ptr& conv, ActivationLayer* activ_, + const std::vector& reluslope, bool fusedAdd) +{ + Mat input = _input.getMat(); + Mat output = _output.getMat(); + MatShape inputShape = shape(input); + MatShape outputShape = shape(output); + + CV_Assert(inputShape.size() == 3 || inputShape.size() == 4); + CV_Assert(inputShape.size() == outputShape.size()); + + int conv_dim = conv->conv_dim; + CV_Assert((conv_dim == CONV_2D || conv_dim == CONV_1D) && + "DNN: Currently we do not support depth-wise for Convolution 3D!"); + + ActivationLayer* activ = reluslope.empty() ? activ_ : nullptr; + int N = inputShape[0], C = inputShape[1]; + + int Hi = conv_dim == CONV_1D ? 1 : inputShape[inputShape.size() - 2]; + int Wi = inputShape[inputShape.size() - 1]; + + int K = conv->K, Hk = conv->Hk, Wk = conv->Wk; + + int H0 = conv_dim == CONV_1D ? 1 : outputShape[outputShape.size() - 2]; + int W0 = outputShape[outputShape.size() - 1]; + int ngroups = conv->ngroups; + + const size_t inp_planesize = (size_t) Hi * Wi; + const size_t out_planesize = (size_t) H0 * W0; + + CV_Assert(ngroups > 1 && ngroups == K && ngroups == C); + + int stride_h = conv->stride_h, stride_w = conv->stride_w; + int dilation_h = conv->dilation_h, dilation_w = conv->dilation_w; + + int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom; + int pad_left = conv->pad_left, pad_right = conv->pad_right; + + int ksize = Hk * Wk; + + const int VEC_NLANES = 32; + int padded_ksize = ((ksize + VEC_NLANES-1) / VEC_NLANES) * VEC_NLANES; + + const float *inp = input.ptr(); + float *out = output.ptr(); + +#if CV_TRY_AVX2 || CV_TRY_AVX || CV_TRY_RVV + // TODO: remove the following limitation, need change code in conv_depthwise.simd.hpp. + bool canRunOpt = Wi >= 16 + dilation_w*(Wk - 1) && !fusedAdd; +#endif + std::vector ofstab_(3 * ksize, 0); + int *ofstab = ofstab_.data(); + int *yxtab = ofstab + ksize; + + for (int k = 0; k < ksize; k++) + { + int y = k < ksize ? k / Wk : 0; + int x = k < ksize ? k % Wk : 0; + int dy = y * dilation_h, dx = x * dilation_w; + yxtab[k * 2] = dy; + yxtab[k * 2 + 1] = dx; + ofstab[k] = dy * Wi + dx; + } + + const float *weights0 = conv->weightsBufPtr, *bias = conv->biasBuf.data(); + const float* relu = reluslope.data(); + CV_Assert(ksize > 1 || (pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0)); + + parallel_for_(Range(0, N * C), [&](const Range &r0) { + for (int nc = r0.start; nc < r0.end; nc++) + { + int c = nc % C; + const float *inptr0 = inp + inp_planesize * nc; + float *outptr0 = out + out_planesize * nc; + + const float *weights = weights0 + c * padded_ksize; + + if (conv_dim == CONV_2D) + { +#if CV_TRY_AVX2 + if(canRunOpt && conv->useAVX2) + opt_AVX2::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w, + pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0); + else +#endif +#if CV_TRY_AVX + if(canRunOpt && conv->useAVX) + opt_AVX::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w, + pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0); + else +#endif +#if CV_TRY_RVV + if(canRunOpt && conv->useRVV) + opt_RVV::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w, + pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0); + else +#endif + depthWiseBlockConv2D(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w, + pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0, fusedAdd); + } + else // conv_dim == CONV_1D, spatial branch for depth-wise Conv1D. + { + depthWiseBlockConv1D(weights, Wk, stride_w, dilation_w, pad_left, bias, relu, inptr0, Wi, outptr0, c, W0, fusedAdd); + } + + if (activ) + activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1); + }}); +} + +/****************************************************************************************\ + SIMD and no-SIMD code for depthWiseBlockConv +\****************************************************************************************/ + +void depthWiseBlockConv2D(const float* wptr, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w, @@ -199,7 +326,7 @@ static void depthWiseBlockConv2D(const float* wptr, } } -static void depthWiseBlockConv1D(const float* wptr, +void depthWiseBlockConv1D(const float* wptr, int kernel_w, int stride_w, int dilation_w, int pad_l, const float* biasptr, const float* relu, const float* inptr_, int width, @@ -332,114 +459,5 @@ static void depthWiseBlockConv1D(const float* wptr, } } -void runDepthwise(InputArray _input, OutputArray _output, const Ptr& conv, ActivationLayer* activ_, - const std::vector& reluslope, bool fusedAdd) -{ - Mat input = _input.getMat(); - Mat output = _output.getMat(); - MatShape inputShape = shape(input); - MatShape outputShape = shape(output); - - CV_Assert(inputShape.size() == 3 || inputShape.size() == 4); - CV_Assert(inputShape.size() == outputShape.size()); - - int conv_dim = conv->conv_dim; - CV_Assert((conv_dim == CONV_2D || conv_dim == CONV_1D) && - "DNN: Currently we do not support depth-wise for Convolution 3D!"); - - ActivationLayer* activ = reluslope.empty() ? activ_ : nullptr; - int N = inputShape[0], C = inputShape[1]; - - int Hi = conv_dim == CONV_1D ? 1 : inputShape[inputShape.size() - 2]; - int Wi = inputShape[inputShape.size() - 1]; - - int K = conv->K, Hk = conv->Hk, Wk = conv->Wk; - - int H0 = conv_dim == CONV_1D ? 1 : outputShape[outputShape.size() - 2]; - int W0 = outputShape[outputShape.size() - 1]; - int ngroups = conv->ngroups; - - const size_t inp_planesize = (size_t) Hi * Wi; - const size_t out_planesize = (size_t) H0 * W0; - - CV_Assert(ngroups > 1 && ngroups == K && ngroups == C); - - int stride_h = conv->stride_h, stride_w = conv->stride_w; - int dilation_h = conv->dilation_h, dilation_w = conv->dilation_w; - - int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom; - int pad_left = conv->pad_left, pad_right = conv->pad_right; - - int ksize = Hk * Wk; - - const int VEC_NLANES = 32; - int padded_ksize = ((ksize + VEC_NLANES-1) / VEC_NLANES) * VEC_NLANES; - - const float *inp = input.ptr(); - float *out = output.ptr(); - -#if CV_TRY_AVX2 || CV_TRY_AVX || CV_TRY_RVV - // TODO: remove the following limitation, need change code in layers_common.simd.hpp. - bool canRunOpt = Wi >= 16 + dilation_w*(Wk - 1) && !fusedAdd; -#endif - std::vector ofstab_(3 * ksize, 0); - int *ofstab = ofstab_.data(); - int *yxtab = ofstab + ksize; - - for (int k = 0; k < ksize; k++) - { - int y = k < ksize ? k / Wk : 0; - int x = k < ksize ? k % Wk : 0; - int dy = y * dilation_h, dx = x * dilation_w; - yxtab[k * 2] = dy; - yxtab[k * 2 + 1] = dx; - ofstab[k] = dy * Wi + dx; - } - - const float *weights0 = conv->weightsBufPtr, *bias = conv->biasBuf.data(); - const float* relu = reluslope.data(); - CV_Assert(ksize > 1 || (pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0)); - - parallel_for_(Range(0, N * C), [&](const Range &r0) { - for (int nc = r0.start; nc < r0.end; nc++) - { - int c = nc % C; - const float *inptr0 = inp + inp_planesize * nc; - float *outptr0 = out + out_planesize * nc; - - const float *weights = weights0 + c * padded_ksize; - - if (conv_dim == CONV_2D) - { -#if CV_TRY_AVX2 - if(canRunOpt && conv->useAVX2) - opt_AVX2::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w, - pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0); - else -#endif -#if CV_TRY_AVX - if(canRunOpt && conv->useAVX) - opt_AVX::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w, - pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0); - else -#endif -#if CV_TRY_RVV - if(canRunOpt && conv->useRVV) - opt_RVV::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w, - pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0); - else -#endif - depthWiseBlockConv2D(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w, - pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0, fusedAdd); - } - else // conv_dim == CONV_1D, spatial branch for depth-wise Conv1D. - { - depthWiseBlockConv1D(weights, Wk, stride_w, dilation_w, pad_left, bias, relu, inptr0, Wi, outptr0, c, W0, fusedAdd); - } - - if (activ) - activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1); - }}); -} }} // namespace cv::dnn diff --git a/modules/dnn/src/layers/cpu_kernels/conv_depthwise.simd.hpp b/modules/dnn/src/layers/cpu_kernels/conv_depthwise.simd.hpp new file mode 100644 index 0000000000..1d561e9864 --- /dev/null +++ b/modules/dnn/src/layers/cpu_kernels/conv_depthwise.simd.hpp @@ -0,0 +1,591 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "opencv2/core/hal/intrin.hpp" + +namespace cv { +namespace dnn { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN + +void fastDepthwiseConv(const float* weights, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int dilation_h, int dilation_w, + int pad_t, int pad_l, + const float* bias, const float* relu, + const float* inptr, + int height, int width, + float* outptr, + int out_d, int outH, int outW); + +#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX + +#if !CV_FMA3 // AVX workaround +#undef _mm256_fmadd_ps +#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b)) +#endif + +static inline void _mm256_load_deinterleave(const float* ptr, __m256& a, __m256& b) +{ + __m256 t0 = _mm256_loadu_ps(ptr); + __m256 t1 = _mm256_loadu_ps(ptr + 8); + + __m256 lo = _mm256_permute2f128_ps(t0, t1, 0+2*16); + __m256 hi = _mm256_permute2f128_ps(t0, t1, 1+3*16); + a = _mm256_shuffle_ps(lo, hi, 0x88); + b = _mm256_shuffle_ps(lo, hi, 0xdd); +} + +void fastDepthwiseConv( const float* wptr, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int dilation_h, int dilation_w, + int pad_t, int pad_l, + const float* biasptr, const float* relu, + const float* inptr_, + int height, int width, + float* outptr_, + int out_d, int outH, int outW ) +{ + const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], + w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], + w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; + int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w); + float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d]; + + for (int out_i = 0; out_i < outH; out_i++) + { + int in_i = out_i * stride_h - pad_t, out_j = 0; + const float* imgptr0 = inptr_ + in_i*width; + const float* imgptr1 = imgptr0 + dilation_h*width; + const float* imgptr2 = imgptr0 + (dilation_h*2)*width; + float out, w00 = w00_, w01 = w01_, w02 = w02_; + float w20 = w20_, w21 = w21_, w22 = w22_; + if (in_i < 0) + { + w00 = w01 = w02 = 0.f; + imgptr0 = imgptr1; + } + else if (in_i + dilation_h*(kernel_h-1) >= height) + { + w20 = w21 = w22 = 0.f; + imgptr2 = imgptr1; + } + float* outptr = outptr_ + out_i*outW; + if (pad_l > 0) + { + out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 + + imgptr1[0]*w11 + imgptr1[dilation_w]*w12 + + imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[0] = out; + out_j = 1; + } + + if (stride_w == 1 || (stride_w == 2 && dilation_w == 1)) + { + const int VECSZ = 8; + __m256 vw00 = _mm256_set1_ps(w00), vw01 = _mm256_set1_ps(w01), vw02 = _mm256_set1_ps(w02), + vw10 = _mm256_set1_ps(w10), vw11 = _mm256_set1_ps(w11), vw12 = _mm256_set1_ps(w12), + vw20 = _mm256_set1_ps(w20), vw21 = _mm256_set1_ps(w21), vw22 = _mm256_set1_ps(w22); + __m256 z = _mm256_setzero_ps(), vbias = _mm256_set1_ps(bias), vrc = _mm256_set1_ps(relu_coeff); + + if( stride_w == 1 ) + for( ; out_j < outW1; out_j += VECSZ ) + { + if (out_j + VECSZ > outW1 && out_j > pad_l) + out_j = outW1 - VECSZ; + int in_j = out_j * stride_w - pad_l; + __m256 v00 = _mm256_loadu_ps(imgptr0 + in_j), + v01 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w), + v02 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w*2), + v10 = _mm256_loadu_ps(imgptr1 + in_j), + v11 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w), + v12 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w*2), + v20 = _mm256_loadu_ps(imgptr2 + in_j), + v21 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w), + v22 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w*2); + + __m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias); + __m256 vout1 = _mm256_mul_ps(v01, vw01); + __m256 vout2 = _mm256_mul_ps(v02, vw02); + + vout0 = _mm256_fmadd_ps(v10, vw10, vout0); + vout1 = _mm256_fmadd_ps(v11, vw11, vout1); + vout2 = _mm256_fmadd_ps(v12, vw12, vout2); + + vout0 = _mm256_fmadd_ps(v20, vw20, vout0); + vout1 = _mm256_fmadd_ps(v21, vw21, vout1); + vout2 = _mm256_fmadd_ps(v22, vw22, vout2); + + vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2); + if (relu) + { + __m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ); + vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m); + } + _mm256_storeu_ps(outptr + out_j, vout0); + } + else + for( ; out_j < outW1; out_j += VECSZ ) + { + if (out_j + VECSZ > outW1 && out_j > pad_l) + out_j = outW1 - VECSZ; + int in_j = out_j * stride_w - pad_l; + __m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused; + _mm256_load_deinterleave(imgptr0 + in_j, v00, v01); + _mm256_load_deinterleave(imgptr0 + in_j + 2, v02, unused); + _mm256_load_deinterleave(imgptr1 + in_j, v10, v11); + _mm256_load_deinterleave(imgptr1 + in_j + 2, v12, unused); + _mm256_load_deinterleave(imgptr2 + in_j, v20, v21); + _mm256_load_deinterleave(imgptr2 + in_j + 2, v22, unused); + + __m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias); + __m256 vout1 = _mm256_mul_ps(v01, vw01); + __m256 vout2 = _mm256_mul_ps(v02, vw02); + + vout0 = _mm256_fmadd_ps(v10, vw10, vout0); + vout1 = _mm256_fmadd_ps(v11, vw11, vout1); + vout2 = _mm256_fmadd_ps(v12, vw12, vout2); + + vout0 = _mm256_fmadd_ps(v20, vw20, vout0); + vout1 = _mm256_fmadd_ps(v21, vw21, vout1); + vout2 = _mm256_fmadd_ps(v22, vw22, vout2); + + vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2); + if (relu) + { + __m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ); + vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m); + } + _mm256_storeu_ps(outptr + out_j, vout0); + } + } + + for (; out_j < outW1; out_j++) + { + int in_j = out_j * stride_w - pad_l; + out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 + + imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 + + imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[out_j] = out; + } + + for (; out_j < outW; out_j++ ) + { + int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; + float s0 = 1.f, s1 = 1.f, s2 = 1.f; + if (in_j0 >= width) + { + in_j0 = 0; + s0 = 0.f; + } + if (in_j1 >= width) + { + in_j1 = 0; + s1 = 0.f; + } + if (in_j2 >= width) + { + in_j2 = 0; + s2 = 0.f; + } + out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 + + imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 + + imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[out_j] = out; + } + } + _mm256_zeroupper(); +} + +#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_RVV + +/* +Example for load_deinterleave: + input: ptr[16] = {1,2,3, ... ,14,15,16} + output: a = {1, 3, 5, 7, 9, 11, 13, 15} + output: b = {2, 4, 6, 8,10, 12, 14, 16} +*/ +static inline void vfloat32m2_load_deinterleave(const float* ptr, vfloat32m2_t& a, vfloat32m2_t& b, int vl) +{ + vuint64m4_t mask = vmv_v_x_u64m4(1,vl*2); + vuint32m4_t mask_re = vreinterpret_v_u64m4_u32m4(mask); + vbool8_t mask0 = vmseq_vx_u32m4_b8 (mask_re, 1, vl*2); + vbool8_t mask1 = vmseq_vx_u32m4_b8 (mask_re, 0, vl*2); + vfloat32m4_t tempa = vundefined_f32m4(), tempb = vundefined_f32m4(); + vfloat32m4_t vw = vle32_v_f32m4(ptr, vl*2); + tempa = vcompress_vm_f32m4(mask0, tempa, vw, vl*2); + tempb = vcompress_vm_f32m4(mask1, tempb, vw, vl*2); + /* The following instructions have not to be supported by the GNU toolchain. + So we temporarily use store and load instead. + // a = vlmul_trunc_v_f32m4_f32m2(tempa); + // b = vlmul_trunc_v_f32m4_f32m2(tempb); + */ + cv::AutoBuffer cvBuffer(sizeof(float)*vl*2); + float* buffer = (float*)cvBuffer.data(); + vse32_v_f32m4(buffer, tempa, vl); + a = vle32_v_f32m2(buffer, vl); + vse32_v_f32m4(buffer, tempb, vl); + b = vle32_v_f32m2(buffer, vl); +} + +void fastDepthwiseConv( const float* wptr, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int dilation_h, int dilation_w, + int pad_t, int pad_l, + const float* biasptr, const float* relu, + const float* inptr_, + int height, int width, + float* outptr_, + int out_d, int outH, int outW ) +{ + int vl; + const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], + w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], + w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; + int outW1 = std::min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w); + float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d]; + + for (int out_i = 0; out_i < outH; out_i++) + { + int in_i = out_i * stride_h - pad_t, out_j = 0; + const float* imgptr0 = inptr_ + in_i*width; + const float* imgptr1 = imgptr0 + dilation_h*width; + const float* imgptr2 = imgptr0 + (dilation_h*2)*width; + float out, w00 = w00_, w01 = w01_, w02 = w02_; + float w20 = w20_, w21 = w21_, w22 = w22_; + if (in_i < 0) + { + w00 = w01 = w02 = 0.f; + imgptr0 = imgptr1; + } + else if (in_i + dilation_h*(kernel_h-1) >= height) + { + w20 = w21 = w22 = 0.f; + imgptr2 = imgptr1; + } + float* outptr = outptr_ + out_i*outW; + if (pad_l > 0) + { + out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 + + imgptr1[0]*w11 + imgptr1[dilation_w]*w12 + + imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[0] = out; + out_j = 1; + } + + if (stride_w == 1 || (stride_w == 2 && dilation_w == 1)) + { + int avl = outW1 - out_j; + if( stride_w == 1 ) + for( ; out_j < outW1; out_j += vl, avl -= vl) + { + vl = vsetvl_e32m2(avl); + int in_j = out_j * stride_w - pad_l; + vfloat32m2_t v00 = vle32_v_f32m2(imgptr0 + in_j, vl), + v01 = vle32_v_f32m2(imgptr0 + in_j + dilation_w, vl), + v02 = vle32_v_f32m2(imgptr0 + in_j + dilation_w*2, vl), + v10 = vle32_v_f32m2(imgptr1 + in_j, vl), + v11 = vle32_v_f32m2(imgptr1 + in_j + dilation_w, vl), + v12 = vle32_v_f32m2(imgptr1 + in_j + dilation_w*2, vl), + v20 = vle32_v_f32m2(imgptr2 + in_j, vl), + v21 = vle32_v_f32m2(imgptr2 + in_j + dilation_w, vl), + v22 = vle32_v_f32m2(imgptr2 + in_j + dilation_w*2, vl); + + vfloat32m2_t vout0 = vfmul_vf_f32m2(v00, w00, vl); + vfloat32m2_t vout1 = vfmul_vf_f32m2(v01, w01, vl); + vfloat32m2_t vout2 = vfmul_vf_f32m2(v02, w02, vl); + vout0 = vfadd_vf_f32m2(vout0, bias, vl); + + vout0 = vfmacc_vf_f32m2(vout0, w10, v10, vl); + vout1 = vfmacc_vf_f32m2(vout1, w11, v11, vl); + vout2 = vfmacc_vf_f32m2(vout2, w12, v12, vl); + + vout0 = vfmacc_vf_f32m2(vout0, w20, v20, vl); + vout1 = vfmacc_vf_f32m2(vout1, w21, v21, vl); + vout2 = vfmacc_vf_f32m2(vout2, w22, v22, vl); + + vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl); + if (relu) + { + vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl); + vout0 = vmerge_vvm_f32m2(m, vfmul_vf_f32m2(vout0, relu_coeff, vl), vout0, vl); + } + vse32_v_f32m2(outptr + out_j, vout0, vl); + } + else //stride_w == 2 && dilation_w == 1 + for( ; out_j < outW1; out_j += vl, avl -= vl) + { + vl = vsetvl_e32m2(avl); + int in_j = out_j * stride_w - pad_l; + vfloat32m2_t v00, v01, v02, v10, v11, v12, v20, v21, v22, unused; + vfloat32m2_load_deinterleave(imgptr0 + in_j, v00, v01, vl); + vfloat32m2_load_deinterleave(imgptr0 + in_j + 2, v02, unused, vl); + vfloat32m2_load_deinterleave(imgptr1 + in_j, v10, v11, vl); + vfloat32m2_load_deinterleave(imgptr1 + in_j + 2, v12, unused, vl); + vfloat32m2_load_deinterleave(imgptr2 + in_j, v20, v21, vl); + vfloat32m2_load_deinterleave(imgptr2 + in_j + 2, v22, unused, vl); + + vfloat32m2_t vout0 = vfmul_vf_f32m2(v00, w00, vl); + vfloat32m2_t vout1 = vfmul_vf_f32m2(v01, w01, vl); + vfloat32m2_t vout2 = vfmul_vf_f32m2(v02, w02, vl); + vout0 = vfadd_vf_f32m2(vout0, bias, vl); + + vout0 = vfmacc_vf_f32m2(vout0, w10, v10, vl); + vout1 = vfmacc_vf_f32m2(vout1, w11, v11, vl); + vout2 = vfmacc_vf_f32m2(vout2, w12, v12, vl); + + vout0 = vfmacc_vf_f32m2(vout0, w20, v20, vl); + vout1 = vfmacc_vf_f32m2(vout1, w21, v21, vl); + vout2 = vfmacc_vf_f32m2(vout2, w22, v22, vl); + + vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl); + if (relu) + { + vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl); + vout0 = vmerge_vvm_f32m2(m, vfmul_vf_f32m2(vout0, relu_coeff, vl), vout0, vl); + } + vse32_v_f32m2(outptr + out_j, vout0, vl); + } + } + + for (; out_j < outW1; out_j++) + { + int in_j = out_j * stride_w - pad_l; + out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 + + imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 + + imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[out_j] = out; + } + + for (; out_j < outW; out_j++ ) + { + int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; + float s0 = 1.f, s1 = 1.f, s2 = 1.f; + if (in_j0 >= width) + { + in_j0 = 0; + s0 = 0.f; + } + if (in_j1 >= width) + { + in_j1 = 0; + s1 = 0.f; + } + if (in_j2 >= width) + { + in_j2 = 0; + s2 = 0.f; + } + out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 + + imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 + + imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[out_j] = out; + } + } +} + +#endif // CV_RVV + +#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX + +static inline void _v256_load_deinterleave(const float* ptr, __m256& a, __m256& b) +{ + __m256 t0 = (__m256)__lasx_xvld(ptr, 0); + __m256 t1 = (__m256)__lasx_xvld(ptr, 8*4); + + __m256 lo = (__m256)__lasx_xvpermi_q(t0, t1, 2+0*16); + __m256 hi = (__m256)__lasx_xvpermi_q(t0, t1, 3+1*16); + + a = (__m256)__lasx_xvpermi_w(hi, lo, 0x88); + b = (__m256)__lasx_xvpermi_w(hi, lo, 0xdd); +} + +void fastDepthwiseConv( const float* wptr, + int kernel_h, int kernel_w, + int stride_h, int stride_w, + int dilation_h, int dilation_w, + int pad_t, int pad_l, + const float* biasptr, const float* relu, + const float* inptr_, + int height, int width, + float* outptr_, + int out_d, int outH, int outW ) +{ + const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], + w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], + w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; + int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w); + float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d]; + + for (int out_i = 0; out_i < outH; out_i++) + { + int in_i = out_i * stride_h - pad_t, out_j = 0; + const float* imgptr0 = inptr_ + in_i*width; + const float* imgptr1 = imgptr0 + dilation_h*width; + const float* imgptr2 = imgptr0 + (dilation_h*2)*width; + float out, w00 = w00_, w01 = w01_, w02 = w02_; + float w20 = w20_, w21 = w21_, w22 = w22_; + if (in_i < 0) + { + w00 = w01 = w02 = 0.f; + imgptr0 = imgptr1; + } + else if (in_i + dilation_h*(kernel_h-1) >= height) + { + w20 = w21 = w22 = 0.f; + imgptr2 = imgptr1; + } + float* outptr = outptr_ + out_i*outW; + if (pad_l > 0) + { + out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 + + imgptr1[0]*w11 + imgptr1[dilation_w]*w12 + + imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[0] = out; + out_j = 1; + } + + if (stride_w == 1 || (stride_w == 2 && dilation_w == 1)) + { + const int VECSZ = 8; + __m256 vw00 = _v256_setall_ps(w00), vw01 = _v256_setall_ps(w01), vw02 = _v256_setall_ps(w02), + vw10 = _v256_setall_ps(w10), vw11 = _v256_setall_ps(w11), vw12 = _v256_setall_ps(w12), + vw20 = _v256_setall_ps(w20), vw21 = _v256_setall_ps(w21), vw22 = _v256_setall_ps(w22); + __m256 z = (__m256)__lasx_xvxor_v((__m256i)vw00, (__m256i)vw00), + vbias = _v256_setall_ps(bias), vrc = _v256_setall_ps(relu_coeff); + + if( stride_w == 1 ) + for( ; out_j < outW1; out_j += VECSZ ) + { + if (out_j + VECSZ > outW1 && out_j > pad_l) + out_j = outW1 - VECSZ; + int in_j = out_j * stride_w - pad_l; + __m256 v00 = (__m256)__lasx_xvld(imgptr0 + in_j, 0), + v01 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w, 0), + v02 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w*2, 0), + v10 = (__m256)__lasx_xvld(imgptr1 + in_j, 0), + v11 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w, 0), + v12 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w*2, 0), + v20 = (__m256)__lasx_xvld(imgptr2 + in_j, 0), + v21 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w, 0), + v22 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w*2, 0); + + __m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias); + __m256 vout1 = __lasx_xvfmul_s(v01, vw01); + __m256 vout2 = __lasx_xvfmul_s(v02, vw02); + + vout0 = __lasx_xvfmadd_s(v10, vw10, vout0); + vout1 = __lasx_xvfmadd_s(v11, vw11, vout1); + vout2 = __lasx_xvfmadd_s(v12, vw12, vout2); + + vout0 = __lasx_xvfmadd_s(v20, vw20, vout0); + vout1 = __lasx_xvfmadd_s(v21, vw21, vout1); + vout2 = __lasx_xvfmadd_s(v22, vw22, vout2); + + vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2); + if (relu) + { + __m256i m = __lasx_xvfcmp_clt_s(z, vout0); + vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m); + } + __lasx_xvst(vout0, outptr + out_j, 0); + } + else + for( ; out_j < outW1; out_j += VECSZ ) + { + if (out_j + VECSZ > outW1 && out_j > pad_l) + out_j = outW1 - VECSZ; + int in_j = out_j * stride_w - pad_l; + __m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused; + _v256_load_deinterleave(imgptr0 + in_j, v00, v01); + _v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused); + _v256_load_deinterleave(imgptr1 + in_j, v10, v11); + _v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused); + _v256_load_deinterleave(imgptr2 + in_j, v20, v21); + _v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused); + + __m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias); + __m256 vout1 = __lasx_xvfmul_s(v01, vw01); + __m256 vout2 = __lasx_xvfmul_s(v02, vw02); + + vout0 = __lasx_xvfmadd_s(v10, vw10, vout0); + vout1 = __lasx_xvfmadd_s(v11, vw11, vout1); + vout2 = __lasx_xvfmadd_s(v12, vw12, vout2); + + vout0 = __lasx_xvfmadd_s(v20, vw20, vout0); + vout1 = __lasx_xvfmadd_s(v21, vw21, vout1); + vout2 = __lasx_xvfmadd_s(v22, vw22, vout2); + + vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2); + if (relu) + { + __m256i m = __lasx_xvfcmp_clt_s(z, vout0); + vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m); + } + __lasx_xvst(vout0, outptr + out_j, 0); + } + } + + for (; out_j < outW1; out_j++) + { + int in_j = out_j * stride_w - pad_l; + out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 + + imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 + + imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[out_j] = out; + } + + for (; out_j < outW; out_j++ ) + { + int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; + float s0 = 1.f, s1 = 1.f, s2 = 1.f; + if (in_j0 >= width) + { + in_j0 = 0; + s0 = 0.f; + } + if (in_j1 >= width) + { + in_j1 = 0; + s1 = 0.f; + } + if (in_j2 >= width) + { + in_j2 = 0; + s2 = 0.f; + } + out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 + + imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 + + imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias; + if (relu) + out = out > 0.f ? out : out*relu_coeff; + outptr[out_j] = out; + } + } +} + +#endif // CV_LASX + +CV_CPU_OPTIMIZATION_NAMESPACE_END +}} // namespace diff --git a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp new file mode 100644 index 0000000000..27998e4bcc --- /dev/null +++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp @@ -0,0 +1,764 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv_Winograd.fx). +// Here is the original license: +/* + This file is a part of ficus language project. + See ficus/LICENSE for the licensing terms +*/ + +#include "../../precomp.hpp" +#include "convolution.hpp" + +#include "conv_winograd_f63.simd.hpp" +#include "layers/cpu_kernels/conv_winograd_f63.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + +namespace cv { namespace dnn { + +#if CV_NEON || CV_SIMD128 || CV_TRY_AVX2 +enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment. + +void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock, + const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32); + +/*Input transform*/ +void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep, + float* outptr, int Cg, const int winoIblock, const int winoAtomF32); + +/*Output transform*/ +void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep, + float bias, float minval, float maxval, bool ifMinMaxAct); + + +int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr& conv, + int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) +{ + Mat input = _input.getMat(); + Mat output = _output.getMat(); + Mat fusedAddMat = _fusedAddMat.getMat(); + + MatShape inputShape = shape(input); + MatShape outputShape = shape(output); + CV_Assert(inputShape.size() == 4 && outputShape.size() == 4); + + int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3]; // [N, C, H, W] + int K = conv->K; + int H0 = outputShape[2], W0 = outputShape[3]; + + int pad_top = conv->pad_top; + int pad_left = conv->pad_left; + + int ngroups = conv->ngroups, Cg = C/ngroups, Kg = K/ngroups; + int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - 1)/CONV_WINO_KBLOCK; + const size_t inp_planesize = (size_t)Hi*Wi; + const size_t out_planesize = (size_t)H0*W0; + + int blocks_per_row = (W0+CONV_WINO_STEP-1)/CONV_WINO_STEP; + int blocks_per_plane = ((H0+CONV_WINO_STEP-1)/CONV_WINO_STEP)*blocks_per_row; + int blocks_per_plane_aligned = ((blocks_per_plane + + CONV_WINO_IBLOCK-1)/CONV_WINO_IBLOCK)*CONV_WINO_IBLOCK; + + size_t totalbufsize = (size_t)N*C*blocks_per_plane_aligned*CONV_WINO_AREA; + + AutoBuffer _buf; + _buf.allocate(totalbufsize + VEC_ALIGN); + float* wbuf_all = alignPtr(_buf.data(), VEC_ALIGN); + + float* inp = input.ptr(); + float* out = output.ptr(); + + float* fusedAddPtr = fusedAddMat.empty() ? nullptr : fusedAddMat.ptr(); + + // Phase 1. compute forward Winograd transforms for all input blocks, + // all input planes, all samples in the batch. + // [TODO]: maybe, if there are too many input channels, it makes sense to + // transform only part of input channels at once and then compute the partial + // accumulated sums (i.e. update the output buffers several times, + // rather than compute them in one pass). + parallel_for_(Range(0, ntasks), [&](const Range& r0) { + for (int task_id = r0.start; task_id < r0.end; task_id++) + { + int nc0 = (N*C)*task_id/ntasks; + int nc1 = (N*C)*(task_id+1)/ntasks; + for(; nc0 < nc1; nc0++) + { + int n = nc0 / C; + int c = nc0 - n*C; + int g = c / Cg; + c -= g*Cg; + for (int block_id = 0; block_id < blocks_per_plane; block_id += CONV_WINO_IBLOCK) + { + for (int db = 0; db < CONV_WINO_IBLOCK; db++) + { + size_t inwofs = ((n*ngroups + g)*blocks_per_plane_aligned + + block_id)*Cg*CONV_WINO_AREA + + (c*CONV_WINO_IBLOCK + db)*CONV_WINO_ATOM_F32; + float* inwptr = (float*)wbuf_all + inwofs; + + if (block_id + db < blocks_per_plane) + { + int y0 = (block_id + db) / blocks_per_row; + int x0 = (block_id + db) - y0 * blocks_per_row; + y0 = y0*CONV_WINO_STEP - pad_top; + x0 = x0*CONV_WINO_STEP - pad_left; + bool partial = y0 < 0 || y0 + CONV_WINO_SIZE > Hi || + x0 < 0 || x0 + CONV_WINO_SIZE > Wi; + int dx1 = 0, dx2 = CONV_WINO_SIZE, dy1 = 0, dy2 = CONV_WINO_SIZE; + int inpstep = Wi; + + float inpbuf[CONV_WINO_AREA]; + float* inptr0 = (float*)inp + nc0*inp_planesize + y0*Wi + x0; + float* inptr = inptr0; + + if (partial) + { + memset(inpbuf, 0, sizeof(inpbuf)); + dy1 = -y0 > 0 ? -y0 : 0; + dy2 = Hi - y0 < CONV_WINO_SIZE ? Hi - y0 : CONV_WINO_SIZE; + + if (dy2 < dy1) {dy2 = dy1 = 0;} + dx1 = -x0 > 0 ? -x0 : 0; + dx2 = Wi - x0 < CONV_WINO_SIZE ? Wi - x0 : CONV_WINO_SIZE; + + if (dx2 < dx1) {dx2 = dx1 = 0;} + inptr0 -= y0*Wi + x0; + + if (dx1 < dx2 && dy1 < dy2) + { + for(int dy = dy1; dy < dy2; dy++) + memcpy(&inpbuf[dy*CONV_WINO_SIZE + dx1], + inptr0 + (y0+dy)*Wi + (x0+dx1), + (dx2-dx1)*sizeof(inpbuf[0])); + } + + inptr = inpbuf; + inpstep = CONV_WINO_SIZE; + } +#if CV_TRY_AVX2 + if (conv->useAVX2) + opt_AVX2::winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32); + else +#endif +#if CV_TRY_AVX + if (conv->useAVX) + opt_AVX::winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32); + else +#endif +#if CV_NEON && CV_NEON_AARCH64 + if (conv->useNEON) + opt_NEON::winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32); + else +#endif + winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32); + } + else + { + for (int i = 0; i < CONV_WINO_NATOMS_F32; i++, inwptr += CONV_WINO_IBLOCK*CONV_WINO_ATOM_F32) + memset(inwptr, 0, CONV_WINO_ATOM_F32*sizeof(inwptr[0])); + } + } + } + } + }}); + + // Phase 2. compute elemwise-weighted sums of transformed blocks, + // apply inverse Winograd transforms to the sums, + // add bias, apply activation function if any and store the results. + parallel_for_(Range(0, ntasks), [&](const Range& r0) { + for (int task_id = r0.start; task_id < r0.end; task_id++) + { + size_t out_wbuf_size = CONV_WINO_AREA*CONV_WINO_KBLOCK*CONV_WINO_IBLOCK; + size_t outbuf_size = CONV_WINO_AREA; + AutoBuffer out_wbuf_, outbuf_; + out_wbuf_.allocate(out_wbuf_size + VEC_ALIGN); + float* out_wbuf = alignPtr(out_wbuf_.data(), VEC_ALIGN); + outbuf_.allocate(outbuf_size + VEC_ALIGN); + float* outbuf = alignPtr(outbuf_.data(), VEC_ALIGN); + + memset(out_wbuf, 0, out_wbuf_size * sizeof(float)); + memset(outbuf, 0, outbuf_size * sizeof(float)); + + int ngk0 = (int)(((int64_t)N*Kg_nblocks*ngroups)*task_id/ntasks); + int ngk1 = (int)(((int64_t)N*Kg_nblocks*ngroups)*(task_id+1)/ntasks); + + for(; ngk0 < ngk1; ngk0++) + { + int n = ngk0 / (Kg_nblocks*ngroups); + int gk0 = ngk0 % (Kg_nblocks*ngroups); + int g = gk0 / Kg_nblocks; + int k0 = (gk0 % Kg_nblocks)*CONV_WINO_KBLOCK; + int k1 = k0 + CONV_WINO_KBLOCK <= Kg ? k0 + CONV_WINO_KBLOCK : Kg; + + for (int block_id0 = 0; block_id0 < blocks_per_plane; block_id0 += CONV_WINO_IBLOCK) + { + int block_id1 = block_id0 + CONV_WINO_IBLOCK; + block_id1 = block_id1 < blocks_per_plane ? block_id1 : blocks_per_plane; + size_t inwofs = ((n*ngroups + g)*blocks_per_plane_aligned + block_id0)*Cg*CONV_WINO_AREA; + size_t wofs = (g*Kg_nblocks*CONV_WINO_KBLOCK + k0)*Cg*CONV_WINO_AREA; + + float* inwptr = wbuf_all + inwofs; + const float* wptr = conv->weightsWinoBufPtr + wofs; + +#if CV_TRY_AVX2 + if (conv->useAVX2) + opt_AVX2::winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK, + CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32); + else +#endif +#if CV_TRY_AVX + if (conv->useAVX) + opt_AVX::winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK, + CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32); + else +#endif +#if CV_NEON && CV_NEON_AARCH64 + if (conv->useNEON) + opt_NEON::winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK, + CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32); + else +#endif + + winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK, + CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32); + for (int k = k0; k < k1; k++) + { + float biasv = conv->biasBuf[g*Kg + k]; + for (int block_id = block_id0; block_id < block_id1; block_id++) + { + int y0 = block_id / blocks_per_row; + int x0 = block_id - y0 * blocks_per_row; + y0 = y0*CONV_WINO_STEP; + x0 = x0*CONV_WINO_STEP; + int dy1 = H0 - y0; + if (dy1 > CONV_WINO_STEP) dy1 = CONV_WINO_STEP; + int dx1 = W0 - x0; + if (dx1 > CONV_WINO_STEP) dx1 = CONV_WINO_STEP; + assert(dx1 > 0 && dy1 > 0); + bool partial = activ || dy1 < CONV_WINO_STEP || dx1 < CONV_WINO_STEP; + size_t outofs = (n*K + g*Kg + k)*out_planesize + y0*W0 + x0; + int outstep = W0; + + float* outptr0 = (float*)out + outofs; + float* pbptr0 = fusedAddPtr ? fusedAddPtr + outofs : nullptr; + float *outptr = outptr0, *bpptr = pbptr0; + + if (partial) + { + outptr = outbuf; + outstep = CONV_WINO_SIZE; + if (pbptr0) + { + bpptr = outbuf; + for (int y = 0; y < dy1; y++) + memcpy(outbuf + y*CONV_WINO_SIZE, pbptr0 + y*W0, + dx1*sizeof(pbptr0[0])); + } + } +#if CV_TRY_AVX2 + if (conv->useAVX2) + opt_AVX::winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE, + bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct); + else +#endif +#if CV_TRY_AVX + if (conv->useAVX) + opt_AVX::winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE, + bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct); + else +#endif +#if CV_NEON && CV_NEON_AARCH64 + if (conv->useNEON) + // NEON optimization is only for ARMv8 device, and for ARMv7 device, we use the Universal intrinsics. + opt_NEON::winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE, + bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct); + else +#endif + winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE, + bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct); + if (partial) + { + if (activ) + activ->forwardSlice(outptr, outptr, CONV_WINO_SIZE*CONV_WINO_STEP, 0, g*Kg + k, g*Kg + k + 1); + for (int y = 0; y < dy1; y++) + memcpy(outptr0 + y*W0, outptr + y*CONV_WINO_SIZE,dx1*sizeof(outptr0[0])); + } + } + } + } + } + }}); + return 1; +} + +/****************************************************************************************\ + SIMD for winograd function +\****************************************************************************************/ + +#if CV_SIMD128 + +void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock, + const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32) +{ +#if 1 + CV_Assert(winoIblock == 3 && winoKblock == 4 && winoAtomF32 == 4); + for (int atom_id = 0; atom_id < winoNatomF32; atom_id++, + outbuf += winoAtomF32) + { + v_float32x4 s00 = v_setzero_f32(), s01 = s00, s02 = s00; + v_float32x4 s10 = v_setzero_f32(), s11 = s00, s12 = s00; + v_float32x4 s20 = v_setzero_f32(), s21 = s00, s22 = s00; + v_float32x4 s30 = v_setzero_f32(), s31 = s00, s32 = s00; + + for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, + wptr += winoKblock*winoAtomF32) + { + v_float32x4 x0, x1, x2; + x0 = v_load(inwptr); + x1 = v_load(inwptr + 4); + x2 = v_load(inwptr + 8); + + v_float32x4 w0 = v_load(wptr); + s00 = v_fma(w0, x0, s00); + s01 = v_fma(w0, x1, s01); + s02 = v_fma(w0, x2, s02); + + w0 = v_load(wptr + 4); + s10 = v_fma(w0, x0, s10); + s11 = v_fma(w0, x1, s11); + s12 = v_fma(w0, x2, s12); + + w0 = v_load(wptr + 8); + s20 = v_fma(w0, x0, s20); + s21 = v_fma(w0, x1, s21); + s22 = v_fma(w0, x2, s22); + + w0 = v_load(wptr + 12); + s30 = v_fma(w0, x0, s30); + s31 = v_fma(w0, x1, s31); + s32 = v_fma(w0, x2, s32); + } + + v_store(outbuf, s00); + v_store(outbuf + 1*64, s01); + v_store(outbuf + 2*64, s02); + v_store(outbuf + 3*64, s10); + v_store(outbuf + 4*64, s11); + v_store(outbuf + 5*64, s12); + v_store(outbuf + 6*64, s20); + v_store(outbuf + 7*64, s21); + v_store(outbuf + 8*64, s22); + v_store(outbuf + 9*64, s30); + v_store(outbuf + 10*64, s31); + v_store(outbuf + 11*64, s32); + } +#else + // Naive C++ code, the code should never be run here. + for (int atom_id = 0; atom_id < winoNatomF32; + atom_id++, outbuf += winoAtomF32) + { + float sumbuf[winoIblock*winoKblock*winoAtomF32]; + memset(sumbuf, 0, sizeof(sumbuf)); + for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, + wptr += winoKblock*winoAtomF32) + { + for (int i = 0; i < winoKblock; i++) + { + for (int j = 0; j < winoIblock; j++) + { + int i_ = i*winoAtomF32; + int j_ = j*winoAtomF32; + int ij_ = i_*winoIblock + j_; + float s0 = inwptr[j_ + 0]*wptr[i_ + 0]; + float s1 = inwptr[j_ + 1]*wptr[i_ + 1]; + float s2 = inwptr[j_ + 2]*wptr[i_ + 2]; + float s3 = inwptr[j_ + 3]*wptr[i_ + 3]; + sumbuf[ij_ + 0] += s0; + sumbuf[ij_ + 1] += s1; + sumbuf[ij_ + 2] += s2; + sumbuf[ij_ + 3] += s3; + } + } + } + for (int ij = 0; ij < winoKblock*winoIblock; ij++) + { + int ij_ = ij*winoAtomF32; + int ij_out = ij*CONV_WINO_AREA; + outbuf[ij_out + 0] = sumbuf[ij_ + 0]; + outbuf[ij_out + 1] = sumbuf[ij_ + 1]; + outbuf[ij_out + 2] = sumbuf[ij_ + 2]; + outbuf[ij_out + 3] = sumbuf[ij_ + 3]; + } + } +#endif +} + +/*Input transform*/ +void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep, + float* outptr, int Cg, const int winoIblock, const int winoAtomF32) +{ + CV_Assert(CONV_WINO_IBLOCK == 3 && CONV_WINO_KBLOCK == 4 && CONV_WINO_ATOM_F32 == 4); + v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4); + v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4); + v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4); + v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4); + v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4); + v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4); + v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4); + v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4); + + v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71; + + { + /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */ + /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */ + v_float32x4 q5_25 = v_setall_f32(5.25f), t00, t01, t10, t11; + t00 = x40 - x20; + t01 = x41 - x21; + t10 = x30 - x50; + t11 = x31 - x51; + v_float32x4 y00 = v_fma(t00, q5_25, x00 - x60); + v_float32x4 y01 = v_fma(t01, q5_25, x01 - x61); + v_float32x4 y70 = v_fma(t10, q5_25, x70 - x10); + v_float32x4 y71 = v_fma(t11, q5_25, x71 - x11); + + /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */ + /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */ + v_float32x4 qm4_25 = v_setall_f32(-4.25f); + t00 = v_fma(x30, qm4_25, x10 + x50); + t01 = v_fma(x31, qm4_25, x11 + x51); + t10 = v_fma(x40, qm4_25, x20 + x60); + t11 = v_fma(x41, qm4_25, x21 + x61); + + v_float32x4 y10 = t00 + t10, y11 = t01 + t11; + v_float32x4 y20 = t10 - t00, y21 = t11 - t01; + + /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */ + /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */ + v_float32x4 q0_5 = v_setall_f32(0.5f), q0_25 = v_setall_f32(0.25f); + v_float32x4 qm2_5 = v_setall_f32(-2.5f), qm1_25 = v_setall_f32(-1.25f); + t00 = v_fma(x10, q0_5, x50 + x50); + t01 = v_fma(x11, q0_5, x51 + x51); + t10 = v_fma(x20, q0_25, x60); + t11 = v_fma(x21, q0_25, x61); + t00 = v_fma(x30, qm2_5, t00); + t01 = v_fma(x31, qm2_5, t01); + t10 = v_fma(x40, qm1_25, t10); + t11 = v_fma(x41, qm1_25, t11); + + v_float32x4 y30 = t00 + t10, y31 = t01 + t11; + v_float32x4 y40 = t10 - t00, y41 = t11 - t01; + + /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */ + /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */ + v_float32x4 q4 = v_setall_f32(4.f), qm5 = v_setall_f32(-5.f); + t00 = v_fma(x50, q0_5, x10 + x10); + t01 = v_fma(x51, q0_5, x11 + x11); + t10 = v_fma(x20, q4 , x60); + t11 = v_fma(x21, q4 , x61); + t00 = v_fma(x30, qm2_5, t00); + t01 = v_fma(x31, qm2_5, t01); + t10 = v_fma(x40, qm5 , t10); + t11 = v_fma(x41, qm5 , t11); + + v_float32x4 y50 = t00 + t10, y51 = t01 + t11; + v_float32x4 y60 = t10 - t00, y61 = t11 - t01; + + /* transpose 8x8 matrix in-place with some renumeration of the elements: */ + /* Y: */ + /* y00 y01 */ + /* y10 y11 */ + /* ... */ + /* y70 y71 */ + /* Y': */ + /* y00 y40 */ + /* y10 y50 */ + /* y20 y60 */ + /* y30 y70 */ + /* y01 y41 */ + /* y11 y51 */ + /* y21 y61 */ + /* y31 y71 */ + /* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */ + + v_transpose4x4(y00, y10, y20, y30, y00, y10, y20, y30); + v_transpose4x4(y01, y11, y21, y31, y01, y11, y21, y31); + v_transpose4x4(y40, y50, y60, y70, y40, y50, y60, y70); + v_transpose4x4(y41, y51, y61, y71, y41, y51, y61, y71); + + /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */ + /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */ + t00 = y01 - y20; + t01 = y41 - y60; + t10 = y30 - y11; + t11 = y70 - y51; + z00 = v_fma(t00, q5_25, y00 - y21); + z01 = v_fma(t01, q5_25, y40 - y61); + z70 = v_fma(t10, q5_25, y31 - y10); + z71 = v_fma(t11, q5_25, y71 - y50); + + /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */ + /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */ + t00 = v_fma(y30, qm4_25, y10 + y11); + t01 = v_fma(y70, qm4_25, y50 + y51); + t10 = v_fma(y01, qm4_25, y20 + y21); + t11 = v_fma(y41, qm4_25, y60 + y61); + + z10 = t00 + t10; z11 = t01 + t11; + z20 = t10 - t00; z21 = t11 - t01; + + /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */ + /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */ + t00 = v_fma(y10, q0_5, y11 + y11); + t01 = v_fma(y50, q0_5, y51 + y51); + t10 = v_fma(y20, q0_25, y21); + t11 = v_fma(y60, q0_25, y61); + t00 = v_fma(y30, qm2_5, t00); + t01 = v_fma(y70, qm2_5, t01); + t10 = v_fma(y01, qm1_25, t10); + t11 = v_fma(y41, qm1_25, t11); + + z30 = t00 + t10; z31 = t01 + t11; + z40 = t10 - t00; z41 = t11 - t01; + + /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */ + /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */ + t00 = v_fma(y11, q0_5, y10 + y10); + t01 = v_fma(y51, q0_5, y50 + y50); + t10 = v_fma(y20, q4, y21); + t11 = v_fma(y60, q4, y61); + t00 = v_fma(y30, qm2_5, t00); + t01 = v_fma(y70, qm2_5, t01); + t10 = v_fma(y01, qm5, t10); + t11 = v_fma(y41, qm5, t11); + + z50 = t00 + t10; z51 = t01 + t11; + z60 = t10 - t00; z61 = t11 - t01; + } + + const int outstep = winoIblock*winoAtomF32*Cg; + + v_store(outptr, z00); + v_store(outptr + outstep, z01); + v_store(outptr + outstep*2, z10); + v_store(outptr + outstep*3, z11); + v_store(outptr + outstep*4, z20); + v_store(outptr + outstep*5, z21); + v_store(outptr + outstep*6, z30); + v_store(outptr + outstep*7, z31); + v_store(outptr + outstep*8, z40); + v_store(outptr + outstep*9, z41); + v_store(outptr + outstep*10, z50); + v_store(outptr + outstep*11, z51); + v_store(outptr + outstep*12, z60); + v_store(outptr + outstep*13, z61); + v_store(outptr + outstep*14, z70); + v_store(outptr + outstep*15, z71); +} + +/*Output transform*/ +/* Inverse Winograd 8x8 transform: + out = (A'*inp*A)', where + inp is input 8x8 FP32 matrix, + A' is + [1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, + 0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f, + 0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f, + 0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f, + 0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f, + 0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f] + + inp is pre-loaded into xij registers, + out will be stored in zij, where (0<=i<=7 for x, 0<=i<=5 for z), 0<=j<=1. + + After the inverse transform is done, we add bias, + optionally add results from the earlier tensors (by-pass), + optionally apply activation function and then + store the final results. + + That is, after both forward and then inverse transformation, + we get non-transposed result. + Of course, for the correct work of Winograd-based convolution, + the Winograd-transformed weights should also be transposed. + init_conv() (see OpConv.fx) takes care of that. +*/ +void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, + float* bpptr, int bpstep, float* outptr, int outstep, + float bias, float minval, float maxval, bool ifMinMaxAct) +{ + CV_Assert(CONV_WINO_IBLOCK == 3 && CONV_WINO_KBLOCK == 4 && CONV_WINO_ATOM_F32 == 4); + v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4); + v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4); + v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4); + v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4); + v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4); + v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4); + v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4); + v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4); + v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51; + + { + v_float32x4 s12_0, s12_1, s34_0, s34_1, s56_0, s56_1; + s12_0 = x10 + x20; s12_1 = x11 + x21; + s34_0 = x30 + x40; s34_1 = x31 + x41; + s56_0 = x50 + x60; s56_1 = x51 + x61; + + v_float32x4 y00 = x00 + s12_0 + s34_0 + s56_0; + v_float32x4 y01 = x01 + s12_1 + s34_1 + s56_1; + + v_float32x4 a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f); + v_float32x4 y20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); + v_float32x4 y21 = v_fma(s56_1, a0 ,v_fma(s34_1, a1, s12_1) ); + + a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f); + v_float32x4 y40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); + v_float32x4 y41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); + + s12_0 = x10 - x20; s12_1 = x11 - x21; + s34_0 = x30 - x40; s34_1 = x31 - x41; + s56_0 = x50 - x60; s56_1 = x51 - x61; + + a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.f); + v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, x70 + s12_0)); + v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, x71 + s12_1)); + + a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.f); + v_float32x4 y10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); + v_float32x4 y11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); + + a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.f); + v_float32x4 y30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); + v_float32x4 y31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); + + v_float32x4 y60 = v_setall_f32(0.f), y61 = y60, y70 = y60, y71 = y60; + + /* transpose 8x8 matrix in-place with some renumeration of the elements: */ + /* Y: */ + /* y00 y01 */ + /* y10 y11 */ + /* ... */ + /* y50 y51 */ + /* 0 0 */ + /* 0 0 */ + /* Y': */ + /* y00 y40 */ + /* y10 y50 */ + /* y20 y60 */ + /* y30 y70 */ + /* y01 y41 */ + /* y11 y51 */ + /* y21 y61 */ + /* y31 y71 */ + /* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */ + + v_transpose4x4(y00, y10, y20, y30, y00, y10, y20, y30); + v_transpose4x4(y01, y11, y21, y31, y01, y11, y21, y31); + v_transpose4x4(y40, y50, y60, y70, y40, y50, y60, y70); + v_transpose4x4(y41, y51, y61, y71, y41, y51, y61, y71); + + s12_0 = y10 + y20; s12_1 = y50 + y60; + s34_0 = y30 + y01; s34_1 = y70 + y41; + s56_0 = y11 + y21; s56_1 = y51 + y61; + + z00 = y00 + s12_0 + s34_0 + s56_0; + z01 = y40 + s12_1 + s34_1 + s56_1; + + a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f); + z20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); + z21 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); + + a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f); + z40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); + z41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); + + s12_0 = y10 - y20; s12_1 = y50 - y60; + s34_0 = y30 - y01; s34_1 = y70 - y41; + s56_0 = y11 - y21; s56_1 = y51 - y61; + + a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.0f); + z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, y31 + s12_0)); + z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, y71 + s12_1)); + + a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.0f); + z10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); + z11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); + + a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.0f); + z30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); + z31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); + + v_float32x4 vbias = v_setall_f32(bias); + z00 += vbias; + z01 += vbias; + z10 += vbias; + z11 += vbias; + z20 += vbias; + z21 += vbias; + z30 += vbias; + z31 += vbias; + z40 += vbias; + z41 += vbias; + z50 += vbias; + z51 += vbias; + } + + if (bpptr) + { + z00 += v_load(bpptr); + z01 += v_load_low(bpptr + 4); + z10 += v_load(bpptr + bpstep); + z11 += v_load_low(bpptr + bpstep + 4); + z20 += v_load(bpptr + bpstep*2); + z21 += v_load_low(bpptr + bpstep*2 + 4); + z30 += v_load(bpptr + bpstep*3); + z31 += v_load_low(bpptr + bpstep*3 + 4); + z40 += v_load(bpptr + bpstep*4); + z41 += v_load_low(bpptr + bpstep*4 + 4); + z50 += v_load(bpptr + bpstep*5); + z51 += v_load_low(bpptr + bpstep*5 + 4); + } + + if (ifMinMaxAct) + { + v_float32x4 vmax = v_setall_f32(maxval); + v_float32x4 vmin = v_setall_f32(minval); + + z00 = v_min(v_max(z00, vmin), vmax); + z01 = v_min(v_max(z01, vmin), vmax); + z10 = v_min(v_max(z10, vmin), vmax); + z11 = v_min(v_max(z11, vmin), vmax); + z20 = v_min(v_max(z20, vmin), vmax); + z21 = v_min(v_max(z21, vmin), vmax); + z30 = v_min(v_max(z30, vmin), vmax); + z31 = v_min(v_max(z31, vmin), vmax); + z40 = v_min(v_max(z40, vmin), vmax); + z41 = v_min(v_max(z41, vmin), vmax); + z50 = v_min(v_max(z50, vmin), vmax); + z51 = v_min(v_max(z51, vmin), vmax); + } + + v_store(outptr, z00); + v_store_low(outptr + 4, z01); + v_store(outptr + outstep, z10); + v_store_low(outptr + outstep + 4, z11); + v_store(outptr + outstep*2, z20); + v_store_low(outptr + outstep*2 + 4, z21); + v_store(outptr + outstep*3, z30); + v_store_low(outptr + outstep*3 + 4, z31); + v_store(outptr + outstep*4, z40); + v_store_low(outptr + outstep*4 + 4, z41); + v_store(outptr + outstep*5, z50); + v_store_low(outptr + outstep*5 + 4, z51); +} +#endif + +#else +int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr& conv, + int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) +{ + return 0; +} +#endif + +}} // namespace cv::dnn diff --git a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp new file mode 100644 index 0000000000..2688c75785 --- /dev/null +++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp @@ -0,0 +1,886 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "opencv2/core/hal/intrin.hpp" + +namespace cv { +namespace dnn { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN + +/* Accumulate */ +void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock, + const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32); + +/*Input transform*/ +void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep, + float* outptr, int Cg, const int winoIblock, const int winoAtomF32); + +/*Output transform*/ +void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, + float* bpptr, int bpstep, float* outptr, int outstep, + float bias, float minval, float maxval, bool ifMinMaxAct); + +#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX + +#if !CV_FMA3 // AVX workaround +#undef _mm256_fmadd_ps +#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b)) +#endif + +void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock, + const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32) +{ + CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF32 == 8); + if (iblock > 3) + { + for (int atom_id = 0; atom_id < winoNatomF32; atom_id++, + outbuf += winoAtomF32) + { + __m256 s00 = _mm256_set1_ps(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00; + __m256 s10 = _mm256_set1_ps(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00; + __m256 s20 = _mm256_set1_ps(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00; + __m256 s30 = _mm256_set1_ps(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00; + for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, + wptr += winoKblock*winoAtomF32) + { + __m256 w0 = _mm256_load_ps(wptr), w1 = _mm256_load_ps(wptr + 8); + __m256 w2 = _mm256_load_ps(wptr + 16), w3 = _mm256_load_ps(wptr + 24); + __m256 x0, x1; + x0 = _mm256_load_ps(inwptr); + x1 = _mm256_load_ps(inwptr + 8); + s00 = _mm256_fmadd_ps(w0, x0, s00); + s01 = _mm256_fmadd_ps(w0, x1, s01); + s10 = _mm256_fmadd_ps(w1, x0, s10); + s11 = _mm256_fmadd_ps(w1, x1, s11); + s20 = _mm256_fmadd_ps(w2, x0, s20); + s21 = _mm256_fmadd_ps(w2, x1, s21); + s30 = _mm256_fmadd_ps(w3, x0, s30); + s31 = _mm256_fmadd_ps(w3, x1, s31); + x0 = _mm256_load_ps(inwptr + 16); + x1 = _mm256_load_ps(inwptr + 24); + s02 = _mm256_fmadd_ps(w0, x0, s02); + s03 = _mm256_fmadd_ps(w0, x1, s03); + s12 = _mm256_fmadd_ps(w1, x0, s12); + s13 = _mm256_fmadd_ps(w1, x1, s13); + s22 = _mm256_fmadd_ps(w2, x0, s22); + s23 = _mm256_fmadd_ps(w2, x1, s23); + s32 = _mm256_fmadd_ps(w3, x0, s32); + s33 = _mm256_fmadd_ps(w3, x1, s33); + x0 = _mm256_load_ps(inwptr + 32); + x1 = _mm256_load_ps(inwptr + 40); + s04 = _mm256_fmadd_ps(w0, x0, s04); + s05 = _mm256_fmadd_ps(w0, x1, s05); + s14 = _mm256_fmadd_ps(w1, x0, s14); + s15 = _mm256_fmadd_ps(w1, x1, s15); + s24 = _mm256_fmadd_ps(w2, x0, s24); + s25 = _mm256_fmadd_ps(w2, x1, s25); + s34 = _mm256_fmadd_ps(w3, x0, s34); + s35 = _mm256_fmadd_ps(w3, x1, s35); + } + + _mm256_store_ps(outbuf, s00); + _mm256_store_ps(outbuf + 1*64, s01); + _mm256_store_ps(outbuf + 2*64, s02); + _mm256_store_ps(outbuf + 3*64, s03); + _mm256_store_ps(outbuf + 4*64, s04); + _mm256_store_ps(outbuf + 5*64, s05); + + _mm256_store_ps(outbuf + 6*64, s10); + _mm256_store_ps(outbuf + 7*64, s11); + _mm256_store_ps(outbuf + 8*64, s12); + _mm256_store_ps(outbuf + 9*64, s13); + _mm256_store_ps(outbuf + 10*64, s14); + _mm256_store_ps(outbuf + 11*64, s15); + + _mm256_store_ps(outbuf + 12*64, s20); + _mm256_store_ps(outbuf + 13*64, s21); + _mm256_store_ps(outbuf + 14*64, s22); + _mm256_store_ps(outbuf + 15*64, s23); + _mm256_store_ps(outbuf + 16*64, s24); + _mm256_store_ps(outbuf + 17*64, s25); + + _mm256_store_ps(outbuf + 18*64, s30); + _mm256_store_ps(outbuf + 19*64, s31); + _mm256_store_ps(outbuf + 20*64, s32); + _mm256_store_ps(outbuf + 21*64, s33); + _mm256_store_ps(outbuf + 22*64, s34); + _mm256_store_ps(outbuf + 23*64, s35); + } + } + else + { + for (int atom_id = 0; atom_id < winoNatomF32; atom_id++, + outbuf += winoAtomF32) + { + __m256 s00 = _mm256_set1_ps(0.f), s01 = s00, s02 = s00; + __m256 s10 = _mm256_set1_ps(0.f), s11 = s00, s12 = s00; + __m256 s20 = _mm256_set1_ps(0.f), s21 = s00, s22 = s00; + __m256 s30 = _mm256_set1_ps(0.f), s31 = s00, s32 = s00; + for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, + wptr += winoKblock*winoAtomF32) { + __m256 w0 = _mm256_load_ps(wptr), w1 = _mm256_load_ps(wptr + 8); + __m256 w2 = _mm256_load_ps(wptr + 16), w3 = _mm256_load_ps(wptr + 24); + __m256 x0, x1, x2; + x0 = _mm256_load_ps(inwptr); + x1 = _mm256_load_ps(inwptr + 8); + x2 = _mm256_load_ps(inwptr + 16); + s00 = _mm256_fmadd_ps(w0, x0, s00); + s01 = _mm256_fmadd_ps(w0, x1, s01); + s02 = _mm256_fmadd_ps(w0, x2, s02); + s10 = _mm256_fmadd_ps(w1, x0, s10); + s11 = _mm256_fmadd_ps(w1, x1, s11); + s12 = _mm256_fmadd_ps(w1, x2, s12); + s20 = _mm256_fmadd_ps(w2, x0, s20); + s21 = _mm256_fmadd_ps(w2, x1, s21); + s22 = _mm256_fmadd_ps(w2, x2, s22); + s30 = _mm256_fmadd_ps(w3, x0, s30); + s31 = _mm256_fmadd_ps(w3, x1, s31); + s32 = _mm256_fmadd_ps(w3, x2, s32); + } + + _mm256_store_ps(outbuf, s00); + _mm256_store_ps(outbuf + 1*64, s01); + _mm256_store_ps(outbuf + 2*64, s02); + _mm256_store_ps(outbuf + 6*64, s10); + _mm256_store_ps(outbuf + 7*64, s11); + _mm256_store_ps(outbuf + 8*64, s12); + _mm256_store_ps(outbuf + 12*64, s20); + _mm256_store_ps(outbuf + 13*64, s21); + _mm256_store_ps(outbuf + 14*64, s22); + _mm256_store_ps(outbuf + 18*64, s30); + _mm256_store_ps(outbuf + 19*64, s31); + _mm256_store_ps(outbuf + 20*64, s32); + } + } + _mm256_zeroupper(); +} +static inline +void transpose8_ps(__m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m256 &row4, __m256 &row5, __m256 &row6, __m256 &row7) +{ + __m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7; + __m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7; + __t0 = _mm256_unpacklo_ps(row0, row1); + __t1 = _mm256_unpackhi_ps(row0, row1); + __t2 = _mm256_unpacklo_ps(row2, row3); + __t3 = _mm256_unpackhi_ps(row2, row3); + __t4 = _mm256_unpacklo_ps(row4, row5); + __t5 = _mm256_unpackhi_ps(row4, row5); + __t6 = _mm256_unpacklo_ps(row6, row7); + __t7 = _mm256_unpackhi_ps(row6, row7); + __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); + __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); + __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); + __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); + __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); + __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); + __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); + __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); + row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); + row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); + row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); + row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); + row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); + row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); + row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); + row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); +} + +/*Input transform*/ +void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep, + float* outptr, int Cg, const int winoIblock, const int winoAtomF32) +{ + __m256 x00 = _mm256_loadu_ps(inptr); + __m256 x10 = _mm256_loadu_ps(inptr + inpstep); + __m256 x20 = _mm256_loadu_ps(inptr + inpstep*2); + __m256 x30 = _mm256_loadu_ps(inptr + inpstep*3); + __m256 x40 = _mm256_loadu_ps(inptr + inpstep*4); + __m256 x50 = _mm256_loadu_ps(inptr + inpstep*5); + __m256 x60 = _mm256_loadu_ps(inptr + inpstep*6); + __m256 x70 = _mm256_loadu_ps(inptr + inpstep*7); + + __m256 z00, z10, z20, z30, z40, z50, z60, z70; + + { + /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */ + /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */ + __m256 q5_25 = _mm256_set1_ps(5.25f), t00, t10; + t00 = _mm256_sub_ps(x40, x20); + t10 = _mm256_sub_ps(x30, x50); + + __m256 y00 = _mm256_fmadd_ps(t00, q5_25, _mm256_sub_ps(x00, x60)); + __m256 y70 = _mm256_fmadd_ps(t10, q5_25, _mm256_sub_ps(x70, x10)); + + /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */ + /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */ + __m256 qm4_25 = _mm256_set1_ps(-4.25f); + t00 = _mm256_fmadd_ps(x30, qm4_25, _mm256_add_ps(x10, x50)); + t10 = _mm256_fmadd_ps(x40, qm4_25, _mm256_add_ps(x20, x60)); + + __m256 y10 = _mm256_add_ps(t00, t10); + __m256 y20 = _mm256_sub_ps(t10, t00); + + /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */ + /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */ + __m256 q0_5 = _mm256_set1_ps(0.5f), q0_25 = _mm256_set1_ps(0.25f); + __m256 qm2_5 = _mm256_set1_ps(-2.5f), qm1_25 = _mm256_set1_ps(-1.25f); + t00 = _mm256_fmadd_ps(x10, q0_5, _mm256_add_ps(x50, x50)); + t10 = _mm256_fmadd_ps(x20, q0_25, x60); + t00 = _mm256_fmadd_ps(x30, qm2_5, t00); + t10 = _mm256_fmadd_ps(x40, qm1_25, t10); + + __m256 y30 = _mm256_add_ps(t00, t10); + __m256 y40 = _mm256_sub_ps(t10, t00); + + /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */ + /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */ + __m256 q4 = _mm256_set1_ps(4.f), qm5 = _mm256_set1_ps(-5.f); + t00 = _mm256_fmadd_ps(x50, q0_5, _mm256_add_ps(x10, x10)); + t10 = _mm256_fmadd_ps(x20, q4 , x60); + t00 = _mm256_fmadd_ps(x30, qm2_5, t00); + t10 = _mm256_fmadd_ps(x40, qm5 , t10); + + __m256 y50 = _mm256_add_ps(t00, t10); + __m256 y60 = _mm256_sub_ps(t10, t00); + + /* transpose 8x8 matrix in-place with some renumeration of the elements: */ + transpose8_ps(y00, y10, y20, y30, y40, y50, y60, y70); + + /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */ + /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */ + t00 = _mm256_sub_ps(y40, y20); + t10 = _mm256_sub_ps(y30, y50); + z00 = _mm256_fmadd_ps(t00, q5_25, _mm256_sub_ps(y00, y60)); + z70 = _mm256_fmadd_ps(t10, q5_25, _mm256_sub_ps(y70, y10)); + + /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */ + /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */ + t00 = _mm256_fmadd_ps(y30, qm4_25, _mm256_add_ps(y10, y50)); + t10 = _mm256_fmadd_ps(y40, qm4_25, _mm256_add_ps(y20, y60)); + z10 = _mm256_add_ps(t00, t10); + z20 = _mm256_sub_ps(t10, t00); + + /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */ + /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */ + t00 = _mm256_fmadd_ps(y10, q0_5, _mm256_add_ps(y50, y50)); + t10 = _mm256_fmadd_ps(y20, q0_25, y60); + t00 = _mm256_fmadd_ps(y30, qm2_5, t00); + t10 = _mm256_fmadd_ps(y40, qm1_25, t10); + + z30 = _mm256_add_ps(t00, t10); + z40 = _mm256_sub_ps(t10, t00); + + /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */ + /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */ + t00 = _mm256_fmadd_ps(y50, q0_5, _mm256_add_ps(y10, y10)); + t10 = _mm256_fmadd_ps(y20, q4, y60); + t00 = _mm256_fmadd_ps(y30, qm2_5, t00); + t10 = _mm256_fmadd_ps(y40, qm5, t10); + + z50 = _mm256_add_ps(t00, t10); + z60 = _mm256_sub_ps(t10, t00); + } + + const int outstep = winoIblock*winoAtomF32*Cg; + + _mm256_storeu_ps(outptr, z00); + _mm256_storeu_ps(outptr + outstep, z10); + _mm256_storeu_ps(outptr + outstep*2, z20); + _mm256_storeu_ps(outptr + outstep*3, z30); + _mm256_storeu_ps(outptr + outstep*4, z40); + _mm256_storeu_ps(outptr + outstep*5, z50); + _mm256_storeu_ps(outptr + outstep*6, z60); + _mm256_storeu_ps(outptr + outstep*7, z70); + _mm256_zeroupper(); +} + +#define STORE6_ELE_FROM_16(ptr, z00, lowM, highM) \ + lowM = _mm256_castps256_ps128(z00); \ + highM = _mm256_extractf128_ps(z00, 1); \ + _mm_storeu_ps(ptr, lowM); \ + _mm_storel_epi64((__m128i*)(ptr + 4), _mm_castps_si128(highM)) + +/* Inverse Winograd 8x8 transform: + out = (A'*inp*A)', where + inp is input 8x8 FP32 matrix, + A' is + [1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, + 0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f, + 0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f, + 0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f, + 0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f, + 0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f] +*/ +void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, + float* bpptr, int bpstep, float* outptr, int outstep, + float bias, float minval, float maxval, bool ifMinMaxAct) +{ + + __m256 x00 = _mm256_load_ps(inptr); + __m256 x10 = _mm256_load_ps(inptr + inpstep); + __m256 x20 = _mm256_load_ps(inptr + inpstep*2); + __m256 x30 = _mm256_load_ps(inptr + inpstep*3); + __m256 x40 = _mm256_load_ps(inptr + inpstep*4); + __m256 x50 = _mm256_load_ps(inptr + inpstep*5); + __m256 x60 = _mm256_load_ps(inptr + inpstep*6); + __m256 x70 = _mm256_load_ps(inptr + inpstep*7); + __m256 z00, z10, z20, z30, z40, z50; + + { + __m256 s12_0, s34_0, s56_0; + s12_0 = _mm256_add_ps(x10, x20); + s34_0 = _mm256_add_ps(x30, x40); + s56_0 = _mm256_add_ps(x50, x60); + + __m256 y00 = _mm256_add_ps(x00, _mm256_add_ps(s12_0, _mm256_add_ps(s34_0, s56_0))); + __m256 y20 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.25f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(4.0f), s12_0)); + __m256 y40 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/16), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(16.0f), s12_0)); + + s12_0 = _mm256_sub_ps(x10, x20); + s34_0 = _mm256_sub_ps(x30, x40); + s56_0 = _mm256_sub_ps(x50, x60); + __m256 y50 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/32), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(32.f), _mm256_add_ps(x70, s12_0))); + __m256 y10 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.5f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(2.f), s12_0)); + __m256 y30 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.125f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(8.f), s12_0)); + __m256 y60 = _mm256_set1_ps(0.f), y70 = y60; + + /* transpose 8x8 matrix in-place with some renumeration of the elements: */ + + transpose8_ps(y00, y10, y20, y30, y40, y50, y60, y70); + + s12_0 = _mm256_add_ps(y10, y20); + s34_0 = _mm256_add_ps(y30, y40); + s56_0 = _mm256_add_ps(y50, y60); + + z00 = _mm256_add_ps(y00, _mm256_add_ps(s12_0, _mm256_add_ps(s34_0, s56_0))); + z20 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.25f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(4.0f), s12_0)); + z40 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/16), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(16.0f), s12_0)); + + s12_0 = _mm256_sub_ps(y10, y20); + s34_0 = _mm256_sub_ps(y30, y40); + s56_0 = _mm256_sub_ps(y50, y60); + + z50 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/32), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(32.0f), _mm256_add_ps(y70, s12_0))); + z10 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.5f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(2.0f), s12_0)); + z30 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.125f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(8.0f), s12_0)); + + __m256 vbias = _mm256_set1_ps(bias); + z00 = _mm256_add_ps(vbias, z00); + z10 = _mm256_add_ps(vbias, z10); + z20 = _mm256_add_ps(vbias, z20); + z30 = _mm256_add_ps(vbias, z30); + z40 = _mm256_add_ps(vbias, z40); + z50 = _mm256_add_ps(vbias, z50); + } + + if (bpptr) + { + z00 = _mm256_add_ps(z00, _mm256_loadu_ps(bpptr)); + z10 = _mm256_add_ps(z10, _mm256_loadu_ps(bpptr + bpstep)); + z20 = _mm256_add_ps(z20, _mm256_loadu_ps(bpptr + bpstep*2)); + z30 = _mm256_add_ps(z30, _mm256_loadu_ps(bpptr + bpstep*3)); + z40 = _mm256_add_ps(z40, _mm256_loadu_ps(bpptr + bpstep*4)); + z50 = _mm256_add_ps(z50, _mm256_loadu_ps(bpptr + bpstep*5)); + } + + if (ifMinMaxAct) + { + __m256 vmax = _mm256_set1_ps(maxval); + __m256 vmin = _mm256_set1_ps(minval); + + z00 = _mm256_min_ps(_mm256_max_ps(z00, vmin), vmax); + z10 = _mm256_min_ps(_mm256_max_ps(z10, vmin), vmax); + z20 = _mm256_min_ps(_mm256_max_ps(z20, vmin), vmax); + z30 = _mm256_min_ps(_mm256_max_ps(z30, vmin), vmax); + z40 = _mm256_min_ps(_mm256_max_ps(z40, vmin), vmax); + z50 = _mm256_min_ps(_mm256_max_ps(z50, vmin), vmax); + } + + __m128 lowM, highM; + STORE6_ELE_FROM_16(outptr, z00, lowM, highM); + STORE6_ELE_FROM_16(outptr + outstep, z10, lowM, highM); + STORE6_ELE_FROM_16(outptr + outstep * 2, z20, lowM, highM); + STORE6_ELE_FROM_16(outptr + outstep * 3, z30, lowM, highM); + STORE6_ELE_FROM_16(outptr + outstep * 4, z40, lowM, highM); + STORE6_ELE_FROM_16(outptr + outstep * 5, z50, lowM, highM); + _mm256_zeroupper(); +} +#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +CV_CPU_OPTIMIZATION_NAMESPACE_END + +// NEON code work around. +namespace opt_NEON +{ + +#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON && CV_NEON_AARCH64 +/* Accumulate */ +void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock, + const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32); + +/*Input transform*/ +void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep, + float* outptr, int Cg, const int winoIblock, const int winoAtomF32); + +/*Output transform*/ +void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, + float* bpptr, int bpstep, float* outptr, int outstep, + float bias, float minval, float maxval, bool ifMinMaxAct); + +void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock, + const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32) +{ + CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF32 == 4); + if (iblock > 3) + { + for (int atom_id = 0; atom_id < winoNatomF32; atom_id++, + outbuf += winoAtomF32) + { + float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00; + float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00; + float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00; + float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00; + for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, + wptr += winoKblock*winoAtomF32) { + float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4); + float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12); + float32x4_t x0, x1; + x0 = vld1q_f32(inwptr); + x1 = vld1q_f32(inwptr + 4); + s00 = vfmaq_f32(s00, w0, x0); + s01 = vfmaq_f32(s01, w0, x1); + s10 = vfmaq_f32(s10, w1, x0); + s11 = vfmaq_f32(s11, w1, x1); + s20 = vfmaq_f32(s20, w2, x0); + s21 = vfmaq_f32(s21, w2, x1); + s30 = vfmaq_f32(s30, w3, x0); + s31 = vfmaq_f32(s31, w3, x1); + x0 = vld1q_f32(inwptr + 8); + x1 = vld1q_f32(inwptr + 12); + s02 = vfmaq_f32(s02, w0, x0); + s03 = vfmaq_f32(s03, w0, x1); + s12 = vfmaq_f32(s12, w1, x0); + s13 = vfmaq_f32(s13, w1, x1); + s22 = vfmaq_f32(s22, w2, x0); + s23 = vfmaq_f32(s23, w2, x1); + s32 = vfmaq_f32(s32, w3, x0); + s33 = vfmaq_f32(s33, w3, x1); + x0 = vld1q_f32(inwptr + 16); + x1 = vld1q_f32(inwptr + 20); + s04 = vfmaq_f32(s04, w0, x0); + s05 = vfmaq_f32(s05, w0, x1); + s14 = vfmaq_f32(s14, w1, x0); + s15 = vfmaq_f32(s15, w1, x1); + s24 = vfmaq_f32(s24, w2, x0); + s25 = vfmaq_f32(s25, w2, x1); + s34 = vfmaq_f32(s34, w3, x0); + s35 = vfmaq_f32(s35, w3, x1); + } + + vst1q_f32(outbuf, s00); + vst1q_f32(outbuf + 1*64, s01); + vst1q_f32(outbuf + 2*64, s02); + vst1q_f32(outbuf + 3*64, s03); + vst1q_f32(outbuf + 4*64, s04); + vst1q_f32(outbuf + 5*64, s05); + + vst1q_f32(outbuf + 6*64, s10); + vst1q_f32(outbuf + 7*64, s11); + vst1q_f32(outbuf + 8*64, s12); + vst1q_f32(outbuf + 9*64, s13); + vst1q_f32(outbuf + 10*64, s14); + vst1q_f32(outbuf + 11*64, s15); + + vst1q_f32(outbuf + 12*64, s20); + vst1q_f32(outbuf + 13*64, s21); + vst1q_f32(outbuf + 14*64, s22); + vst1q_f32(outbuf + 15*64, s23); + vst1q_f32(outbuf + 16*64, s24); + vst1q_f32(outbuf + 17*64, s25); + + vst1q_f32(outbuf + 18*64, s30); + vst1q_f32(outbuf + 19*64, s31); + vst1q_f32(outbuf + 20*64, s32); + vst1q_f32(outbuf + 21*64, s33); + vst1q_f32(outbuf + 22*64, s34); + vst1q_f32(outbuf + 23*64, s35); + } + } + else + { + for (int atom_id = 0; atom_id < winoNatomF32; atom_id++, + outbuf += winoAtomF32) + { + float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00; + float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00; + float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00; + float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00; + for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32, + wptr += winoKblock*winoAtomF32) { + float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4); + float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12); + float32x4_t x0, x1, x2; + x0 = vld1q_f32(inwptr); + x1 = vld1q_f32(inwptr + 4); + x2 = vld1q_f32(inwptr + 8); + s00 = vfmaq_f32(s00, w0, x0); + s01 = vfmaq_f32(s01, w0, x1); + s02 = vfmaq_f32(s02, w0, x2); + s10 = vfmaq_f32(s10, w1, x0); + s11 = vfmaq_f32(s11, w1, x1); + s12 = vfmaq_f32(s12, w1, x2); + s20 = vfmaq_f32(s20, w2, x0); + s21 = vfmaq_f32(s21, w2, x1); + s22 = vfmaq_f32(s22, w2, x2); + s30 = vfmaq_f32(s30, w3, x0); + s31 = vfmaq_f32(s31, w3, x1); + s32 = vfmaq_f32(s32, w3, x2); + } + + vst1q_f32(outbuf, s00); + vst1q_f32(outbuf + 1*64, s01); + vst1q_f32(outbuf + 2*64, s02); + vst1q_f32(outbuf + 6*64, s10); + vst1q_f32(outbuf + 7*64, s11); + vst1q_f32(outbuf + 8*64, s12); + vst1q_f32(outbuf + 12*64, s20); + vst1q_f32(outbuf + 13*64, s21); + vst1q_f32(outbuf + 14*64, s22); + vst1q_f32(outbuf + 18*64, s30); + vst1q_f32(outbuf + 19*64, s31); + vst1q_f32(outbuf + 20*64, s32); + } + } +} + +#define T4x4(a, b, c, d, tr0, tr1) \ + tr0 = vtrnq_f32(a, b); \ + tr1 = vtrnq_f32(c, d); \ + a = vcombine_f32(vget_low_f32(tr0.val[0]), vget_low_f32(tr1.val[0])); \ + b = vcombine_f32(vget_low_f32(tr0.val[1]), vget_low_f32(tr1.val[1])); \ + c = vcombine_f32(vget_high_f32(tr0.val[0]), vget_high_f32(tr1.val[0])); \ + d = vcombine_f32(vget_high_f32(tr0.val[1]), vget_high_f32(tr1.val[1])) + +/*Input transform*/ +void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep, + float* outptr, int Cg, const int winoIblock, const int winoAtomF32) +{ + float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4); + float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4); + float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4); + float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4); + float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4); + float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4); + float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4); + float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4); + + float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71; + + { + /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */ + /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */ + float32x4_t q5_25 = vdupq_n_f32(5.25f), t00, t01, t10, t11; + t00 = vsubq_f32(x40, x20); + t01 = vsubq_f32(x41, x21); + t10 = vsubq_f32(x30, x50); + t11 = vsubq_f32(x31, x51); + float32x4_t y00 = vfmaq_f32(vsubq_f32(x00, x60), t00, q5_25); + float32x4_t y01 = vfmaq_f32(vsubq_f32(x01, x61), t01, q5_25); + float32x4_t y70 = vfmaq_f32(vsubq_f32(x70, x10), t10, q5_25); + float32x4_t y71 = vfmaq_f32(vsubq_f32(x71, x11), t11, q5_25); + + /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */ + /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */ + float32x4_t qm4_25 = vdupq_n_f32(-4.25f); + t00 = vfmaq_f32(vaddq_f32(x10, x50), x30, qm4_25); + t01 = vfmaq_f32(vaddq_f32(x11, x51), x31, qm4_25); + t10 = vfmaq_f32(vaddq_f32(x20, x60), x40, qm4_25); + t11 = vfmaq_f32(vaddq_f32(x21, x61), x41, qm4_25); + + float32x4_t y10 = vaddq_f32(t00, t10), y11 = vaddq_f32(t01, t11); + float32x4_t y20 = vsubq_f32(t10, t00), y21 = vsubq_f32(t11, t01); + + /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */ + /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */ + float32x4_t q0_5 = vdupq_n_f32(0.5f), q0_25 = vdupq_n_f32(0.25f); + float32x4_t qm2_5 = vdupq_n_f32(-2.5f), qm1_25 = vdupq_n_f32(-1.25f); + t00 = vfmaq_f32(vaddq_f32(x50, x50), x10, q0_5); + t01 = vfmaq_f32(vaddq_f32(x51, x51), x11, q0_5); + t10 = vfmaq_f32(x60, x20, q0_25); + t11 = vfmaq_f32(x61, x21, q0_25); + t00 = vfmaq_f32(t00, x30, qm2_5); + t01 = vfmaq_f32(t01, x31, qm2_5); + t10 = vfmaq_f32(t10, x40, qm1_25); + t11 = vfmaq_f32(t11, x41, qm1_25); + + float32x4_t y30 = vaddq_f32(t00, t10), y31 = vaddq_f32(t01, t11); + float32x4_t y40 = vsubq_f32(t10, t00), y41 = vsubq_f32(t11, t01); + + /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */ + /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */ + float32x4_t q4 = vdupq_n_f32(4.f), qm5 = vdupq_n_f32(-5.f); + t00 = vfmaq_f32(vaddq_f32(x10, x10), x50, q0_5); + t01 = vfmaq_f32(vaddq_f32(x11, x11), x51, q0_5); + t10 = vfmaq_f32(x60, x20, q4); + t11 = vfmaq_f32(x61, x21, q4); + t00 = vfmaq_f32(t00, x30, qm2_5); + t01 = vfmaq_f32(t01, x31, qm2_5); + t10 = vfmaq_f32(t10, x40, qm5); + t11 = vfmaq_f32(t11, x41, qm5); + + float32x4_t y50 = vaddq_f32(t00, t10), y51 = vaddq_f32(t01, t11); + float32x4_t y60 = vsubq_f32(t10, t00), y61 = vsubq_f32(t11, t01); + + /* transpose 8x8 matrix in-place with some renumeration of the elements: */ + /* Y: */ + /* y00 y01 */ + /* y10 y11 */ + /* ... */ + /* y70 y71 */ + /* Y': */ + /* y00 y40 */ + /* y10 y50 */ + /* y20 y60 */ + /* y30 y70 */ + /* y01 y41 */ + /* y11 y51 */ + /* y21 y61 */ + /* y31 y71 */ + /* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */ + float32x4x2_t tr0, tr1; + + T4x4(y00, y10, y20, y30, tr0, tr1); + T4x4(y01, y11, y21, y31, tr0, tr1); + T4x4(y40, y50, y60, y70, tr0, tr1); + T4x4(y41, y51, y61, y71, tr0, tr1); + + /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */ + /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */ + t00 = vsubq_f32(y01, y20); + t01 = vsubq_f32(y41, y60); + t10 = vsubq_f32(y30, y11); + t11 = vsubq_f32(y70, y51); + z00 = vfmaq_f32(vsubq_f32(y00, y21), t00, q5_25); + z01 = vfmaq_f32(vsubq_f32(y40, y61), t01, q5_25); + z70 = vfmaq_f32(vsubq_f32(y31, y10), t10, q5_25); + z71 = vfmaq_f32(vsubq_f32(y71, y50), t11, q5_25); + + /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */ + /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */ + t00 = vfmaq_f32(vaddq_f32(y10, y11), y30, qm4_25); + t01 = vfmaq_f32(vaddq_f32(y50, y51), y70, qm4_25); + t10 = vfmaq_f32(vaddq_f32(y20, y21), y01, qm4_25); + t11 = vfmaq_f32(vaddq_f32(y60, y61), y41, qm4_25); + + z10 = vaddq_f32(t00, t10); z11 = vaddq_f32(t01, t11); + z20 = vsubq_f32(t10, t00); z21 = vsubq_f32(t11, t01); + + /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */ + /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */ + t00 = vfmaq_f32(vaddq_f32(y11, y11), y10, q0_5); + t01 = vfmaq_f32(vaddq_f32(y51, y51), y50, q0_5); + t10 = vfmaq_f32(y21, y20, q0_25); + t11 = vfmaq_f32(y61, y60, q0_25); + t00 = vfmaq_f32(t00, y30, qm2_5); + t01 = vfmaq_f32(t01, y70, qm2_5); + t10 = vfmaq_f32(t10, y01, qm1_25); + t11 = vfmaq_f32(t11, y41, qm1_25); + + z30 = vaddq_f32(t00, t10); z31 = vaddq_f32(t01, t11); + z40 = vsubq_f32(t10, t00); z41 = vsubq_f32(t11, t01); + + /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */ + /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */ + t00 = vfmaq_f32(vaddq_f32(y10, y10), y11, q0_5); + t01 = vfmaq_f32(vaddq_f32(y50, y50), y51, q0_5); + t10 = vfmaq_f32(y21, y20, q4); + t11 = vfmaq_f32(y61, y60, q4); + t00 = vfmaq_f32(t00, y30, qm2_5); + t01 = vfmaq_f32(t01, y70, qm2_5); + t10 = vfmaq_f32(t10, y01, qm5); + t11 = vfmaq_f32(t11, y41, qm5); + + z50 = vaddq_f32(t00, t10); z51 = vaddq_f32(t01, t11); + z60 = vsubq_f32(t10, t00); z61 = vsubq_f32(t11, t01); + } + + const int outstep = winoIblock*winoAtomF32*Cg; + + vst1q_f32(outptr, z00); + vst1q_f32(outptr + outstep, z01); + vst1q_f32(outptr + outstep*2, z10); + vst1q_f32(outptr + outstep*3, z11); + vst1q_f32(outptr + outstep*4, z20); + vst1q_f32(outptr + outstep*5, z21); + vst1q_f32(outptr + outstep*6, z30); + vst1q_f32(outptr + outstep*7, z31); + vst1q_f32(outptr + outstep*8, z40); + vst1q_f32(outptr + outstep*9, z41); + vst1q_f32(outptr + outstep*10, z50); + vst1q_f32(outptr + outstep*11, z51); + vst1q_f32(outptr + outstep*12, z60); + vst1q_f32(outptr + outstep*13, z61); + vst1q_f32(outptr + outstep*14, z70); + vst1q_f32(outptr + outstep*15, z71); +} + +/*Output transform*/ +void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, + float* bpptr, int bpstep, float* outptr, int outstep, + float bias, float minval, float maxval, bool ifMinMaxAct) +{ + float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4); + float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4); + float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4); + float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4); + float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4); + float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4); + float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4); + float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4); + float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51; + + { + float32x4_t s12_0, s12_1, s34_0, s34_1, s56_0, s56_1; + s12_0 = vaddq_f32(x10, x20); s12_1 = vaddq_f32(x11, x21); + s34_0 = vaddq_f32(x30, x40); s34_1 = vaddq_f32(x31, x41); + s56_0 = vaddq_f32(x50, x60); s56_1 = vaddq_f32(x51, x61); + + float32x4_t y00 = vaddq_f32(vaddq_f32(vaddq_f32(x00, s12_0), s34_0), s56_0); + float32x4_t y01 = vaddq_f32(vaddq_f32(vaddq_f32(x01, s12_1), s34_1), s56_1); + float32x4_t y20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f); + float32x4_t y21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f); + float32x4_t y40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16); + float32x4_t y41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16); + + s12_0 = vsubq_f32(x10, x20); s12_1 = vsubq_f32(x11, x21); + s34_0 = vsubq_f32(x30, x40); s34_1 = vsubq_f32(x31, x41); + s56_0 = vsubq_f32(x50, x60); s56_1 = vsubq_f32(x51, x61); + + float32x4_t y50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x70, s12_0), + s34_0, 32.f), s56_0, 1.f/32); + float32x4_t y51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x71, s12_1), + s34_1, 32.f), s56_1, 1.f/32); + float32x4_t y10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f); + float32x4_t y11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f); + float32x4_t y30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f); + float32x4_t y31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f); + float32x4_t y60 = vdupq_n_f32(0.f), y61 = y60, y70 = y60, y71 = y60; + + /* transpose 8x8 matrix in-place with some renumeration of the elements: */ + /* Y: */ + /* y00 y01 */ + /* y10 y11 */ + /* ... */ + /* y50 y51 */ + /* 0 0 */ + /* 0 0 */ + /* Y': */ + /* y00 y40 */ + /* y10 y50 */ + /* y20 y60 */ + /* y30 y70 */ + /* y01 y41 */ + /* y11 y51 */ + /* y21 y61 */ + /* y31 y71 */ + /* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */ + float32x4x2_t tr0, tr1; + + T4x4(y00, y10, y20, y30, tr0, tr1); + T4x4(y01, y11, y21, y31, tr0, tr1); + T4x4(y40, y50, y60, y70, tr0, tr1); + T4x4(y41, y51, y61, y71, tr0, tr1); + + s12_0 = vaddq_f32(y10, y20); s12_1 = vaddq_f32(y50, y60); + s34_0 = vaddq_f32(y30, y01); s34_1 = vaddq_f32(y70, y41); + s56_0 = vaddq_f32(y11, y21); s56_1 = vaddq_f32(y51, y61); + + z00 = vaddq_f32(vaddq_f32(vaddq_f32(y00, s12_0), s34_0), s56_0); + z01 = vaddq_f32(vaddq_f32(vaddq_f32(y40, s12_1), s34_1), s56_1); + z20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f); + z21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f); + z40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16); + z41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16); + + s12_0 = vsubq_f32(y10, y20); s12_1 = vsubq_f32(y50, y60); + s34_0 = vsubq_f32(y30, y01); s34_1 = vsubq_f32(y70, y41); + s56_0 = vsubq_f32(y11, y21); s56_1 = vsubq_f32(y51, y61); + + z50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y31, s12_0), + s34_0, 32.f), s56_0, 1.f/32); + z51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y71, s12_1), + s34_1, 32.f), s56_1, 1.f/32); + z10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f); + z11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f); + z30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f); + z31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f); + float32x4_t vbias = vdupq_n_f32(bias); + + z00 = vaddq_f32(z00, vbias); + z01 = vaddq_f32(z01, vbias); + z10 = vaddq_f32(z10, vbias); + z11 = vaddq_f32(z11, vbias); + z20 = vaddq_f32(z20, vbias); + z21 = vaddq_f32(z21, vbias); + z30 = vaddq_f32(z30, vbias); + z31 = vaddq_f32(z31, vbias); + z40 = vaddq_f32(z40, vbias); + z41 = vaddq_f32(z41, vbias); + z50 = vaddq_f32(z50, vbias); + z51 = vaddq_f32(z51, vbias); + } + + if (bpptr) + { + float32x2_t zhalf = vdup_n_f32(0.f); + z00 = vaddq_f32(z00, vld1q_f32(bpptr)); + z01 = vaddq_f32(z01, vcombine_f32(vld1_f32(bpptr + 4), zhalf)); + z10 = vaddq_f32(z10, vld1q_f32(bpptr + bpstep)); + z11 = vaddq_f32(z11, vcombine_f32(vld1_f32(bpptr + bpstep + 4), zhalf)); + z20 = vaddq_f32(z20, vld1q_f32(bpptr + bpstep*2)); + z21 = vaddq_f32(z21, vcombine_f32(vld1_f32(bpptr + bpstep*2 + 4), zhalf)); + z30 = vaddq_f32(z30, vld1q_f32(bpptr + bpstep*3)); + z31 = vaddq_f32(z31, vcombine_f32(vld1_f32(bpptr + bpstep*3 + 4), zhalf)); + z40 = vaddq_f32(z40, vld1q_f32(bpptr + bpstep*4)); + z41 = vaddq_f32(z41, vcombine_f32(vld1_f32(bpptr + bpstep*4 + 4), zhalf)); + z50 = vaddq_f32(z50, vld1q_f32(bpptr + bpstep*5)); + z51 = vaddq_f32(z51, vcombine_f32(vld1_f32(bpptr + bpstep*5 + 4), zhalf)); + } + + if (ifMinMaxAct) + { + float32x4_t vmax = vdupq_n_f32(maxval); + float32x4_t vmin = vdupq_n_f32(minval); + + z00 = vminq_f32(vmaxq_f32(z00, vmin), vmax); + z01 = vminq_f32(vmaxq_f32(z01, vmin), vmax); + z10 = vminq_f32(vmaxq_f32(z10, vmin), vmax); + z11 = vminq_f32(vmaxq_f32(z11, vmin), vmax); + z20 = vminq_f32(vmaxq_f32(z20, vmin), vmax); + z21 = vminq_f32(vmaxq_f32(z21, vmin), vmax); + z30 = vminq_f32(vmaxq_f32(z30, vmin), vmax); + z31 = vminq_f32(vmaxq_f32(z31, vmin), vmax); + z40 = vminq_f32(vmaxq_f32(z40, vmin), vmax); + z41 = vminq_f32(vmaxq_f32(z41, vmin), vmax); + z50 = vminq_f32(vmaxq_f32(z50, vmin), vmax); + z51 = vminq_f32(vmaxq_f32(z51, vmin), vmax); + } + + vst1q_f32(outptr, z00); + vst1_f32(outptr + 4, vget_low_f32(z01)); + vst1q_f32(outptr + outstep, z10); + vst1_f32(outptr + outstep + 4, vget_low_f32(z11)); + vst1q_f32(outptr + outstep*2, z20); + vst1_f32(outptr + outstep*2 + 4, vget_low_f32(z21)); + vst1q_f32(outptr + outstep*3, z30); + vst1_f32(outptr + outstep*3 + 4, vget_low_f32(z31)); + vst1q_f32(outptr + outstep*4, z40); + vst1_f32(outptr + outstep*4 + 4, vget_low_f32(z41)); + vst1q_f32(outptr + outstep*5, z50); + vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51)); +} + +#endif +} + +}} // namespace diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp b/modules/dnn/src/layers/cpu_kernels/convolution.cpp similarity index 73% rename from modules/dnn/src/layers/fast_convolution/fast_convolution.cpp rename to modules/dnn/src/layers/cpu_kernels/convolution.cpp index 51abf8facc..0f0da11ec7 100644 --- a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp +++ b/modules/dnn/src/layers/cpu_kernels/convolution.cpp @@ -10,11 +10,19 @@ */ #include "../../precomp.hpp" -#include "fast_convolution.hpp" -#include "fast_convolution.simd.hpp" +#include "convolution.hpp" + +#include "conv_block.simd.hpp" +#include "layers/cpu_kernels/conv_block.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content namespace cv { namespace dnn { enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment. + +void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen, + const int convMR, const int convNR); +void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c, + const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR); + Ptr initFastConv( InputArray _weightsMat, float* srcBias, @@ -94,21 +102,15 @@ Ptr initFastConv( } } - conv->conv_type = ifRunDepthWise && conv_dim != CONV_3D ? _FX_CONV_TYPE_DEPTHWISE : + conv->conv_type = ifRunDepthWise && conv_dim != CONV_3D ? CONV_TYPE_DEPTHWISE : useWinograd && (conv_dim == CONV_2D && (conv->useSIMD128 || conv->useAVX2 || conv->useNEON) && Hk == 3 && Wk == 3 && dilation_h == 1 && dilation_w == 1 && stride_h == 1 && stride_w == 1) ? - _FX_CONV_TYPE_WINOGRAD3X3 : - (ifRunDepthWiseRemain ? _FX_CONV_TYPE_DEPTHWISE_REMAIN : _FX_CONV_TYPE_GENERIC); + CONV_TYPE_WINOGRAD3X3 : + (ifRunDepthWiseRemain ? CONV_TYPE_DEPTHWISE_REMAIN : CONV_TYPE_GENERIC); #if !(CV_NEON || CV_SIMD128 || CV_TRY_AVX2) - if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // Disabel Winograd when CV_NEON, CV_SIMD128 and CV_TRY_AVX2 are not available. - conv->conv_type = _FX_CONV_TYPE_GENERIC; -#endif - -#if CV_TRY_AVX2 - // Disabel Winograd when CV_TRY_AVX2 is true, but conv->useAVX2 is false. - if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3 && !conv->useAVX2) - conv->conv_type = _FX_CONV_TYPE_GENERIC; + if (conv->conv_type == CONV_TYPE_WINOGRAD3X3) // Disabel Winograd when CV_NEON, CV_SIMD128 and CV_TRY_AVX2 are not available. + conv->conv_type = CONV_TYPE_GENERIC; #endif Mat weightsMat = _weightsMat.getMat(); @@ -116,7 +118,7 @@ Ptr initFastConv( const size_t wstep = weightsMat.step1(); float *srcWeights = (float *)weightsMat.data; - if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE || conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN) + if (conv->conv_type == CONV_TYPE_DEPTHWISE || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN) { // Handle the Conv1D, Conv2D and Conv3D depth-wise. // for depth-wise convolutions on NCHW data we just preserve the weights in KCHW layout, @@ -138,7 +140,7 @@ Ptr initFastConv( weightsBufPtr[c*padded_ksize + k] = srcWeights[c*wstep + k]; }}); } - else if(conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // winograd + else if(conv->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd { static const float ktm[8][3] = { {1.0f, 0.0f, 0.0f}, @@ -156,24 +158,24 @@ Ptr initFastConv( // where W is the size of Winograd-transformed kernel (8x8), // ATOM_SIZE is number of lanes in SIMD register (4 for NEON and FP32), // KBLOCK is some platform-dependent constant dependent on the number of SIMD registers. - int ksize = _FX_WINO_KSIZE * _FX_WINO_KSIZE; + int ksize = CONV_WINO_KSIZE * CONV_WINO_KSIZE; int Cg = C/ngroups; int Kg = K/ngroups; - int Kg_nblocks = (Kg + _FX_WINO_KBLOCK - 1)/_FX_WINO_KBLOCK; - size_t nweights = ngroups*Kg_nblocks*Cg*_FX_WINO_KBLOCK*_FX_WINO_AREA; + int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - 1)/CONV_WINO_KBLOCK; + size_t nweights = ngroups*Kg_nblocks*Cg*CONV_WINO_KBLOCK*CONV_WINO_AREA; conv->weightsWinoBuf.reserve(nweights + VEC_ALIGN); conv->weightsWinoBufPtr = alignPtr(conv->weightsWinoBuf.data(), VEC_ALIGN); float* wptrWino = conv->weightsWinoBufPtr; memset(wptrWino, 0, nweights * sizeof(wptrWino[0])); parallel_for_(Range(0, K), [&](const Range& r0){ - float kernelTm[_FX_WINO_AREA]; + float kernelTm[CONV_WINO_AREA]; for (int k = r0.start; k < r0.end; k++) { int g = k / Kg; int k_ = k - g*Kg; - int ki = k_ / _FX_WINO_KBLOCK; - int dk = k_ - ki*_FX_WINO_KBLOCK; + int ki = k_ / CONV_WINO_KBLOCK; + int dk = k_ - ki*CONV_WINO_KBLOCK; for (int c = 0; c < Cg; c++) { @@ -204,18 +206,18 @@ Ptr initFastConv( } // repack the data. - float* wptr = wptrWino + (g*Kg_nblocks + ki) * Cg *_FX_WINO_KBLOCK*_FX_WINO_AREA + - (c*_FX_WINO_KBLOCK + dk)*_FX_WINO_ATOM_F32; - for (int i = 0; i < _FX_WINO_NATOMS_F32; i++, - wptr += Cg * _FX_WINO_KBLOCK * _FX_WINO_ATOM_F32) + float* wptr = wptrWino + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA + + (c*CONV_WINO_KBLOCK + dk)*CONV_WINO_ATOM_F32; + for (int i = 0; i < CONV_WINO_NATOMS_F32; i++, + wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F32) { - CV_Assert(conv->weightsWinoBufPtr <= wptr && wptr + _FX_WINO_ATOM_F32 <= conv->weightsWinoBufPtr + nweights); - memcpy(wptr, kernelTm + i * _FX_WINO_ATOM_F32, _FX_WINO_ATOM_F32*sizeof (wptr[0])); + CV_Assert(conv->weightsWinoBufPtr <= wptr && wptr + CONV_WINO_ATOM_F32 <= conv->weightsWinoBufPtr + nweights); + memcpy(wptr, kernelTm + i * CONV_WINO_ATOM_F32, CONV_WINO_ATOM_F32*sizeof (wptr[0])); } } }}); } - else if (conv->conv_type == _FX_CONV_TYPE_GENERIC) + else if (conv->conv_type == CONV_TYPE_GENERIC) { // The weights are packed as // ngroups x (ceil((K/ngroups)/CONV_MR)*CONV_MR) x (Cg*Hk*Wk*Dk) x CONV_MR tensor @@ -372,7 +374,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co fusedAddMat = _output.getMat(); } - if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE) + if (conv->conv_type == CONV_TYPE_DEPTHWISE) { // Depthwise-Convolution layer should not be followed by Add layer. CV_Assert((conv_dim == CONV_1D || conv_dim == CONV_2D)); @@ -420,7 +422,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co else activ = nullptr; - if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // winograd + if (conv->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd { CV_Assert(conv->weightsWinoBufPtr && input.dims == 4 && conv_dim == CONV_2D); if (runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct)) @@ -454,8 +456,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co int dilation_d = conv->dilation_d, dilation_h = conv->dilation_h, dilation_w = conv->dilation_w; int ksize = Dk*Hk*Wk; - bool fast_1x1 = ksize == 1 && stride_d == 1 && stride_w == 1 && stride_h == 1 && - pad_front == 0 && pad_top == 0 && pad_left == 0; + bool fast_1x1 = ksize == 1 && stride_d == 1 && stride_w == 1 && stride_h == 1; int DkHkWkCg = Dk*Hk*Wk*Cg; std::vector ofstab_(Hk*Wk*Dk*4, 0); @@ -504,14 +505,14 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co int MAX_STRIPES = (56 + CONV_NR - 1)/CONV_NR; // Friendly to L1 cache - const int K_BLOCK_SIZE = conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN ? 1 : 32; + const int K_BLOCK_SIZE = conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? 1 : 32; const int C_BLOCK_SIZE = 256; int Kg_nblocks = (Kg + CONV_MR-1)/CONV_MR, Kg_aligned = Kg_nblocks * CONV_MR; int stripes_per_sample = ((int)out_planesize + CONV_NR - 1) / CONV_NR; - if (stripes_per_sample < ntasks * 4 && conv->conv_type != _FX_CONV_TYPE_DEPTHWISE_REMAIN) + if (stripes_per_sample < ntasks * 4 && conv->conv_type != CONV_TYPE_DEPTHWISE_REMAIN) { MAX_STRIPES = 1; stripes_per_sample = 1; @@ -555,7 +556,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co int k0, k1; int zyx0, zyx_limit, zyx_block_limit = 0; - if (stripes_per_sample == 1 && conv->conv_type != _FX_CONV_TYPE_DEPTHWISE_REMAIN) + if (stripes_per_sample == 1 && conv->conv_type != CONV_TYPE_DEPTHWISE_REMAIN) { k0 = kzyx0 * CONV_MR; k1 = kzyx1 * CONV_MR; @@ -618,7 +619,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co } } } - else if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN) + else if (conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN) { CV_Assert(Cg == 1); const int HW0 = H0 * W0; @@ -928,7 +929,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co // spacial branch for depth-wise convolution implemented using generic convolution. // In this case, CONV_MR is 1, and CONV_NR is the same. - if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN) + if (conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN) { size_t outofs = (n * ngroups + g) * out_planesize + zyx0; float *cptr0 = cbuf_task; @@ -947,12 +948,8 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co memcpy(cptr0, cptr, outLen * sizeof(cptr[0])); cptr = cptr0; } -#if CV_TRY_AVX2 - if (conv->useAVX2 && outLen > CONV_NR/3) - opt_AVX2::convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct); - else -#endif - convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen); + + convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR); if (ifBuffer) { @@ -980,7 +977,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co { const int outLen = std::min(out_width - stripe * CONV_NR, CONV_NR); -#if CV_TRY_AVX2 || CV_TRY_NEON +#if CV_TRY_AVX || CV_TRY_AVX2 || CV_NEON // The possible CONV_NR is 28, 24, 12, so the possible CONV_NR/3 is 9, 8, 4. bool runOpt = outLen > std::min(8, CONV_NR/3); #endif @@ -992,16 +989,21 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co { #if CV_TRY_AVX2 if (conv->useAVX2 && runOpt) - opt_AVX2::convBlock_AVX2(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0); + opt_AVX2::convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, CONV_MR, CONV_NR); else #endif -#if CV_TRY_NEON +#if CV_TRY_AVX + if (conv->useAVX && runOpt) + opt_AVX::convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, CONV_MR, CONV_NR); + else +#endif +#if CV_NEON if (conv->useNEON && runOpt) - opt_NEON::convBlock_NEON(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0); + opt_NEON::convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, CONV_MR, CONV_NR); else #endif // The possible outLen range is 24 or 8~1. - convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, outLen); + convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR); } } } @@ -1087,4 +1089,466 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co } }); } + + +/****************************************************************************************\ + SIMD and no-SIMD code for convBlock +\****************************************************************************************/ + +static void convBlockMR1NoSIMD(int np, const float* a, const float* b, float *c, const float bias, bool init_c, + const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR) +{ + std::vector cbuffer(outLen, 0); + float* cbuf = cbuffer.data(); + for( int p = 0; p < np; p++ ) + { + float ai = a[p]; + for( int j = 0; j < outLen; j++ ) + cbuf[j] += b[convNR*p + j] * ai; + } + + if (init_c) + { + for(int j = 0; j < outLen; j++) + { + c[j] += cbuf[j] + bias; + if (ifMinMaxAct) + c[j] = std::min(std::max(c[j], minval), maxval); + } + } + else + { + for(int j = 0; j < outLen; j++) + { + c[j] = cbuf[j] + bias; + if (ifMinMaxAct) + c[j] = std::min(std::max(c[j], minval), maxval); + } + } +} + +#if CV_SIMD128 +static void convBlockMR1x28(int np, const float* a, const float* b, float *c, const float bias, bool init_c, + const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR) +{ + CV_Assert(convNR == 28); + v_float32x4 c0 = v_setall_f32(bias), c1 = c0, c2 = c0; + v_float32x4 c3 = c0, c4 = c0, c5 = c0; + v_float32x4 c6 = c0; + + for (int p = 0; p < np; p++, a++, b += convNR) + { + v_float32x4 a0 = v_setall_f32(a[0]); + v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8); + v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20); + v_float32x4 b6 = v_load(b + 24); + + c0 = v_fma(b0, a0, c0); + c1 = v_fma(b1, a0, c1); + c2 = v_fma(b2, a0, c2); + c3 = v_fma(b3, a0, c3); + c4 = v_fma(b4, a0, c4); + c5 = v_fma(b5, a0, c5); + c6 = v_fma(b6, a0, c6); + } + + if (init_c) + { + c0 += v_load(c); + c1 += v_load(c + 4); + c2 += v_load(c + 8); + c3 += v_load(c + 12); + c4 += v_load(c + 16); + c5 += v_load(c + 20); + c6 += v_load(c + 24); + } + + if (ifMinMaxAct) + { + v_float32x4 vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval); + c0 = v_min(v_max(c0, vmin), vmax); + c1 = v_min(v_max(c1, vmin), vmax); + c2 = v_min(v_max(c2, vmin), vmax); + c3 = v_min(v_max(c3, vmin), vmax); + c4 = v_min(v_max(c4, vmin), vmax); + c5 = v_min(v_max(c5, vmin), vmax); + c6 = v_min(v_max(c6, vmin), vmax); + } + + v_store(c, c0); + v_store(c + 4, c1); + v_store(c + 8, c2); + v_store(c + 12, c3); + v_store(c + 16, c4); + v_store(c + 20, c5); + v_store(c + 24, c6); +} + +static void convBlockMR1x24(int np, const float* a, const float* b, float *c, const float bias, bool init_c, + const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR) +{ + CV_Assert(convNR == 24); + v_float32x4 c0 = v_setall_f32(bias), c1 = c0, c2 = c0; + v_float32x4 c3 = c0, c4 = c0, c5 = c0; + + for (int p = 0; p < np; p++, a++, b += convNR) + { + v_float32x4 a0 = v_setall_f32(a[0]); + v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8); + v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20); + + c0 = v_fma(b0, a0, c0); + c1 = v_fma(b1, a0, c1); + c2 = v_fma(b2, a0, c2); + c3 = v_fma(b3, a0, c3); + c4 = v_fma(b4, a0, c4); + c5 = v_fma(b5, a0, c5); + } + + if (init_c) + { + c0 += v_load(c); + c1 += v_load(c + 4); + c2 += v_load(c + 8); + c3 += v_load(c + 12); + c4 += v_load(c + 16); + c5 += v_load(c + 20); + } + + if (ifMinMaxAct) + { + v_float32x4 vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval); + c0 = v_min(v_max(c0, vmin), vmax); + c1 = v_min(v_max(c1, vmin), vmax); + c2 = v_min(v_max(c2, vmin), vmax); + c3 = v_min(v_max(c3, vmin), vmax); + c4 = v_min(v_max(c4, vmin), vmax); + c5 = v_min(v_max(c5, vmin), vmax); + } + + v_store(c, c0); + v_store(c + 4, c1); + v_store(c + 8, c2); + v_store(c + 12, c3); + v_store(c + 16, c4); + v_store(c + 20, c5); +} + +static void convBlockMR1x12(int np, const float* a, const float* b, float *c, const float bias, bool init_c, + const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR) +{ + CV_Assert(convNR == 12); + v_float32x4 c0 = v_setall_f32(bias), c1 = c0, c2 = c0; + for (int p = 0; p < np; p++, a++, b += convNR) + { + v_float32x4 a0 = v_setall_f32(a[0]); + v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8); + + c0 = v_fma(b0, a0, c0); + c1 = v_fma(b1, a0, c1); + c2 = v_fma(b2, a0, c2); + } + + if (init_c) + { + c0 += v_load(c); + c1 += v_load(c + 4); + c2 += v_load(c + 8); + } + + if (ifMinMaxAct) + { + v_float32x4 vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval); + c0 = v_min(v_max(c0, vmin), vmax); + c1 = v_min(v_max(c1, vmin), vmax); + c2 = v_min(v_max(c2, vmin), vmax); + } + + v_store(c, c0); + v_store(c + 4, c1); + v_store(c + 8, c2); +} +#endif + +void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c, + const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR) +{ +#if CV_SIMD128 + // The outLen represents the valid output value in CONV_NR length. + // When outLen is very small, we use the no-SIMD branch. + const int convNRby3 = convNR/3; + if (outLen > convNRby3) + { + if (convNR == 28) + convBlockMR1x28(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR); + else if (convNR == 24) + convBlockMR1x24(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR); + else if (convNR == 12) + convBlockMR1x12(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR); + else + convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR); + } + else + convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR); +#else + convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR); +#endif +} + +#if CV_SIMD128 +static void convBlock4x24(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR) +{ + v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0; + v_float32x4 c6 = v_setzero_f32(), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6; + v_float32x4 c12 = v_setzero_f32(), c13 = c12, c14 = c12, c15 = c12, c16 = c12, c17 = c12; + v_float32x4 c18 = v_setzero_f32(), c19 = c18, c20 = c18, c21 = c18, c22 = c18, c23 = c18; + + for (int p = 0; p < np; p++, a += convMR, b += convNR) + { + v_float32x4 a0 = v_setall_f32(a[0]); + v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8); + v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20); + + c0 = v_fma(b0, a0, c0); + c1 = v_fma(b1, a0, c1); + c2 = v_fma(b2, a0, c2); + c3 = v_fma(b3, a0, c3); + c4 = v_fma(b4, a0, c4); + c5 = v_fma(b5, a0, c5); + + a0 = v_setall_f32(a[1]); + c6 = v_fma(b0, a0, c6); + c7 = v_fma(b1, a0, c7); + c8 = v_fma(b2, a0, c8); + c9 = v_fma(b3, a0, c9); + c10 = v_fma(b4, a0, c10); + c11 = v_fma(b5, a0, c11); + + a0 = v_setall_f32(a[2]); + c12 = v_fma(b0, a0, c12); + c13 = v_fma(b1, a0, c13); + c14 = v_fma(b2, a0, c14); + c15 = v_fma(b3, a0, c15); + c16 = v_fma(b4, a0, c16); + c17 = v_fma(b5, a0, c17); + + a0 = v_setall_f32(a[3]); + c18 = v_fma(b0, a0, c18); + c19 = v_fma(b1, a0, c19); + c20 = v_fma(b2, a0, c20); + c21 = v_fma(b3, a0, c21); + c22 = v_fma(b4, a0, c22); + c23 = v_fma(b5, a0, c23); + } + + if (!init_c) + { + c0 += v_load(c); + c1 += v_load(c + 4); + c2 += v_load(c + 8); + c3 += v_load(c + 12); + c4 += v_load(c + 16); + c5 += v_load(c + 20); + + c6 += v_load(c + ldc); + c7 += v_load(c + ldc + 4); + c8 += v_load(c + ldc + 8); + c9 += v_load(c + ldc + 12); + c10 += v_load(c + ldc + 16); + c11 += v_load(c + ldc + 20); + + c12 += v_load(c + ldc*2); + c13 += v_load(c + ldc*2 + 4); + c14 += v_load(c + ldc*2 + 8); + c15 += v_load(c + ldc*2 + 12); + c16 += v_load(c + ldc*2 + 16); + c17 += v_load(c + ldc*2 + 20); + + c18 += v_load(c + ldc*3); + c19 += v_load(c + ldc*3 + 4); + c20 += v_load(c + ldc*3 + 8); + c21 += v_load(c + ldc*3 + 12); + c22 += v_load(c + ldc*3 + 16); + c23 += v_load(c + ldc*3 + 20); + } + + v_store(c, c0); + v_store(c + 4, c1); + v_store(c + 8, c2); + v_store(c + 12, c3); + v_store(c + 16, c4); + v_store(c + 20, c5); + + v_store(c + ldc, c6); + v_store(c + ldc + 4, c7); + v_store(c + ldc + 8, c8); + v_store(c + ldc + 12, c9); + v_store(c + ldc + 16, c10); + v_store(c + ldc + 20, c11); + + v_store(c + ldc * 2, c12); + v_store(c + ldc * 2 + 4, c13); + v_store(c + ldc * 2 + 8, c14); + v_store(c + ldc * 2 + 12, c15); + v_store(c + ldc * 2 + 16, c16); + v_store(c + ldc * 2 + 20, c17); + + v_store(c + ldc * 3, c18); + v_store(c + ldc * 3 + 4, c19); + v_store(c + ldc * 3 + 8, c20); + v_store(c + ldc * 3 + 12, c21); + v_store(c + ldc * 3 + 16, c22); + v_store(c + ldc * 3 + 20, c23); +} + +static void convBlock4x8(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR) +{ + CV_Assert(convNR >= 4); + v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0; + v_float32x4 c4 = c0, c5 = c0, c6 = c0, c7 = c0; + + for (int p = 0; p < np; p++, a += convMR, b += convNR) + { + v_float32x4 a0 = v_setall_f32(a[0]); + v_float32x4 a1 = v_setall_f32(a[1]); + v_float32x4 a2 = v_setall_f32(a[2]); + v_float32x4 a3 = v_setall_f32(a[3]); + + v_float32x4 b0 = v_load(b), b1 = v_load(b + 4); + + c0 = v_fma(b0, a0, c0); + c1 = v_fma(b1, a0, c1); + + c2 = v_fma(b0, a1, c2); + c3 = v_fma(b1, a1, c3); + + c4 = v_fma(b0, a2, c4); + c5 = v_fma(b1, a2, c5); + + c6 = v_fma(b0, a3, c6); + c7 = v_fma(b1, a3, c7); + } + + if (!init_c) + { + c0 += v_load(c); + c1 += v_load(c + 4); + + c2 += v_load(c + ldc); + c3 += v_load(c + ldc + 4); + + c4 += v_load(c + ldc*2); + c5 += v_load(c + ldc*2 + 4); + + c6 += v_load(c + ldc*3); + c7 += v_load(c + ldc*3 + 4); + } + + v_store(c, c0); + v_store(c + 4, c1); + v_store(c + ldc, c2); + v_store(c + ldc + 4, c3); + v_store(c + ldc * 2, c4); + v_store(c + ldc * 2 + 4, c5); + v_store(c + ldc * 3, c6); + v_store(c + ldc * 3 + 4, c7); +} + +static void convBlock4x4(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR) +{ + CV_Assert(convNR >= 4); + v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0; + + for (int p = 0; p < np; p++, a += convMR, b += convNR) + { + v_float32x4 a0 = v_setall_f32(a[0]); + v_float32x4 a1 = v_setall_f32(a[1]); + v_float32x4 a2 = v_setall_f32(a[2]); + v_float32x4 a3 = v_setall_f32(a[3]); + + v_float32x4 b0 = v_load(b); + + c0 = v_fma(b0, a0, c0); + c1 = v_fma(b0, a1, c1); + c2 = v_fma(b0, a2, c2); + c3 = v_fma(b0, a3, c3); + } + + if (!init_c) + { + c0 += v_load(c); + c1 += v_load(c + ldc); + c2 += v_load(c + ldc*2); + c3 += v_load(c + ldc*3); + } + + v_store(c, c0); + v_store(c + ldc, c1); + v_store(c + ldc * 2, c2); + v_store(c + ldc * 3, c3); +} +#endif + +static void convBlockNoSIMD(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen, + const int convMR, const int convNR) +{ + std::vector cbuffer(convMR * outLen, 0); + float* cbuf = cbuffer.data(); + for( int p = 0; p < np; p++ ) + { + for( int i = 0; i < convMR; i++ ) + { + float ai = a[convMR*p + i]; + for( int j = 0; j < outLen; j++ ) + cbuf[i * outLen+j] += b[convNR*p + j] * ai; + } + } + + if (!init_c) + { + for(int i = 0; i < convMR; i++) + { + for(int j = 0; j < outLen; j++) + c[i*ldc + j] += cbuf[i*outLen + j]; + } + } + else + { + for(int i = 0; i < convMR; i++) + { + for(int j = 0; j < outLen; j++) + c[i*ldc + j] = cbuf[i*outLen + j]; + } + } +} + +void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen, + const int convMR, const int convNR) +{ + // The possible outLen range is [24, 8~1]. +#if CV_SIMD128 + CV_Assert(convMR == 4); + if (outLen > 8 && convNR == 24) + { + convBlock4x24(np, a, b, c, ldc, init_c, convMR, convNR); + return; + } + + if (outLen <= 8 && outLen > 4) + { + convBlock4x8(np, a, b, c, ldc, init_c, convMR, convNR); + return; + } + + if (outLen <= 4 && outLen > 1) + { + convBlock4x4(np, a, b, c, ldc, init_c, convMR, convNR); + return; + } + convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen, convMR, convNR); +#else + convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen, convMR, convNR); +#endif +} + }} // namespace cv::dnn diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp b/modules/dnn/src/layers/cpu_kernels/convolution.hpp similarity index 69% rename from modules/dnn/src/layers/fast_convolution/fast_convolution.hpp rename to modules/dnn/src/layers/cpu_kernels/convolution.hpp index 7794078bb4..0a077bf800 100644 --- a/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp +++ b/modules/dnn/src/layers/cpu_kernels/convolution.hpp @@ -22,27 +22,29 @@ // Winograd Params enum { - _FX_WINO_STEP=6, - _FX_WINO_KSIZE=3, - _FX_WINO_SIZE=_FX_WINO_STEP+_FX_WINO_KSIZE-1, - _FX_WINO_AREA=_FX_WINO_SIZE*_FX_WINO_SIZE, + CONV_WINO_STEP=6, + CONV_WINO_KSIZE=3, + CONV_WINO_SIZE=CONV_WINO_STEP+CONV_WINO_KSIZE-1, // 8 + CONV_WINO_AREA=CONV_WINO_SIZE*CONV_WINO_SIZE, - _FX_WINO_KBLOCK = 4, + CONV_WINO_KBLOCK = 4, #if (CV_NEON && CV_NEON_AARCH64) || CV_TRY_AVX2 - _FX_WINO_IBLOCK = 6, + CONV_WINO_IBLOCK = 6, #else - _FX_WINO_IBLOCK = 3, + CONV_WINO_IBLOCK = 3, #endif #if CV_TRY_AVX2 - _FX_WINO_ATOM_F32 = 8, + CONV_WINO_ATOM_F32 = 8, #else - _FX_WINO_ATOM_F32 = 4, + CONV_WINO_ATOM_F32 = 4, #endif - _FX_WINO_NATOMS_F32 = _FX_WINO_AREA / _FX_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16. + CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16. }; -enum { _FX_CONV_TYPE_GENERIC=0, _FX_CONV_TYPE_DEPTHWISE=1, _FX_CONV_TYPE_WINOGRAD3X3=2, _FX_CONV_TYPE_DEPTHWISE_REMAIN=3 }; + +// NOTE that: CONV_TYPE_DEPTHWISE is for 3x3 depthwise conv, and others depthwise will be set as CONV_TYPE_DEPTHWISE_REMAIN. +enum { CONV_TYPE_GENERIC=0, CONV_TYPE_DEPTHWISE=1, CONV_TYPE_WINOGRAD3X3=2, CONV_TYPE_DEPTHWISE_REMAIN=3 }; enum { CONV_1D = 0, CONV_2D = 1, CONV_3D = 2 }; #endif @@ -105,22 +107,6 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr& c int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr& conv, int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct); -namespace opt_AVX2 -{ -#if CV_TRY_AVX2 -void convBlock_AVX2(int np, const float* a, const float* b, float* c, int ldc, bool init_c); - -void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c, const float minval, - const float maxval, bool ifMinMaxAct); - -void _fx_winograd_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock); -void _fx_winograd_BtXB_8x8_f32(const float* inptr, int inpstep, float* outptr, int Cg); -void _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep, - float bias, float minval, float maxval, bool ifMinMaxAct); - -#endif -} // namespace opt_AVX2 - } // namespace dnn } // namespace cv diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp deleted file mode 100644 index c98fbe72bd..0000000000 --- a/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp +++ /dev/null @@ -1,499 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level directory -// of this distribution and at http://opencv.org/license.html. - -#include "../../precomp.hpp" -#include "fast_convolution.hpp" - -namespace cv { -namespace dnn { -namespace opt_AVX2 -{ -#if CV_TRY_AVX2 -void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c, - const float minval, const float maxval, bool ifMinMaxAct) -{ -#if CONV_NR == 24 - __m256 c0 = _mm256_set1_ps(bias), c1 = c0, c2 = c0; - - for (int p = 0; p < np; p++, a++, b += CONV_NR) - { - __m256 a0 = _mm256_set1_ps(a[0]); - __m256 b0 = _mm256_loadu_ps(b), b1 = _mm256_loadu_ps(b + 8), b2 = _mm256_loadu_ps(b + 16); - - c0 = _mm256_fmadd_ps(b0, a0, c0); - c1 = _mm256_fmadd_ps(b1, a0, c1); - c2 = _mm256_fmadd_ps(b2, a0, c2); - } - - if (init_c) - { - c0 = _mm256_add_ps(_mm256_loadu_ps(c), c0); - c1 = _mm256_add_ps(_mm256_loadu_ps(c + 8), c1); - c2 = _mm256_add_ps(_mm256_loadu_ps(c + 16), c2); - } - - if (ifMinMaxAct) - { - __m256 vmax = _mm256_set1_ps(maxval); - __m256 vmin = _mm256_set1_ps(minval); - - c0 = _mm256_min_ps(_mm256_max_ps(c0, vmin), vmax); - c1 = _mm256_min_ps(_mm256_max_ps(c1, vmin), vmax); - c2 = _mm256_min_ps(_mm256_max_ps(c2, vmin), vmax); - } - - _mm256_storeu_ps(c, c0); - _mm256_storeu_ps(c + 8, c1); - _mm256_storeu_ps(c + 16, c2); - _mm256_zeroupper(); -#else -#error "unsupported CONV_NR in convBlockMR1." -#endif -} - -void convBlock_AVX2(int np, const float* a, const float* b, float* c, int ldc, bool init_c) -{ -#if CONV_MR == 4 && CONV_NR == 24 - __m256 c00 = _mm256_set1_ps(0.f), c01 = c00, c02 = c00; - __m256 c10 = c00, c11 = c00, c12 = c00; - __m256 c20 = c00, c21 = c00, c22 = c00; - __m256 c30 = c00, c31 = c00, c32 = c00; - - __m256 a0 = _mm256_setzero_ps(), a1 = _mm256_setzero_ps(); - __m256 b0 = _mm256_setzero_ps(), b1 = _mm256_setzero_ps(), b2 = _mm256_setzero_ps(); - - for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR) - { - a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]); - b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16); - - c00 = _mm256_fmadd_ps(b0, a0, c00); - c01 = _mm256_fmadd_ps(b1, a0, c01); - c02 = _mm256_fmadd_ps(b2, a0, c02); - - c10 = _mm256_fmadd_ps(b0, a1, c10); - c11 = _mm256_fmadd_ps(b1, a1, c11); - c12 = _mm256_fmadd_ps(b2, a1, c12); - - a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]); - - c20 = _mm256_fmadd_ps(b0, a0, c20); - c21 = _mm256_fmadd_ps(b1, a0, c21); - c22 = _mm256_fmadd_ps(b2, a0, c22); - - c30 = _mm256_fmadd_ps(b0, a1, c30); - c31 = _mm256_fmadd_ps(b1, a1, c31); - c32 = _mm256_fmadd_ps(b2, a1, c32); - } - - if (!init_c) - { - c00 = _mm256_add_ps(c00, _mm256_load_ps(c)); - c01 = _mm256_add_ps(c01, _mm256_load_ps(c + 8)); - c02 = _mm256_add_ps(c02, _mm256_load_ps(c + 16)); - - c10 = _mm256_add_ps(c10, _mm256_load_ps(c + ldc)); - c11 = _mm256_add_ps(c11, _mm256_load_ps(c + ldc + 8)); - c12 = _mm256_add_ps(c12, _mm256_load_ps(c + ldc + 16)); - - c20 = _mm256_add_ps(c20, _mm256_load_ps(c + ldc*2)); - c21 = _mm256_add_ps(c21, _mm256_load_ps(c + ldc*2 + 8)); - c22 = _mm256_add_ps(c22, _mm256_load_ps(c + ldc*2 + 16)); - - c30 = _mm256_add_ps(c30, _mm256_load_ps(c + ldc*3)); - c31 = _mm256_add_ps(c31, _mm256_load_ps(c + ldc*3 + 8)); - c32 = _mm256_add_ps(c32, _mm256_load_ps(c + ldc*3 + 16)); - } - - _mm256_storeu_ps(c, c00), _mm256_storeu_ps(c+8, c01), _mm256_storeu_ps(c+16, c02); - _mm256_storeu_ps(c + ldc, c10), _mm256_storeu_ps(c + ldc + 8, c11), _mm256_storeu_ps(c + ldc + 16, c12); - _mm256_storeu_ps(c + ldc*2, c20), _mm256_storeu_ps(c + ldc*2 + 8, c21), _mm256_storeu_ps(c + ldc*2 + 16, c22); - _mm256_storeu_ps(c + ldc*3, c30), _mm256_storeu_ps(c + ldc*3 + 8, c31), _mm256_storeu_ps(c + ldc*3 + 16, c32); - _mm256_zeroupper(); -#else -#error "unsupported CONV_MR and/or CONV_NR in convBlock_AVX2." -#endif -} - -void _fx_winograd_accum_f32(const float* inwptr, const float* wptr, - float* outbuf, int Cg, int iblock) -{ - CV_Assert(_FX_WINO_IBLOCK == 6 && _FX_WINO_KBLOCK == 4 && _FX_WINO_ATOM_F32 == 8); - if (iblock > 3) - { - for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++, - outbuf += _FX_WINO_ATOM_F32) - { - __m256 s00 = _mm256_set1_ps(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00; - __m256 s10 = _mm256_set1_ps(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00; - __m256 s20 = _mm256_set1_ps(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00; - __m256 s30 = _mm256_set1_ps(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00; - for (int c = 0; c < Cg; c++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32, - wptr += _FX_WINO_KBLOCK*_FX_WINO_ATOM_F32) - { - __m256 w0 = _mm256_load_ps(wptr), w1 = _mm256_load_ps(wptr + 8); - __m256 w2 = _mm256_load_ps(wptr + 16), w3 = _mm256_load_ps(wptr + 24); - __m256 x0, x1; - x0 = _mm256_load_ps(inwptr); - x1 = _mm256_load_ps(inwptr + 8); - s00 = _mm256_fmadd_ps(w0, x0, s00); - s01 = _mm256_fmadd_ps(w0, x1, s01); - s10 = _mm256_fmadd_ps(w1, x0, s10); - s11 = _mm256_fmadd_ps(w1, x1, s11); - s20 = _mm256_fmadd_ps(w2, x0, s20); - s21 = _mm256_fmadd_ps(w2, x1, s21); - s30 = _mm256_fmadd_ps(w3, x0, s30); - s31 = _mm256_fmadd_ps(w3, x1, s31); - x0 = _mm256_load_ps(inwptr + 16); - x1 = _mm256_load_ps(inwptr + 24); - s02 = _mm256_fmadd_ps(w0, x0, s02); - s03 = _mm256_fmadd_ps(w0, x1, s03); - s12 = _mm256_fmadd_ps(w1, x0, s12); - s13 = _mm256_fmadd_ps(w1, x1, s13); - s22 = _mm256_fmadd_ps(w2, x0, s22); - s23 = _mm256_fmadd_ps(w2, x1, s23); - s32 = _mm256_fmadd_ps(w3, x0, s32); - s33 = _mm256_fmadd_ps(w3, x1, s33); - x0 = _mm256_load_ps(inwptr + 32); - x1 = _mm256_load_ps(inwptr + 40); - s04 = _mm256_fmadd_ps(w0, x0, s04); - s05 = _mm256_fmadd_ps(w0, x1, s05); - s14 = _mm256_fmadd_ps(w1, x0, s14); - s15 = _mm256_fmadd_ps(w1, x1, s15); - s24 = _mm256_fmadd_ps(w2, x0, s24); - s25 = _mm256_fmadd_ps(w2, x1, s25); - s34 = _mm256_fmadd_ps(w3, x0, s34); - s35 = _mm256_fmadd_ps(w3, x1, s35); - } - - _mm256_store_ps(outbuf, s00); - _mm256_store_ps(outbuf + 1*64, s01); - _mm256_store_ps(outbuf + 2*64, s02); - _mm256_store_ps(outbuf + 3*64, s03); - _mm256_store_ps(outbuf + 4*64, s04); - _mm256_store_ps(outbuf + 5*64, s05); - - _mm256_store_ps(outbuf + 6*64, s10); - _mm256_store_ps(outbuf + 7*64, s11); - _mm256_store_ps(outbuf + 8*64, s12); - _mm256_store_ps(outbuf + 9*64, s13); - _mm256_store_ps(outbuf + 10*64, s14); - _mm256_store_ps(outbuf + 11*64, s15); - - _mm256_store_ps(outbuf + 12*64, s20); - _mm256_store_ps(outbuf + 13*64, s21); - _mm256_store_ps(outbuf + 14*64, s22); - _mm256_store_ps(outbuf + 15*64, s23); - _mm256_store_ps(outbuf + 16*64, s24); - _mm256_store_ps(outbuf + 17*64, s25); - - _mm256_store_ps(outbuf + 18*64, s30); - _mm256_store_ps(outbuf + 19*64, s31); - _mm256_store_ps(outbuf + 20*64, s32); - _mm256_store_ps(outbuf + 21*64, s33); - _mm256_store_ps(outbuf + 22*64, s34); - _mm256_store_ps(outbuf + 23*64, s35); - } - } - else - { - for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++, - outbuf += _FX_WINO_ATOM_F32) - { - __m256 s00 = _mm256_set1_ps(0.f), s01 = s00, s02 = s00; - __m256 s10 = _mm256_set1_ps(0.f), s11 = s00, s12 = s00; - __m256 s20 = _mm256_set1_ps(0.f), s21 = s00, s22 = s00; - __m256 s30 = _mm256_set1_ps(0.f), s31 = s00, s32 = s00; - for (int c = 0; c < Cg; c++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32, - wptr += _FX_WINO_KBLOCK*_FX_WINO_ATOM_F32) { - __m256 w0 = _mm256_load_ps(wptr), w1 = _mm256_load_ps(wptr + 8); - __m256 w2 = _mm256_load_ps(wptr + 16), w3 = _mm256_load_ps(wptr + 24); - __m256 x0, x1, x2; - x0 = _mm256_load_ps(inwptr); - x1 = _mm256_load_ps(inwptr + 8); - x2 = _mm256_load_ps(inwptr + 16); - s00 = _mm256_fmadd_ps(w0, x0, s00); - s01 = _mm256_fmadd_ps(w0, x1, s01); - s02 = _mm256_fmadd_ps(w0, x2, s02); - s10 = _mm256_fmadd_ps(w1, x0, s10); - s11 = _mm256_fmadd_ps(w1, x1, s11); - s12 = _mm256_fmadd_ps(w1, x2, s12); - s20 = _mm256_fmadd_ps(w2, x0, s20); - s21 = _mm256_fmadd_ps(w2, x1, s21); - s22 = _mm256_fmadd_ps(w2, x2, s22); - s30 = _mm256_fmadd_ps(w3, x0, s30); - s31 = _mm256_fmadd_ps(w3, x1, s31); - s32 = _mm256_fmadd_ps(w3, x2, s32); - } - - _mm256_store_ps(outbuf, s00); - _mm256_store_ps(outbuf + 1*64, s01); - _mm256_store_ps(outbuf + 2*64, s02); - _mm256_store_ps(outbuf + 6*64, s10); - _mm256_store_ps(outbuf + 7*64, s11); - _mm256_store_ps(outbuf + 8*64, s12); - _mm256_store_ps(outbuf + 12*64, s20); - _mm256_store_ps(outbuf + 13*64, s21); - _mm256_store_ps(outbuf + 14*64, s22); - _mm256_store_ps(outbuf + 18*64, s30); - _mm256_store_ps(outbuf + 19*64, s31); - _mm256_store_ps(outbuf + 20*64, s32); - } - } - _mm256_zeroupper(); -} -static inline -void transpose8_ps(__m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m256 &row4, __m256 &row5, __m256 &row6, __m256 &row7) -{ - __m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7; - __m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7; - __t0 = _mm256_unpacklo_ps(row0, row1); - __t1 = _mm256_unpackhi_ps(row0, row1); - __t2 = _mm256_unpacklo_ps(row2, row3); - __t3 = _mm256_unpackhi_ps(row2, row3); - __t4 = _mm256_unpacklo_ps(row4, row5); - __t5 = _mm256_unpackhi_ps(row4, row5); - __t6 = _mm256_unpacklo_ps(row6, row7); - __t7 = _mm256_unpackhi_ps(row6, row7); - __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); - __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); - __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); - __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); - __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); - __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); - __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); - __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); - row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); - row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); - row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); - row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); - row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); - row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); - row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); - row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); -} - -/*Input transform*/ -void _fx_winograd_BtXB_8x8_f32(const float* inptr, int inpstep, float* outptr, int Cg) -{ - __m256 x00 = _mm256_loadu_ps(inptr); - __m256 x10 = _mm256_loadu_ps(inptr + inpstep); - __m256 x20 = _mm256_loadu_ps(inptr + inpstep*2); - __m256 x30 = _mm256_loadu_ps(inptr + inpstep*3); - __m256 x40 = _mm256_loadu_ps(inptr + inpstep*4); - __m256 x50 = _mm256_loadu_ps(inptr + inpstep*5); - __m256 x60 = _mm256_loadu_ps(inptr + inpstep*6); - __m256 x70 = _mm256_loadu_ps(inptr + inpstep*7); - - __m256 z00, z10, z20, z30, z40, z50, z60, z70; - - { - /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */ - /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */ - __m256 q5_25 = _mm256_set1_ps(5.25f), t00, t10; - t00 = _mm256_sub_ps(x40, x20); - t10 = _mm256_sub_ps(x30, x50); - - __m256 y00 = _mm256_fmadd_ps(t00, q5_25, _mm256_sub_ps(x00, x60)); - __m256 y70 = _mm256_fmadd_ps(t10, q5_25, _mm256_sub_ps(x70, x10)); - - /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */ - /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */ - __m256 qm4_25 = _mm256_set1_ps(-4.25f); - t00 = _mm256_fmadd_ps(x30, qm4_25, _mm256_add_ps(x10, x50)); - t10 = _mm256_fmadd_ps(x40, qm4_25, _mm256_add_ps(x20, x60)); - - __m256 y10 = _mm256_add_ps(t00, t10); - __m256 y20 = _mm256_sub_ps(t10, t00); - - /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */ - /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */ - __m256 q0_5 = _mm256_set1_ps(0.5f), q0_25 = _mm256_set1_ps(0.25f); - __m256 qm2_5 = _mm256_set1_ps(-2.5f), qm1_25 = _mm256_set1_ps(-1.25f); - t00 = _mm256_fmadd_ps(x10, q0_5, _mm256_add_ps(x50, x50)); - t10 = _mm256_fmadd_ps(x20, q0_25, x60); - t00 = _mm256_fmadd_ps(x30, qm2_5, t00); - t10 = _mm256_fmadd_ps(x40, qm1_25, t10); - - __m256 y30 = _mm256_add_ps(t00, t10); - __m256 y40 = _mm256_sub_ps(t10, t00); - - /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */ - /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */ - __m256 q4 = _mm256_set1_ps(4.f), qm5 = _mm256_set1_ps(-5.f); - t00 = _mm256_fmadd_ps(x50, q0_5, _mm256_add_ps(x10, x10)); - t10 = _mm256_fmadd_ps(x20, q4 , x60); - t00 = _mm256_fmadd_ps(x30, qm2_5, t00); - t10 = _mm256_fmadd_ps(x40, qm5 , t10); - - __m256 y50 = _mm256_add_ps(t00, t10); - __m256 y60 = _mm256_sub_ps(t10, t00); - - /* transpose 8x8 matrix in-place with some renumeration of the elements: */ - transpose8_ps(y00, y10, y20, y30, y40, y50, y60, y70); - - /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */ - /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */ - t00 = _mm256_sub_ps(y40, y20); - t10 = _mm256_sub_ps(y30, y50); - z00 = _mm256_fmadd_ps(t00, q5_25, _mm256_sub_ps(y00, y60)); - z70 = _mm256_fmadd_ps(t10, q5_25, _mm256_sub_ps(y70, y10)); - - /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */ - /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */ - t00 = _mm256_fmadd_ps(y30, qm4_25, _mm256_add_ps(y10, y50)); - t10 = _mm256_fmadd_ps(y40, qm4_25, _mm256_add_ps(y20, y60)); - z10 = _mm256_add_ps(t00, t10); - z20 = _mm256_sub_ps(t10, t00); - - /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */ - /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */ - t00 = _mm256_fmadd_ps(y10, q0_5, _mm256_add_ps(y50, y50)); - t10 = _mm256_fmadd_ps(y20, q0_25, y60); - t00 = _mm256_fmadd_ps(y30, qm2_5, t00); - t10 = _mm256_fmadd_ps(y40, qm1_25, t10); - - z30 = _mm256_add_ps(t00, t10); - z40 = _mm256_sub_ps(t10, t00); - - /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */ - /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */ - t00 = _mm256_fmadd_ps(y50, q0_5, _mm256_add_ps(y10, y10)); - t10 = _mm256_fmadd_ps(y20, q4, y60); - t00 = _mm256_fmadd_ps(y30, qm2_5, t00); - t10 = _mm256_fmadd_ps(y40, qm5, t10); - - z50 = _mm256_add_ps(t00, t10); - z60 = _mm256_sub_ps(t10, t00); - } - - const int outstep = _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32*Cg; - - _mm256_storeu_ps(outptr, z00); - _mm256_storeu_ps(outptr + outstep, z10); - _mm256_storeu_ps(outptr + outstep*2, z20); - _mm256_storeu_ps(outptr + outstep*3, z30); - _mm256_storeu_ps(outptr + outstep*4, z40); - _mm256_storeu_ps(outptr + outstep*5, z50); - _mm256_storeu_ps(outptr + outstep*6, z60); - _mm256_storeu_ps(outptr + outstep*7, z70); - _mm256_zeroupper(); -} - -#define STORE6_ELE_FROM_16(ptr, z00, lowM, highM) \ - lowM = _mm256_castps256_ps128(z00); \ - highM = _mm256_extractf128_ps(z00, 1); \ - _mm_storeu_ps(ptr, lowM); \ - _mm_storel_epi64((__m128i*)(ptr + 4), _mm_castps_si128(highM)) - -/* Inverse Winograd 8x8 transform: - out = (A'*inp*A)', where - inp is input 8x8 FP32 matrix, - A' is - [1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, - 0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f, - 0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f, - 0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f, - 0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f, - 0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f] -*/ -void _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep, - float* bpptr, int bpstep, float* outptr, int outstep, - float bias, float minval, float maxval, bool ifMinMaxAct) -{ - - __m256 x00 = _mm256_load_ps(inptr); - __m256 x10 = _mm256_load_ps(inptr + inpstep); - __m256 x20 = _mm256_load_ps(inptr + inpstep*2); - __m256 x30 = _mm256_load_ps(inptr + inpstep*3); - __m256 x40 = _mm256_load_ps(inptr + inpstep*4); - __m256 x50 = _mm256_load_ps(inptr + inpstep*5); - __m256 x60 = _mm256_load_ps(inptr + inpstep*6); - __m256 x70 = _mm256_load_ps(inptr + inpstep*7); - __m256 z00, z10, z20, z30, z40, z50; - - { - __m256 s12_0, s34_0, s56_0; - s12_0 = _mm256_add_ps(x10, x20); - s34_0 = _mm256_add_ps(x30, x40); - s56_0 = _mm256_add_ps(x50, x60); - - __m256 y00 = _mm256_add_ps(x00, _mm256_add_ps(s12_0, _mm256_add_ps(s34_0, s56_0))); - __m256 y20 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.25f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(4.0f), s12_0)); - __m256 y40 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/16), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(16.0f), s12_0)); - - s12_0 = _mm256_sub_ps(x10, x20); - s34_0 = _mm256_sub_ps(x30, x40); - s56_0 = _mm256_sub_ps(x50, x60); - __m256 y50 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/32), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(32.f), _mm256_add_ps(x70, s12_0))); - __m256 y10 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.5f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(2.f), s12_0)); - __m256 y30 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.125f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(8.f), s12_0)); - __m256 y60 = _mm256_set1_ps(0.f), y70 = y60; - - /* transpose 8x8 matrix in-place with some renumeration of the elements: */ - - transpose8_ps(y00, y10, y20, y30, y40, y50, y60, y70); - - s12_0 = _mm256_add_ps(y10, y20); - s34_0 = _mm256_add_ps(y30, y40); - s56_0 = _mm256_add_ps(y50, y60); - - z00 = _mm256_add_ps(y00, _mm256_add_ps(s12_0, _mm256_add_ps(s34_0, s56_0))); - z20 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.25f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(4.0f), s12_0)); - z40 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/16), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(16.0f), s12_0)); - - s12_0 = _mm256_sub_ps(y10, y20); - s34_0 = _mm256_sub_ps(y30, y40); - s56_0 = _mm256_sub_ps(y50, y60); - - z50 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/32), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(32.0f), _mm256_add_ps(y70, s12_0))); - z10 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.5f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(2.0f), s12_0)); - z30 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.125f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(8.0f), s12_0)); - - __m256 vbias = _mm256_set1_ps(bias); - z00 = _mm256_add_ps(vbias, z00); - z10 = _mm256_add_ps(vbias, z10); - z20 = _mm256_add_ps(vbias, z20); - z30 = _mm256_add_ps(vbias, z30); - z40 = _mm256_add_ps(vbias, z40); - z50 = _mm256_add_ps(vbias, z50); - } - - if (bpptr) - { - z00 = _mm256_add_ps(z00, _mm256_loadu_ps(bpptr)); - z10 = _mm256_add_ps(z10, _mm256_loadu_ps(bpptr + bpstep)); - z20 = _mm256_add_ps(z20, _mm256_loadu_ps(bpptr + bpstep*2)); - z30 = _mm256_add_ps(z30, _mm256_loadu_ps(bpptr + bpstep*3)); - z40 = _mm256_add_ps(z40, _mm256_loadu_ps(bpptr + bpstep*4)); - z50 = _mm256_add_ps(z50, _mm256_loadu_ps(bpptr + bpstep*5)); - } - - if (ifMinMaxAct) - { - __m256 vmax = _mm256_set1_ps(maxval); - __m256 vmin = _mm256_set1_ps(minval); - - z00 = _mm256_min_ps(_mm256_max_ps(z00, vmin), vmax); - z10 = _mm256_min_ps(_mm256_max_ps(z10, vmin), vmax); - z20 = _mm256_min_ps(_mm256_max_ps(z20, vmin), vmax); - z30 = _mm256_min_ps(_mm256_max_ps(z30, vmin), vmax); - z40 = _mm256_min_ps(_mm256_max_ps(z40, vmin), vmax); - z50 = _mm256_min_ps(_mm256_max_ps(z50, vmin), vmax); - } - - __m128 lowM, highM; - STORE6_ELE_FROM_16(outptr, z00, lowM, highM); - STORE6_ELE_FROM_16(outptr + outstep, z10, lowM, highM); - STORE6_ELE_FROM_16(outptr + outstep * 2, z20, lowM, highM); - STORE6_ELE_FROM_16(outptr + outstep * 3, z30, lowM, highM); - STORE6_ELE_FROM_16(outptr + outstep * 4, z40, lowM, highM); - STORE6_ELE_FROM_16(outptr + outstep * 5, z50, lowM, highM); - _mm256_zeroupper(); -} - -#endif -} // namespace opt_AVX2 -} // namespace dnn -} // namespace cv \ No newline at end of file diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp deleted file mode 100644 index e146c0974e..0000000000 --- a/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp +++ /dev/null @@ -1,567 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level directory -// of this distribution and at http://opencv.org/license.html. - -#ifndef OPENCV_FAST_CONVOLUTION_SIMD_HPP -#define OPENCV_FAST_CONVOLUTION_SIMD_HPP - -#include "opencv2/core/hal/intrin.hpp" -#include - -namespace cv { -namespace dnn { - -static void convBlockMR1NoSIMD(int np, const float* a, const float* b, float *c, const float bias, bool init_c, - const float minval, const float maxval, bool ifMinMaxAct, const int outLen) -{ - std::vector cbuffer(outLen, 0); - float* cbuf = cbuffer.data(); - for( int p = 0; p < np; p++ ) - { - float ai = a[p]; - for( int j = 0; j < outLen; j++ ) - cbuf[j] += b[CONV_NR*p + j] * ai; - } - - if (init_c) - { - for(int j = 0; j < outLen; j++) - { - c[j] += cbuf[j] + bias; - if (ifMinMaxAct) - c[j] = std::min(std::max(c[j], minval), maxval); - } - } - else - { - for(int j = 0; j < outLen; j++) - { - c[j] = cbuf[j] + bias; - if (ifMinMaxAct) - c[j] = std::min(std::max(c[j], minval), maxval); - } - } -} - -void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c, - const float minval, const float maxval, bool ifMinMaxAct, const int outLen) -{ -#if CV_SIMD128 - // The outLen represents the valid output value in CONV_NR length. - // When outLen is very small, we use the no-SIMD branch. - const int CONV_NRby3 = CONV_NR/3; - if (outLen > CONV_NRby3) - { - v_float32x4 c0 = v_setall_f32(bias), c1 = c0, c2 = c0; // CONV_NR == 12 -#if CONV_NR == 28 || CONV_NR == 24 - v_float32x4 c3 = c0, c4 = c0, c5 = c0; -#endif -#if CONV_NR == 28 - v_float32x4 c6 = c0; -#endif - for (int p = 0; p < np; p++, a++, b += CONV_NR) - { - v_float32x4 a0 = v_setall_f32(a[0]); - v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8); -#if CONV_NR == 28 || CONV_NR == 24 - v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20); -#endif -#if CONV_NR == 28 - v_float32x4 b6 = v_load(b + 24); -#endif - - c0 = v_fma(b0, a0, c0); - c1 = v_fma(b1, a0, c1); - c2 = v_fma(b2, a0, c2); -#if CONV_NR == 28 || CONV_NR == 24 - c3 = v_fma(b3, a0, c3); - c4 = v_fma(b4, a0, c4); - c5 = v_fma(b5, a0, c5); -#endif -#if CONV_NR == 28 - c6 = v_fma(b6, a0, c6); -#endif - } - - if (init_c) - { - c0 += v_load(c); - c1 += v_load(c + 4); - c2 += v_load(c + 8); -#if CONV_NR == 28 || CONV_NR == 24 - c3 += v_load(c + 12); - c4 += v_load(c + 16); - c5 += v_load(c + 20); -#endif -#if CONV_NR == 28 - c6 += v_load(c + 24); -#endif - } - - if (ifMinMaxAct) - { - v_float32x4 vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval); - c0 = v_min(v_max(c0, vmin), vmax); - c1 = v_min(v_max(c1, vmin), vmax); - c2 = v_min(v_max(c2, vmin), vmax); -#if CONV_NR == 28 || CONV_NR == 24 - c3 = v_min(v_max(c3, vmin), vmax); - c4 = v_min(v_max(c4, vmin), vmax); - c5 = v_min(v_max(c5, vmin), vmax); -#endif -#if CONV_NR == 28 - c6 = v_min(v_max(c6, vmin), vmax); -#endif - } - - v_store(c, c0); - v_store(c + 4, c1); - v_store(c + 8, c2); -#if CONV_NR == 28 || CONV_NR == 24 - v_store(c + 12, c3); - v_store(c + 16, c4); - v_store(c + 20, c5); -#endif -#if CONV_NR == 28 - v_store(c + 24, c6); -#endif - } - else - convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen); -#else - convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen); -#endif -} - -#if CV_SIMD128 -#if CONV_MR == 4 && CONV_NR == 24 -static void convBlock4x24(int np, const float* a, const float* b, float* c, int ldc, bool init_c) -{ - v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0; - v_float32x4 c6 = v_setzero_f32(), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6; - v_float32x4 c12 = v_setzero_f32(), c13 = c12, c14 = c12, c15 = c12, c16 = c12, c17 = c12; - v_float32x4 c18 = v_setzero_f32(), c19 = c18, c20 = c18, c21 = c18, c22 = c18, c23 = c18; - - for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR) - { - v_float32x4 a0 = v_setall_f32(a[0]); - v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8); - v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20); - - c0 = v_fma(b0, a0, c0); - c1 = v_fma(b1, a0, c1); - c2 = v_fma(b2, a0, c2); - c3 = v_fma(b3, a0, c3); - c4 = v_fma(b4, a0, c4); - c5 = v_fma(b5, a0, c5); - - a0 = v_setall_f32(a[1]); - c6 = v_fma(b0, a0, c6); - c7 = v_fma(b1, a0, c7); - c8 = v_fma(b2, a0, c8); - c9 = v_fma(b3, a0, c9); - c10 = v_fma(b4, a0, c10); - c11 = v_fma(b5, a0, c11); - - a0 = v_setall_f32(a[2]); - c12 = v_fma(b0, a0, c12); - c13 = v_fma(b1, a0, c13); - c14 = v_fma(b2, a0, c14); - c15 = v_fma(b3, a0, c15); - c16 = v_fma(b4, a0, c16); - c17 = v_fma(b5, a0, c17); - - a0 = v_setall_f32(a[3]); - c18 = v_fma(b0, a0, c18); - c19 = v_fma(b1, a0, c19); - c20 = v_fma(b2, a0, c20); - c21 = v_fma(b3, a0, c21); - c22 = v_fma(b4, a0, c22); - c23 = v_fma(b5, a0, c23); - } - - if (!init_c) - { - c0 += v_load(c); - c1 += v_load(c + 4); - c2 += v_load(c + 8); - c3 += v_load(c + 12); - c4 += v_load(c + 16); - c5 += v_load(c + 20); - - c6 += v_load(c + ldc); - c7 += v_load(c + ldc + 4); - c8 += v_load(c + ldc + 8); - c9 += v_load(c + ldc + 12); - c10 += v_load(c + ldc + 16); - c11 += v_load(c + ldc + 20); - - c12 += v_load(c + ldc*2); - c13 += v_load(c + ldc*2 + 4); - c14 += v_load(c + ldc*2 + 8); - c15 += v_load(c + ldc*2 + 12); - c16 += v_load(c + ldc*2 + 16); - c17 += v_load(c + ldc*2 + 20); - - c18 += v_load(c + ldc*3); - c19 += v_load(c + ldc*3 + 4); - c20 += v_load(c + ldc*3 + 8); - c21 += v_load(c + ldc*3 + 12); - c22 += v_load(c + ldc*3 + 16); - c23 += v_load(c + ldc*3 + 20); - } - - v_store(c, c0); - v_store(c + 4, c1); - v_store(c + 8, c2); - v_store(c + 12, c3); - v_store(c + 16, c4); - v_store(c + 20, c5); - - v_store(c + ldc, c6); - v_store(c + ldc + 4, c7); - v_store(c + ldc + 8, c8); - v_store(c + ldc + 12, c9); - v_store(c + ldc + 16, c10); - v_store(c + ldc + 20, c11); - - v_store(c + ldc * 2, c12); - v_store(c + ldc * 2 + 4, c13); - v_store(c + ldc * 2 + 8, c14); - v_store(c + ldc * 2 + 12, c15); - v_store(c + ldc * 2 + 16, c16); - v_store(c + ldc * 2 + 20, c17); - - v_store(c + ldc * 3, c18); - v_store(c + ldc * 3 + 4, c19); - v_store(c + ldc * 3 + 8, c20); - v_store(c + ldc * 3 + 12, c21); - v_store(c + ldc * 3 + 16, c22); - v_store(c + ldc * 3 + 20, c23); -} -#endif - -static void convBlock4x8(int np, const float* a, const float* b, float* c, int ldc, bool init_c) -{ - CV_Assert(CONV_NR >= 4); - v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0; - v_float32x4 c4 = c0, c5 = c0, c6 = c0, c7 = c0; - - for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR) - { - v_float32x4 a0 = v_setall_f32(a[0]); - v_float32x4 a1 = v_setall_f32(a[1]); - v_float32x4 a2 = v_setall_f32(a[2]); - v_float32x4 a3 = v_setall_f32(a[3]); - - v_float32x4 b0 = v_load(b), b1 = v_load(b + 4); - - c0 = v_fma(b0, a0, c0); - c1 = v_fma(b1, a0, c1); - - c2 = v_fma(b0, a1, c2); - c3 = v_fma(b1, a1, c3); - - c4 = v_fma(b0, a2, c4); - c5 = v_fma(b1, a2, c5); - - c6 = v_fma(b0, a3, c6); - c7 = v_fma(b1, a3, c7); - } - - if (!init_c) - { - c0 += v_load(c); - c1 += v_load(c + 4); - - c2 += v_load(c + ldc); - c3 += v_load(c + ldc + 4); - - c4 += v_load(c + ldc*2); - c5 += v_load(c + ldc*2 + 4); - - c6 += v_load(c + ldc*3); - c7 += v_load(c + ldc*3 + 4); - } - - v_store(c, c0); - v_store(c + 4, c1); - v_store(c + ldc, c2); - v_store(c + ldc + 4, c3); - v_store(c + ldc * 2, c4); - v_store(c + ldc * 2 + 4, c5); - v_store(c + ldc * 3, c6); - v_store(c + ldc * 3 + 4, c7); -} - -static void convBlock4x4(int np, const float* a, const float* b, float* c, int ldc, bool init_c) -{ - CV_Assert(CONV_NR >= 4); - v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0; - - for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR) - { - v_float32x4 a0 = v_setall_f32(a[0]); - v_float32x4 a1 = v_setall_f32(a[1]); - v_float32x4 a2 = v_setall_f32(a[2]); - v_float32x4 a3 = v_setall_f32(a[3]); - - v_float32x4 b0 = v_load(b); - - c0 = v_fma(b0, a0, c0); - c1 = v_fma(b0, a1, c1); - c2 = v_fma(b0, a2, c2); - c3 = v_fma(b0, a3, c3); - } - - if (!init_c) - { - c0 += v_load(c); - c1 += v_load(c + ldc); - c2 += v_load(c + ldc*2); - c3 += v_load(c + ldc*3); - } - - v_store(c, c0); - v_store(c + ldc, c1); - v_store(c + ldc * 2, c2); - v_store(c + ldc * 3, c3); -} -#endif - -static void convBlockNoSIMD(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen) -{ - std::vector cbuffer(CONV_MR * outLen, 0); - float* cbuf = cbuffer.data(); - for( int p = 0; p < np; p++ ) - { - for( int i = 0; i < CONV_MR; i++ ) - { - float ai = a[CONV_MR*p + i]; - for( int j = 0; j < outLen; j++ ) - cbuf[i * outLen+j] += b[CONV_NR*p + j] * ai; - } - } - - if (!init_c) - { - for(int i = 0; i < CONV_MR; i++) - { - for(int j = 0; j < outLen; j++) - c[i*ldc + j] += cbuf[i*outLen + j]; - } - } - else - { - for(int i = 0; i < CONV_MR; i++) - { - for(int j = 0; j < outLen; j++) - c[i*ldc + j] = cbuf[i*outLen + j]; - } - } -} - -void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen) -{ - // The possible outLen range is [24, 8~1]. -#if CV_SIMD128 -#if CONV_MR == 4 && CONV_NR == 24 - const int CONV_NRby3 = CONV_NR/3; - if (outLen > CONV_NRby3) - { - convBlock4x24(np, a, b, c, ldc, init_c); - return; - } -#endif - - if (outLen <= 8 && outLen > 4) - { - convBlock4x8(np, a, b, c, ldc, init_c); - return; - } - - if (outLen <= 4 && outLen > 1) - { - convBlock4x4(np, a, b, c, ldc, init_c); - return; - } - convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen); -#else - convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen); -#endif -} -} // namespace dnn - -namespace opt_NEON -{ -#if CV_TRY_NEON -void convBlock_NEON(int np, const float* a, const float* b, float* c, int ldc, bool init_c) -{ -#if CONV_MR == 4 && CONV_NR == 28 // AARCH64 - { - float32x4_t c00 = vdupq_n_f32(0.f), c01 = c00, c02 = c00, c03 = c00, c04 = c00, c05 = c00, c06 = c00; - float32x4_t c10 = vdupq_n_f32(0.f), c11 = c10, c12 = c10, c13 = c10, c14 = c10, c15 = c10, c16 = c10; - float32x4_t c20 = vdupq_n_f32(0.f), c21 = c20, c22 = c20, c23 = c20, c24 = c20, c25 = c20, c26 = c20; - float32x4_t c30 = vdupq_n_f32(0.f), c31 = c30, c32 = c30, c33 = c30, c34 = c30, c35 = c30, c36 = c30; - - for( int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR ) - { - float32x4_t a0 = vld1q_f32(a), b0, b1, b2; - b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8); - - c00 = vfmaq_laneq_f32(c00, b0, a0, 0); - c01 = vfmaq_laneq_f32(c01, b1, a0, 0); - c02 = vfmaq_laneq_f32(c02, b2, a0, 0); - c10 = vfmaq_laneq_f32(c10, b0, a0, 1); - c11 = vfmaq_laneq_f32(c11, b1, a0, 1); - c12 = vfmaq_laneq_f32(c12, b2, a0, 1); - c20 = vfmaq_laneq_f32(c20, b0, a0, 2); - c21 = vfmaq_laneq_f32(c21, b1, a0, 2); - c22 = vfmaq_laneq_f32(c22, b2, a0, 2); - c30 = vfmaq_laneq_f32(c30, b0, a0, 3); - c31 = vfmaq_laneq_f32(c31, b1, a0, 3); - c32 = vfmaq_laneq_f32(c32, b2, a0, 3); - - b0 = vld1q_f32(b + 12); b1 = vld1q_f32(b + 16); b2 = vld1q_f32(b + 20); - - c03 = vfmaq_laneq_f32(c03, b0, a0, 0); - c04 = vfmaq_laneq_f32(c04, b1, a0, 0); - c05 = vfmaq_laneq_f32(c05, b2, a0, 0); - c13 = vfmaq_laneq_f32(c13, b0, a0, 1); - c14 = vfmaq_laneq_f32(c14, b1, a0, 1); - c15 = vfmaq_laneq_f32(c15, b2, a0, 1); - c23 = vfmaq_laneq_f32(c23, b0, a0, 2); - c24 = vfmaq_laneq_f32(c24, b1, a0, 2); - c25 = vfmaq_laneq_f32(c25, b2, a0, 2); - c33 = vfmaq_laneq_f32(c33, b0, a0, 3); - c34 = vfmaq_laneq_f32(c34, b1, a0, 3); - c35 = vfmaq_laneq_f32(c35, b2, a0, 3); - - b0 = vld1q_f32(b + 24); - c06 = vfmaq_laneq_f32(c06, b0, a0, 0); - c16 = vfmaq_laneq_f32(c16, b0, a0, 1); - c26 = vfmaq_laneq_f32(c26, b0, a0, 2); - c36 = vfmaq_laneq_f32(c36, b0, a0, 3); - } - - if (!init_c) - { - c00 = vaddq_f32(c00, vld1q_f32(c)); - c01 = vaddq_f32(c01, vld1q_f32(c + 4)); - c02 = vaddq_f32(c02, vld1q_f32(c + 8)); - c03 = vaddq_f32(c03, vld1q_f32(c + 12)); - c04 = vaddq_f32(c04, vld1q_f32(c + 16)); - c05 = vaddq_f32(c05, vld1q_f32(c + 20)); - c06 = vaddq_f32(c06, vld1q_f32(c + 24)); - - c10 = vaddq_f32(c10, vld1q_f32(c + ldc)); - c11 = vaddq_f32(c11, vld1q_f32(c + ldc + 4)); - c12 = vaddq_f32(c12, vld1q_f32(c + ldc + 8)); - c13 = vaddq_f32(c13, vld1q_f32(c + ldc + 12)); - c14 = vaddq_f32(c14, vld1q_f32(c + ldc + 16)); - c15 = vaddq_f32(c15, vld1q_f32(c + ldc + 20)); - c16 = vaddq_f32(c16, vld1q_f32(c + ldc + 24)); - - c20 = vaddq_f32(c20, vld1q_f32(c + ldc*2)); - c21 = vaddq_f32(c21, vld1q_f32(c + ldc*2 + 4)); - c22 = vaddq_f32(c22, vld1q_f32(c + ldc*2 + 8)); - c23 = vaddq_f32(c23, vld1q_f32(c + ldc*2 + 12)); - c24 = vaddq_f32(c24, vld1q_f32(c + ldc*2 + 16)); - c25 = vaddq_f32(c25, vld1q_f32(c + ldc*2 + 20)); - c26 = vaddq_f32(c26, vld1q_f32(c + ldc*2 + 24)); - - c30 = vaddq_f32(c30, vld1q_f32(c + ldc*3)); - c31 = vaddq_f32(c31, vld1q_f32(c + ldc*3 + 4)); - c32 = vaddq_f32(c32, vld1q_f32(c + ldc*3 + 8)); - c33 = vaddq_f32(c33, vld1q_f32(c + ldc*3 + 12)); - c34 = vaddq_f32(c34, vld1q_f32(c + ldc*3 + 16)); - c35 = vaddq_f32(c35, vld1q_f32(c + ldc*3 + 20)); - c36 = vaddq_f32(c36, vld1q_f32(c + ldc*3 + 24)); - } - - vst1q_f32(c, c00); vst1q_f32(c+4, c01); - vst1q_f32(c+8, c02); vst1q_f32(c+12, c03); - vst1q_f32(c+16, c04); vst1q_f32(c+20, c05); - vst1q_f32(c+24, c06); - - vst1q_f32(c+ldc, c10); vst1q_f32(c+ldc+4, c11); - vst1q_f32(c+ldc+8, c12); vst1q_f32(c+ldc+12, c13); - vst1q_f32(c+ldc+16, c14); vst1q_f32(c+ldc+20, c15); - vst1q_f32(c+ldc+24, c16); - - vst1q_f32(c+ldc*2, c20); vst1q_f32(c+ldc*2+4, c21); - vst1q_f32(c+ldc*2+8, c22); vst1q_f32(c+ldc*2+12, c23); - vst1q_f32(c+ldc*2+16, c24); vst1q_f32(c+ldc*2+20, c25); - vst1q_f32(c+ldc*2+24, c26); - - vst1q_f32(c+ldc*3, c30); vst1q_f32(c+ldc*3+4, c31); - vst1q_f32(c+ldc*3+8, c32); vst1q_f32(c+ldc*3+12, c33); - vst1q_f32(c+ldc*3+16, c34); vst1q_f32(c+ldc*3+20, c35); - vst1q_f32(c+ldc*3+24, c36); - } -#elif CONV_MR == 4 && CONV_NR == 12 // ARMv7 - { - float32x4_t c0 = vdupq_n_f32(0.f), c1 = c0, c2 = c0; - float32x4_t c3 = vdupq_n_f32(0.f), c4 = c3, c5 = c3; - float32x4_t c6 = vdupq_n_f32(0.f), c7 = c6, c8 = c6; - float32x4_t c9 = vdupq_n_f32(0.f), c10 = c9, c11 = c9; - - - float32x2_t a0 = vdup_n_f32(0.0f), a1 = a0; - float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f); - - for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR) - { - a0 = vld1_f32(a), a1 = vld1_f32(a+2); - b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8); - - c0 = vmlaq_lane_f32(c0, b0, a0, 0); - c1 = vmlaq_lane_f32(c1, b1, a0, 0); - c2 = vmlaq_lane_f32(c2, b2, a0, 0); - - c3 = vmlaq_lane_f32(c3, b0, a0, 1); - c4 = vmlaq_lane_f32(c4, b1, a0, 1); - c5 = vmlaq_lane_f32(c5, b2, a0, 1); - - c6 = vmlaq_lane_f32(c6, b0, a1, 0); - c7 = vmlaq_lane_f32(c7, b1, a1, 0); - c8 = vmlaq_lane_f32(c8, b2, a1, 0); - - c9 = vmlaq_lane_f32(c9 , b0, a1, 1); - c10 = vmlaq_lane_f32(c10, b1, a1, 1); - c11 = vmlaq_lane_f32(c11, b2, a1, 1); - } - - if (!init_c) - { - c0 = vaddq_f32(c0, vld1q_f32(c)); - c1 = vaddq_f32(c1, vld1q_f32(c + 4)); - c2 = vaddq_f32(c2, vld1q_f32(c + 8)); - - c3 = vaddq_f32(c3, vld1q_f32(c + ldc)); - c4 = vaddq_f32(c4, vld1q_f32(c + ldc + 4)); - c5 = vaddq_f32(c5, vld1q_f32(c + ldc + 8)); - - c6 = vaddq_f32(c6, vld1q_f32(c + ldc * 2)); - c7 = vaddq_f32(c7, vld1q_f32(c + ldc * 2 + 4)); - c8 = vaddq_f32(c8, vld1q_f32(c + ldc * 2 + 8)); - - c9 = vaddq_f32(c9 , vld1q_f32(c + ldc * 3)); - c10 = vaddq_f32(c10, vld1q_f32(c + ldc * 3 + 4)); - c11 = vaddq_f32(c11, vld1q_f32(c + ldc * 3 + 8)); - } - - vst1q_f32(c, c0), vst1q_f32(c+4, c1), vst1q_f32(c+8, c2); - vst1q_f32(c + ldc, c3), vst1q_f32(c + ldc + 4, c4), vst1q_f32(c + ldc + 8, c5); - vst1q_f32(c + ldc*2, c6), vst1q_f32(c + ldc*2 + 4, c7), vst1q_f32(c + ldc*2 + 8, c8); - vst1q_f32(c + ldc*3, c9), vst1q_f32(c + ldc*3 + 4, c10), vst1q_f32(c + ldc*3 + 8, c11); - } -//#else -//#error "unsupported CONV_MR and/or CONV_NR in convBlock_NEON." -#endif -} -#endif -} // namespace opt_NEON - -} // namespace cv -#endif //OPENCV_FAST_CONVOLUTION_SIMD_HPP diff --git a/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp b/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp deleted file mode 100644 index b0ccfd0cd2..0000000000 --- a/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp +++ /dev/null @@ -1,1153 +0,0 @@ -// This file is part of OpenCV project. -// It is subject to the license terms in the LICENSE file found in the top-level directory -// of this distribution and at http://opencv.org/license.html. - -// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv_Winograd.fx). -// Here is the original license: -/* - This file is a part of ficus language project. - See ficus/LICENSE for the licensing terms -*/ - -#include "../../precomp.hpp" -#include "fast_convolution.hpp" - -namespace cv { namespace dnn { - -#if CV_NEON || CV_SIMD128 || CV_TRY_AVX2 -enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment. - -static void -_fx_winograd_accum_f32(const float* inwptr, const float* wptr, - float* outbuf, int Cg, int iblock) - { -#if CV_NEON && CV_NEON_AARCH64 - CV_Assert(_FX_WINO_IBLOCK == 6 && _FX_WINO_KBLOCK == 4 && _FX_WINO_ATOM_F32 == 4); - if (iblock > 3) - { - for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++, - outbuf += _FX_WINO_ATOM_F32) - { - float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00; - float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00; - float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00; - float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00; - for (int c = 0; c < Cg; c++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32, - wptr += _FX_WINO_KBLOCK*_FX_WINO_ATOM_F32) { - float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4); - float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12); - float32x4_t x0, x1; - x0 = vld1q_f32(inwptr); - x1 = vld1q_f32(inwptr + 4); - s00 = vfmaq_f32(s00, w0, x0); - s01 = vfmaq_f32(s01, w0, x1); - s10 = vfmaq_f32(s10, w1, x0); - s11 = vfmaq_f32(s11, w1, x1); - s20 = vfmaq_f32(s20, w2, x0); - s21 = vfmaq_f32(s21, w2, x1); - s30 = vfmaq_f32(s30, w3, x0); - s31 = vfmaq_f32(s31, w3, x1); - x0 = vld1q_f32(inwptr + 8); - x1 = vld1q_f32(inwptr + 12); - s02 = vfmaq_f32(s02, w0, x0); - s03 = vfmaq_f32(s03, w0, x1); - s12 = vfmaq_f32(s12, w1, x0); - s13 = vfmaq_f32(s13, w1, x1); - s22 = vfmaq_f32(s22, w2, x0); - s23 = vfmaq_f32(s23, w2, x1); - s32 = vfmaq_f32(s32, w3, x0); - s33 = vfmaq_f32(s33, w3, x1); - x0 = vld1q_f32(inwptr + 16); - x1 = vld1q_f32(inwptr + 20); - s04 = vfmaq_f32(s04, w0, x0); - s05 = vfmaq_f32(s05, w0, x1); - s14 = vfmaq_f32(s14, w1, x0); - s15 = vfmaq_f32(s15, w1, x1); - s24 = vfmaq_f32(s24, w2, x0); - s25 = vfmaq_f32(s25, w2, x1); - s34 = vfmaq_f32(s34, w3, x0); - s35 = vfmaq_f32(s35, w3, x1); - } - - vst1q_f32(outbuf, s00); - vst1q_f32(outbuf + 1*64, s01); - vst1q_f32(outbuf + 2*64, s02); - vst1q_f32(outbuf + 3*64, s03); - vst1q_f32(outbuf + 4*64, s04); - vst1q_f32(outbuf + 5*64, s05); - - vst1q_f32(outbuf + 6*64, s10); - vst1q_f32(outbuf + 7*64, s11); - vst1q_f32(outbuf + 8*64, s12); - vst1q_f32(outbuf + 9*64, s13); - vst1q_f32(outbuf + 10*64, s14); - vst1q_f32(outbuf + 11*64, s15); - - vst1q_f32(outbuf + 12*64, s20); - vst1q_f32(outbuf + 13*64, s21); - vst1q_f32(outbuf + 14*64, s22); - vst1q_f32(outbuf + 15*64, s23); - vst1q_f32(outbuf + 16*64, s24); - vst1q_f32(outbuf + 17*64, s25); - - vst1q_f32(outbuf + 18*64, s30); - vst1q_f32(outbuf + 19*64, s31); - vst1q_f32(outbuf + 20*64, s32); - vst1q_f32(outbuf + 21*64, s33); - vst1q_f32(outbuf + 22*64, s34); - vst1q_f32(outbuf + 23*64, s35); - } - } - else - { - for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++, - outbuf += _FX_WINO_ATOM_F32) - { - float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00; - float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00; - float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00; - float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00; - for (int c = 0; c < Cg; c++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32, - wptr += _FX_WINO_KBLOCK*_FX_WINO_ATOM_F32) { - float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4); - float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12); - float32x4_t x0, x1, x2; - x0 = vld1q_f32(inwptr); - x1 = vld1q_f32(inwptr + 4); - x2 = vld1q_f32(inwptr + 8); - s00 = vfmaq_f32(s00, w0, x0); - s01 = vfmaq_f32(s01, w0, x1); - s02 = vfmaq_f32(s02, w0, x2); - s10 = vfmaq_f32(s10, w1, x0); - s11 = vfmaq_f32(s11, w1, x1); - s12 = vfmaq_f32(s12, w1, x2); - s20 = vfmaq_f32(s20, w2, x0); - s21 = vfmaq_f32(s21, w2, x1); - s22 = vfmaq_f32(s22, w2, x2); - s30 = vfmaq_f32(s30, w3, x0); - s31 = vfmaq_f32(s31, w3, x1); - s32 = vfmaq_f32(s32, w3, x2); - } - - vst1q_f32(outbuf, s00); - vst1q_f32(outbuf + 1*64, s01); - vst1q_f32(outbuf + 2*64, s02); - vst1q_f32(outbuf + 6*64, s10); - vst1q_f32(outbuf + 7*64, s11); - vst1q_f32(outbuf + 8*64, s12); - vst1q_f32(outbuf + 12*64, s20); - vst1q_f32(outbuf + 13*64, s21); - vst1q_f32(outbuf + 14*64, s22); - vst1q_f32(outbuf + 18*64, s30); - vst1q_f32(outbuf + 19*64, s31); - vst1q_f32(outbuf + 20*64, s32); - } - } -#elif CV_SIMD128 - CV_Assert(_FX_WINO_IBLOCK == 3 && _FX_WINO_KBLOCK == 4 && _FX_WINO_ATOM_F32 == 4); - for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++, - outbuf += _FX_WINO_ATOM_F32) - { - v_float32x4 s00 = v_setzero_f32(), s01 = s00, s02 = s00; - v_float32x4 s10 = v_setzero_f32(), s11 = s00, s12 = s00; - v_float32x4 s20 = v_setzero_f32(), s21 = s00, s22 = s00; - v_float32x4 s30 = v_setzero_f32(), s31 = s00, s32 = s00; - - for (int c = 0; c < Cg; c++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32, - wptr += _FX_WINO_KBLOCK*_FX_WINO_ATOM_F32) - { - v_float32x4 x0, x1, x2; - x0 = v_load(inwptr); - x1 = v_load(inwptr + 4); - x2 = v_load(inwptr + 8); - - v_float32x4 w0 = v_load(wptr); - s00 = v_fma(w0, x0, s00); - s01 = v_fma(w0, x1, s01); - s02 = v_fma(w0, x2, s02); - - w0 = v_load(wptr + 4); - s10 = v_fma(w0, x0, s10); - s11 = v_fma(w0, x1, s11); - s12 = v_fma(w0, x2, s12); - - w0 = v_load(wptr + 8); - s20 = v_fma(w0, x0, s20); - s21 = v_fma(w0, x1, s21); - s22 = v_fma(w0, x2, s22); - - w0 = v_load(wptr + 12); - s30 = v_fma(w0, x0, s30); - s31 = v_fma(w0, x1, s31); - s32 = v_fma(w0, x2, s32); - } - - v_store(outbuf, s00); - v_store(outbuf + 1*64, s01); - v_store(outbuf + 2*64, s02); - v_store(outbuf + 3*64, s10); - v_store(outbuf + 4*64, s11); - v_store(outbuf + 5*64, s12); - v_store(outbuf + 6*64, s20); - v_store(outbuf + 7*64, s21); - v_store(outbuf + 8*64, s22); - v_store(outbuf + 9*64, s30); - v_store(outbuf + 10*64, s31); - v_store(outbuf + 11*64, s32); - } -#else - for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; - atom_id++, outbuf += _FX_WINO_ATOM_F32) - { - float sumbuf[_FX_WINO_IBLOCK*_FX_WINO_KBLOCK*_FX_WINO_ATOM_F32]; - memset(sumbuf, 0, sizeof(sumbuf)); - for (int c = 0; c < Cg; c++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32, - wptr += _FX_WINO_KBLOCK*_FX_WINO_ATOM_F32) - { - for (int i = 0; i < _FX_WINO_KBLOCK; i++) - { - for (int j = 0; j < _FX_WINO_IBLOCK; j++) - { - int i_ = i*_FX_WINO_ATOM_F32; - int j_ = j*_FX_WINO_ATOM_F32; - int ij_ = i_*_FX_WINO_IBLOCK + j_; - float s0 = inwptr[j_ + 0]*wptr[i_ + 0]; - float s1 = inwptr[j_ + 1]*wptr[i_ + 1]; - float s2 = inwptr[j_ + 2]*wptr[i_ + 2]; - float s3 = inwptr[j_ + 3]*wptr[i_ + 3]; - sumbuf[ij_ + 0] += s0; - sumbuf[ij_ + 1] += s1; - sumbuf[ij_ + 2] += s2; - sumbuf[ij_ + 3] += s3; - } - } - } - for (int ij = 0; ij < _FX_WINO_KBLOCK*_FX_WINO_IBLOCK; ij++) - { - int ij_ = ij*_FX_WINO_ATOM_F32; - int ij_out = ij*_FX_WINO_AREA; - outbuf[ij_out + 0] = sumbuf[ij_ + 0]; - outbuf[ij_out + 1] = sumbuf[ij_ + 1]; - outbuf[ij_out + 2] = sumbuf[ij_ + 2]; - outbuf[ij_out + 3] = sumbuf[ij_ + 3]; - } - } -#endif -} - -#if CV_NEON -#define T4x4(a, b, c, d, tr0, tr1) \ - tr0 = vtrnq_f32(a, b); \ - tr1 = vtrnq_f32(c, d); \ - a = vcombine_f32(vget_low_f32(tr0.val[0]), vget_low_f32(tr1.val[0])); \ - b = vcombine_f32(vget_low_f32(tr0.val[1]), vget_low_f32(tr1.val[1])); \ - c = vcombine_f32(vget_high_f32(tr0.val[0]), vget_high_f32(tr1.val[0])); \ - d = vcombine_f32(vget_high_f32(tr0.val[1]), vget_high_f32(tr1.val[1])) -#endif - -/*Input transform*/ -static void -_fx_winograd_BtXB_8x8_f32(const float* inptr, int inpstep, - float* outptr, int Cg) -{ -#if CV_NEON && CV_NEON_AARCH64 - float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4); - float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4); - float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4); - float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4); - float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4); - float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4); - float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4); - float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4); - - float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71; - - { - /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */ - /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */ - float32x4_t q5_25 = vdupq_n_f32(5.25f), t00, t01, t10, t11; - t00 = vsubq_f32(x40, x20); - t01 = vsubq_f32(x41, x21); - t10 = vsubq_f32(x30, x50); - t11 = vsubq_f32(x31, x51); - float32x4_t y00 = vfmaq_f32(vsubq_f32(x00, x60), t00, q5_25); - float32x4_t y01 = vfmaq_f32(vsubq_f32(x01, x61), t01, q5_25); - float32x4_t y70 = vfmaq_f32(vsubq_f32(x70, x10), t10, q5_25); - float32x4_t y71 = vfmaq_f32(vsubq_f32(x71, x11), t11, q5_25); - - /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */ - /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */ - float32x4_t qm4_25 = vdupq_n_f32(-4.25f); - t00 = vfmaq_f32(vaddq_f32(x10, x50), x30, qm4_25); - t01 = vfmaq_f32(vaddq_f32(x11, x51), x31, qm4_25); - t10 = vfmaq_f32(vaddq_f32(x20, x60), x40, qm4_25); - t11 = vfmaq_f32(vaddq_f32(x21, x61), x41, qm4_25); - - float32x4_t y10 = vaddq_f32(t00, t10), y11 = vaddq_f32(t01, t11); - float32x4_t y20 = vsubq_f32(t10, t00), y21 = vsubq_f32(t11, t01); - - /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */ - /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */ - float32x4_t q0_5 = vdupq_n_f32(0.5f), q0_25 = vdupq_n_f32(0.25f); - float32x4_t qm2_5 = vdupq_n_f32(-2.5f), qm1_25 = vdupq_n_f32(-1.25f); - t00 = vfmaq_f32(vaddq_f32(x50, x50), x10, q0_5); - t01 = vfmaq_f32(vaddq_f32(x51, x51), x11, q0_5); - t10 = vfmaq_f32(x60, x20, q0_25); - t11 = vfmaq_f32(x61, x21, q0_25); - t00 = vfmaq_f32(t00, x30, qm2_5); - t01 = vfmaq_f32(t01, x31, qm2_5); - t10 = vfmaq_f32(t10, x40, qm1_25); - t11 = vfmaq_f32(t11, x41, qm1_25); - - float32x4_t y30 = vaddq_f32(t00, t10), y31 = vaddq_f32(t01, t11); - float32x4_t y40 = vsubq_f32(t10, t00), y41 = vsubq_f32(t11, t01); - - /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */ - /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */ - float32x4_t q4 = vdupq_n_f32(4.f), qm5 = vdupq_n_f32(-5.f); - t00 = vfmaq_f32(vaddq_f32(x10, x10), x50, q0_5); - t01 = vfmaq_f32(vaddq_f32(x11, x11), x51, q0_5); - t10 = vfmaq_f32(x60, x20, q4); - t11 = vfmaq_f32(x61, x21, q4); - t00 = vfmaq_f32(t00, x30, qm2_5); - t01 = vfmaq_f32(t01, x31, qm2_5); - t10 = vfmaq_f32(t10, x40, qm5); - t11 = vfmaq_f32(t11, x41, qm5); - - float32x4_t y50 = vaddq_f32(t00, t10), y51 = vaddq_f32(t01, t11); - float32x4_t y60 = vsubq_f32(t10, t00), y61 = vsubq_f32(t11, t01); - - /* transpose 8x8 matrix in-place with some renumeration of the elements: */ - /* Y: */ - /* y00 y01 */ - /* y10 y11 */ - /* ... */ - /* y70 y71 */ - /* Y': */ - /* y00 y40 */ - /* y10 y50 */ - /* y20 y60 */ - /* y30 y70 */ - /* y01 y41 */ - /* y11 y51 */ - /* y21 y61 */ - /* y31 y71 */ - /* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */ - float32x4x2_t tr0, tr1; - - T4x4(y00, y10, y20, y30, tr0, tr1); - T4x4(y01, y11, y21, y31, tr0, tr1); - T4x4(y40, y50, y60, y70, tr0, tr1); - T4x4(y41, y51, y61, y71, tr0, tr1); - - /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */ - /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */ - t00 = vsubq_f32(y01, y20); - t01 = vsubq_f32(y41, y60); - t10 = vsubq_f32(y30, y11); - t11 = vsubq_f32(y70, y51); - z00 = vfmaq_f32(vsubq_f32(y00, y21), t00, q5_25); - z01 = vfmaq_f32(vsubq_f32(y40, y61), t01, q5_25); - z70 = vfmaq_f32(vsubq_f32(y31, y10), t10, q5_25); - z71 = vfmaq_f32(vsubq_f32(y71, y50), t11, q5_25); - - /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */ - /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */ - t00 = vfmaq_f32(vaddq_f32(y10, y11), y30, qm4_25); - t01 = vfmaq_f32(vaddq_f32(y50, y51), y70, qm4_25); - t10 = vfmaq_f32(vaddq_f32(y20, y21), y01, qm4_25); - t11 = vfmaq_f32(vaddq_f32(y60, y61), y41, qm4_25); - - z10 = vaddq_f32(t00, t10); z11 = vaddq_f32(t01, t11); - z20 = vsubq_f32(t10, t00); z21 = vsubq_f32(t11, t01); - - /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */ - /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */ - t00 = vfmaq_f32(vaddq_f32(y11, y11), y10, q0_5); - t01 = vfmaq_f32(vaddq_f32(y51, y51), y50, q0_5); - t10 = vfmaq_f32(y21, y20, q0_25); - t11 = vfmaq_f32(y61, y60, q0_25); - t00 = vfmaq_f32(t00, y30, qm2_5); - t01 = vfmaq_f32(t01, y70, qm2_5); - t10 = vfmaq_f32(t10, y01, qm1_25); - t11 = vfmaq_f32(t11, y41, qm1_25); - - z30 = vaddq_f32(t00, t10); z31 = vaddq_f32(t01, t11); - z40 = vsubq_f32(t10, t00); z41 = vsubq_f32(t11, t01); - - /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */ - /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */ - t00 = vfmaq_f32(vaddq_f32(y10, y10), y11, q0_5); - t01 = vfmaq_f32(vaddq_f32(y50, y50), y51, q0_5); - t10 = vfmaq_f32(y21, y20, q4); - t11 = vfmaq_f32(y61, y60, q4); - t00 = vfmaq_f32(t00, y30, qm2_5); - t01 = vfmaq_f32(t01, y70, qm2_5); - t10 = vfmaq_f32(t10, y01, qm5); - t11 = vfmaq_f32(t11, y41, qm5); - - z50 = vaddq_f32(t00, t10); z51 = vaddq_f32(t01, t11); - z60 = vsubq_f32(t10, t00); z61 = vsubq_f32(t11, t01); - } - - const int outstep = _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32*Cg; - - vst1q_f32(outptr, z00); - vst1q_f32(outptr + outstep, z01); - vst1q_f32(outptr + outstep*2, z10); - vst1q_f32(outptr + outstep*3, z11); - vst1q_f32(outptr + outstep*4, z20); - vst1q_f32(outptr + outstep*5, z21); - vst1q_f32(outptr + outstep*6, z30); - vst1q_f32(outptr + outstep*7, z31); - vst1q_f32(outptr + outstep*8, z40); - vst1q_f32(outptr + outstep*9, z41); - vst1q_f32(outptr + outstep*10, z50); - vst1q_f32(outptr + outstep*11, z51); - vst1q_f32(outptr + outstep*12, z60); - vst1q_f32(outptr + outstep*13, z61); - vst1q_f32(outptr + outstep*14, z70); - vst1q_f32(outptr + outstep*15, z71); -#elif CV_SIMD128 - v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4); - v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4); - v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4); - v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4); - v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4); - v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4); - v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4); - v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4); - - v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71; - - { - /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */ - /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */ - v_float32x4 q5_25 = v_setall_f32(5.25f), t00, t01, t10, t11; - t00 = x40 - x20; - t01 = x41 - x21; - t10 = x30 - x50; - t11 = x31 - x51; - v_float32x4 y00 = v_fma(t00, q5_25, x00 - x60); - v_float32x4 y01 = v_fma(t01, q5_25, x01 - x61); - v_float32x4 y70 = v_fma(t10, q5_25, x70 - x10); - v_float32x4 y71 = v_fma(t11, q5_25, x71 - x11); - - /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */ - /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */ - v_float32x4 qm4_25 = v_setall_f32(-4.25f); - t00 = v_fma(x30, qm4_25, x10 + x50); - t01 = v_fma(x31, qm4_25, x11 + x51); - t10 = v_fma(x40, qm4_25, x20 + x60); - t11 = v_fma(x41, qm4_25, x21 + x61); - - v_float32x4 y10 = t00 + t10, y11 = t01 + t11; - v_float32x4 y20 = t10 - t00, y21 = t11 - t01; - - /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */ - /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */ - v_float32x4 q0_5 = v_setall_f32(0.5f), q0_25 = v_setall_f32(0.25f); - v_float32x4 qm2_5 = v_setall_f32(-2.5f), qm1_25 = v_setall_f32(-1.25f); - t00 = v_fma(x10, q0_5, x50 + x50); - t01 = v_fma(x11, q0_5, x51 + x51); - t10 = v_fma(x20, q0_25, x60); - t11 = v_fma(x21, q0_25, x61); - t00 = v_fma(x30, qm2_5, t00); - t01 = v_fma(x31, qm2_5, t01); - t10 = v_fma(x40, qm1_25, t10); - t11 = v_fma(x41, qm1_25, t11); - - v_float32x4 y30 = t00 + t10, y31 = t01 + t11; - v_float32x4 y40 = t10 - t00, y41 = t11 - t01; - - /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */ - /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */ - v_float32x4 q4 = v_setall_f32(4.f), qm5 = v_setall_f32(-5.f); - t00 = v_fma(x50, q0_5, x10 + x10); - t01 = v_fma(x51, q0_5, x11 + x11); - t10 = v_fma(x20, q4 , x60); - t11 = v_fma(x21, q4 , x61); - t00 = v_fma(x30, qm2_5, t00); - t01 = v_fma(x31, qm2_5, t01); - t10 = v_fma(x40, qm5 , t10); - t11 = v_fma(x41, qm5 , t11); - - v_float32x4 y50 = t00 + t10, y51 = t01 + t11; - v_float32x4 y60 = t10 - t00, y61 = t11 - t01; - - /* transpose 8x8 matrix in-place with some renumeration of the elements: */ - /* Y: */ - /* y00 y01 */ - /* y10 y11 */ - /* ... */ - /* y70 y71 */ - /* Y': */ - /* y00 y40 */ - /* y10 y50 */ - /* y20 y60 */ - /* y30 y70 */ - /* y01 y41 */ - /* y11 y51 */ - /* y21 y61 */ - /* y31 y71 */ - /* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */ - - v_transpose4x4(y00, y10, y20, y30, y00, y10, y20, y30); - v_transpose4x4(y01, y11, y21, y31, y01, y11, y21, y31); - v_transpose4x4(y40, y50, y60, y70, y40, y50, y60, y70); - v_transpose4x4(y41, y51, y61, y71, y41, y51, y61, y71); - - /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */ - /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */ - t00 = y01 - y20; - t01 = y41 - y60; - t10 = y30 - y11; - t11 = y70 - y51; - z00 = v_fma(t00, q5_25, y00 - y21); - z01 = v_fma(t01, q5_25, y40 - y61); - z70 = v_fma(t10, q5_25, y31 - y10); - z71 = v_fma(t11, q5_25, y71 - y50); - - /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */ - /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */ - t00 = v_fma(y30, qm4_25, y10 + y11); - t01 = v_fma(y70, qm4_25, y50 + y51); - t10 = v_fma(y01, qm4_25, y20 + y21); - t11 = v_fma(y41, qm4_25, y60 + y61); - - z10 = t00 + t10; z11 = t01 + t11; - z20 = t10 - t00; z21 = t11 - t01; - - /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */ - /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */ - t00 = v_fma(y10, q0_5, y11 + y11); - t01 = v_fma(y50, q0_5, y51 + y51); - t10 = v_fma(y20, q0_25, y21); - t11 = v_fma(y60, q0_25, y61); - t00 = v_fma(y30, qm2_5, t00); - t01 = v_fma(y70, qm2_5, t01); - t10 = v_fma(y01, qm1_25, t10); - t11 = v_fma(y41, qm1_25, t11); - - z30 = t00 + t10; z31 = t01 + t11; - z40 = t10 - t00; z41 = t11 - t01; - - /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */ - /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */ - t00 = v_fma(y11, q0_5, y10 + y10); - t01 = v_fma(y51, q0_5, y50 + y50); - t10 = v_fma(y20, q4, y21); - t11 = v_fma(y60, q4, y61); - t00 = v_fma(y30, qm2_5, t00); - t01 = v_fma(y70, qm2_5, t01); - t10 = v_fma(y01, qm5, t10); - t11 = v_fma(y41, qm5, t11); - - z50 = t00 + t10; z51 = t01 + t11; - z60 = t10 - t00; z61 = t11 - t01; - } - - const int outstep = _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32*Cg; - - v_store(outptr, z00); - v_store(outptr + outstep, z01); - v_store(outptr + outstep*2, z10); - v_store(outptr + outstep*3, z11); - v_store(outptr + outstep*4, z20); - v_store(outptr + outstep*5, z21); - v_store(outptr + outstep*6, z30); - v_store(outptr + outstep*7, z31); - v_store(outptr + outstep*8, z40); - v_store(outptr + outstep*9, z41); - v_store(outptr + outstep*10, z50); - v_store(outptr + outstep*11, z51); - v_store(outptr + outstep*12, z60); - v_store(outptr + outstep*13, z61); - v_store(outptr + outstep*14, z70); - v_store(outptr + outstep*15, z71); -#else -#error "Only SIMD128, AVX2 and NEON are supported in Winograd." -#endif -} - -/* Inverse Winograd 8x8 transform: - out = (A'*inp*A)', where - inp is input 8x8 FP32 matrix, - A' is - [1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, - 0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f, - 0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f, - 0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f, - 0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f, - 0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f] - - inp is pre-loaded into xij registers, - out will be stored in zij, where (0<=i<=7 for x, 0<=i<=5 for z), 0<=j<=1. - - After the inverse transform is done, we add bias, - optionally add results from the earlier tensors (by-pass), - optionally apply activation function and then - store the final results. - - Note that both _FX_WINOGRAD_FWD_8x8() and - _FX_WINOGRAD_INV_8x8() produce tranposed output. - That is, after both forward and then inverse transformation, - we get non-transposed result. - Of course, for the correct work of Winograd-based convolution, - the Winograd-transformed weights should also be transposed. - init_conv() (see OpConv.fx) takes care of that. -*/ -static void -_fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep, - float* bpptr, int bpstep, float* outptr, int outstep, - float bias, float minval, float maxval, bool ifMinMaxAct) -{ -#if CV_NEON && CV_NEON_AARCH64 - float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4); - float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4); - float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4); - float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4); - float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4); - float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4); - float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4); - float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4); - float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51; - - { - float32x4_t s12_0, s12_1, s34_0, s34_1, s56_0, s56_1; - s12_0 = vaddq_f32(x10, x20); s12_1 = vaddq_f32(x11, x21); - s34_0 = vaddq_f32(x30, x40); s34_1 = vaddq_f32(x31, x41); - s56_0 = vaddq_f32(x50, x60); s56_1 = vaddq_f32(x51, x61); - - float32x4_t y00 = vaddq_f32(vaddq_f32(vaddq_f32(x00, s12_0), s34_0), s56_0); - float32x4_t y01 = vaddq_f32(vaddq_f32(vaddq_f32(x01, s12_1), s34_1), s56_1); - float32x4_t y20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f); - float32x4_t y21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f); - float32x4_t y40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16); - float32x4_t y41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16); - - s12_0 = vsubq_f32(x10, x20); s12_1 = vsubq_f32(x11, x21); - s34_0 = vsubq_f32(x30, x40); s34_1 = vsubq_f32(x31, x41); - s56_0 = vsubq_f32(x50, x60); s56_1 = vsubq_f32(x51, x61); - - float32x4_t y50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x70, s12_0), - s34_0, 32.f), s56_0, 1.f/32); - float32x4_t y51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x71, s12_1), - s34_1, 32.f), s56_1, 1.f/32); - float32x4_t y10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f); - float32x4_t y11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f); - float32x4_t y30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f); - float32x4_t y31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f); - float32x4_t y60 = vdupq_n_f32(0.f), y61 = y60, y70 = y60, y71 = y60; - - /* transpose 8x8 matrix in-place with some renumeration of the elements: */ - /* Y: */ - /* y00 y01 */ - /* y10 y11 */ - /* ... */ - /* y50 y51 */ - /* 0 0 */ - /* 0 0 */ - /* Y': */ - /* y00 y40 */ - /* y10 y50 */ - /* y20 y60 */ - /* y30 y70 */ - /* y01 y41 */ - /* y11 y51 */ - /* y21 y61 */ - /* y31 y71 */ - /* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */ - float32x4x2_t tr0, tr1; - - T4x4(y00, y10, y20, y30, tr0, tr1); - T4x4(y01, y11, y21, y31, tr0, tr1); - T4x4(y40, y50, y60, y70, tr0, tr1); - T4x4(y41, y51, y61, y71, tr0, tr1); - - s12_0 = vaddq_f32(y10, y20); s12_1 = vaddq_f32(y50, y60); - s34_0 = vaddq_f32(y30, y01); s34_1 = vaddq_f32(y70, y41); - s56_0 = vaddq_f32(y11, y21); s56_1 = vaddq_f32(y51, y61); - - z00 = vaddq_f32(vaddq_f32(vaddq_f32(y00, s12_0), s34_0), s56_0); - z01 = vaddq_f32(vaddq_f32(vaddq_f32(y40, s12_1), s34_1), s56_1); - z20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f); - z21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f); - z40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16); - z41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16); - - s12_0 = vsubq_f32(y10, y20); s12_1 = vsubq_f32(y50, y60); - s34_0 = vsubq_f32(y30, y01); s34_1 = vsubq_f32(y70, y41); - s56_0 = vsubq_f32(y11, y21); s56_1 = vsubq_f32(y51, y61); - - z50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y31, s12_0), - s34_0, 32.f), s56_0, 1.f/32); - z51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y71, s12_1), - s34_1, 32.f), s56_1, 1.f/32); - z10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f); - z11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f); - z30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f); - z31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f); - float32x4_t vbias = vdupq_n_f32(bias); - - z00 = vaddq_f32(z00, vbias); - z01 = vaddq_f32(z01, vbias); - z10 = vaddq_f32(z10, vbias); - z11 = vaddq_f32(z11, vbias); - z20 = vaddq_f32(z20, vbias); - z21 = vaddq_f32(z21, vbias); - z30 = vaddq_f32(z30, vbias); - z31 = vaddq_f32(z31, vbias); - z40 = vaddq_f32(z40, vbias); - z41 = vaddq_f32(z41, vbias); - z50 = vaddq_f32(z50, vbias); - z51 = vaddq_f32(z51, vbias); - } - - if (bpptr) - { - float32x2_t zhalf = vdup_n_f32(0.f); - z00 = vaddq_f32(z00, vld1q_f32(bpptr)); - z01 = vaddq_f32(z01, vcombine_f32(vld1_f32(bpptr + 4), zhalf)); - z10 = vaddq_f32(z10, vld1q_f32(bpptr + bpstep)); - z11 = vaddq_f32(z11, vcombine_f32(vld1_f32(bpptr + bpstep + 4), zhalf)); - z20 = vaddq_f32(z20, vld1q_f32(bpptr + bpstep*2)); - z21 = vaddq_f32(z21, vcombine_f32(vld1_f32(bpptr + bpstep*2 + 4), zhalf)); - z30 = vaddq_f32(z30, vld1q_f32(bpptr + bpstep*3)); - z31 = vaddq_f32(z31, vcombine_f32(vld1_f32(bpptr + bpstep*3 + 4), zhalf)); - z40 = vaddq_f32(z40, vld1q_f32(bpptr + bpstep*4)); - z41 = vaddq_f32(z41, vcombine_f32(vld1_f32(bpptr + bpstep*4 + 4), zhalf)); - z50 = vaddq_f32(z50, vld1q_f32(bpptr + bpstep*5)); - z51 = vaddq_f32(z51, vcombine_f32(vld1_f32(bpptr + bpstep*5 + 4), zhalf)); - } - - if (ifMinMaxAct) - { - float32x4_t vmax = vdupq_n_f32(maxval); - float32x4_t vmin = vdupq_n_f32(minval); - - z00 = vminq_f32(vmaxq_f32(z00, vmin), vmax); - z01 = vminq_f32(vmaxq_f32(z01, vmin), vmax); - z10 = vminq_f32(vmaxq_f32(z10, vmin), vmax); - z11 = vminq_f32(vmaxq_f32(z11, vmin), vmax); - z20 = vminq_f32(vmaxq_f32(z20, vmin), vmax); - z21 = vminq_f32(vmaxq_f32(z21, vmin), vmax); - z30 = vminq_f32(vmaxq_f32(z30, vmin), vmax); - z31 = vminq_f32(vmaxq_f32(z31, vmin), vmax); - z40 = vminq_f32(vmaxq_f32(z40, vmin), vmax); - z41 = vminq_f32(vmaxq_f32(z41, vmin), vmax); - z50 = vminq_f32(vmaxq_f32(z50, vmin), vmax); - z51 = vminq_f32(vmaxq_f32(z51, vmin), vmax); - } - - vst1q_f32(outptr, z00); - vst1_f32(outptr + 4, vget_low_f32(z01)); - vst1q_f32(outptr + outstep, z10); - vst1_f32(outptr + outstep + 4, vget_low_f32(z11)); - vst1q_f32(outptr + outstep*2, z20); - vst1_f32(outptr + outstep*2 + 4, vget_low_f32(z21)); - vst1q_f32(outptr + outstep*3, z30); - vst1_f32(outptr + outstep*3 + 4, vget_low_f32(z31)); - vst1q_f32(outptr + outstep*4, z40); - vst1_f32(outptr + outstep*4 + 4, vget_low_f32(z41)); - vst1q_f32(outptr + outstep*5, z50); - vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51)); -#elif CV_SIMD128 - v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4); - v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4); - v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4); - v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4); - v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4); - v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4); - v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4); - v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4); - v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51; - - { - v_float32x4 s12_0, s12_1, s34_0, s34_1, s56_0, s56_1; - s12_0 = x10 + x20; s12_1 = x11 + x21; - s34_0 = x30 + x40; s34_1 = x31 + x41; - s56_0 = x50 + x60; s56_1 = x51 + x61; - - v_float32x4 y00 = x00 + s12_0 + s34_0 + s56_0; - v_float32x4 y01 = x01 + s12_1 + s34_1 + s56_1; - - v_float32x4 a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f); - v_float32x4 y20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); - v_float32x4 y21 = v_fma(s56_1, a0 ,v_fma(s34_1, a1, s12_1) ); - - a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f); - v_float32x4 y40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); - v_float32x4 y41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); - - s12_0 = x10 - x20; s12_1 = x11 - x21; - s34_0 = x30 - x40; s34_1 = x31 - x41; - s56_0 = x50 - x60; s56_1 = x51 - x61; - - a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.f); - v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, x70 + s12_0)); - v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, x71 + s12_1)); - - a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.f); - v_float32x4 y10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); - v_float32x4 y11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); - - a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.f); - v_float32x4 y30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); - v_float32x4 y31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); - - v_float32x4 y60 = v_setall_f32(0.f), y61 = y60, y70 = y60, y71 = y60; - - /* transpose 8x8 matrix in-place with some renumeration of the elements: */ - /* Y: */ - /* y00 y01 */ - /* y10 y11 */ - /* ... */ - /* y50 y51 */ - /* 0 0 */ - /* 0 0 */ - /* Y': */ - /* y00 y40 */ - /* y10 y50 */ - /* y20 y60 */ - /* y30 y70 */ - /* y01 y41 */ - /* y11 y51 */ - /* y21 y61 */ - /* y31 y71 */ - /* in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */ - - v_transpose4x4(y00, y10, y20, y30, y00, y10, y20, y30); - v_transpose4x4(y01, y11, y21, y31, y01, y11, y21, y31); - v_transpose4x4(y40, y50, y60, y70, y40, y50, y60, y70); - v_transpose4x4(y41, y51, y61, y71, y41, y51, y61, y71); - - s12_0 = y10 + y20; s12_1 = y50 + y60; - s34_0 = y30 + y01; s34_1 = y70 + y41; - s56_0 = y11 + y21; s56_1 = y51 + y61; - - z00 = y00 + s12_0 + s34_0 + s56_0; - z01 = y40 + s12_1 + s34_1 + s56_1; - - a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f); - z20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); - z21 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); - - a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f); - z40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); - z41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); - - s12_0 = y10 - y20; s12_1 = y50 - y60; - s34_0 = y30 - y01; s34_1 = y70 - y41; - s56_0 = y11 - y21; s56_1 = y51 - y61; - - a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.0f); - z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, y31 + s12_0)); - z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, y71 + s12_1)); - - a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.0f); - z10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); - z11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); - - a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.0f); - z30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0)); - z31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1)); - - v_float32x4 vbias = v_setall_f32(bias); - z00 += vbias; - z01 += vbias; - z10 += vbias; - z11 += vbias; - z20 += vbias; - z21 += vbias; - z30 += vbias; - z31 += vbias; - z40 += vbias; - z41 += vbias; - z50 += vbias; - z51 += vbias; - } - - if (bpptr) - { - z00 += v_load(bpptr); - z01 += v_load_low(bpptr + 4); - z10 += v_load(bpptr + bpstep); - z11 += v_load_low(bpptr + bpstep + 4); - z20 += v_load(bpptr + bpstep*2); - z21 += v_load_low(bpptr + bpstep*2 + 4); - z30 += v_load(bpptr + bpstep*3); - z31 += v_load_low(bpptr + bpstep*3 + 4); - z40 += v_load(bpptr + bpstep*4); - z41 += v_load_low(bpptr + bpstep*4 + 4); - z50 += v_load(bpptr + bpstep*5); - z51 += v_load_low(bpptr + bpstep*5 + 4); - } - - if (ifMinMaxAct) - { - v_float32x4 vmax = v_setall_f32(maxval); - v_float32x4 vmin = v_setall_f32(minval); - - z00 = v_min(v_max(z00, vmin), vmax); - z01 = v_min(v_max(z01, vmin), vmax); - z10 = v_min(v_max(z10, vmin), vmax); - z11 = v_min(v_max(z11, vmin), vmax); - z20 = v_min(v_max(z20, vmin), vmax); - z21 = v_min(v_max(z21, vmin), vmax); - z30 = v_min(v_max(z30, vmin), vmax); - z31 = v_min(v_max(z31, vmin), vmax); - z40 = v_min(v_max(z40, vmin), vmax); - z41 = v_min(v_max(z41, vmin), vmax); - z50 = v_min(v_max(z50, vmin), vmax); - z51 = v_min(v_max(z51, vmin), vmax); - } - - v_store(outptr, z00); - v_store_low(outptr + 4, z01); - v_store(outptr + outstep, z10); - v_store_low(outptr + outstep + 4, z11); - v_store(outptr + outstep*2, z20); - v_store_low(outptr + outstep*2 + 4, z21); - v_store(outptr + outstep*3, z30); - v_store_low(outptr + outstep*3 + 4, z31); - v_store(outptr + outstep*4, z40); - v_store_low(outptr + outstep*4 + 4, z41); - v_store(outptr + outstep*5, z50); - v_store_low(outptr + outstep*5 + 4, z51); -#else -#error "Only SIMD128, AVX2 and NEON are supported in Winograd." -#endif -} - -int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr& conv, - int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) -{ - Mat input = _input.getMat(); - Mat output = _output.getMat(); - Mat fusedAddMat = _fusedAddMat.getMat(); - - MatShape inputShape = shape(input); - MatShape outputShape = shape(output); - CV_Assert(inputShape.size() == 4 && outputShape.size() == 4); - - int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3]; // [N, C, H, W] - int K = conv->K; - int H0 = outputShape[2], W0 = outputShape[3]; - - int pad_top = conv->pad_top; - int pad_left = conv->pad_left; - - int ngroups = conv->ngroups, Cg = C/ngroups, Kg = K/ngroups; - int Kg_nblocks = (Kg + _FX_WINO_KBLOCK - 1)/_FX_WINO_KBLOCK; - const size_t inp_planesize = (size_t)Hi*Wi; - const size_t out_planesize = (size_t)H0*W0; - - int blocks_per_row = (W0+_FX_WINO_STEP-1)/_FX_WINO_STEP; - int blocks_per_plane = ((H0+_FX_WINO_STEP-1)/_FX_WINO_STEP)*blocks_per_row; - int blocks_per_plane_aligned = ((blocks_per_plane + - _FX_WINO_IBLOCK-1)/_FX_WINO_IBLOCK)*_FX_WINO_IBLOCK; - - size_t totalbufsize = (size_t)N*C*blocks_per_plane_aligned*_FX_WINO_AREA; - - AutoBuffer _buf; - _buf.allocate(totalbufsize + VEC_ALIGN); - float* wbuf_all = alignPtr(_buf.data(), VEC_ALIGN); - - float* inp = input.ptr(); - float* out = output.ptr(); - - float* fusedAddPtr = fusedAddMat.empty() ? nullptr : fusedAddMat.ptr(); - - // Phase 1. compute forward Winograd transforms for all input blocks, - // all input planes, all samples in the batch. - // [TODO]: maybe, if there are too many input channels, it makes sense to - // transform only part of input channels at once and then compute the partial - // accumulated sums (i.e. update the output buffers several times, - // rather than compute them in one pass). - parallel_for_(Range(0, ntasks), [&](const Range& r0) { - for (int task_id = r0.start; task_id < r0.end; task_id++) - { - int nc0 = (N*C)*task_id/ntasks; - int nc1 = (N*C)*(task_id+1)/ntasks; - for(; nc0 < nc1; nc0++) - { - int n = nc0 / C; - int c = nc0 - n*C; - int g = c / Cg; - c -= g*Cg; - for (int block_id = 0; block_id < blocks_per_plane; block_id += _FX_WINO_IBLOCK) - { - for (int db = 0; db < _FX_WINO_IBLOCK; db++) - { - size_t inwofs = ((n*ngroups + g)*blocks_per_plane_aligned + - block_id)*Cg*_FX_WINO_AREA + - (c*_FX_WINO_IBLOCK + db)*_FX_WINO_ATOM_F32; - float* inwptr = (float*)wbuf_all + inwofs; - - if (block_id + db < blocks_per_plane) - { - int y0 = (block_id + db) / blocks_per_row; - int x0 = (block_id + db) - y0 * blocks_per_row; - y0 = y0*_FX_WINO_STEP - pad_top; - x0 = x0*_FX_WINO_STEP - pad_left; - bool partial = y0 < 0 || y0 + _FX_WINO_SIZE > Hi || - x0 < 0 || x0 + _FX_WINO_SIZE > Wi; - int dx1 = 0, dx2 = _FX_WINO_SIZE, dy1 = 0, dy2 = _FX_WINO_SIZE; - int inpstep = Wi; - - float inpbuf[_FX_WINO_AREA]; - float* inptr0 = (float*)inp + nc0*inp_planesize + y0*Wi + x0; - float* inptr = inptr0; - - if (partial) - { - memset(inpbuf, 0, sizeof(inpbuf)); - dy1 = -y0 > 0 ? -y0 : 0; - dy2 = Hi - y0 < _FX_WINO_SIZE ? Hi - y0 : _FX_WINO_SIZE; - - if (dy2 < dy1) {dy2 = dy1 = 0;} - dx1 = -x0 > 0 ? -x0 : 0; - dx2 = Wi - x0 < _FX_WINO_SIZE ? Wi - x0 : _FX_WINO_SIZE; - - if (dx2 < dx1) {dx2 = dx1 = 0;} - inptr0 -= y0*Wi + x0; - - if (dx1 < dx2 && dy1 < dy2) - { - for(int dy = dy1; dy < dy2; dy++) - memcpy(&inpbuf[dy*_FX_WINO_SIZE + dx1], - inptr0 + (y0+dy)*Wi + (x0+dx1), - (dx2-dx1)*sizeof(inpbuf[0])); - } - - inptr = inpbuf; - inpstep = _FX_WINO_SIZE; - } -#if CV_TRY_AVX2 - if (conv->useAVX2) - opt_AVX2::_fx_winograd_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg); - else -#endif - _fx_winograd_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg); - } - else - { - for (int i = 0; i < _FX_WINO_NATOMS_F32; i++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32) - memset(inwptr, 0, _FX_WINO_ATOM_F32*sizeof(inwptr[0])); - } - } - } - } - }}); - - // Phase 2. compute elemwise-weighted sums of transformed blocks, - // apply inverse Winograd transforms to the sums, - // add bias, apply activation function if any and store the results. - parallel_for_(Range(0, ntasks), [&](const Range& r0) { - for (int task_id = r0.start; task_id < r0.end; task_id++) - { - size_t out_wbuf_size = _FX_WINO_AREA*_FX_WINO_KBLOCK*_FX_WINO_IBLOCK; - size_t outbuf_size = _FX_WINO_AREA; - AutoBuffer out_wbuf_, outbuf_; - out_wbuf_.allocate(out_wbuf_size + VEC_ALIGN); - float* out_wbuf = alignPtr(out_wbuf_.data(), VEC_ALIGN); - outbuf_.allocate(outbuf_size + VEC_ALIGN); - float* outbuf = alignPtr(outbuf_.data(), VEC_ALIGN); - - memset(out_wbuf, 0, out_wbuf_size * sizeof(float)); - memset(outbuf, 0, outbuf_size * sizeof(float)); - - int ngk0 = (int)(((int64_t)N*Kg_nblocks*ngroups)*task_id/ntasks); - int ngk1 = (int)(((int64_t)N*Kg_nblocks*ngroups)*(task_id+1)/ntasks); - - for(; ngk0 < ngk1; ngk0++) - { - int n = ngk0 / (Kg_nblocks*ngroups); - int gk0 = ngk0 % (Kg_nblocks*ngroups); - int g = gk0 / Kg_nblocks; - int k0 = (gk0 % Kg_nblocks)*_FX_WINO_KBLOCK; - int k1 = k0 + _FX_WINO_KBLOCK <= Kg ? k0 + _FX_WINO_KBLOCK : Kg; - - for (int block_id0 = 0; block_id0 < blocks_per_plane; block_id0 += _FX_WINO_IBLOCK) - { - int block_id1 = block_id0 + _FX_WINO_IBLOCK; - block_id1 = block_id1 < blocks_per_plane ? block_id1 : blocks_per_plane; - size_t inwofs = ((n*ngroups + g)*blocks_per_plane_aligned + block_id0)*Cg*_FX_WINO_AREA; - size_t wofs = (g*Kg_nblocks*_FX_WINO_KBLOCK + k0)*Cg*_FX_WINO_AREA; - - float* inwptr = wbuf_all + inwofs; - const float* wptr = conv->weightsWinoBufPtr + wofs; - -#if CV_TRY_AVX2 - if (conv->useAVX2) - opt_AVX2::_fx_winograd_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0); - else -#endif - _fx_winograd_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0); - for (int k = k0; k < k1; k++) - { - float biasv = conv->biasBuf[g*Kg + k]; - for (int block_id = block_id0; block_id < block_id1; block_id++) - { - int y0 = block_id / blocks_per_row; - int x0 = block_id - y0 * blocks_per_row; - y0 = y0*_FX_WINO_STEP; - x0 = x0*_FX_WINO_STEP; - int dy1 = H0 - y0; - if (dy1 > _FX_WINO_STEP) dy1 = _FX_WINO_STEP; - int dx1 = W0 - x0; - if (dx1 > _FX_WINO_STEP) dx1 = _FX_WINO_STEP; - assert(dx1 > 0 && dy1 > 0); - bool partial = activ || dy1 < _FX_WINO_STEP || dx1 < _FX_WINO_STEP; - size_t outofs = (n*K + g*Kg + k)*out_planesize + y0*W0 + x0; - int outstep = W0; - - float* outptr0 = (float*)out + outofs; - float* pbptr0 = fusedAddPtr ? fusedAddPtr + outofs : nullptr; - float *outptr = outptr0, *bpptr = pbptr0; - - if (partial) - { - outptr = outbuf; - outstep = _FX_WINO_SIZE; - if (pbptr0) - { - bpptr = outbuf; - for (int y = 0; y < dy1; y++) - memcpy(outbuf + y*_FX_WINO_SIZE, pbptr0 + y*W0, - dx1*sizeof(pbptr0[0])); - } - } -#if CV_TRY_AVX2 - if (conv->useAVX2) - opt_AVX2::_fx_winograd_AtXA_8x8_f32(out_wbuf + ((k - k0)*_FX_WINO_IBLOCK + (block_id - block_id0))*_FX_WINO_AREA, _FX_WINO_SIZE, - bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct); - else -#endif - _fx_winograd_AtXA_8x8_f32(out_wbuf + ((k - k0)*_FX_WINO_IBLOCK + (block_id - block_id0))*_FX_WINO_AREA, _FX_WINO_SIZE, - bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct); - if (partial) - { - if (activ) - activ->forwardSlice(outptr, outptr, _FX_WINO_SIZE*_FX_WINO_STEP, 0, g*Kg + k, g*Kg + k + 1); - for (int y = 0; y < dy1; y++) - memcpy(outptr0 + y*W0, outptr + y*_FX_WINO_SIZE,dx1*sizeof(outptr0[0])); - } - } - } - } - } - }}); - return 1; -} - -#else - -int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr& conv, - int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) -{ - return 0; -} -#endif -}} // namespace cv::dnn diff --git a/modules/dnn/src/layers/layers_common.simd.hpp b/modules/dnn/src/layers/layers_common.simd.hpp index eb1735639e..4bae86911c 100644 --- a/modules/dnn/src/layers/layers_common.simd.hpp +++ b/modules/dnn/src/layers/layers_common.simd.hpp @@ -46,16 +46,6 @@ namespace cv { namespace dnn { CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN -void fastDepthwiseConv( const float* weights, - int kernel_h, int kernel_w, - int stride_h, int stride_w, - int dilation_h, int dilation_w, - int pad_t, int pad_l, - const float* bias, const float* relu, - const float* inptr, - int height, int width, - float* outptr, - int out_d, int outH, int outW ); void fastGEMM1T( const float* vec, const float* weights, size_t wstep, const float* bias, float* dst, int nvecs, int vecsize ); @@ -70,185 +60,6 @@ void fastGEMM( const float* aptr, size_t astep, const float* bptr, #define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b)) #endif -static inline void _mm256_load_deinterleave(const float* ptr, __m256& a, __m256& b) -{ - __m256 t0 = _mm256_loadu_ps(ptr); - __m256 t1 = _mm256_loadu_ps(ptr + 8); - - __m256 lo = _mm256_permute2f128_ps(t0, t1, 0+2*16); - __m256 hi = _mm256_permute2f128_ps(t0, t1, 1+3*16); - a = _mm256_shuffle_ps(lo, hi, 0x88); - b = _mm256_shuffle_ps(lo, hi, 0xdd); -} - -void fastDepthwiseConv( const float* wptr, - int kernel_h, int kernel_w, - int stride_h, int stride_w, - int dilation_h, int dilation_w, - int pad_t, int pad_l, - const float* biasptr, const float* relu, - const float* inptr_, - int height, int width, - float* outptr_, - int out_d, int outH, int outW ) -{ - const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], - w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], - w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; - int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w); - float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d]; - - for (int out_i = 0; out_i < outH; out_i++) - { - int in_i = out_i * stride_h - pad_t, out_j = 0; - const float* imgptr0 = inptr_ + in_i*width; - const float* imgptr1 = imgptr0 + dilation_h*width; - const float* imgptr2 = imgptr0 + (dilation_h*2)*width; - float out, w00 = w00_, w01 = w01_, w02 = w02_; - float w20 = w20_, w21 = w21_, w22 = w22_; - if (in_i < 0) - { - w00 = w01 = w02 = 0.f; - imgptr0 = imgptr1; - } - else if (in_i + dilation_h*(kernel_h-1) >= height) - { - w20 = w21 = w22 = 0.f; - imgptr2 = imgptr1; - } - float* outptr = outptr_ + out_i*outW; - if (pad_l > 0) - { - out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 + - imgptr1[0]*w11 + imgptr1[dilation_w]*w12 + - imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias; - if (relu) - out = out > 0.f ? out : out*relu_coeff; - outptr[0] = out; - out_j = 1; - } - - if (stride_w == 1 || (stride_w == 2 && dilation_w == 1)) - { - const int VECSZ = 8; - __m256 vw00 = _mm256_set1_ps(w00), vw01 = _mm256_set1_ps(w01), vw02 = _mm256_set1_ps(w02), - vw10 = _mm256_set1_ps(w10), vw11 = _mm256_set1_ps(w11), vw12 = _mm256_set1_ps(w12), - vw20 = _mm256_set1_ps(w20), vw21 = _mm256_set1_ps(w21), vw22 = _mm256_set1_ps(w22); - __m256 z = _mm256_setzero_ps(), vbias = _mm256_set1_ps(bias), vrc = _mm256_set1_ps(relu_coeff); - - if( stride_w == 1 ) - for( ; out_j < outW1; out_j += VECSZ ) - { - if (out_j + VECSZ > outW1 && out_j > pad_l) - out_j = outW1 - VECSZ; - int in_j = out_j * stride_w - pad_l; - __m256 v00 = _mm256_loadu_ps(imgptr0 + in_j), - v01 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w), - v02 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w*2), - v10 = _mm256_loadu_ps(imgptr1 + in_j), - v11 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w), - v12 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w*2), - v20 = _mm256_loadu_ps(imgptr2 + in_j), - v21 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w), - v22 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w*2); - - __m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias); - __m256 vout1 = _mm256_mul_ps(v01, vw01); - __m256 vout2 = _mm256_mul_ps(v02, vw02); - - vout0 = _mm256_fmadd_ps(v10, vw10, vout0); - vout1 = _mm256_fmadd_ps(v11, vw11, vout1); - vout2 = _mm256_fmadd_ps(v12, vw12, vout2); - - vout0 = _mm256_fmadd_ps(v20, vw20, vout0); - vout1 = _mm256_fmadd_ps(v21, vw21, vout1); - vout2 = _mm256_fmadd_ps(v22, vw22, vout2); - - vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2); - if (relu) - { - __m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ); - vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m); - } - _mm256_storeu_ps(outptr + out_j, vout0); - } - else - for( ; out_j < outW1; out_j += VECSZ ) - { - if (out_j + VECSZ > outW1 && out_j > pad_l) - out_j = outW1 - VECSZ; - int in_j = out_j * stride_w - pad_l; - __m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused; - _mm256_load_deinterleave(imgptr0 + in_j, v00, v01); - _mm256_load_deinterleave(imgptr0 + in_j + 2, v02, unused); - _mm256_load_deinterleave(imgptr1 + in_j, v10, v11); - _mm256_load_deinterleave(imgptr1 + in_j + 2, v12, unused); - _mm256_load_deinterleave(imgptr2 + in_j, v20, v21); - _mm256_load_deinterleave(imgptr2 + in_j + 2, v22, unused); - - __m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias); - __m256 vout1 = _mm256_mul_ps(v01, vw01); - __m256 vout2 = _mm256_mul_ps(v02, vw02); - - vout0 = _mm256_fmadd_ps(v10, vw10, vout0); - vout1 = _mm256_fmadd_ps(v11, vw11, vout1); - vout2 = _mm256_fmadd_ps(v12, vw12, vout2); - - vout0 = _mm256_fmadd_ps(v20, vw20, vout0); - vout1 = _mm256_fmadd_ps(v21, vw21, vout1); - vout2 = _mm256_fmadd_ps(v22, vw22, vout2); - - vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2); - if (relu) - { - __m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ); - vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m); - } - _mm256_storeu_ps(outptr + out_j, vout0); - } - } - - for (; out_j < outW1; out_j++) - { - int in_j = out_j * stride_w - pad_l; - out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 + - imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 + - imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias; - if (relu) - out = out > 0.f ? out : out*relu_coeff; - outptr[out_j] = out; - } - - for (; out_j < outW; out_j++ ) - { - int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; - float s0 = 1.f, s1 = 1.f, s2 = 1.f; - if (in_j0 >= width) - { - in_j0 = 0; - s0 = 0.f; - } - if (in_j1 >= width) - { - in_j1 = 0; - s1 = 0.f; - } - if (in_j2 >= width) - { - in_j2 = 0; - s2 = 0.f; - } - out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 + - imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 + - imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias; - if (relu) - out = out > 0.f ? out : out*relu_coeff; - outptr[out_j] = out; - } - } - _mm256_zeroupper(); -} - // Used to generate the mask used when calculating tails static const uint32_t tailMaskArray[15] = { 0, 0, 0, 0, 0, 0, 0, 0, @@ -654,382 +465,10 @@ void fastGEMM1T( const float* vec, const float* weights, } } -/* -Example for load_deinterleave: - input: ptr[16] = {1,2,3, ... ,14,15,16} - output: a = {1, 3, 5, 7, 9, 11, 13, 15} - output: b = {2, 4, 6, 8,10, 12, 14, 16} -*/ -static inline void vfloat32m2_load_deinterleave(const float* ptr, vfloat32m2_t& a, vfloat32m2_t& b, int vl) -{ - vuint64m4_t mask = vmv_v_x_u64m4(1,vl*2); - vuint32m4_t mask_re = vreinterpret_v_u64m4_u32m4(mask); - vbool8_t mask0 = vmseq_vx_u32m4_b8 (mask_re, 1, vl*2); - vbool8_t mask1 = vmseq_vx_u32m4_b8 (mask_re, 0, vl*2); - vfloat32m4_t tempa = vundefined_f32m4(), tempb = vundefined_f32m4(); - vfloat32m4_t vw = vle32_v_f32m4(ptr, vl*2); - tempa = vcompress_vm_f32m4(mask0, tempa, vw, vl*2); - tempb = vcompress_vm_f32m4(mask1, tempb, vw, vl*2); - /* The following instructions have not to be supported by the GNU toolchain. - So we temporarily use store and load instead. - // a = vlmul_trunc_v_f32m4_f32m2(tempa); - // b = vlmul_trunc_v_f32m4_f32m2(tempb); - */ - cv::AutoBuffer cvBuffer(sizeof(float)*vl*2); - float* buffer = (float*)cvBuffer.data(); - vse32_v_f32m4(buffer, tempa, vl); - a = vle32_v_f32m2(buffer, vl); - vse32_v_f32m4(buffer, tempb, vl); - b = vle32_v_f32m2(buffer, vl); -} - -void fastDepthwiseConv( const float* wptr, - int kernel_h, int kernel_w, - int stride_h, int stride_w, - int dilation_h, int dilation_w, - int pad_t, int pad_l, - const float* biasptr, const float* relu, - const float* inptr_, - int height, int width, - float* outptr_, - int out_d, int outH, int outW ) -{ - int vl; - const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], - w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], - w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; - int outW1 = std::min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w); - float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d]; - - for (int out_i = 0; out_i < outH; out_i++) - { - int in_i = out_i * stride_h - pad_t, out_j = 0; - const float* imgptr0 = inptr_ + in_i*width; - const float* imgptr1 = imgptr0 + dilation_h*width; - const float* imgptr2 = imgptr0 + (dilation_h*2)*width; - float out, w00 = w00_, w01 = w01_, w02 = w02_; - float w20 = w20_, w21 = w21_, w22 = w22_; - if (in_i < 0) - { - w00 = w01 = w02 = 0.f; - imgptr0 = imgptr1; - } - else if (in_i + dilation_h*(kernel_h-1) >= height) - { - w20 = w21 = w22 = 0.f; - imgptr2 = imgptr1; - } - float* outptr = outptr_ + out_i*outW; - if (pad_l > 0) - { - out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 + - imgptr1[0]*w11 + imgptr1[dilation_w]*w12 + - imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias; - if (relu) - out = out > 0.f ? out : out*relu_coeff; - outptr[0] = out; - out_j = 1; - } - - if (stride_w == 1 || (stride_w == 2 && dilation_w == 1)) - { - int avl = outW1 - out_j; - if( stride_w == 1 ) - for( ; out_j < outW1; out_j += vl, avl -= vl) - { - vl = vsetvl_e32m2(avl); - int in_j = out_j * stride_w - pad_l; - vfloat32m2_t v00 = vle32_v_f32m2(imgptr0 + in_j, vl), - v01 = vle32_v_f32m2(imgptr0 + in_j + dilation_w, vl), - v02 = vle32_v_f32m2(imgptr0 + in_j + dilation_w*2, vl), - v10 = vle32_v_f32m2(imgptr1 + in_j, vl), - v11 = vle32_v_f32m2(imgptr1 + in_j + dilation_w, vl), - v12 = vle32_v_f32m2(imgptr1 + in_j + dilation_w*2, vl), - v20 = vle32_v_f32m2(imgptr2 + in_j, vl), - v21 = vle32_v_f32m2(imgptr2 + in_j + dilation_w, vl), - v22 = vle32_v_f32m2(imgptr2 + in_j + dilation_w*2, vl); - - vfloat32m2_t vout0 = vfmul_vf_f32m2(v00, w00, vl); - vfloat32m2_t vout1 = vfmul_vf_f32m2(v01, w01, vl); - vfloat32m2_t vout2 = vfmul_vf_f32m2(v02, w02, vl); - vout0 = vfadd_vf_f32m2(vout0, bias, vl); - - vout0 = vfmacc_vf_f32m2(vout0, w10, v10, vl); - vout1 = vfmacc_vf_f32m2(vout1, w11, v11, vl); - vout2 = vfmacc_vf_f32m2(vout2, w12, v12, vl); - - vout0 = vfmacc_vf_f32m2(vout0, w20, v20, vl); - vout1 = vfmacc_vf_f32m2(vout1, w21, v21, vl); - vout2 = vfmacc_vf_f32m2(vout2, w22, v22, vl); - - vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl); - if (relu) - { - vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl); - vout0 = vmerge_vvm_f32m2(m, vfmul_vf_f32m2(vout0, relu_coeff, vl), vout0, vl); - } - vse32_v_f32m2(outptr + out_j, vout0, vl); - } - else //stride_w == 2 && dilation_w == 1 - for( ; out_j < outW1; out_j += vl, avl -= vl) - { - vl = vsetvl_e32m2(avl); - int in_j = out_j * stride_w - pad_l; - vfloat32m2_t v00, v01, v02, v10, v11, v12, v20, v21, v22, unused; - vfloat32m2_load_deinterleave(imgptr0 + in_j, v00, v01, vl); - vfloat32m2_load_deinterleave(imgptr0 + in_j + 2, v02, unused, vl); - vfloat32m2_load_deinterleave(imgptr1 + in_j, v10, v11, vl); - vfloat32m2_load_deinterleave(imgptr1 + in_j + 2, v12, unused, vl); - vfloat32m2_load_deinterleave(imgptr2 + in_j, v20, v21, vl); - vfloat32m2_load_deinterleave(imgptr2 + in_j + 2, v22, unused, vl); - - vfloat32m2_t vout0 = vfmul_vf_f32m2(v00, w00, vl); - vfloat32m2_t vout1 = vfmul_vf_f32m2(v01, w01, vl); - vfloat32m2_t vout2 = vfmul_vf_f32m2(v02, w02, vl); - vout0 = vfadd_vf_f32m2(vout0, bias, vl); - - vout0 = vfmacc_vf_f32m2(vout0, w10, v10, vl); - vout1 = vfmacc_vf_f32m2(vout1, w11, v11, vl); - vout2 = vfmacc_vf_f32m2(vout2, w12, v12, vl); - - vout0 = vfmacc_vf_f32m2(vout0, w20, v20, vl); - vout1 = vfmacc_vf_f32m2(vout1, w21, v21, vl); - vout2 = vfmacc_vf_f32m2(vout2, w22, v22, vl); - - vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl); - if (relu) - { - vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl); - vout0 = vmerge_vvm_f32m2(m, vfmul_vf_f32m2(vout0, relu_coeff, vl), vout0, vl); - } - vse32_v_f32m2(outptr + out_j, vout0, vl); - } - } - - for (; out_j < outW1; out_j++) - { - int in_j = out_j * stride_w - pad_l; - out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 + - imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 + - imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias; - if (relu) - out = out > 0.f ? out : out*relu_coeff; - outptr[out_j] = out; - } - - for (; out_j < outW; out_j++ ) - { - int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; - float s0 = 1.f, s1 = 1.f, s2 = 1.f; - if (in_j0 >= width) - { - in_j0 = 0; - s0 = 0.f; - } - if (in_j1 >= width) - { - in_j1 = 0; - s1 = 0.f; - } - if (in_j2 >= width) - { - in_j2 = 0; - s2 = 0.f; - } - out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 + - imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 + - imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias; - if (relu) - out = out > 0.f ? out : out*relu_coeff; - outptr[out_j] = out; - } - } -} - #endif // CV_RVV #if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX -static inline void _v256_load_deinterleave(const float* ptr, __m256& a, __m256& b) -{ - __m256 t0 = (__m256)__lasx_xvld(ptr, 0); - __m256 t1 = (__m256)__lasx_xvld(ptr, 8*4); - - __m256 lo = (__m256)__lasx_xvpermi_q(t0, t1, 2+0*16); - __m256 hi = (__m256)__lasx_xvpermi_q(t0, t1, 3+1*16); - - a = (__m256)__lasx_xvpermi_w(hi, lo, 0x88); - b = (__m256)__lasx_xvpermi_w(hi, lo, 0xdd); -} - -void fastDepthwiseConv( const float* wptr, - int kernel_h, int kernel_w, - int stride_h, int stride_w, - int dilation_h, int dilation_w, - int pad_t, int pad_l, - const float* biasptr, const float* relu, - const float* inptr_, - int height, int width, - float* outptr_, - int out_d, int outH, int outW ) -{ - const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], - w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], - w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; - int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w); - float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d]; - - for (int out_i = 0; out_i < outH; out_i++) - { - int in_i = out_i * stride_h - pad_t, out_j = 0; - const float* imgptr0 = inptr_ + in_i*width; - const float* imgptr1 = imgptr0 + dilation_h*width; - const float* imgptr2 = imgptr0 + (dilation_h*2)*width; - float out, w00 = w00_, w01 = w01_, w02 = w02_; - float w20 = w20_, w21 = w21_, w22 = w22_; - if (in_i < 0) - { - w00 = w01 = w02 = 0.f; - imgptr0 = imgptr1; - } - else if (in_i + dilation_h*(kernel_h-1) >= height) - { - w20 = w21 = w22 = 0.f; - imgptr2 = imgptr1; - } - float* outptr = outptr_ + out_i*outW; - if (pad_l > 0) - { - out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 + - imgptr1[0]*w11 + imgptr1[dilation_w]*w12 + - imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias; - if (relu) - out = out > 0.f ? out : out*relu_coeff; - outptr[0] = out; - out_j = 1; - } - - if (stride_w == 1 || (stride_w == 2 && dilation_w == 1)) - { - const int VECSZ = 8; - __m256 vw00 = _v256_setall_ps(w00), vw01 = _v256_setall_ps(w01), vw02 = _v256_setall_ps(w02), - vw10 = _v256_setall_ps(w10), vw11 = _v256_setall_ps(w11), vw12 = _v256_setall_ps(w12), - vw20 = _v256_setall_ps(w20), vw21 = _v256_setall_ps(w21), vw22 = _v256_setall_ps(w22); - __m256 z = (__m256)__lasx_xvxor_v((__m256i)vw00, (__m256i)vw00), - vbias = _v256_setall_ps(bias), vrc = _v256_setall_ps(relu_coeff); - - if( stride_w == 1 ) - for( ; out_j < outW1; out_j += VECSZ ) - { - if (out_j + VECSZ > outW1 && out_j > pad_l) - out_j = outW1 - VECSZ; - int in_j = out_j * stride_w - pad_l; - __m256 v00 = (__m256)__lasx_xvld(imgptr0 + in_j, 0), - v01 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w, 0), - v02 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w*2, 0), - v10 = (__m256)__lasx_xvld(imgptr1 + in_j, 0), - v11 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w, 0), - v12 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w*2, 0), - v20 = (__m256)__lasx_xvld(imgptr2 + in_j, 0), - v21 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w, 0), - v22 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w*2, 0); - - __m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias); - __m256 vout1 = __lasx_xvfmul_s(v01, vw01); - __m256 vout2 = __lasx_xvfmul_s(v02, vw02); - - vout0 = __lasx_xvfmadd_s(v10, vw10, vout0); - vout1 = __lasx_xvfmadd_s(v11, vw11, vout1); - vout2 = __lasx_xvfmadd_s(v12, vw12, vout2); - - vout0 = __lasx_xvfmadd_s(v20, vw20, vout0); - vout1 = __lasx_xvfmadd_s(v21, vw21, vout1); - vout2 = __lasx_xvfmadd_s(v22, vw22, vout2); - - vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2); - if (relu) - { - __m256i m = __lasx_xvfcmp_clt_s(z, vout0); - vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m); - } - __lasx_xvst(vout0, outptr + out_j, 0); - } - else - for( ; out_j < outW1; out_j += VECSZ ) - { - if (out_j + VECSZ > outW1 && out_j > pad_l) - out_j = outW1 - VECSZ; - int in_j = out_j * stride_w - pad_l; - __m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused; - _v256_load_deinterleave(imgptr0 + in_j, v00, v01); - _v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused); - _v256_load_deinterleave(imgptr1 + in_j, v10, v11); - _v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused); - _v256_load_deinterleave(imgptr2 + in_j, v20, v21); - _v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused); - - __m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias); - __m256 vout1 = __lasx_xvfmul_s(v01, vw01); - __m256 vout2 = __lasx_xvfmul_s(v02, vw02); - - vout0 = __lasx_xvfmadd_s(v10, vw10, vout0); - vout1 = __lasx_xvfmadd_s(v11, vw11, vout1); - vout2 = __lasx_xvfmadd_s(v12, vw12, vout2); - - vout0 = __lasx_xvfmadd_s(v20, vw20, vout0); - vout1 = __lasx_xvfmadd_s(v21, vw21, vout1); - vout2 = __lasx_xvfmadd_s(v22, vw22, vout2); - - vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2); - if (relu) - { - __m256i m = __lasx_xvfcmp_clt_s(z, vout0); - vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m); - } - __lasx_xvst(vout0, outptr + out_j, 0); - } - } - - for (; out_j < outW1; out_j++) - { - int in_j = out_j * stride_w - pad_l; - out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 + - imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 + - imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias; - if (relu) - out = out > 0.f ? out : out*relu_coeff; - outptr[out_j] = out; - } - - for (; out_j < outW; out_j++ ) - { - int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2; - float s0 = 1.f, s1 = 1.f, s2 = 1.f; - if (in_j0 >= width) - { - in_j0 = 0; - s0 = 0.f; - } - if (in_j1 >= width) - { - in_j1 = 0; - s1 = 0.f; - } - if (in_j2 >= width) - { - in_j2 = 0; - s2 = 0.f; - } - out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 + - imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 + - imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias; - if (relu) - out = out > 0.f ? out : out*relu_coeff; - outptr[out_j] = out; - } - } -} - // dst = vec * weights^t + bias void fastGEMM1T( const float* vec, const float* weights, size_t wstep, const float* bias,