diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt
index d285e544c0..88f4347bb6 100644
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@@ -10,6 +10,9 @@ set(the_description "Deep neural network module. It allows to load models from d
 
 ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV LASX)
 ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX LASX)
+ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_block" AVX AVX2)
+ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_depthwise" AVX AVX2 RVV LASX)
+ocv_add_dispatched_file_force_all("layers/cpu_kernels/conv_winograd_f63" AVX AVX2)
 
 ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js)
 
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 5567a58a2a..3e62887bd7 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -72,7 +72,7 @@ using namespace cv::dnn::ocl4dnn;
 using namespace cv::dnn::cuda4dnn;
 #endif
 
-#include "fast_convolution/fast_convolution.hpp"
+#include "cpu_kernels/convolution.hpp"
 
 namespace cv
 {
diff --git a/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp b/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp
new file mode 100644
index 0000000000..71b17dcc9b
--- /dev/null
+++ b/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp
@@ -0,0 +1,259 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "opencv2/core/hal/intrin.hpp"
+
+namespace cv {
+namespace dnn {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR);
+
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
+
+#if !CV_FMA3 // AVX workaround
+#undef _mm256_fmadd_ps
+#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
+#endif
+
+void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
+{
+    CV_Assert(convMR == 4 && convNR == 24);
+    __m256 c00 = _mm256_set1_ps(0.f), c01 = c00, c02 = c00;
+    __m256 c10 = c00, c11 = c00, c12 = c00;
+    __m256 c20 = c00, c21 = c00, c22 = c00;
+    __m256 c30 = c00, c31 = c00, c32 = c00;
+
+    __m256 a0 = _mm256_setzero_ps(), a1 = _mm256_setzero_ps();
+    __m256 b0 = _mm256_setzero_ps(), b1 = _mm256_setzero_ps(), b2 = _mm256_setzero_ps();
+
+    for (int p = 0; p < np; p++, a += convMR, b += convNR)
+    {
+        a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
+        b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16);
+
+        c00 = _mm256_fmadd_ps(b0, a0, c00);
+        c01 = _mm256_fmadd_ps(b1, a0, c01);
+        c02 = _mm256_fmadd_ps(b2, a0, c02);
+
+        c10 = _mm256_fmadd_ps(b0, a1, c10);
+        c11 = _mm256_fmadd_ps(b1, a1, c11);
+        c12 = _mm256_fmadd_ps(b2, a1, c12);
+
+        a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]);
+
+        c20 = _mm256_fmadd_ps(b0, a0, c20);
+        c21 = _mm256_fmadd_ps(b1, a0, c21);
+        c22 = _mm256_fmadd_ps(b2, a0, c22);
+
+        c30 = _mm256_fmadd_ps(b0, a1, c30);
+        c31 = _mm256_fmadd_ps(b1, a1, c31);
+        c32 = _mm256_fmadd_ps(b2, a1, c32);
+    }
+
+    if (!init_c)
+    {
+        c00 = _mm256_add_ps(c00, _mm256_load_ps(c));
+        c01 = _mm256_add_ps(c01, _mm256_load_ps(c + 8));
+        c02 = _mm256_add_ps(c02, _mm256_load_ps(c + 16));
+
+        c10 = _mm256_add_ps(c10, _mm256_load_ps(c + ldc));
+        c11 = _mm256_add_ps(c11, _mm256_load_ps(c + ldc + 8));
+        c12 = _mm256_add_ps(c12, _mm256_load_ps(c + ldc + 16));
+
+        c20 = _mm256_add_ps(c20, _mm256_load_ps(c + ldc*2));
+        c21 = _mm256_add_ps(c21, _mm256_load_ps(c + ldc*2 + 8));
+        c22 = _mm256_add_ps(c22, _mm256_load_ps(c + ldc*2 + 16));
+
+        c30 = _mm256_add_ps(c30, _mm256_load_ps(c + ldc*3));
+        c31 = _mm256_add_ps(c31, _mm256_load_ps(c + ldc*3 + 8));
+        c32 = _mm256_add_ps(c32, _mm256_load_ps(c + ldc*3 + 16));
+    }
+
+    _mm256_storeu_ps(c, c00), _mm256_storeu_ps(c+8, c01), _mm256_storeu_ps(c+16, c02);
+    _mm256_storeu_ps(c + ldc, c10), _mm256_storeu_ps(c + ldc + 8, c11), _mm256_storeu_ps(c + ldc + 16, c12);
+    _mm256_storeu_ps(c + ldc*2, c20), _mm256_storeu_ps(c + ldc*2 + 8, c21), _mm256_storeu_ps(c + ldc*2 + 16, c22);
+    _mm256_storeu_ps(c + ldc*3, c30), _mm256_storeu_ps(c + ldc*3 + 8, c31), _mm256_storeu_ps(c + ldc*3 + 16, c32);
+    _mm256_zeroupper();
+}
+
+#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+
+// NEON code work around.
+namespace opt_NEON
+{
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON
+
+void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
+{
+#if CV_NEON_AARCH64
+    if (convMR == 4 && convNR == 28) // AARCH64
+    {
+        float32x4_t c00 = vdupq_n_f32(0.f), c01 = c00, c02 = c00, c03 = c00, c04 = c00, c05 = c00, c06 = c00;
+        float32x4_t c10 = vdupq_n_f32(0.f), c11 = c10, c12 = c10, c13 = c10, c14 = c10, c15 = c10, c16 = c10;
+        float32x4_t c20 = vdupq_n_f32(0.f), c21 = c20, c22 = c20, c23 = c20, c24 = c20, c25 = c20, c26 = c20;
+        float32x4_t c30 = vdupq_n_f32(0.f), c31 = c30, c32 = c30, c33 = c30, c34 = c30, c35 = c30, c36 = c30;
+
+        for( int p = 0; p < np; p++, a += convMR, b += convNR )
+        {
+            float32x4_t a0 = vld1q_f32(a), b0, b1, b2;
+            b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8);
+
+            c00 = vfmaq_laneq_f32(c00, b0, a0, 0);
+            c01 = vfmaq_laneq_f32(c01, b1, a0, 0);
+            c02 = vfmaq_laneq_f32(c02, b2, a0, 0);
+            c10 = vfmaq_laneq_f32(c10, b0, a0, 1);
+            c11 = vfmaq_laneq_f32(c11, b1, a0, 1);
+            c12 = vfmaq_laneq_f32(c12, b2, a0, 1);
+            c20 = vfmaq_laneq_f32(c20, b0, a0, 2);
+            c21 = vfmaq_laneq_f32(c21, b1, a0, 2);
+            c22 = vfmaq_laneq_f32(c22, b2, a0, 2);
+            c30 = vfmaq_laneq_f32(c30, b0, a0, 3);
+            c31 = vfmaq_laneq_f32(c31, b1, a0, 3);
+            c32 = vfmaq_laneq_f32(c32, b2, a0, 3);
+
+            b0 = vld1q_f32(b + 12); b1 = vld1q_f32(b + 16); b2 = vld1q_f32(b + 20);
+
+            c03 = vfmaq_laneq_f32(c03, b0, a0, 0);
+            c04 = vfmaq_laneq_f32(c04, b1, a0, 0);
+            c05 = vfmaq_laneq_f32(c05, b2, a0, 0);
+            c13 = vfmaq_laneq_f32(c13, b0, a0, 1);
+            c14 = vfmaq_laneq_f32(c14, b1, a0, 1);
+            c15 = vfmaq_laneq_f32(c15, b2, a0, 1);
+            c23 = vfmaq_laneq_f32(c23, b0, a0, 2);
+            c24 = vfmaq_laneq_f32(c24, b1, a0, 2);
+            c25 = vfmaq_laneq_f32(c25, b2, a0, 2);
+            c33 = vfmaq_laneq_f32(c33, b0, a0, 3);
+            c34 = vfmaq_laneq_f32(c34, b1, a0, 3);
+            c35 = vfmaq_laneq_f32(c35, b2, a0, 3);
+
+            b0 = vld1q_f32(b + 24);
+            c06 = vfmaq_laneq_f32(c06, b0, a0, 0);
+            c16 = vfmaq_laneq_f32(c16, b0, a0, 1);
+            c26 = vfmaq_laneq_f32(c26, b0, a0, 2);
+            c36 = vfmaq_laneq_f32(c36, b0, a0, 3);
+        }
+
+        if (!init_c)
+        {
+            c00 = vaddq_f32(c00, vld1q_f32(c));
+            c01 = vaddq_f32(c01, vld1q_f32(c + 4));
+            c02 = vaddq_f32(c02, vld1q_f32(c + 8));
+            c03 = vaddq_f32(c03, vld1q_f32(c + 12));
+            c04 = vaddq_f32(c04, vld1q_f32(c + 16));
+            c05 = vaddq_f32(c05, vld1q_f32(c + 20));
+            c06 = vaddq_f32(c06, vld1q_f32(c + 24));
+
+            c10 = vaddq_f32(c10, vld1q_f32(c + ldc));
+            c11 = vaddq_f32(c11, vld1q_f32(c + ldc + 4));
+            c12 = vaddq_f32(c12, vld1q_f32(c + ldc + 8));
+            c13 = vaddq_f32(c13, vld1q_f32(c + ldc + 12));
+            c14 = vaddq_f32(c14, vld1q_f32(c + ldc + 16));
+            c15 = vaddq_f32(c15, vld1q_f32(c + ldc + 20));
+            c16 = vaddq_f32(c16, vld1q_f32(c + ldc + 24));
+
+            c20 = vaddq_f32(c20, vld1q_f32(c + ldc*2));
+            c21 = vaddq_f32(c21, vld1q_f32(c + ldc*2 + 4));
+            c22 = vaddq_f32(c22, vld1q_f32(c + ldc*2 + 8));
+            c23 = vaddq_f32(c23, vld1q_f32(c + ldc*2 + 12));
+            c24 = vaddq_f32(c24, vld1q_f32(c + ldc*2 + 16));
+            c25 = vaddq_f32(c25, vld1q_f32(c + ldc*2 + 20));
+            c26 = vaddq_f32(c26, vld1q_f32(c + ldc*2 + 24));
+
+            c30 = vaddq_f32(c30, vld1q_f32(c + ldc*3));
+            c31 = vaddq_f32(c31, vld1q_f32(c + ldc*3 + 4));
+            c32 = vaddq_f32(c32, vld1q_f32(c + ldc*3 + 8));
+            c33 = vaddq_f32(c33, vld1q_f32(c + ldc*3 + 12));
+            c34 = vaddq_f32(c34, vld1q_f32(c + ldc*3 + 16));
+            c35 = vaddq_f32(c35, vld1q_f32(c + ldc*3 + 20));
+            c36 = vaddq_f32(c36, vld1q_f32(c + ldc*3 + 24));
+        }
+
+        vst1q_f32(c, c00); vst1q_f32(c+4, c01);
+        vst1q_f32(c+8, c02); vst1q_f32(c+12, c03);
+        vst1q_f32(c+16, c04); vst1q_f32(c+20, c05);
+        vst1q_f32(c+24, c06);
+
+        vst1q_f32(c+ldc, c10); vst1q_f32(c+ldc+4, c11);
+        vst1q_f32(c+ldc+8, c12); vst1q_f32(c+ldc+12, c13);
+        vst1q_f32(c+ldc+16, c14); vst1q_f32(c+ldc+20, c15);
+        vst1q_f32(c+ldc+24, c16);
+
+        vst1q_f32(c+ldc*2, c20); vst1q_f32(c+ldc*2+4, c21);
+        vst1q_f32(c+ldc*2+8, c22); vst1q_f32(c+ldc*2+12, c23);
+        vst1q_f32(c+ldc*2+16, c24); vst1q_f32(c+ldc*2+20, c25);
+        vst1q_f32(c+ldc*2+24, c26);
+
+        vst1q_f32(c+ldc*3, c30); vst1q_f32(c+ldc*3+4, c31);
+        vst1q_f32(c+ldc*3+8, c32); vst1q_f32(c+ldc*3+12, c33);
+        vst1q_f32(c+ldc*3+16, c34); vst1q_f32(c+ldc*3+20, c35);
+        vst1q_f32(c+ldc*3+24, c36);
+    }
+    else
+#endif
+    if (convMR == 4 && convNR == 12) // ARMv7
+    {
+        float32x4_t c0 = vdupq_n_f32(0.f), c1 = c0, c2 = c0;
+        float32x4_t c3 = vdupq_n_f32(0.f), c4 = c3, c5 = c3;
+        float32x4_t c6 = vdupq_n_f32(0.f), c7 = c6, c8 = c6;
+        float32x4_t c9 = vdupq_n_f32(0.f), c10 = c9, c11 = c9;
+
+        float32x2_t a0 = vdup_n_f32(0.0f), a1 = a0;
+        float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f);
+
+        for (int p = 0; p < np; p++, a += convMR, b += convNR)
+        {
+            a0 = vld1_f32(a), a1 = vld1_f32(a+2);
+            b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
+
+            c0 = vmlaq_lane_f32(c0, b0, a0, 0);
+            c1 = vmlaq_lane_f32(c1, b1, a0, 0);
+            c2 = vmlaq_lane_f32(c2, b2, a0, 0);
+
+            c3 = vmlaq_lane_f32(c3, b0, a0, 1);
+            c4 = vmlaq_lane_f32(c4, b1, a0, 1);
+            c5 = vmlaq_lane_f32(c5, b2, a0, 1);
+
+            c6 = vmlaq_lane_f32(c6, b0, a1, 0);
+            c7 = vmlaq_lane_f32(c7, b1, a1, 0);
+            c8 = vmlaq_lane_f32(c8, b2, a1, 0);
+
+            c9  = vmlaq_lane_f32(c9 , b0, a1, 1);
+            c10 = vmlaq_lane_f32(c10, b1, a1, 1);
+            c11 = vmlaq_lane_f32(c11, b2, a1, 1);
+        }
+
+        if (!init_c)
+        {
+            c0 = vaddq_f32(c0, vld1q_f32(c));
+            c1 = vaddq_f32(c1, vld1q_f32(c + 4));
+            c2 = vaddq_f32(c2, vld1q_f32(c + 8));
+
+            c3 = vaddq_f32(c3, vld1q_f32(c + ldc));
+            c4 = vaddq_f32(c4, vld1q_f32(c + ldc + 4));
+            c5 = vaddq_f32(c5, vld1q_f32(c + ldc + 8));
+
+            c6 = vaddq_f32(c6, vld1q_f32(c + ldc * 2));
+            c7 = vaddq_f32(c7, vld1q_f32(c + ldc * 2 + 4));
+            c8 = vaddq_f32(c8, vld1q_f32(c + ldc * 2 + 8));
+
+            c9  = vaddq_f32(c9 , vld1q_f32(c + ldc * 3));
+            c10 = vaddq_f32(c10, vld1q_f32(c + ldc * 3 + 4));
+            c11 = vaddq_f32(c11, vld1q_f32(c + ldc * 3 + 8));
+        }
+
+        vst1q_f32(c, c0), vst1q_f32(c+4, c1), vst1q_f32(c+8, c2);
+        vst1q_f32(c + ldc, c3), vst1q_f32(c + ldc + 4, c4), vst1q_f32(c + ldc + 8, c5);
+        vst1q_f32(c + ldc*2, c6), vst1q_f32(c + ldc*2 + 4, c7), vst1q_f32(c + ldc*2 + 8, c8);
+        vst1q_f32(c + ldc*3, c9), vst1q_f32(c + ldc*3 + 4, c10), vst1q_f32(c + ldc*3 + 8, c11);
+    }
+    else
+        CV_Error(Error::StsNotImplemented, "Unsupported convMR and/or convNR in opt_NEON::convBlock");
+}
+
+#endif
+}
+}} // namespace cv::dnn
diff --git a/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp b/modules/dnn/src/layers/cpu_kernels/conv_depthwise.cpp
similarity index 91%
rename from modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
rename to modules/dnn/src/layers/cpu_kernels/conv_depthwise.cpp
index b690156941..3e969336ad 100644
--- a/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/conv_depthwise.cpp
@@ -2,20 +2,147 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 
-// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv.fx).
-// Here is the original license:
-/*
-    This file is a part of ficus language project.
-    See ficus/LICENSE for the licensing terms
-*/
-
 #include "../../precomp.hpp"
-#include "fast_convolution.hpp"
-#include "../layers_common.hpp"
+#include "convolution.hpp"
+
+#include "conv_depthwise.simd.hpp"
+#include "layers/cpu_kernels/conv_depthwise.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
 
 namespace cv { namespace dnn {
 
-static void depthWiseBlockConv2D(const float* wptr,
+void depthWiseBlockConv2D(const float* wptr,
+                                 int kernel_h, int kernel_w,
+                                 int stride_h, int stride_w,
+                                 int dilation_h, int dilation_w,
+                                 int pad_t, int pad_l,
+                                 const float* biasptr, const float* relu,
+                                 const float* inptr_,
+                                 int height, int width,
+                                 float* outptr_,
+                                 int out_d, int outH, int outW, bool fusedAdd);
+
+void depthWiseBlockConv1D(const float* wptr,
+                                 int kernel_w, int stride_w, int dilation_w, int pad_l,
+                                 const float* biasptr, const float* relu,
+                                 const float* inptr_, int width,
+                                 float* outptr_,
+                                 int out_d, int outW, bool fusedAdd);
+
+void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, ActivationLayer* activ_,
+                  const std::vector<float>& reluslope, bool fusedAdd)
+{
+    Mat input = _input.getMat();
+    Mat output = _output.getMat();
+    MatShape inputShape = shape(input);
+    MatShape outputShape = shape(output);
+
+    CV_Assert(inputShape.size() == 3 || inputShape.size() == 4);
+    CV_Assert(inputShape.size() == outputShape.size());
+
+    int conv_dim = conv->conv_dim;
+    CV_Assert((conv_dim == CONV_2D || conv_dim == CONV_1D) &&
+            "DNN: Currently we do not support depth-wise for Convolution 3D!");
+
+    ActivationLayer* activ = reluslope.empty() ? activ_ : nullptr;
+    int N = inputShape[0], C = inputShape[1];
+
+    int Hi = conv_dim == CONV_1D ? 1 : inputShape[inputShape.size() - 2];
+    int Wi = inputShape[inputShape.size() - 1];
+
+    int K = conv->K, Hk = conv->Hk, Wk = conv->Wk;
+
+    int H0 = conv_dim == CONV_1D ? 1 : outputShape[outputShape.size() - 2];
+    int W0 = outputShape[outputShape.size() - 1];
+    int ngroups = conv->ngroups;
+
+    const size_t inp_planesize = (size_t) Hi * Wi;
+    const size_t out_planesize = (size_t) H0 * W0;
+
+    CV_Assert(ngroups > 1 && ngroups == K && ngroups == C);
+
+    int stride_h = conv->stride_h, stride_w = conv->stride_w;
+    int dilation_h = conv->dilation_h, dilation_w = conv->dilation_w;
+
+    int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom;
+    int pad_left = conv->pad_left, pad_right = conv->pad_right;
+
+    int ksize = Hk * Wk;
+
+    const int VEC_NLANES = 32;
+    int padded_ksize = ((ksize + VEC_NLANES-1) / VEC_NLANES) * VEC_NLANES;
+
+    const float *inp = input.ptr<float>();
+    float *out = output.ptr<float>();
+
+#if CV_TRY_AVX2 || CV_TRY_AVX || CV_TRY_RVV
+    // TODO: remove the following limitation, need change code in conv_depthwise.simd.hpp.
+    bool canRunOpt = Wi >= 16 + dilation_w*(Wk - 1) && !fusedAdd;
+#endif
+    std::vector<int> ofstab_(3 * ksize, 0);
+    int *ofstab = ofstab_.data();
+    int *yxtab = ofstab + ksize;
+
+    for (int k = 0; k < ksize; k++)
+    {
+        int y = k < ksize ? k / Wk : 0;
+        int x = k < ksize ? k % Wk : 0;
+        int dy = y * dilation_h, dx = x * dilation_w;
+        yxtab[k * 2] = dy;
+        yxtab[k * 2 + 1] = dx;
+        ofstab[k] = dy * Wi + dx;
+    }
+
+    const float *weights0 = conv->weightsBufPtr, *bias = conv->biasBuf.data();
+    const float* relu = reluslope.data();
+    CV_Assert(ksize > 1 || (pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0));
+
+    parallel_for_(Range(0, N * C), [&](const Range &r0) {
+    for (int nc = r0.start; nc < r0.end; nc++)
+    {
+        int c = nc % C;
+        const float *inptr0 = inp + inp_planesize * nc;
+        float *outptr0 = out + out_planesize * nc;
+
+        const float *weights = weights0 + c * padded_ksize;
+
+        if (conv_dim == CONV_2D)
+        {
+#if CV_TRY_AVX2
+            if(canRunOpt && conv->useAVX2)
+                opt_AVX2::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
+                                            pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
+            else
+#endif
+#if CV_TRY_AVX
+            if(canRunOpt && conv->useAVX)
+                opt_AVX::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
+                                            pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
+            else
+#endif
+#if CV_TRY_RVV
+            if(canRunOpt && conv->useRVV)
+                opt_RVV::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
+                                            pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
+            else
+#endif
+            depthWiseBlockConv2D(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
+                                 pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0, fusedAdd);
+        }
+        else // conv_dim == CONV_1D, spatial branch for depth-wise Conv1D.
+        {
+            depthWiseBlockConv1D(weights, Wk, stride_w, dilation_w, pad_left, bias, relu, inptr0, Wi, outptr0, c, W0, fusedAdd);
+        }
+
+        if (activ)
+            activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1);
+    }});
+}
+
+/****************************************************************************************\
+                                    SIMD and no-SIMD code for depthWiseBlockConv
+\****************************************************************************************/
+
+void depthWiseBlockConv2D(const float* wptr,
                                  int kernel_h, int kernel_w,
                                  int stride_h, int stride_w,
                                  int dilation_h, int dilation_w,
@@ -199,7 +326,7 @@ static void depthWiseBlockConv2D(const float* wptr,
     }
 }
 
-static void depthWiseBlockConv1D(const float* wptr,
+void depthWiseBlockConv1D(const float* wptr,
                                  int kernel_w, int stride_w, int dilation_w, int pad_l,
                                  const float* biasptr, const float* relu,
                                  const float* inptr_, int width,
@@ -332,114 +459,5 @@ static void depthWiseBlockConv1D(const float* wptr,
     }
 }
 
-void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, ActivationLayer* activ_,
-                  const std::vector<float>& reluslope, bool fusedAdd)
-{
-    Mat input = _input.getMat();
-    Mat output = _output.getMat();
-    MatShape inputShape = shape(input);
-    MatShape outputShape = shape(output);
-
-    CV_Assert(inputShape.size() == 3 || inputShape.size() == 4);
-    CV_Assert(inputShape.size() == outputShape.size());
-
-    int conv_dim = conv->conv_dim;
-    CV_Assert((conv_dim == CONV_2D || conv_dim == CONV_1D) &&
-            "DNN: Currently we do not support depth-wise for Convolution 3D!");
-
-    ActivationLayer* activ = reluslope.empty() ? activ_ : nullptr;
-    int N = inputShape[0], C = inputShape[1];
-
-    int Hi = conv_dim == CONV_1D ? 1 : inputShape[inputShape.size() - 2];
-    int Wi = inputShape[inputShape.size() - 1];
-
-    int K = conv->K, Hk = conv->Hk, Wk = conv->Wk;
-
-    int H0 = conv_dim == CONV_1D ? 1 : outputShape[outputShape.size() - 2];
-    int W0 = outputShape[outputShape.size() - 1];
-    int ngroups = conv->ngroups;
-
-    const size_t inp_planesize = (size_t) Hi * Wi;
-    const size_t out_planesize = (size_t) H0 * W0;
-
-    CV_Assert(ngroups > 1 && ngroups == K && ngroups == C);
-
-    int stride_h = conv->stride_h, stride_w = conv->stride_w;
-    int dilation_h = conv->dilation_h, dilation_w = conv->dilation_w;
-
-    int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom;
-    int pad_left = conv->pad_left, pad_right = conv->pad_right;
-
-    int ksize = Hk * Wk;
-
-    const int VEC_NLANES = 32;
-    int padded_ksize = ((ksize + VEC_NLANES-1) / VEC_NLANES) * VEC_NLANES;
-
-    const float *inp = input.ptr<float>();
-    float *out = output.ptr<float>();
-
-#if CV_TRY_AVX2 || CV_TRY_AVX || CV_TRY_RVV
-    // TODO: remove the following limitation, need change code in layers_common.simd.hpp.
-    bool canRunOpt = Wi >= 16 + dilation_w*(Wk - 1) && !fusedAdd;
-#endif
-    std::vector<int> ofstab_(3 * ksize, 0);
-    int *ofstab = ofstab_.data();
-    int *yxtab = ofstab + ksize;
-
-    for (int k = 0; k < ksize; k++)
-    {
-        int y = k < ksize ? k / Wk : 0;
-        int x = k < ksize ? k % Wk : 0;
-        int dy = y * dilation_h, dx = x * dilation_w;
-        yxtab[k * 2] = dy;
-        yxtab[k * 2 + 1] = dx;
-        ofstab[k] = dy * Wi + dx;
-    }
-
-    const float *weights0 = conv->weightsBufPtr, *bias = conv->biasBuf.data();
-    const float* relu = reluslope.data();
-    CV_Assert(ksize > 1 || (pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0));
-
-    parallel_for_(Range(0, N * C), [&](const Range &r0) {
-    for (int nc = r0.start; nc < r0.end; nc++)
-    {
-        int c = nc % C;
-        const float *inptr0 = inp + inp_planesize * nc;
-        float *outptr0 = out + out_planesize * nc;
-
-        const float *weights = weights0 + c * padded_ksize;
-
-        if (conv_dim == CONV_2D)
-        {
-#if CV_TRY_AVX2
-            if(canRunOpt && conv->useAVX2)
-                opt_AVX2::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
-                                            pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
-            else
-#endif
-#if CV_TRY_AVX
-            if(canRunOpt && conv->useAVX)
-                opt_AVX::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
-                                            pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
-            else
-#endif
-#if CV_TRY_RVV
-            if(canRunOpt && conv->useRVV)
-                opt_RVV::fastDepthwiseConv(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
-                                            pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0);
-            else
-#endif
-            depthWiseBlockConv2D(weights, Hk, Wk, stride_h, stride_w, dilation_h, dilation_w,
-                                 pad_top, pad_left, bias, relu, inptr0, Hi, Wi, outptr0, c, H0, W0, fusedAdd);
-        }
-        else // conv_dim == CONV_1D, spatial branch for depth-wise Conv1D.
-        {
-            depthWiseBlockConv1D(weights, Wk, stride_w, dilation_w, pad_left, bias, relu, inptr0, Wi, outptr0, c, W0, fusedAdd);
-        }
-
-        if (activ)
-            activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1);
-    }});
-}
 
 }} // namespace cv::dnn
diff --git a/modules/dnn/src/layers/cpu_kernels/conv_depthwise.simd.hpp b/modules/dnn/src/layers/cpu_kernels/conv_depthwise.simd.hpp
new file mode 100644
index 0000000000..1d561e9864
--- /dev/null
+++ b/modules/dnn/src/layers/cpu_kernels/conv_depthwise.simd.hpp
@@ -0,0 +1,591 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "opencv2/core/hal/intrin.hpp"
+
+namespace cv {
+namespace dnn {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+void fastDepthwiseConv(const float* weights,
+                        int kernel_h, int kernel_w,
+                        int stride_h, int stride_w,
+                        int dilation_h, int dilation_w,
+                        int pad_t, int pad_l,
+                        const float* bias, const float* relu,
+                        const float* inptr,
+                        int height, int width,
+                        float* outptr,
+                        int out_d, int outH, int outW);
+
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
+
+#if !CV_FMA3 // AVX workaround
+#undef _mm256_fmadd_ps
+#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
+#endif
+
+static inline void _mm256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
+{
+    __m256 t0 = _mm256_loadu_ps(ptr);
+    __m256 t1 = _mm256_loadu_ps(ptr + 8);
+
+    __m256 lo = _mm256_permute2f128_ps(t0, t1, 0+2*16);
+    __m256 hi = _mm256_permute2f128_ps(t0, t1, 1+3*16);
+    a = _mm256_shuffle_ps(lo, hi, 0x88);
+    b = _mm256_shuffle_ps(lo, hi, 0xdd);
+}
+
+void fastDepthwiseConv( const float* wptr,
+                     int kernel_h, int kernel_w,
+                     int stride_h, int stride_w,
+                     int dilation_h, int dilation_w,
+                     int pad_t, int pad_l,
+                     const float* biasptr, const float* relu,
+                     const float* inptr_,
+                     int height, int width,
+                     float* outptr_,
+                     int out_d, int outH, int outW )
+{
+    const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
+                w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
+                w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
+    int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
+    float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
+
+    for (int out_i = 0; out_i < outH; out_i++)
+    {
+        int in_i = out_i * stride_h - pad_t, out_j = 0;
+        const float* imgptr0 = inptr_ + in_i*width;
+        const float* imgptr1 = imgptr0 + dilation_h*width;
+        const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
+        float out, w00 = w00_, w01 = w01_, w02 = w02_;
+        float w20 = w20_, w21 = w21_, w22 = w22_;
+        if (in_i < 0)
+        {
+            w00 = w01 = w02 = 0.f;
+            imgptr0 = imgptr1;
+        }
+        else if (in_i + dilation_h*(kernel_h-1) >= height)
+        {
+            w20 = w21 = w22 = 0.f;
+            imgptr2 = imgptr1;
+        }
+        float* outptr = outptr_ + out_i*outW;
+        if (pad_l > 0)
+        {
+            out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
+                  imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
+                  imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[0] = out;
+            out_j = 1;
+        }
+
+        if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
+        {
+            const int VECSZ = 8;
+            __m256 vw00 = _mm256_set1_ps(w00), vw01 = _mm256_set1_ps(w01), vw02 = _mm256_set1_ps(w02),
+                      vw10 = _mm256_set1_ps(w10), vw11 = _mm256_set1_ps(w11), vw12 = _mm256_set1_ps(w12),
+                      vw20 = _mm256_set1_ps(w20), vw21 = _mm256_set1_ps(w21), vw22 = _mm256_set1_ps(w22);
+            __m256 z = _mm256_setzero_ps(), vbias = _mm256_set1_ps(bias), vrc = _mm256_set1_ps(relu_coeff);
+
+            if( stride_w == 1 )
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1 && out_j > pad_l)
+                        out_j = outW1 - VECSZ;
+                    int in_j = out_j * stride_w - pad_l;
+                    __m256 v00 = _mm256_loadu_ps(imgptr0 + in_j),
+                           v01 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w),
+                           v02 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w*2),
+                           v10 = _mm256_loadu_ps(imgptr1 + in_j),
+                           v11 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w),
+                           v12 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w*2),
+                           v20 = _mm256_loadu_ps(imgptr2 + in_j),
+                           v21 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w),
+                           v22 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w*2);
+
+                    __m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias);
+                    __m256 vout1 = _mm256_mul_ps(v01, vw01);
+                    __m256 vout2 = _mm256_mul_ps(v02, vw02);
+
+                    vout0 = _mm256_fmadd_ps(v10, vw10, vout0);
+                    vout1 = _mm256_fmadd_ps(v11, vw11, vout1);
+                    vout2 = _mm256_fmadd_ps(v12, vw12, vout2);
+
+                    vout0 = _mm256_fmadd_ps(v20, vw20, vout0);
+                    vout1 = _mm256_fmadd_ps(v21, vw21, vout1);
+                    vout2 = _mm256_fmadd_ps(v22, vw22, vout2);
+
+                    vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2);
+                    if (relu)
+                    {
+                        __m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ);
+                        vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m);
+                    }
+                    _mm256_storeu_ps(outptr + out_j, vout0);
+                }
+            else
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1 && out_j > pad_l)
+                        out_j = outW1 - VECSZ;
+                    int in_j = out_j * stride_w - pad_l;
+                    __m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
+                    _mm256_load_deinterleave(imgptr0 + in_j, v00, v01);
+                    _mm256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
+                    _mm256_load_deinterleave(imgptr1 + in_j, v10, v11);
+                    _mm256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
+                    _mm256_load_deinterleave(imgptr2 + in_j, v20, v21);
+                    _mm256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
+
+                    __m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias);
+                    __m256 vout1 = _mm256_mul_ps(v01, vw01);
+                    __m256 vout2 = _mm256_mul_ps(v02, vw02);
+
+                    vout0 = _mm256_fmadd_ps(v10, vw10, vout0);
+                    vout1 = _mm256_fmadd_ps(v11, vw11, vout1);
+                    vout2 = _mm256_fmadd_ps(v12, vw12, vout2);
+
+                    vout0 = _mm256_fmadd_ps(v20, vw20, vout0);
+                    vout1 = _mm256_fmadd_ps(v21, vw21, vout1);
+                    vout2 = _mm256_fmadd_ps(v22, vw22, vout2);
+
+                    vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2);
+                    if (relu)
+                    {
+                        __m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ);
+                        vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m);
+                    }
+                    _mm256_storeu_ps(outptr + out_j, vout0);
+                }
+        }
+
+        for (; out_j < outW1; out_j++)
+        {
+            int in_j = out_j * stride_w - pad_l;
+            out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
+                  imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
+                  imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[out_j] = out;
+        }
+
+        for (; out_j < outW; out_j++ )
+        {
+            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
+            float s0 = 1.f, s1 = 1.f, s2 = 1.f;
+            if (in_j0 >= width)
+            {
+                in_j0 = 0;
+                s0 = 0.f;
+            }
+            if (in_j1 >= width)
+            {
+                in_j1 = 0;
+                s1 = 0.f;
+            }
+            if (in_j2 >= width)
+            {
+                in_j2 = 0;
+                s2 = 0.f;
+            }
+            out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
+                  imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
+                  imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[out_j] = out;
+        }
+    }
+    _mm256_zeroupper();
+}
+
+#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_RVV
+
+/*
+Example for load_deinterleave:
+    input: ptr[16] = {1,2,3, ... ,14,15,16}
+    output: a = {1, 3, 5, 7, 9, 11, 13, 15}
+    output: b = {2, 4, 6, 8,10, 12, 14, 16}
+*/
+static inline void vfloat32m2_load_deinterleave(const float* ptr, vfloat32m2_t& a, vfloat32m2_t& b, int vl)
+{
+    vuint64m4_t mask = vmv_v_x_u64m4(1,vl*2);
+    vuint32m4_t mask_re = vreinterpret_v_u64m4_u32m4(mask);
+    vbool8_t mask0 = vmseq_vx_u32m4_b8 (mask_re, 1, vl*2);
+    vbool8_t mask1 = vmseq_vx_u32m4_b8 (mask_re, 0, vl*2);
+    vfloat32m4_t tempa = vundefined_f32m4(), tempb = vundefined_f32m4();
+    vfloat32m4_t vw = vle32_v_f32m4(ptr, vl*2);
+    tempa = vcompress_vm_f32m4(mask0, tempa, vw, vl*2);
+    tempb = vcompress_vm_f32m4(mask1, tempb, vw, vl*2);
+    /* The following instructions have not to be supported by the GNU toolchain.
+       So we temporarily use store and load instead.
+    // a = vlmul_trunc_v_f32m4_f32m2(tempa);
+    // b = vlmul_trunc_v_f32m4_f32m2(tempb);
+    */
+    cv::AutoBuffer<float> cvBuffer(sizeof(float)*vl*2);
+    float* buffer = (float*)cvBuffer.data();
+    vse32_v_f32m4(buffer, tempa, vl);
+    a = vle32_v_f32m2(buffer, vl);
+    vse32_v_f32m4(buffer, tempb, vl);
+    b = vle32_v_f32m2(buffer, vl);
+}
+
+void fastDepthwiseConv( const float* wptr,
+                     int kernel_h, int kernel_w,
+                     int stride_h, int stride_w,
+                     int dilation_h, int dilation_w,
+                     int pad_t, int pad_l,
+                     const float* biasptr, const float* relu,
+                     const float* inptr_,
+                     int height, int width,
+                     float* outptr_,
+                     int out_d, int outH, int outW )
+{
+    int vl;
+    const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
+                w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
+                w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
+    int outW1 = std::min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
+    float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
+
+    for (int out_i = 0; out_i < outH; out_i++)
+    {
+        int in_i = out_i * stride_h - pad_t, out_j = 0;
+        const float* imgptr0 = inptr_ + in_i*width;
+        const float* imgptr1 = imgptr0 + dilation_h*width;
+        const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
+        float out, w00 = w00_, w01 = w01_, w02 = w02_;
+        float w20 = w20_, w21 = w21_, w22 = w22_;
+        if (in_i < 0)
+        {
+            w00 = w01 = w02 = 0.f;
+            imgptr0 = imgptr1;
+        }
+        else if (in_i + dilation_h*(kernel_h-1) >= height)
+        {
+            w20 = w21 = w22 = 0.f;
+            imgptr2 = imgptr1;
+        }
+        float* outptr = outptr_ + out_i*outW;
+        if (pad_l > 0)
+        {
+            out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
+                  imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
+                  imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[0] = out;
+            out_j = 1;
+        }
+
+        if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
+        {
+            int avl = outW1 - out_j;
+            if( stride_w == 1 )
+                for( ; out_j < outW1; out_j += vl, avl -= vl)
+                {
+                    vl = vsetvl_e32m2(avl);
+                    int in_j = out_j * stride_w - pad_l;
+                    vfloat32m2_t v00 = vle32_v_f32m2(imgptr0 + in_j, vl),
+                           v01 = vle32_v_f32m2(imgptr0 + in_j + dilation_w, vl),
+                           v02 = vle32_v_f32m2(imgptr0 + in_j + dilation_w*2, vl),
+                           v10 = vle32_v_f32m2(imgptr1 + in_j, vl),
+                           v11 = vle32_v_f32m2(imgptr1 + in_j + dilation_w, vl),
+                           v12 = vle32_v_f32m2(imgptr1 + in_j + dilation_w*2, vl),
+                           v20 = vle32_v_f32m2(imgptr2 + in_j, vl),
+                           v21 = vle32_v_f32m2(imgptr2 + in_j + dilation_w, vl),
+                           v22 = vle32_v_f32m2(imgptr2 + in_j + dilation_w*2, vl);
+
+                    vfloat32m2_t vout0 = vfmul_vf_f32m2(v00, w00, vl);
+                    vfloat32m2_t vout1 = vfmul_vf_f32m2(v01, w01, vl);
+                    vfloat32m2_t vout2 = vfmul_vf_f32m2(v02, w02, vl);
+                    vout0 = vfadd_vf_f32m2(vout0, bias, vl);
+
+                    vout0 = vfmacc_vf_f32m2(vout0, w10, v10, vl);
+                    vout1 = vfmacc_vf_f32m2(vout1, w11, v11, vl);
+                    vout2 = vfmacc_vf_f32m2(vout2, w12, v12, vl);
+
+                    vout0 = vfmacc_vf_f32m2(vout0, w20, v20, vl);
+                    vout1 = vfmacc_vf_f32m2(vout1, w21, v21, vl);
+                    vout2 = vfmacc_vf_f32m2(vout2, w22, v22, vl);
+
+                    vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl);
+                    if (relu)
+                    {
+                        vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl);
+                        vout0 = vmerge_vvm_f32m2(m, vfmul_vf_f32m2(vout0, relu_coeff, vl), vout0, vl);
+                    }
+                    vse32_v_f32m2(outptr + out_j, vout0, vl);
+                }
+            else //stride_w == 2 && dilation_w == 1
+                for( ; out_j < outW1; out_j += vl, avl -= vl)
+                {
+                    vl = vsetvl_e32m2(avl);
+                    int in_j = out_j * stride_w - pad_l;
+                    vfloat32m2_t v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
+                    vfloat32m2_load_deinterleave(imgptr0 + in_j, v00, v01, vl);
+                    vfloat32m2_load_deinterleave(imgptr0 + in_j + 2, v02, unused, vl);
+                    vfloat32m2_load_deinterleave(imgptr1 + in_j, v10, v11, vl);
+                    vfloat32m2_load_deinterleave(imgptr1 + in_j + 2, v12, unused, vl);
+                    vfloat32m2_load_deinterleave(imgptr2 + in_j, v20, v21, vl);
+                    vfloat32m2_load_deinterleave(imgptr2 + in_j + 2, v22, unused, vl);
+
+                    vfloat32m2_t vout0 = vfmul_vf_f32m2(v00, w00, vl);
+                    vfloat32m2_t vout1 = vfmul_vf_f32m2(v01, w01, vl);
+                    vfloat32m2_t vout2 = vfmul_vf_f32m2(v02, w02, vl);
+                    vout0 = vfadd_vf_f32m2(vout0, bias, vl);
+
+                    vout0 = vfmacc_vf_f32m2(vout0, w10, v10, vl);
+                    vout1 = vfmacc_vf_f32m2(vout1, w11, v11, vl);
+                    vout2 = vfmacc_vf_f32m2(vout2, w12, v12, vl);
+
+                    vout0 = vfmacc_vf_f32m2(vout0, w20, v20, vl);
+                    vout1 = vfmacc_vf_f32m2(vout1, w21, v21, vl);
+                    vout2 = vfmacc_vf_f32m2(vout2, w22, v22, vl);
+
+                    vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl);
+                    if (relu)
+                    {
+                        vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl);
+                        vout0 = vmerge_vvm_f32m2(m, vfmul_vf_f32m2(vout0, relu_coeff, vl), vout0, vl);
+                    }
+                    vse32_v_f32m2(outptr + out_j, vout0, vl);
+                }
+        }
+
+        for (; out_j < outW1; out_j++)
+        {
+            int in_j = out_j * stride_w - pad_l;
+            out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
+                  imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
+                  imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[out_j] = out;
+        }
+
+        for (; out_j < outW; out_j++ )
+        {
+            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
+            float s0 = 1.f, s1 = 1.f, s2 = 1.f;
+            if (in_j0 >= width)
+            {
+                in_j0 = 0;
+                s0 = 0.f;
+            }
+            if (in_j1 >= width)
+            {
+                in_j1 = 0;
+                s1 = 0.f;
+            }
+            if (in_j2 >= width)
+            {
+                in_j2 = 0;
+                s2 = 0.f;
+            }
+            out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
+                  imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
+                  imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[out_j] = out;
+        }
+    }
+}
+
+#endif // CV_RVV
+
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX
+
+static inline void _v256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
+{
+    __m256 t0 = (__m256)__lasx_xvld(ptr, 0);
+    __m256 t1 = (__m256)__lasx_xvld(ptr, 8*4);
+
+    __m256 lo = (__m256)__lasx_xvpermi_q(t0, t1, 2+0*16);
+    __m256 hi = (__m256)__lasx_xvpermi_q(t0, t1, 3+1*16);
+
+    a = (__m256)__lasx_xvpermi_w(hi, lo, 0x88);
+    b = (__m256)__lasx_xvpermi_w(hi, lo, 0xdd);
+}
+
+void fastDepthwiseConv( const float* wptr,
+                     int kernel_h, int kernel_w,
+                     int stride_h, int stride_w,
+                     int dilation_h, int dilation_w,
+                     int pad_t, int pad_l,
+                     const float* biasptr, const float* relu,
+                     const float* inptr_,
+                     int height, int width,
+                     float* outptr_,
+                     int out_d, int outH, int outW )
+{
+    const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
+                w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
+                w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
+    int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
+    float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
+
+    for (int out_i = 0; out_i < outH; out_i++)
+    {
+        int in_i = out_i * stride_h - pad_t, out_j = 0;
+        const float* imgptr0 = inptr_ + in_i*width;
+        const float* imgptr1 = imgptr0 + dilation_h*width;
+        const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
+        float out, w00 = w00_, w01 = w01_, w02 = w02_;
+        float w20 = w20_, w21 = w21_, w22 = w22_;
+        if (in_i < 0)
+        {
+            w00 = w01 = w02 = 0.f;
+            imgptr0 = imgptr1;
+        }
+        else if (in_i + dilation_h*(kernel_h-1) >= height)
+        {
+            w20 = w21 = w22 = 0.f;
+            imgptr2 = imgptr1;
+        }
+        float* outptr = outptr_ + out_i*outW;
+        if (pad_l > 0)
+        {
+            out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
+                  imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
+                  imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[0] = out;
+            out_j = 1;
+        }
+
+        if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
+        {
+            const int VECSZ = 8;
+            __m256 vw00 = _v256_setall_ps(w00), vw01 = _v256_setall_ps(w01), vw02 = _v256_setall_ps(w02),
+                   vw10 = _v256_setall_ps(w10), vw11 = _v256_setall_ps(w11), vw12 = _v256_setall_ps(w12),
+                   vw20 = _v256_setall_ps(w20), vw21 = _v256_setall_ps(w21), vw22 = _v256_setall_ps(w22);
+            __m256 z = (__m256)__lasx_xvxor_v((__m256i)vw00, (__m256i)vw00),
+            vbias = _v256_setall_ps(bias), vrc = _v256_setall_ps(relu_coeff);
+
+            if( stride_w == 1 )
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1 && out_j > pad_l)
+                        out_j = outW1 - VECSZ;
+                    int in_j = out_j * stride_w - pad_l;
+                    __m256 v00 = (__m256)__lasx_xvld(imgptr0 + in_j, 0),
+                           v01 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w, 0),
+                           v02 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w*2, 0),
+                           v10 = (__m256)__lasx_xvld(imgptr1 + in_j, 0),
+                           v11 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w, 0),
+                           v12 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w*2, 0),
+                           v20 = (__m256)__lasx_xvld(imgptr2 + in_j, 0),
+                           v21 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w, 0),
+                           v22 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w*2, 0);
+
+                    __m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
+                    __m256 vout1 = __lasx_xvfmul_s(v01, vw01);
+                    __m256 vout2 = __lasx_xvfmul_s(v02, vw02);
+
+                    vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
+                    vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
+                    vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
+
+                    vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
+                    vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
+                    vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
+
+                    vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
+                    if (relu)
+                    {
+                        __m256i m = __lasx_xvfcmp_clt_s(z, vout0);
+                        vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
+                    }
+                    __lasx_xvst(vout0, outptr + out_j, 0);
+                }
+            else
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1 && out_j > pad_l)
+                        out_j = outW1 - VECSZ;
+                    int in_j = out_j * stride_w - pad_l;
+                    __m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
+                    _v256_load_deinterleave(imgptr0 + in_j, v00, v01);
+                    _v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
+                    _v256_load_deinterleave(imgptr1 + in_j, v10, v11);
+                    _v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
+                    _v256_load_deinterleave(imgptr2 + in_j, v20, v21);
+                    _v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
+
+                    __m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
+                    __m256 vout1 = __lasx_xvfmul_s(v01, vw01);
+                    __m256 vout2 = __lasx_xvfmul_s(v02, vw02);
+
+                    vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
+                    vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
+                    vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
+
+                    vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
+                    vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
+                    vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
+
+                    vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
+                    if (relu)
+                    {
+                        __m256i m = __lasx_xvfcmp_clt_s(z, vout0);
+                        vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
+                    }
+                    __lasx_xvst(vout0, outptr + out_j, 0);
+                }
+        }
+
+        for (; out_j < outW1; out_j++)
+        {
+            int in_j = out_j * stride_w - pad_l;
+            out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
+                  imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
+                  imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[out_j] = out;
+        }
+
+        for (; out_j < outW; out_j++ )
+        {
+            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
+            float s0 = 1.f, s1 = 1.f, s2 = 1.f;
+            if (in_j0 >= width)
+            {
+                in_j0 = 0;
+                s0 = 0.f;
+            }
+            if (in_j1 >= width)
+            {
+                in_j1 = 0;
+                s1 = 0.f;
+            }
+            if (in_j2 >= width)
+            {
+                in_j2 = 0;
+                s2 = 0.f;
+            }
+            out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
+                  imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
+                  imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[out_j] = out;
+        }
+    }
+}
+
+#endif // CV_LASX
+
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+}} // namespace
diff --git a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
new file mode 100644
index 0000000000..27998e4bcc
--- /dev/null
+++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
@@ -0,0 +1,764 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv_Winograd.fx).
+// Here is the original license:
+/*
+    This file is a part of ficus language project.
+    See ficus/LICENSE for the licensing terms
+*/
+
+#include "../../precomp.hpp"
+#include "convolution.hpp"
+
+#include "conv_winograd_f63.simd.hpp"
+#include "layers/cpu_kernels/conv_winograd_f63.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
+
+namespace cv { namespace dnn {
+
+#if CV_NEON || CV_SIMD128 || CV_TRY_AVX2
+enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment.
+
+void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
+                            const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32);
+
+/*Input transform*/
+void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
+                          float* outptr, int Cg, const int winoIblock, const int winoAtomF32);
+
+/*Output transform*/
+void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep,
+                          float bias, float minval, float maxval, bool ifMinMaxAct);
+
+
+int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv,
+                  int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
+{
+    Mat input = _input.getMat();
+    Mat output = _output.getMat();
+    Mat fusedAddMat = _fusedAddMat.getMat();
+
+    MatShape inputShape = shape(input);
+    MatShape outputShape = shape(output);
+    CV_Assert(inputShape.size() == 4 && outputShape.size() == 4);
+
+    int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3];  // [N, C, H, W]
+    int K = conv->K;
+    int H0 = outputShape[2], W0 = outputShape[3];
+
+    int pad_top = conv->pad_top;
+    int pad_left = conv->pad_left;
+
+    int ngroups = conv->ngroups, Cg = C/ngroups, Kg = K/ngroups;
+    int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - 1)/CONV_WINO_KBLOCK;
+    const size_t inp_planesize = (size_t)Hi*Wi;
+    const size_t out_planesize = (size_t)H0*W0;
+
+    int blocks_per_row = (W0+CONV_WINO_STEP-1)/CONV_WINO_STEP;
+    int blocks_per_plane = ((H0+CONV_WINO_STEP-1)/CONV_WINO_STEP)*blocks_per_row;
+    int blocks_per_plane_aligned = ((blocks_per_plane +
+                                     CONV_WINO_IBLOCK-1)/CONV_WINO_IBLOCK)*CONV_WINO_IBLOCK;
+
+    size_t totalbufsize = (size_t)N*C*blocks_per_plane_aligned*CONV_WINO_AREA;
+
+    AutoBuffer<float> _buf;
+    _buf.allocate(totalbufsize + VEC_ALIGN);
+    float* wbuf_all = alignPtr(_buf.data(), VEC_ALIGN);
+
+    float* inp = input.ptr<float>();
+    float* out = output.ptr<float>();
+
+    float* fusedAddPtr = fusedAddMat.empty() ? nullptr : fusedAddMat.ptr<float>();
+
+    // Phase 1. compute forward Winograd transforms for all input blocks,
+    // all input planes, all samples in the batch.
+    // [TODO]: maybe, if there are too many input channels, it makes sense to
+    // transform only part of input channels at once and then compute the partial
+    // accumulated sums (i.e. update the output buffers several times,
+    // rather than compute them in one pass).
+    parallel_for_(Range(0, ntasks), [&](const Range& r0) {
+    for (int task_id = r0.start; task_id < r0.end; task_id++)
+    {
+        int nc0 = (N*C)*task_id/ntasks;
+        int nc1 = (N*C)*(task_id+1)/ntasks;
+        for(; nc0 < nc1; nc0++)
+        {
+            int n = nc0 / C;
+            int c = nc0 - n*C;
+            int g = c / Cg;
+            c -= g*Cg;
+            for (int block_id = 0; block_id < blocks_per_plane; block_id += CONV_WINO_IBLOCK)
+            {
+                for (int db = 0; db < CONV_WINO_IBLOCK; db++)
+                {
+                    size_t inwofs = ((n*ngroups + g)*blocks_per_plane_aligned +
+                                     block_id)*Cg*CONV_WINO_AREA +
+                                    (c*CONV_WINO_IBLOCK + db)*CONV_WINO_ATOM_F32;
+                    float* inwptr = (float*)wbuf_all + inwofs;
+
+                    if (block_id + db < blocks_per_plane)
+                    {
+                        int y0 = (block_id + db) / blocks_per_row;
+                        int x0 = (block_id + db) - y0 * blocks_per_row;
+                        y0 = y0*CONV_WINO_STEP - pad_top;
+                        x0 = x0*CONV_WINO_STEP - pad_left;
+                        bool partial = y0 < 0 || y0 + CONV_WINO_SIZE > Hi ||
+                                       x0 < 0 || x0 + CONV_WINO_SIZE > Wi;
+                        int dx1 = 0, dx2 = CONV_WINO_SIZE, dy1 = 0, dy2 = CONV_WINO_SIZE;
+                        int inpstep = Wi;
+
+                        float inpbuf[CONV_WINO_AREA];
+                        float* inptr0 = (float*)inp + nc0*inp_planesize + y0*Wi + x0;
+                        float* inptr = inptr0;
+
+                        if (partial)
+                        {
+                            memset(inpbuf, 0, sizeof(inpbuf));
+                            dy1 = -y0 > 0 ? -y0 : 0;
+                            dy2 = Hi - y0 < CONV_WINO_SIZE ? Hi - y0 : CONV_WINO_SIZE;
+
+                            if (dy2 < dy1) {dy2 = dy1 = 0;}
+                            dx1 = -x0 > 0 ? -x0 : 0;
+                            dx2 = Wi - x0 < CONV_WINO_SIZE ? Wi - x0 : CONV_WINO_SIZE;
+
+                            if (dx2 < dx1) {dx2 = dx1 = 0;}
+                            inptr0 -= y0*Wi + x0;
+
+                            if (dx1 < dx2 && dy1 < dy2)
+                            {
+                                for(int dy = dy1; dy < dy2; dy++)
+                                    memcpy(&inpbuf[dy*CONV_WINO_SIZE + dx1],
+                                           inptr0 + (y0+dy)*Wi + (x0+dx1),
+                                           (dx2-dx1)*sizeof(inpbuf[0]));
+                            }
+
+                            inptr = inpbuf;
+                            inpstep = CONV_WINO_SIZE;
+                        }
+#if CV_TRY_AVX2
+                        if (conv->useAVX2)
+                            opt_AVX2::winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32);
+                        else
+#endif
+#if CV_TRY_AVX
+                        if (conv->useAVX)
+                            opt_AVX::winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32);
+                        else
+#endif
+#if CV_NEON && CV_NEON_AARCH64
+                        if (conv->useNEON)
+                            opt_NEON::winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32);
+                        else
+#endif
+                        winofunc_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg, CONV_WINO_IBLOCK, CONV_WINO_ATOM_F32);
+                    }
+                    else
+                    {
+                        for (int i = 0; i < CONV_WINO_NATOMS_F32; i++, inwptr += CONV_WINO_IBLOCK*CONV_WINO_ATOM_F32)
+                            memset(inwptr, 0, CONV_WINO_ATOM_F32*sizeof(inwptr[0]));
+                    }
+                }
+            }
+        }
+    }});
+
+    // Phase 2. compute elemwise-weighted sums of transformed blocks,
+    // apply inverse Winograd transforms to the sums,
+    // add bias, apply activation function if any and store the results.
+    parallel_for_(Range(0, ntasks), [&](const Range& r0) {
+    for (int task_id = r0.start; task_id < r0.end; task_id++)
+    {
+        size_t out_wbuf_size = CONV_WINO_AREA*CONV_WINO_KBLOCK*CONV_WINO_IBLOCK;
+        size_t outbuf_size = CONV_WINO_AREA;
+        AutoBuffer<float> out_wbuf_, outbuf_;
+        out_wbuf_.allocate(out_wbuf_size + VEC_ALIGN);
+        float* out_wbuf = alignPtr(out_wbuf_.data(), VEC_ALIGN);
+        outbuf_.allocate(outbuf_size + VEC_ALIGN);
+        float* outbuf = alignPtr(outbuf_.data(), VEC_ALIGN);
+
+        memset(out_wbuf, 0, out_wbuf_size * sizeof(float));
+        memset(outbuf, 0, outbuf_size * sizeof(float));
+
+        int ngk0 = (int)(((int64_t)N*Kg_nblocks*ngroups)*task_id/ntasks);
+        int ngk1 = (int)(((int64_t)N*Kg_nblocks*ngroups)*(task_id+1)/ntasks);
+
+        for(; ngk0 < ngk1; ngk0++)
+        {
+            int n = ngk0 / (Kg_nblocks*ngroups);
+            int gk0 = ngk0 % (Kg_nblocks*ngroups);
+            int g = gk0 / Kg_nblocks;
+            int k0 = (gk0 % Kg_nblocks)*CONV_WINO_KBLOCK;
+            int k1 = k0 + CONV_WINO_KBLOCK <= Kg ? k0 + CONV_WINO_KBLOCK : Kg;
+
+            for (int block_id0 = 0; block_id0 < blocks_per_plane; block_id0 += CONV_WINO_IBLOCK)
+            {
+                int block_id1 = block_id0 + CONV_WINO_IBLOCK;
+                block_id1 = block_id1 < blocks_per_plane ? block_id1 : blocks_per_plane;
+                size_t inwofs = ((n*ngroups + g)*blocks_per_plane_aligned + block_id0)*Cg*CONV_WINO_AREA;
+                size_t wofs = (g*Kg_nblocks*CONV_WINO_KBLOCK + k0)*Cg*CONV_WINO_AREA;
+
+                float* inwptr = wbuf_all + inwofs;
+                const float* wptr = conv->weightsWinoBufPtr + wofs;
+
+#if CV_TRY_AVX2
+                if (conv->useAVX2)
+                    opt_AVX2::winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
+                                       CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32);
+                else
+#endif
+#if CV_TRY_AVX
+                if (conv->useAVX)
+                    opt_AVX::winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
+                                       CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32);
+                else
+#endif
+#if CV_NEON && CV_NEON_AARCH64
+                if (conv->useNEON)
+                    opt_NEON::winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
+                                       CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32);
+                else
+#endif
+
+                winofunc_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0, CONV_WINO_IBLOCK,
+                                       CONV_WINO_KBLOCK, CONV_WINO_ATOM_F32, CONV_WINO_NATOMS_F32);
+                for (int k = k0; k < k1; k++)
+                {
+                    float biasv = conv->biasBuf[g*Kg + k];
+                    for (int block_id = block_id0; block_id < block_id1; block_id++)
+                    {
+                        int y0 = block_id / blocks_per_row;
+                        int x0 = block_id - y0 * blocks_per_row;
+                        y0 = y0*CONV_WINO_STEP;
+                        x0 = x0*CONV_WINO_STEP;
+                        int dy1 = H0 - y0;
+                        if (dy1 > CONV_WINO_STEP) dy1 = CONV_WINO_STEP;
+                        int dx1 = W0 - x0;
+                        if (dx1 > CONV_WINO_STEP) dx1 = CONV_WINO_STEP;
+                        assert(dx1 > 0 && dy1 > 0);
+                        bool partial = activ || dy1 < CONV_WINO_STEP || dx1 < CONV_WINO_STEP;
+                        size_t outofs = (n*K + g*Kg + k)*out_planesize + y0*W0 + x0;
+                        int outstep = W0;
+
+                        float* outptr0 = (float*)out + outofs;
+                        float* pbptr0 = fusedAddPtr ? fusedAddPtr + outofs : nullptr;
+                        float *outptr = outptr0, *bpptr = pbptr0;
+
+                        if (partial)
+                        {
+                            outptr = outbuf;
+                            outstep = CONV_WINO_SIZE;
+                            if (pbptr0)
+                            {
+                                bpptr = outbuf;
+                                for (int y = 0; y < dy1; y++)
+                                    memcpy(outbuf + y*CONV_WINO_SIZE, pbptr0 + y*W0,
+                                           dx1*sizeof(pbptr0[0]));
+                            }
+                        }
+#if CV_TRY_AVX2
+                        if (conv->useAVX2)
+                            opt_AVX::winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
+                                                                bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
+                        else
+#endif
+#if CV_TRY_AVX
+                        if (conv->useAVX)
+                            opt_AVX::winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
+                                                                bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
+                        else
+#endif
+#if CV_NEON && CV_NEON_AARCH64
+                        if (conv->useNEON)
+                            // NEON optimization is only for ARMv8 device, and for ARMv7 device, we use the Universal intrinsics.
+                            opt_NEON::winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
+                                                                bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
+                        else
+#endif
+                        winofunc_AtXA_8x8_f32(out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
+                                                  bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
+                        if (partial)
+                        {
+                            if (activ)
+                                activ->forwardSlice(outptr, outptr, CONV_WINO_SIZE*CONV_WINO_STEP, 0, g*Kg + k, g*Kg + k + 1);
+                            for (int y = 0; y < dy1; y++)
+                                memcpy(outptr0 + y*W0, outptr + y*CONV_WINO_SIZE,dx1*sizeof(outptr0[0]));
+                        }
+                    }
+                }
+            }
+        }
+    }});
+    return 1;
+}
+
+/****************************************************************************************\
+                                    SIMD for winograd function
+\****************************************************************************************/
+
+#if CV_SIMD128
+
+void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
+                            const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32)
+{
+#if 1
+    CV_Assert(winoIblock == 3 && winoKblock == 4 && winoAtomF32 == 4);
+    for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
+            outbuf += winoAtomF32)
+    {
+        v_float32x4 s00 = v_setzero_f32(), s01 = s00, s02 = s00;
+        v_float32x4 s10 = v_setzero_f32(), s11 = s00, s12 = s00;
+        v_float32x4 s20 = v_setzero_f32(), s21 = s00, s22 = s00;
+        v_float32x4 s30 = v_setzero_f32(), s31 = s00, s32 = s00;
+
+        for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
+                                     wptr += winoKblock*winoAtomF32)
+        {
+            v_float32x4 x0, x1, x2;
+            x0 = v_load(inwptr);
+            x1 = v_load(inwptr + 4);
+            x2 = v_load(inwptr + 8);
+
+            v_float32x4 w0 = v_load(wptr);
+            s00 = v_fma(w0, x0, s00);
+            s01 = v_fma(w0, x1, s01);
+            s02 = v_fma(w0, x2, s02);
+
+            w0 = v_load(wptr + 4);
+            s10 = v_fma(w0, x0, s10);
+            s11 = v_fma(w0, x1, s11);
+            s12 = v_fma(w0, x2, s12);
+
+            w0 = v_load(wptr + 8);
+            s20 = v_fma(w0, x0, s20);
+            s21 = v_fma(w0, x1, s21);
+            s22 = v_fma(w0, x2, s22);
+
+            w0 = v_load(wptr + 12);
+            s30 = v_fma(w0, x0, s30);
+            s31 = v_fma(w0, x1, s31);
+            s32 = v_fma(w0, x2, s32);
+        }
+
+        v_store(outbuf, s00);
+        v_store(outbuf + 1*64, s01);
+        v_store(outbuf + 2*64, s02);
+        v_store(outbuf + 3*64, s10);
+        v_store(outbuf + 4*64, s11);
+        v_store(outbuf + 5*64, s12);
+        v_store(outbuf + 6*64, s20);
+        v_store(outbuf + 7*64, s21);
+        v_store(outbuf + 8*64, s22);
+        v_store(outbuf + 9*64, s30);
+        v_store(outbuf + 10*64, s31);
+        v_store(outbuf + 11*64, s32);
+    }
+#else
+    // Naive C++ code, the code should never be run here.
+    for (int atom_id = 0; atom_id < winoNatomF32;
+                atom_id++, outbuf += winoAtomF32)
+    {
+        float sumbuf[winoIblock*winoKblock*winoAtomF32];
+        memset(sumbuf, 0, sizeof(sumbuf));
+        for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
+                                     wptr += winoKblock*winoAtomF32)
+        {
+            for (int i = 0; i < winoKblock; i++)
+            {
+                for (int j = 0; j < winoIblock; j++)
+                {
+                    int i_ = i*winoAtomF32;
+                    int j_ = j*winoAtomF32;
+                    int ij_ = i_*winoIblock + j_;
+                    float s0 = inwptr[j_ + 0]*wptr[i_ + 0];
+                    float s1 = inwptr[j_ + 1]*wptr[i_ + 1];
+                    float s2 = inwptr[j_ + 2]*wptr[i_ + 2];
+                    float s3 = inwptr[j_ + 3]*wptr[i_ + 3];
+                    sumbuf[ij_ + 0] += s0;
+                    sumbuf[ij_ + 1] += s1;
+                    sumbuf[ij_ + 2] += s2;
+                    sumbuf[ij_ + 3] += s3;
+                }
+            }
+        }
+        for (int ij = 0; ij < winoKblock*winoIblock; ij++)
+        {
+            int ij_ = ij*winoAtomF32;
+            int ij_out = ij*CONV_WINO_AREA;
+            outbuf[ij_out + 0] = sumbuf[ij_ + 0];
+            outbuf[ij_out + 1] = sumbuf[ij_ + 1];
+            outbuf[ij_out + 2] = sumbuf[ij_ + 2];
+            outbuf[ij_out + 3] = sumbuf[ij_ + 3];
+        }
+    }
+#endif
+}
+
+/*Input transform*/
+void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
+                          float* outptr, int Cg, const int winoIblock, const int winoAtomF32)
+{
+    CV_Assert(CONV_WINO_IBLOCK == 3 && CONV_WINO_KBLOCK == 4 && CONV_WINO_ATOM_F32 == 4);
+    v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
+    v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
+    v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
+    v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4);
+    v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4);
+    v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4);
+    v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4);
+    v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4);
+
+    v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71;
+
+    {
+        /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
+        /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
+        v_float32x4 q5_25 = v_setall_f32(5.25f), t00, t01, t10, t11;
+        t00 = x40 - x20;
+        t01 = x41 - x21;
+        t10 = x30 - x50;
+        t11 = x31 - x51;
+        v_float32x4 y00 = v_fma(t00, q5_25, x00 - x60);
+        v_float32x4 y01 = v_fma(t01, q5_25, x01 - x61);
+        v_float32x4 y70 = v_fma(t10, q5_25, x70 - x10);
+        v_float32x4 y71 = v_fma(t11, q5_25, x71 - x11);
+
+        /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
+        /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
+        v_float32x4 qm4_25 = v_setall_f32(-4.25f);
+        t00 = v_fma(x30, qm4_25, x10 + x50);
+        t01 = v_fma(x31, qm4_25, x11 + x51);
+        t10 = v_fma(x40, qm4_25, x20 + x60);
+        t11 = v_fma(x41, qm4_25, x21 + x61);
+
+        v_float32x4 y10 = t00 + t10, y11 = t01 + t11;
+        v_float32x4 y20 = t10 - t00, y21 = t11 - t01;
+
+        /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
+        /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
+        v_float32x4 q0_5 = v_setall_f32(0.5f), q0_25 = v_setall_f32(0.25f);
+        v_float32x4 qm2_5 = v_setall_f32(-2.5f), qm1_25 = v_setall_f32(-1.25f);
+        t00 = v_fma(x10, q0_5, x50 + x50);
+        t01 = v_fma(x11, q0_5, x51 + x51);
+        t10 = v_fma(x20, q0_25, x60);
+        t11 = v_fma(x21, q0_25, x61);
+        t00 = v_fma(x30, qm2_5, t00);
+        t01 = v_fma(x31, qm2_5, t01);
+        t10 = v_fma(x40, qm1_25, t10);
+        t11 = v_fma(x41, qm1_25, t11);
+
+        v_float32x4 y30 = t00 + t10, y31 = t01 + t11;
+        v_float32x4 y40 = t10 - t00, y41 = t11 - t01;
+
+        /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
+        /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
+        v_float32x4 q4 = v_setall_f32(4.f), qm5 = v_setall_f32(-5.f);
+        t00 = v_fma(x50, q0_5, x10 + x10);
+        t01 = v_fma(x51, q0_5, x11 + x11);
+        t10 = v_fma(x20, q4   , x60);
+        t11 = v_fma(x21, q4   , x61);
+        t00 = v_fma(x30, qm2_5, t00);
+        t01 = v_fma(x31, qm2_5, t01);
+        t10 = v_fma(x40, qm5  , t10);
+        t11 = v_fma(x41, qm5  , t11);
+
+        v_float32x4 y50 = t00 + t10, y51 = t01 + t11;
+        v_float32x4 y60 = t10 - t00, y61 = t11 - t01;
+
+        /* transpose 8x8 matrix in-place with some renumeration of the elements: */
+        /* Y:              */
+        /*        y00 y01  */
+        /*        y10 y11  */
+        /*        ...      */
+        /*        y70 y71  */
+        /*   Y':           */
+        /*        y00 y40  */
+        /*        y10 y50  */
+        /*        y20 y60  */
+        /*        y30 y70  */
+        /*        y01 y41  */
+        /*        y11 y51  */
+        /*        y21 y61  */
+        /*        y31 y71  */
+        /*    in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
+
+        v_transpose4x4(y00, y10, y20, y30, y00, y10, y20, y30);
+        v_transpose4x4(y01, y11, y21, y31, y01, y11, y21, y31);
+        v_transpose4x4(y40, y50, y60, y70, y40, y50, y60, y70);
+        v_transpose4x4(y41, y51, y61, y71, y41, y51, y61, y71);
+
+        /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
+        /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
+        t00 = y01 - y20;
+        t01 = y41 - y60;
+        t10 = y30 - y11;
+        t11 = y70 - y51;
+        z00 = v_fma(t00, q5_25, y00 - y21);
+        z01 = v_fma(t01, q5_25, y40 - y61);
+        z70 = v_fma(t10, q5_25, y31 - y10);
+        z71 = v_fma(t11, q5_25, y71 - y50);
+
+        /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
+        /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
+        t00 = v_fma(y30, qm4_25, y10 + y11);
+        t01 = v_fma(y70, qm4_25, y50 + y51);
+        t10 = v_fma(y01, qm4_25, y20 + y21);
+        t11 = v_fma(y41, qm4_25, y60 + y61);
+
+        z10 = t00 + t10; z11 = t01 + t11;
+        z20 = t10 - t00; z21 = t11 - t01;
+
+        /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
+        /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
+        t00 = v_fma(y10, q0_5, y11 + y11);
+        t01 = v_fma(y50, q0_5, y51 + y51);
+        t10 = v_fma(y20, q0_25, y21);
+        t11 = v_fma(y60, q0_25, y61);
+        t00 = v_fma(y30, qm2_5, t00);
+        t01 = v_fma(y70, qm2_5, t01);
+        t10 = v_fma(y01, qm1_25, t10);
+        t11 = v_fma(y41, qm1_25, t11);
+
+        z30 = t00 + t10; z31 = t01 + t11;
+        z40 = t10 - t00; z41 = t11 - t01;
+
+        /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
+        /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
+        t00 = v_fma(y11, q0_5, y10 + y10);
+        t01 = v_fma(y51, q0_5, y50 + y50);
+        t10 = v_fma(y20, q4, y21);
+        t11 = v_fma(y60, q4, y61);
+        t00 = v_fma(y30, qm2_5, t00);
+        t01 = v_fma(y70, qm2_5, t01);
+        t10 = v_fma(y01, qm5, t10);
+        t11 = v_fma(y41, qm5, t11);
+
+        z50 = t00 + t10; z51 = t01 + t11;
+        z60 = t10 - t00; z61 = t11 - t01;
+    }
+
+    const int outstep = winoIblock*winoAtomF32*Cg;
+
+    v_store(outptr, z00);
+    v_store(outptr + outstep, z01);
+    v_store(outptr + outstep*2, z10);
+    v_store(outptr + outstep*3, z11);
+    v_store(outptr + outstep*4, z20);
+    v_store(outptr + outstep*5, z21);
+    v_store(outptr + outstep*6, z30);
+    v_store(outptr + outstep*7, z31);
+    v_store(outptr + outstep*8, z40);
+    v_store(outptr + outstep*9, z41);
+    v_store(outptr + outstep*10, z50);
+    v_store(outptr + outstep*11, z51);
+    v_store(outptr + outstep*12, z60);
+    v_store(outptr + outstep*13, z61);
+    v_store(outptr + outstep*14, z70);
+    v_store(outptr + outstep*15, z71);
+}
+
+/*Output transform*/
+/*  Inverse Winograd 8x8 transform:
+    out = (A'*inp*A)', where
+    inp is input 8x8 FP32 matrix,
+    A' is
+    [1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f,
+     0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f,
+     0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f,
+     0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f,
+     0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f,
+     0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f]
+
+    inp is pre-loaded into xij registers,
+    out will be stored in zij, where (0<=i<=7 for x, 0<=i<=5 for z), 0<=j<=1.
+
+    After the inverse transform is done, we add bias,
+    optionally add results from the earlier tensors (by-pass),
+    optionally apply activation function and then
+    store the final results.
+
+    That is, after both forward and then inverse transformation,
+    we get non-transposed result.
+    Of course, for the correct work of Winograd-based convolution,
+    the Winograd-transformed weights should also be transposed.
+    init_conv() (see OpConv.fx) takes care of that.
+*/
+void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
+                          float* bpptr, int bpstep, float* outptr, int outstep,
+                          float bias, float minval, float maxval, bool ifMinMaxAct)
+{
+    CV_Assert(CONV_WINO_IBLOCK == 3 && CONV_WINO_KBLOCK == 4 && CONV_WINO_ATOM_F32 == 4);
+    v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
+    v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
+    v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
+    v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4);
+    v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4);
+    v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4);
+    v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4);
+    v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4);
+    v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51;
+
+    {
+        v_float32x4 s12_0, s12_1, s34_0, s34_1, s56_0, s56_1;
+        s12_0 = x10 + x20; s12_1 = x11 + x21;
+        s34_0 = x30 + x40; s34_1 = x31 + x41;
+        s56_0 = x50 + x60; s56_1 = x51 + x61;
+
+        v_float32x4 y00 = x00 + s12_0 + s34_0 + s56_0;
+        v_float32x4 y01 = x01 + s12_1 + s34_1 + s56_1;
+
+        v_float32x4 a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
+        v_float32x4 y20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
+        v_float32x4 y21 = v_fma(s56_1, a0 ,v_fma(s34_1, a1, s12_1) );
+
+        a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f);
+        v_float32x4 y40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
+        v_float32x4 y41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
+
+        s12_0 = x10 - x20; s12_1 = x11 - x21;
+        s34_0 = x30 - x40; s34_1 = x31 - x41;
+        s56_0 = x50 - x60; s56_1 = x51 - x61;
+
+        a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.f);
+        v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, x70 + s12_0));
+        v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, x71 + s12_1));
+
+        a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.f);
+        v_float32x4 y10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
+        v_float32x4 y11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
+
+        a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.f);
+        v_float32x4 y30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
+        v_float32x4 y31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
+
+        v_float32x4 y60 = v_setall_f32(0.f), y61 = y60, y70 = y60, y71 = y60;
+
+        /* transpose 8x8 matrix in-place with some renumeration of the elements: */
+        /*  Y: */
+        /*        y00 y01 */
+        /*        y10 y11 */
+        /*        ... */
+        /*        y50 y51 */
+        /*        0   0 */
+        /*        0   0 */
+        /*   Y': */
+        /*        y00 y40 */
+        /*        y10 y50 */
+        /*        y20 y60 */
+        /*        y30 y70 */
+        /*        y01 y41 */
+        /*        y11 y51 */
+        /*        y21 y61 */
+        /*        y31 y71 */
+        /*    in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
+
+        v_transpose4x4(y00, y10, y20, y30, y00, y10, y20, y30);
+        v_transpose4x4(y01, y11, y21, y31, y01, y11, y21, y31);
+        v_transpose4x4(y40, y50, y60, y70, y40, y50, y60, y70);
+        v_transpose4x4(y41, y51, y61, y71, y41, y51, y61, y71);
+
+        s12_0 = y10 + y20; s12_1 = y50 + y60;
+        s34_0 = y30 + y01; s34_1 = y70 + y41;
+        s56_0 = y11 + y21; s56_1 = y51 + y61;
+
+        z00 = y00 + s12_0 + s34_0 + s56_0;
+        z01 = y40 + s12_1 + s34_1 + s56_1;
+
+        a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
+        z20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
+        z21 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
+
+        a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f);
+        z40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
+        z41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
+
+        s12_0 = y10 - y20; s12_1 = y50 - y60;
+        s34_0 = y30 - y01; s34_1 = y70 - y41;
+        s56_0 = y11 - y21; s56_1 = y51 - y61;
+
+        a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.0f);
+        z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, y31 + s12_0));
+        z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, y71 + s12_1));
+
+        a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.0f);
+        z10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
+        z11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
+
+        a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.0f);
+        z30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
+        z31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
+
+        v_float32x4 vbias = v_setall_f32(bias);
+        z00 += vbias;
+        z01 += vbias;
+        z10 += vbias;
+        z11 += vbias;
+        z20 += vbias;
+        z21 += vbias;
+        z30 += vbias;
+        z31 += vbias;
+        z40 += vbias;
+        z41 += vbias;
+        z50 += vbias;
+        z51 += vbias;
+    }
+
+    if (bpptr)
+    {
+        z00 += v_load(bpptr);
+        z01 += v_load_low(bpptr + 4);
+        z10 += v_load(bpptr + bpstep);
+        z11 += v_load_low(bpptr + bpstep + 4);
+        z20 += v_load(bpptr + bpstep*2);
+        z21 += v_load_low(bpptr + bpstep*2 + 4);
+        z30 += v_load(bpptr + bpstep*3);
+        z31 += v_load_low(bpptr + bpstep*3 + 4);
+        z40 += v_load(bpptr + bpstep*4);
+        z41 += v_load_low(bpptr + bpstep*4 + 4);
+        z50 += v_load(bpptr + bpstep*5);
+        z51 += v_load_low(bpptr + bpstep*5 + 4);
+    }
+
+    if (ifMinMaxAct)
+    {
+        v_float32x4 vmax = v_setall_f32(maxval);
+        v_float32x4 vmin = v_setall_f32(minval);
+
+        z00 = v_min(v_max(z00, vmin), vmax);
+        z01 = v_min(v_max(z01, vmin), vmax);
+        z10 = v_min(v_max(z10, vmin), vmax);
+        z11 = v_min(v_max(z11, vmin), vmax);
+        z20 = v_min(v_max(z20, vmin), vmax);
+        z21 = v_min(v_max(z21, vmin), vmax);
+        z30 = v_min(v_max(z30, vmin), vmax);
+        z31 = v_min(v_max(z31, vmin), vmax);
+        z40 = v_min(v_max(z40, vmin), vmax);
+        z41 = v_min(v_max(z41, vmin), vmax);
+        z50 = v_min(v_max(z50, vmin), vmax);
+        z51 = v_min(v_max(z51, vmin), vmax);
+    }
+
+    v_store(outptr, z00);
+    v_store_low(outptr + 4, z01);
+    v_store(outptr + outstep, z10);
+    v_store_low(outptr + outstep + 4, z11);
+    v_store(outptr + outstep*2, z20);
+    v_store_low(outptr + outstep*2 + 4, z21);
+    v_store(outptr + outstep*3, z30);
+    v_store_low(outptr + outstep*3 + 4, z31);
+    v_store(outptr + outstep*4, z40);
+    v_store_low(outptr + outstep*4 + 4, z41);
+    v_store(outptr + outstep*5, z50);
+    v_store_low(outptr + outstep*5 + 4, z51);
+}
+#endif
+
+#else
+int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv,
+                  int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
+{
+    return 0;
+}
+#endif
+
+}} // namespace cv::dnn
diff --git a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp
new file mode 100644
index 0000000000..2688c75785
--- /dev/null
+++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.simd.hpp
@@ -0,0 +1,886 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "opencv2/core/hal/intrin.hpp"
+
+namespace cv {
+namespace dnn {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+/* Accumulate */
+void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
+                            const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32);
+
+/*Input transform*/
+void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
+                               float* outptr, int Cg, const int winoIblock, const int winoAtomF32);
+
+/*Output transform*/
+void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
+                               float* bpptr, int bpstep, float* outptr, int outstep,
+                               float bias, float minval, float maxval, bool ifMinMaxAct);
+
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
+
+#if !CV_FMA3 // AVX workaround
+#undef _mm256_fmadd_ps
+#define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
+#endif
+
+void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
+                            const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32)
+{
+    CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF32 == 8);
+    if (iblock > 3)
+    {
+        for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
+                outbuf += winoAtomF32)
+        {
+            __m256 s00 = _mm256_set1_ps(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00;
+            __m256 s10 = _mm256_set1_ps(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00;
+            __m256 s20 = _mm256_set1_ps(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00;
+            __m256 s30 = _mm256_set1_ps(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00;
+            for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
+                                         wptr += winoKblock*winoAtomF32)
+            {
+                __m256 w0 = _mm256_load_ps(wptr), w1 = _mm256_load_ps(wptr + 8);
+                __m256 w2 = _mm256_load_ps(wptr + 16), w3 = _mm256_load_ps(wptr + 24);
+                __m256 x0, x1;
+                x0 = _mm256_load_ps(inwptr);
+                x1 = _mm256_load_ps(inwptr + 8);
+                s00 = _mm256_fmadd_ps(w0, x0, s00);
+                s01 = _mm256_fmadd_ps(w0, x1, s01);
+                s10 = _mm256_fmadd_ps(w1, x0, s10);
+                s11 = _mm256_fmadd_ps(w1, x1, s11);
+                s20 = _mm256_fmadd_ps(w2, x0, s20);
+                s21 = _mm256_fmadd_ps(w2, x1, s21);
+                s30 = _mm256_fmadd_ps(w3, x0, s30);
+                s31 = _mm256_fmadd_ps(w3, x1, s31);
+                x0 = _mm256_load_ps(inwptr + 16);
+                x1 = _mm256_load_ps(inwptr + 24);
+                s02 = _mm256_fmadd_ps(w0, x0, s02);
+                s03 = _mm256_fmadd_ps(w0, x1, s03);
+                s12 = _mm256_fmadd_ps(w1, x0, s12);
+                s13 = _mm256_fmadd_ps(w1, x1, s13);
+                s22 = _mm256_fmadd_ps(w2, x0, s22);
+                s23 = _mm256_fmadd_ps(w2, x1, s23);
+                s32 = _mm256_fmadd_ps(w3, x0, s32);
+                s33 = _mm256_fmadd_ps(w3, x1, s33);
+                x0 = _mm256_load_ps(inwptr + 32);
+                x1 = _mm256_load_ps(inwptr + 40);
+                s04 = _mm256_fmadd_ps(w0, x0, s04);
+                s05 = _mm256_fmadd_ps(w0, x1, s05);
+                s14 = _mm256_fmadd_ps(w1, x0, s14);
+                s15 = _mm256_fmadd_ps(w1, x1, s15);
+                s24 = _mm256_fmadd_ps(w2, x0, s24);
+                s25 = _mm256_fmadd_ps(w2, x1, s25);
+                s34 = _mm256_fmadd_ps(w3, x0, s34);
+                s35 = _mm256_fmadd_ps(w3, x1, s35);
+            }
+
+            _mm256_store_ps(outbuf, s00);
+            _mm256_store_ps(outbuf + 1*64, s01);
+            _mm256_store_ps(outbuf + 2*64, s02);
+            _mm256_store_ps(outbuf + 3*64, s03);
+            _mm256_store_ps(outbuf + 4*64, s04);
+            _mm256_store_ps(outbuf + 5*64, s05);
+
+            _mm256_store_ps(outbuf + 6*64, s10);
+            _mm256_store_ps(outbuf + 7*64, s11);
+            _mm256_store_ps(outbuf + 8*64, s12);
+            _mm256_store_ps(outbuf + 9*64, s13);
+            _mm256_store_ps(outbuf + 10*64, s14);
+            _mm256_store_ps(outbuf + 11*64, s15);
+
+            _mm256_store_ps(outbuf + 12*64, s20);
+            _mm256_store_ps(outbuf + 13*64, s21);
+            _mm256_store_ps(outbuf + 14*64, s22);
+            _mm256_store_ps(outbuf + 15*64, s23);
+            _mm256_store_ps(outbuf + 16*64, s24);
+            _mm256_store_ps(outbuf + 17*64, s25);
+
+            _mm256_store_ps(outbuf + 18*64, s30);
+            _mm256_store_ps(outbuf + 19*64, s31);
+            _mm256_store_ps(outbuf + 20*64, s32);
+            _mm256_store_ps(outbuf + 21*64, s33);
+            _mm256_store_ps(outbuf + 22*64, s34);
+            _mm256_store_ps(outbuf + 23*64, s35);
+        }
+    }
+    else
+    {
+        for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
+                outbuf += winoAtomF32)
+        {
+            __m256 s00 = _mm256_set1_ps(0.f), s01 = s00, s02 = s00;
+            __m256 s10 = _mm256_set1_ps(0.f), s11 = s00, s12 = s00;
+            __m256 s20 = _mm256_set1_ps(0.f), s21 = s00, s22 = s00;
+            __m256 s30 = _mm256_set1_ps(0.f), s31 = s00, s32 = s00;
+            for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
+                                         wptr += winoKblock*winoAtomF32) {
+                __m256 w0 = _mm256_load_ps(wptr), w1 = _mm256_load_ps(wptr + 8);
+                __m256 w2 = _mm256_load_ps(wptr + 16), w3 = _mm256_load_ps(wptr + 24);
+                __m256 x0, x1, x2;
+                x0 = _mm256_load_ps(inwptr);
+                x1 = _mm256_load_ps(inwptr + 8);
+                x2 = _mm256_load_ps(inwptr + 16);
+                s00 = _mm256_fmadd_ps(w0, x0, s00);
+                s01 = _mm256_fmadd_ps(w0, x1, s01);
+                s02 = _mm256_fmadd_ps(w0, x2, s02);
+                s10 = _mm256_fmadd_ps(w1, x0, s10);
+                s11 = _mm256_fmadd_ps(w1, x1, s11);
+                s12 = _mm256_fmadd_ps(w1, x2, s12);
+                s20 = _mm256_fmadd_ps(w2, x0, s20);
+                s21 = _mm256_fmadd_ps(w2, x1, s21);
+                s22 = _mm256_fmadd_ps(w2, x2, s22);
+                s30 = _mm256_fmadd_ps(w3, x0, s30);
+                s31 = _mm256_fmadd_ps(w3, x1, s31);
+                s32 = _mm256_fmadd_ps(w3, x2, s32);
+            }
+
+            _mm256_store_ps(outbuf, s00);
+            _mm256_store_ps(outbuf + 1*64, s01);
+            _mm256_store_ps(outbuf + 2*64, s02);
+            _mm256_store_ps(outbuf + 6*64, s10);
+            _mm256_store_ps(outbuf + 7*64, s11);
+            _mm256_store_ps(outbuf + 8*64, s12);
+            _mm256_store_ps(outbuf + 12*64, s20);
+            _mm256_store_ps(outbuf + 13*64, s21);
+            _mm256_store_ps(outbuf + 14*64, s22);
+            _mm256_store_ps(outbuf + 18*64, s30);
+            _mm256_store_ps(outbuf + 19*64, s31);
+            _mm256_store_ps(outbuf + 20*64, s32);
+        }
+    }
+    _mm256_zeroupper();
+}
+static inline
+void transpose8_ps(__m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m256 &row4, __m256 &row5, __m256 &row6, __m256 &row7)
+{
+    __m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7;
+    __m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7;
+    __t0 = _mm256_unpacklo_ps(row0, row1);
+    __t1 = _mm256_unpackhi_ps(row0, row1);
+    __t2 = _mm256_unpacklo_ps(row2, row3);
+    __t3 = _mm256_unpackhi_ps(row2, row3);
+    __t4 = _mm256_unpacklo_ps(row4, row5);
+    __t5 = _mm256_unpackhi_ps(row4, row5);
+    __t6 = _mm256_unpacklo_ps(row6, row7);
+    __t7 = _mm256_unpackhi_ps(row6, row7);
+    __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
+    __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
+    __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
+    __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
+    __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
+    __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
+    __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
+    __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
+    row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
+    row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
+    row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
+    row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
+    row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
+    row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
+    row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
+    row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
+}
+
+/*Input transform*/
+void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
+                               float* outptr, int Cg, const int winoIblock, const int winoAtomF32)
+{
+    __m256 x00 = _mm256_loadu_ps(inptr);
+    __m256 x10 = _mm256_loadu_ps(inptr + inpstep);
+    __m256 x20 = _mm256_loadu_ps(inptr + inpstep*2);
+    __m256 x30 = _mm256_loadu_ps(inptr + inpstep*3);
+    __m256 x40 = _mm256_loadu_ps(inptr + inpstep*4);
+    __m256 x50 = _mm256_loadu_ps(inptr + inpstep*5);
+    __m256 x60 = _mm256_loadu_ps(inptr + inpstep*6);
+    __m256 x70 = _mm256_loadu_ps(inptr + inpstep*7);
+
+    __m256 z00, z10, z20, z30, z40, z50, z60, z70;
+
+    {
+        /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
+        /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
+        __m256 q5_25 = _mm256_set1_ps(5.25f), t00, t10;
+        t00 = _mm256_sub_ps(x40, x20);
+        t10 = _mm256_sub_ps(x30, x50);
+
+        __m256 y00 = _mm256_fmadd_ps(t00, q5_25, _mm256_sub_ps(x00, x60));
+        __m256 y70 = _mm256_fmadd_ps(t10, q5_25, _mm256_sub_ps(x70, x10));
+
+        /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
+        /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
+        __m256 qm4_25 = _mm256_set1_ps(-4.25f);
+        t00 = _mm256_fmadd_ps(x30, qm4_25, _mm256_add_ps(x10, x50));
+        t10 = _mm256_fmadd_ps(x40, qm4_25, _mm256_add_ps(x20, x60));
+
+        __m256 y10 = _mm256_add_ps(t00, t10);
+        __m256 y20 = _mm256_sub_ps(t10, t00);
+
+        /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
+        /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
+        __m256 q0_5 = _mm256_set1_ps(0.5f), q0_25 = _mm256_set1_ps(0.25f);
+        __m256 qm2_5 = _mm256_set1_ps(-2.5f), qm1_25 = _mm256_set1_ps(-1.25f);
+        t00 = _mm256_fmadd_ps(x10, q0_5, _mm256_add_ps(x50, x50));
+        t10 = _mm256_fmadd_ps(x20, q0_25, x60);
+        t00 = _mm256_fmadd_ps(x30, qm2_5, t00);
+        t10 = _mm256_fmadd_ps(x40, qm1_25, t10);
+
+        __m256 y30 = _mm256_add_ps(t00, t10);
+        __m256 y40 = _mm256_sub_ps(t10, t00);
+
+        /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
+        /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
+        __m256 q4 = _mm256_set1_ps(4.f), qm5 = _mm256_set1_ps(-5.f);
+        t00 = _mm256_fmadd_ps(x50, q0_5, _mm256_add_ps(x10, x10));
+        t10 = _mm256_fmadd_ps(x20, q4   , x60);
+        t00 = _mm256_fmadd_ps(x30, qm2_5, t00);
+        t10 = _mm256_fmadd_ps(x40, qm5  , t10);
+
+        __m256 y50 = _mm256_add_ps(t00, t10);
+        __m256 y60 = _mm256_sub_ps(t10, t00);
+
+        /* transpose 8x8 matrix in-place with some renumeration of the elements: */
+        transpose8_ps(y00, y10, y20, y30, y40, y50, y60, y70);
+
+        /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
+        /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
+        t00 = _mm256_sub_ps(y40, y20);
+        t10 = _mm256_sub_ps(y30, y50);
+        z00 = _mm256_fmadd_ps(t00, q5_25, _mm256_sub_ps(y00, y60));
+        z70 = _mm256_fmadd_ps(t10, q5_25, _mm256_sub_ps(y70, y10));
+
+        /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
+        /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
+        t00 = _mm256_fmadd_ps(y30, qm4_25, _mm256_add_ps(y10, y50));
+        t10 = _mm256_fmadd_ps(y40, qm4_25, _mm256_add_ps(y20, y60));
+        z10 = _mm256_add_ps(t00, t10);
+        z20 = _mm256_sub_ps(t10, t00);
+
+        /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
+        /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
+        t00 = _mm256_fmadd_ps(y10, q0_5, _mm256_add_ps(y50, y50));
+        t10 = _mm256_fmadd_ps(y20, q0_25, y60);
+        t00 = _mm256_fmadd_ps(y30, qm2_5, t00);
+        t10 = _mm256_fmadd_ps(y40, qm1_25, t10);
+
+        z30 = _mm256_add_ps(t00, t10);
+        z40 = _mm256_sub_ps(t10, t00);
+
+        /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
+        /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
+        t00 = _mm256_fmadd_ps(y50, q0_5, _mm256_add_ps(y10, y10));
+        t10 = _mm256_fmadd_ps(y20, q4, y60);
+        t00 = _mm256_fmadd_ps(y30, qm2_5, t00);
+        t10 = _mm256_fmadd_ps(y40, qm5, t10);
+
+        z50 = _mm256_add_ps(t00, t10);
+        z60 = _mm256_sub_ps(t10, t00);
+    }
+
+    const int outstep = winoIblock*winoAtomF32*Cg;
+
+    _mm256_storeu_ps(outptr, z00);
+    _mm256_storeu_ps(outptr + outstep, z10);
+    _mm256_storeu_ps(outptr + outstep*2, z20);
+    _mm256_storeu_ps(outptr + outstep*3, z30);
+    _mm256_storeu_ps(outptr + outstep*4, z40);
+    _mm256_storeu_ps(outptr + outstep*5, z50);
+    _mm256_storeu_ps(outptr + outstep*6, z60);
+    _mm256_storeu_ps(outptr + outstep*7, z70);
+    _mm256_zeroupper();
+}
+
+#define STORE6_ELE_FROM_16(ptr, z00, lowM, highM) \
+    lowM = _mm256_castps256_ps128(z00); \
+    highM = _mm256_extractf128_ps(z00, 1); \
+    _mm_storeu_ps(ptr, lowM); \
+    _mm_storel_epi64((__m128i*)(ptr + 4), _mm_castps_si128(highM))
+
+/*  Inverse Winograd 8x8 transform:
+    out = (A'*inp*A)', where
+    inp is input 8x8 FP32 matrix,
+    A' is
+    [1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f,
+     0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f,
+     0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f,
+     0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f,
+     0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f,
+     0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f]
+*/
+void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
+                          float* bpptr, int bpstep, float* outptr, int outstep,
+                          float bias, float minval, float maxval, bool ifMinMaxAct)
+{
+
+    __m256 x00 = _mm256_load_ps(inptr);
+    __m256 x10 = _mm256_load_ps(inptr + inpstep);
+    __m256 x20 = _mm256_load_ps(inptr + inpstep*2);
+    __m256 x30 = _mm256_load_ps(inptr + inpstep*3);
+    __m256 x40 = _mm256_load_ps(inptr + inpstep*4);
+    __m256 x50 = _mm256_load_ps(inptr + inpstep*5);
+    __m256 x60 = _mm256_load_ps(inptr + inpstep*6);
+    __m256 x70 = _mm256_load_ps(inptr + inpstep*7);
+    __m256 z00, z10, z20, z30, z40, z50;
+
+    {
+        __m256 s12_0, s34_0, s56_0;
+        s12_0 = _mm256_add_ps(x10, x20);
+        s34_0 = _mm256_add_ps(x30, x40);
+        s56_0 = _mm256_add_ps(x50, x60);
+
+        __m256 y00 = _mm256_add_ps(x00, _mm256_add_ps(s12_0, _mm256_add_ps(s34_0, s56_0)));
+        __m256 y20 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.25f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(4.0f), s12_0));
+        __m256 y40 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/16), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(16.0f), s12_0));
+
+        s12_0 = _mm256_sub_ps(x10, x20);
+        s34_0 = _mm256_sub_ps(x30, x40);
+        s56_0 = _mm256_sub_ps(x50, x60);
+        __m256 y50 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/32), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(32.f), _mm256_add_ps(x70, s12_0)));
+        __m256 y10 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.5f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(2.f), s12_0));
+        __m256 y30 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.125f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(8.f), s12_0));
+        __m256 y60 = _mm256_set1_ps(0.f), y70 = y60;
+
+        /* transpose 8x8 matrix in-place with some renumeration of the elements: */
+
+        transpose8_ps(y00, y10, y20, y30, y40, y50, y60, y70);
+
+        s12_0 = _mm256_add_ps(y10, y20);
+        s34_0 = _mm256_add_ps(y30, y40);
+        s56_0 = _mm256_add_ps(y50, y60);
+
+        z00 = _mm256_add_ps(y00, _mm256_add_ps(s12_0, _mm256_add_ps(s34_0, s56_0)));
+        z20 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.25f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(4.0f), s12_0));
+        z40 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/16), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(16.0f), s12_0));
+
+        s12_0 = _mm256_sub_ps(y10, y20);
+        s34_0 = _mm256_sub_ps(y30, y40);
+        s56_0 = _mm256_sub_ps(y50, y60);
+
+        z50 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/32), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(32.0f), _mm256_add_ps(y70, s12_0)));
+        z10 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.5f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(2.0f), s12_0));
+        z30 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.125f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(8.0f), s12_0));
+
+        __m256 vbias = _mm256_set1_ps(bias);
+        z00 = _mm256_add_ps(vbias, z00);
+        z10 = _mm256_add_ps(vbias, z10);
+        z20 = _mm256_add_ps(vbias, z20);
+        z30 = _mm256_add_ps(vbias, z30);
+        z40 = _mm256_add_ps(vbias, z40);
+        z50 = _mm256_add_ps(vbias, z50);
+    }
+
+    if (bpptr)
+    {
+        z00 = _mm256_add_ps(z00, _mm256_loadu_ps(bpptr));
+        z10 = _mm256_add_ps(z10, _mm256_loadu_ps(bpptr + bpstep));
+        z20 = _mm256_add_ps(z20, _mm256_loadu_ps(bpptr + bpstep*2));
+        z30 = _mm256_add_ps(z30, _mm256_loadu_ps(bpptr + bpstep*3));
+        z40 = _mm256_add_ps(z40, _mm256_loadu_ps(bpptr + bpstep*4));
+        z50 = _mm256_add_ps(z50, _mm256_loadu_ps(bpptr + bpstep*5));
+    }
+
+    if (ifMinMaxAct)
+    {
+        __m256 vmax = _mm256_set1_ps(maxval);
+        __m256 vmin = _mm256_set1_ps(minval);
+
+        z00 = _mm256_min_ps(_mm256_max_ps(z00, vmin), vmax);
+        z10 = _mm256_min_ps(_mm256_max_ps(z10, vmin), vmax);
+        z20 = _mm256_min_ps(_mm256_max_ps(z20, vmin), vmax);
+        z30 = _mm256_min_ps(_mm256_max_ps(z30, vmin), vmax);
+        z40 = _mm256_min_ps(_mm256_max_ps(z40, vmin), vmax);
+        z50 = _mm256_min_ps(_mm256_max_ps(z50, vmin), vmax);
+    }
+
+    __m128 lowM, highM;
+    STORE6_ELE_FROM_16(outptr, z00, lowM, highM);
+    STORE6_ELE_FROM_16(outptr + outstep, z10, lowM, highM);
+    STORE6_ELE_FROM_16(outptr + outstep * 2, z20, lowM, highM);
+    STORE6_ELE_FROM_16(outptr + outstep * 3, z30, lowM, highM);
+    STORE6_ELE_FROM_16(outptr + outstep * 4, z40, lowM, highM);
+    STORE6_ELE_FROM_16(outptr + outstep * 5, z50, lowM, highM);
+    _mm256_zeroupper();
+}
+#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+
+// NEON code work around.
+namespace opt_NEON
+{
+
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON && CV_NEON_AARCH64
+/* Accumulate */
+void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
+                        const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32);
+
+/*Input transform*/
+void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
+                            float* outptr, int Cg, const int winoIblock, const int winoAtomF32);
+
+/*Output transform*/
+void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
+                            float* bpptr, int bpstep, float* outptr, int outstep,
+                            float bias, float minval, float maxval, bool ifMinMaxAct);
+
+void winofunc_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock,
+                            const int winoIblock, const int winoKblock, const int winoAtomF32, const int winoNatomF32)
+{
+    CV_Assert(winoIblock == 6 && winoKblock == 4 && winoAtomF32 == 4);
+    if (iblock > 3)
+    {
+        for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
+                outbuf += winoAtomF32)
+        {
+            float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00;
+            float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00;
+            float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00;
+            float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00;
+            for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
+                                         wptr += winoKblock*winoAtomF32) {
+                float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4);
+                float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12);
+                float32x4_t x0, x1;
+                x0 = vld1q_f32(inwptr);
+                x1 = vld1q_f32(inwptr + 4);
+                s00 = vfmaq_f32(s00, w0, x0);
+                s01 = vfmaq_f32(s01, w0, x1);
+                s10 = vfmaq_f32(s10, w1, x0);
+                s11 = vfmaq_f32(s11, w1, x1);
+                s20 = vfmaq_f32(s20, w2, x0);
+                s21 = vfmaq_f32(s21, w2, x1);
+                s30 = vfmaq_f32(s30, w3, x0);
+                s31 = vfmaq_f32(s31, w3, x1);
+                x0 = vld1q_f32(inwptr + 8);
+                x1 = vld1q_f32(inwptr + 12);
+                s02 = vfmaq_f32(s02, w0, x0);
+                s03 = vfmaq_f32(s03, w0, x1);
+                s12 = vfmaq_f32(s12, w1, x0);
+                s13 = vfmaq_f32(s13, w1, x1);
+                s22 = vfmaq_f32(s22, w2, x0);
+                s23 = vfmaq_f32(s23, w2, x1);
+                s32 = vfmaq_f32(s32, w3, x0);
+                s33 = vfmaq_f32(s33, w3, x1);
+                x0 = vld1q_f32(inwptr + 16);
+                x1 = vld1q_f32(inwptr + 20);
+                s04 = vfmaq_f32(s04, w0, x0);
+                s05 = vfmaq_f32(s05, w0, x1);
+                s14 = vfmaq_f32(s14, w1, x0);
+                s15 = vfmaq_f32(s15, w1, x1);
+                s24 = vfmaq_f32(s24, w2, x0);
+                s25 = vfmaq_f32(s25, w2, x1);
+                s34 = vfmaq_f32(s34, w3, x0);
+                s35 = vfmaq_f32(s35, w3, x1);
+            }
+
+            vst1q_f32(outbuf, s00);
+            vst1q_f32(outbuf + 1*64, s01);
+            vst1q_f32(outbuf + 2*64, s02);
+            vst1q_f32(outbuf + 3*64, s03);
+            vst1q_f32(outbuf + 4*64, s04);
+            vst1q_f32(outbuf + 5*64, s05);
+
+            vst1q_f32(outbuf + 6*64, s10);
+            vst1q_f32(outbuf + 7*64, s11);
+            vst1q_f32(outbuf + 8*64, s12);
+            vst1q_f32(outbuf + 9*64, s13);
+            vst1q_f32(outbuf + 10*64, s14);
+            vst1q_f32(outbuf + 11*64, s15);
+
+            vst1q_f32(outbuf + 12*64, s20);
+            vst1q_f32(outbuf + 13*64, s21);
+            vst1q_f32(outbuf + 14*64, s22);
+            vst1q_f32(outbuf + 15*64, s23);
+            vst1q_f32(outbuf + 16*64, s24);
+            vst1q_f32(outbuf + 17*64, s25);
+
+            vst1q_f32(outbuf + 18*64, s30);
+            vst1q_f32(outbuf + 19*64, s31);
+            vst1q_f32(outbuf + 20*64, s32);
+            vst1q_f32(outbuf + 21*64, s33);
+            vst1q_f32(outbuf + 22*64, s34);
+            vst1q_f32(outbuf + 23*64, s35);
+        }
+    }
+    else
+    {
+        for (int atom_id = 0; atom_id < winoNatomF32; atom_id++,
+                outbuf += winoAtomF32)
+        {
+            float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00;
+            float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00;
+            float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00;
+            float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00;
+            for (int c = 0; c < Cg; c++, inwptr += winoIblock*winoAtomF32,
+                                         wptr += winoKblock*winoAtomF32) {
+                float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4);
+                float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12);
+                float32x4_t x0, x1, x2;
+                x0 = vld1q_f32(inwptr);
+                x1 = vld1q_f32(inwptr + 4);
+                x2 = vld1q_f32(inwptr + 8);
+                s00 = vfmaq_f32(s00, w0, x0);
+                s01 = vfmaq_f32(s01, w0, x1);
+                s02 = vfmaq_f32(s02, w0, x2);
+                s10 = vfmaq_f32(s10, w1, x0);
+                s11 = vfmaq_f32(s11, w1, x1);
+                s12 = vfmaq_f32(s12, w1, x2);
+                s20 = vfmaq_f32(s20, w2, x0);
+                s21 = vfmaq_f32(s21, w2, x1);
+                s22 = vfmaq_f32(s22, w2, x2);
+                s30 = vfmaq_f32(s30, w3, x0);
+                s31 = vfmaq_f32(s31, w3, x1);
+                s32 = vfmaq_f32(s32, w3, x2);
+            }
+
+            vst1q_f32(outbuf, s00);
+            vst1q_f32(outbuf + 1*64, s01);
+            vst1q_f32(outbuf + 2*64, s02);
+            vst1q_f32(outbuf + 6*64, s10);
+            vst1q_f32(outbuf + 7*64, s11);
+            vst1q_f32(outbuf + 8*64, s12);
+            vst1q_f32(outbuf + 12*64, s20);
+            vst1q_f32(outbuf + 13*64, s21);
+            vst1q_f32(outbuf + 14*64, s22);
+            vst1q_f32(outbuf + 18*64, s30);
+            vst1q_f32(outbuf + 19*64, s31);
+            vst1q_f32(outbuf + 20*64, s32);
+        }
+    }
+}
+
+#define T4x4(a, b, c, d, tr0, tr1) \
+    tr0 = vtrnq_f32(a, b); \
+    tr1 = vtrnq_f32(c, d); \
+    a = vcombine_f32(vget_low_f32(tr0.val[0]), vget_low_f32(tr1.val[0])); \
+    b = vcombine_f32(vget_low_f32(tr0.val[1]), vget_low_f32(tr1.val[1])); \
+    c = vcombine_f32(vget_high_f32(tr0.val[0]), vget_high_f32(tr1.val[0])); \
+    d = vcombine_f32(vget_high_f32(tr0.val[1]), vget_high_f32(tr1.val[1]))
+
+/*Input transform*/
+void winofunc_BtXB_8x8_f32(const float* inptr, int inpstep,
+                          float* outptr, int Cg, const int winoIblock, const int winoAtomF32)
+{
+    float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4);
+    float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4);
+    float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4);
+    float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4);
+    float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4);
+    float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4);
+    float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4);
+    float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4);
+
+    float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71;
+
+    {
+        /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
+        /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
+        float32x4_t q5_25 = vdupq_n_f32(5.25f), t00, t01, t10, t11;
+        t00 = vsubq_f32(x40, x20);
+        t01 = vsubq_f32(x41, x21);
+        t10 = vsubq_f32(x30, x50);
+        t11 = vsubq_f32(x31, x51);
+        float32x4_t y00 = vfmaq_f32(vsubq_f32(x00, x60), t00, q5_25);
+        float32x4_t y01 = vfmaq_f32(vsubq_f32(x01, x61), t01, q5_25);
+        float32x4_t y70 = vfmaq_f32(vsubq_f32(x70, x10), t10, q5_25);
+        float32x4_t y71 = vfmaq_f32(vsubq_f32(x71, x11), t11, q5_25);
+
+        /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
+        /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
+        float32x4_t qm4_25 = vdupq_n_f32(-4.25f);
+        t00 = vfmaq_f32(vaddq_f32(x10, x50), x30, qm4_25);
+        t01 = vfmaq_f32(vaddq_f32(x11, x51), x31, qm4_25);
+        t10 = vfmaq_f32(vaddq_f32(x20, x60), x40, qm4_25);
+        t11 = vfmaq_f32(vaddq_f32(x21, x61), x41, qm4_25);
+
+        float32x4_t y10 = vaddq_f32(t00, t10), y11 = vaddq_f32(t01, t11);
+        float32x4_t y20 = vsubq_f32(t10, t00), y21 = vsubq_f32(t11, t01);
+
+        /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
+        /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
+        float32x4_t q0_5 = vdupq_n_f32(0.5f), q0_25 = vdupq_n_f32(0.25f);
+        float32x4_t qm2_5 = vdupq_n_f32(-2.5f), qm1_25 = vdupq_n_f32(-1.25f);
+        t00 = vfmaq_f32(vaddq_f32(x50, x50), x10, q0_5);
+        t01 = vfmaq_f32(vaddq_f32(x51, x51), x11, q0_5);
+        t10 = vfmaq_f32(x60, x20, q0_25);
+        t11 = vfmaq_f32(x61, x21, q0_25);
+        t00 = vfmaq_f32(t00, x30, qm2_5);
+        t01 = vfmaq_f32(t01, x31, qm2_5);
+        t10 = vfmaq_f32(t10, x40, qm1_25);
+        t11 = vfmaq_f32(t11, x41, qm1_25);
+
+        float32x4_t y30 = vaddq_f32(t00, t10), y31 = vaddq_f32(t01, t11);
+        float32x4_t y40 = vsubq_f32(t10, t00), y41 = vsubq_f32(t11, t01);
+
+        /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
+        /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
+        float32x4_t q4 = vdupq_n_f32(4.f), qm5 = vdupq_n_f32(-5.f);
+        t00 = vfmaq_f32(vaddq_f32(x10, x10), x50, q0_5);
+        t01 = vfmaq_f32(vaddq_f32(x11, x11), x51, q0_5);
+        t10 = vfmaq_f32(x60, x20, q4);
+        t11 = vfmaq_f32(x61, x21, q4);
+        t00 = vfmaq_f32(t00, x30, qm2_5);
+        t01 = vfmaq_f32(t01, x31, qm2_5);
+        t10 = vfmaq_f32(t10, x40, qm5);
+        t11 = vfmaq_f32(t11, x41, qm5);
+
+        float32x4_t y50 = vaddq_f32(t00, t10), y51 = vaddq_f32(t01, t11);
+        float32x4_t y60 = vsubq_f32(t10, t00), y61 = vsubq_f32(t11, t01);
+
+        /* transpose 8x8 matrix in-place with some renumeration of the elements: */
+        /* Y:              */
+        /*        y00 y01  */
+        /*        y10 y11  */
+        /*        ...      */
+        /*        y70 y71  */
+        /*   Y':           */
+        /*        y00 y40  */
+        /*        y10 y50  */
+        /*        y20 y60  */
+        /*        y30 y70  */
+        /*        y01 y41  */
+        /*        y11 y51  */
+        /*        y21 y61  */
+        /*        y31 y71  */
+        /*    in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
+        float32x4x2_t tr0, tr1;
+
+        T4x4(y00, y10, y20, y30, tr0, tr1);
+        T4x4(y01, y11, y21, y31, tr0, tr1);
+        T4x4(y40, y50, y60, y70, tr0, tr1);
+        T4x4(y41, y51, y61, y71, tr0, tr1);
+
+        /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
+        /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
+        t00 = vsubq_f32(y01, y20);
+        t01 = vsubq_f32(y41, y60);
+        t10 = vsubq_f32(y30, y11);
+        t11 = vsubq_f32(y70, y51);
+        z00 = vfmaq_f32(vsubq_f32(y00, y21), t00, q5_25);
+        z01 = vfmaq_f32(vsubq_f32(y40, y61), t01, q5_25);
+        z70 = vfmaq_f32(vsubq_f32(y31, y10), t10, q5_25);
+        z71 = vfmaq_f32(vsubq_f32(y71, y50), t11, q5_25);
+
+        /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
+        /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
+        t00 = vfmaq_f32(vaddq_f32(y10, y11), y30, qm4_25);
+        t01 = vfmaq_f32(vaddq_f32(y50, y51), y70, qm4_25);
+        t10 = vfmaq_f32(vaddq_f32(y20, y21), y01, qm4_25);
+        t11 = vfmaq_f32(vaddq_f32(y60, y61), y41, qm4_25);
+
+        z10 = vaddq_f32(t00, t10); z11 = vaddq_f32(t01, t11);
+        z20 = vsubq_f32(t10, t00); z21 = vsubq_f32(t11, t01);
+
+        /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
+        /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
+        t00 = vfmaq_f32(vaddq_f32(y11, y11), y10, q0_5);
+        t01 = vfmaq_f32(vaddq_f32(y51, y51), y50, q0_5);
+        t10 = vfmaq_f32(y21, y20, q0_25);
+        t11 = vfmaq_f32(y61, y60, q0_25);
+        t00 = vfmaq_f32(t00, y30, qm2_5);
+        t01 = vfmaq_f32(t01, y70, qm2_5);
+        t10 = vfmaq_f32(t10, y01, qm1_25);
+        t11 = vfmaq_f32(t11, y41, qm1_25);
+
+        z30 = vaddq_f32(t00, t10); z31 = vaddq_f32(t01, t11);
+        z40 = vsubq_f32(t10, t00); z41 = vsubq_f32(t11, t01);
+
+        /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
+        /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
+        t00 = vfmaq_f32(vaddq_f32(y10, y10), y11, q0_5);
+        t01 = vfmaq_f32(vaddq_f32(y50, y50), y51, q0_5);
+        t10 = vfmaq_f32(y21, y20, q4);
+        t11 = vfmaq_f32(y61, y60, q4);
+        t00 = vfmaq_f32(t00, y30, qm2_5);
+        t01 = vfmaq_f32(t01, y70, qm2_5);
+        t10 = vfmaq_f32(t10, y01, qm5);
+        t11 = vfmaq_f32(t11, y41, qm5);
+
+        z50 = vaddq_f32(t00, t10); z51 = vaddq_f32(t01, t11);
+        z60 = vsubq_f32(t10, t00); z61 = vsubq_f32(t11, t01);
+    }
+
+    const int outstep = winoIblock*winoAtomF32*Cg;
+
+    vst1q_f32(outptr, z00);
+    vst1q_f32(outptr + outstep, z01);
+    vst1q_f32(outptr + outstep*2, z10);
+    vst1q_f32(outptr + outstep*3, z11);
+    vst1q_f32(outptr + outstep*4, z20);
+    vst1q_f32(outptr + outstep*5, z21);
+    vst1q_f32(outptr + outstep*6, z30);
+    vst1q_f32(outptr + outstep*7, z31);
+    vst1q_f32(outptr + outstep*8, z40);
+    vst1q_f32(outptr + outstep*9, z41);
+    vst1q_f32(outptr + outstep*10, z50);
+    vst1q_f32(outptr + outstep*11, z51);
+    vst1q_f32(outptr + outstep*12, z60);
+    vst1q_f32(outptr + outstep*13, z61);
+    vst1q_f32(outptr + outstep*14, z70);
+    vst1q_f32(outptr + outstep*15, z71);
+}
+
+/*Output transform*/
+void winofunc_AtXA_8x8_f32(const float* inptr, int inpstep,
+                          float* bpptr, int bpstep, float* outptr, int outstep,
+                          float bias, float minval, float maxval, bool ifMinMaxAct)
+{
+    float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4);
+    float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4);
+    float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4);
+    float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4);
+    float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4);
+    float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4);
+    float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4);
+    float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4);
+    float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51;
+
+    {
+        float32x4_t s12_0, s12_1, s34_0, s34_1, s56_0, s56_1;
+        s12_0 = vaddq_f32(x10, x20); s12_1 = vaddq_f32(x11, x21);
+        s34_0 = vaddq_f32(x30, x40); s34_1 = vaddq_f32(x31, x41);
+        s56_0 = vaddq_f32(x50, x60); s56_1 = vaddq_f32(x51, x61);
+
+        float32x4_t y00 = vaddq_f32(vaddq_f32(vaddq_f32(x00, s12_0), s34_0), s56_0);
+        float32x4_t y01 = vaddq_f32(vaddq_f32(vaddq_f32(x01, s12_1), s34_1), s56_1);
+        float32x4_t y20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f);
+        float32x4_t y21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f);
+        float32x4_t y40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16);
+        float32x4_t y41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16);
+
+        s12_0 = vsubq_f32(x10, x20); s12_1 = vsubq_f32(x11, x21);
+        s34_0 = vsubq_f32(x30, x40); s34_1 = vsubq_f32(x31, x41);
+        s56_0 = vsubq_f32(x50, x60); s56_1 = vsubq_f32(x51, x61);
+
+        float32x4_t y50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x70, s12_0),
+                                      s34_0, 32.f), s56_0, 1.f/32);
+        float32x4_t y51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x71, s12_1),
+                                      s34_1, 32.f), s56_1, 1.f/32);
+        float32x4_t y10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f);
+        float32x4_t y11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f);
+        float32x4_t y30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f);
+        float32x4_t y31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f);
+        float32x4_t y60 = vdupq_n_f32(0.f), y61 = y60, y70 = y60, y71 = y60;
+
+        /* transpose 8x8 matrix in-place with some renumeration of the elements: */
+        /*  Y: */
+        /*        y00 y01 */
+        /*        y10 y11 */
+        /*        ... */
+        /*        y50 y51 */
+        /*        0   0 */
+        /*        0   0 */
+        /*   Y': */
+        /*        y00 y40 */
+        /*        y10 y50 */
+        /*        y20 y60 */
+        /*        y30 y70 */
+        /*        y01 y41 */
+        /*        y11 y51 */
+        /*        y21 y61 */
+        /*        y31 y71 */
+        /*    in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
+        float32x4x2_t tr0, tr1;
+
+        T4x4(y00, y10, y20, y30, tr0, tr1);
+        T4x4(y01, y11, y21, y31, tr0, tr1);
+        T4x4(y40, y50, y60, y70, tr0, tr1);
+        T4x4(y41, y51, y61, y71, tr0, tr1);
+
+        s12_0 = vaddq_f32(y10, y20); s12_1 = vaddq_f32(y50, y60);
+        s34_0 = vaddq_f32(y30, y01); s34_1 = vaddq_f32(y70, y41);
+        s56_0 = vaddq_f32(y11, y21); s56_1 = vaddq_f32(y51, y61);
+
+        z00 = vaddq_f32(vaddq_f32(vaddq_f32(y00, s12_0), s34_0), s56_0);
+        z01 = vaddq_f32(vaddq_f32(vaddq_f32(y40, s12_1), s34_1), s56_1);
+        z20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f);
+        z21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f);
+        z40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16);
+        z41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16);
+
+        s12_0 = vsubq_f32(y10, y20); s12_1 = vsubq_f32(y50, y60);
+        s34_0 = vsubq_f32(y30, y01); s34_1 = vsubq_f32(y70, y41);
+        s56_0 = vsubq_f32(y11, y21); s56_1 = vsubq_f32(y51, y61);
+
+        z50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y31, s12_0),
+                          s34_0, 32.f), s56_0, 1.f/32);
+        z51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y71, s12_1),
+                          s34_1, 32.f), s56_1, 1.f/32);
+        z10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f);
+        z11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f);
+        z30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f);
+        z31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f);
+        float32x4_t vbias = vdupq_n_f32(bias);
+
+        z00 = vaddq_f32(z00, vbias);
+        z01 = vaddq_f32(z01, vbias);
+        z10 = vaddq_f32(z10, vbias);
+        z11 = vaddq_f32(z11, vbias);
+        z20 = vaddq_f32(z20, vbias);
+        z21 = vaddq_f32(z21, vbias);
+        z30 = vaddq_f32(z30, vbias);
+        z31 = vaddq_f32(z31, vbias);
+        z40 = vaddq_f32(z40, vbias);
+        z41 = vaddq_f32(z41, vbias);
+        z50 = vaddq_f32(z50, vbias);
+        z51 = vaddq_f32(z51, vbias);
+    }
+
+    if (bpptr)
+    {
+        float32x2_t zhalf = vdup_n_f32(0.f);
+        z00 = vaddq_f32(z00, vld1q_f32(bpptr));
+        z01 = vaddq_f32(z01, vcombine_f32(vld1_f32(bpptr + 4), zhalf));
+        z10 = vaddq_f32(z10, vld1q_f32(bpptr + bpstep));
+        z11 = vaddq_f32(z11, vcombine_f32(vld1_f32(bpptr + bpstep + 4), zhalf));
+        z20 = vaddq_f32(z20, vld1q_f32(bpptr + bpstep*2));
+        z21 = vaddq_f32(z21, vcombine_f32(vld1_f32(bpptr + bpstep*2 + 4), zhalf));
+        z30 = vaddq_f32(z30, vld1q_f32(bpptr + bpstep*3));
+        z31 = vaddq_f32(z31, vcombine_f32(vld1_f32(bpptr + bpstep*3 + 4), zhalf));
+        z40 = vaddq_f32(z40, vld1q_f32(bpptr + bpstep*4));
+        z41 = vaddq_f32(z41, vcombine_f32(vld1_f32(bpptr + bpstep*4 + 4), zhalf));
+        z50 = vaddq_f32(z50, vld1q_f32(bpptr + bpstep*5));
+        z51 = vaddq_f32(z51, vcombine_f32(vld1_f32(bpptr + bpstep*5 + 4), zhalf));
+    }
+
+    if (ifMinMaxAct)
+    {
+        float32x4_t vmax = vdupq_n_f32(maxval);
+        float32x4_t vmin = vdupq_n_f32(minval);
+
+        z00 = vminq_f32(vmaxq_f32(z00, vmin), vmax);
+        z01 = vminq_f32(vmaxq_f32(z01, vmin), vmax);
+        z10 = vminq_f32(vmaxq_f32(z10, vmin), vmax);
+        z11 = vminq_f32(vmaxq_f32(z11, vmin), vmax);
+        z20 = vminq_f32(vmaxq_f32(z20, vmin), vmax);
+        z21 = vminq_f32(vmaxq_f32(z21, vmin), vmax);
+        z30 = vminq_f32(vmaxq_f32(z30, vmin), vmax);
+        z31 = vminq_f32(vmaxq_f32(z31, vmin), vmax);
+        z40 = vminq_f32(vmaxq_f32(z40, vmin), vmax);
+        z41 = vminq_f32(vmaxq_f32(z41, vmin), vmax);
+        z50 = vminq_f32(vmaxq_f32(z50, vmin), vmax);
+        z51 = vminq_f32(vmaxq_f32(z51, vmin), vmax);
+    }
+
+    vst1q_f32(outptr, z00);
+    vst1_f32(outptr + 4, vget_low_f32(z01));
+    vst1q_f32(outptr + outstep, z10);
+    vst1_f32(outptr + outstep + 4, vget_low_f32(z11));
+    vst1q_f32(outptr + outstep*2, z20);
+    vst1_f32(outptr + outstep*2 + 4, vget_low_f32(z21));
+    vst1q_f32(outptr + outstep*3, z30);
+    vst1_f32(outptr + outstep*3 + 4, vget_low_f32(z31));
+    vst1q_f32(outptr + outstep*4, z40);
+    vst1_f32(outptr + outstep*4 + 4, vget_low_f32(z41));
+    vst1q_f32(outptr + outstep*5, z50);
+    vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51));
+}
+
+#endif
+}
+
+}} // namespace
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp b/modules/dnn/src/layers/cpu_kernels/convolution.cpp
similarity index 73%
rename from modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
rename to modules/dnn/src/layers/cpu_kernels/convolution.cpp
index 51abf8facc..0f0da11ec7 100644
--- a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/convolution.cpp
@@ -10,11 +10,19 @@
 */
 
 #include "../../precomp.hpp"
-#include "fast_convolution.hpp"
-#include "fast_convolution.simd.hpp"
+#include "convolution.hpp"
+
+#include "conv_block.simd.hpp"
+#include "layers/cpu_kernels/conv_block.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
 
 namespace cv { namespace dnn {
 enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment.
+
+void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
+               const int convMR, const int convNR);
+void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
+                  const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR);
+
 Ptr<FastConv> initFastConv(
         InputArray _weightsMat,
         float* srcBias,
@@ -94,21 +102,15 @@ Ptr<FastConv> initFastConv(
         }
     }
 
-    conv->conv_type = ifRunDepthWise && conv_dim != CONV_3D ? _FX_CONV_TYPE_DEPTHWISE :
+    conv->conv_type = ifRunDepthWise && conv_dim != CONV_3D ? CONV_TYPE_DEPTHWISE :
             useWinograd && (conv_dim == CONV_2D && (conv->useSIMD128 || conv->useAVX2 || conv->useNEON) &&
             Hk == 3 && Wk == 3 && dilation_h == 1 && dilation_w == 1 && stride_h == 1 && stride_w == 1) ?
-            _FX_CONV_TYPE_WINOGRAD3X3 :
-            (ifRunDepthWiseRemain ? _FX_CONV_TYPE_DEPTHWISE_REMAIN : _FX_CONV_TYPE_GENERIC);
+            CONV_TYPE_WINOGRAD3X3 :
+            (ifRunDepthWiseRemain ? CONV_TYPE_DEPTHWISE_REMAIN : CONV_TYPE_GENERIC);
 
 #if !(CV_NEON || CV_SIMD128 || CV_TRY_AVX2)
-    if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // Disabel Winograd when CV_NEON, CV_SIMD128 and CV_TRY_AVX2 are not available.
-        conv->conv_type = _FX_CONV_TYPE_GENERIC;
-#endif
-
-#if CV_TRY_AVX2
-    // Disabel Winograd when CV_TRY_AVX2 is true, but conv->useAVX2 is false.
-    if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3 && !conv->useAVX2)
-        conv->conv_type = _FX_CONV_TYPE_GENERIC;
+    if (conv->conv_type == CONV_TYPE_WINOGRAD3X3) // Disabel Winograd when CV_NEON, CV_SIMD128 and CV_TRY_AVX2 are not available.
+        conv->conv_type = CONV_TYPE_GENERIC;
 #endif
 
     Mat weightsMat = _weightsMat.getMat();
@@ -116,7 +118,7 @@ Ptr<FastConv> initFastConv(
     const size_t wstep = weightsMat.step1();
 
     float *srcWeights = (float *)weightsMat.data;
-    if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE || conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN)
+    if (conv->conv_type == CONV_TYPE_DEPTHWISE || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
     {
         // Handle the Conv1D, Conv2D and Conv3D depth-wise.
         // for depth-wise convolutions on NCHW data we just preserve the weights in KCHW layout,
@@ -138,7 +140,7 @@ Ptr<FastConv> initFastConv(
                 weightsBufPtr[c*padded_ksize + k] = srcWeights[c*wstep + k];
         }});
     }
-    else if(conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // winograd
+    else if(conv->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd
     {
         static const float ktm[8][3] = {
                 {1.0f,      0.0f,      0.0f},
@@ -156,24 +158,24 @@ Ptr<FastConv> initFastConv(
         // where W is the size of Winograd-transformed kernel (8x8),
         // ATOM_SIZE is number of lanes in SIMD register (4 for NEON and FP32),
         // KBLOCK is some platform-dependent constant dependent on the number of SIMD registers.
-        int ksize = _FX_WINO_KSIZE * _FX_WINO_KSIZE;
+        int ksize = CONV_WINO_KSIZE * CONV_WINO_KSIZE;
         int Cg = C/ngroups;
         int Kg = K/ngroups;
-        int Kg_nblocks = (Kg + _FX_WINO_KBLOCK - 1)/_FX_WINO_KBLOCK;
-        size_t nweights = ngroups*Kg_nblocks*Cg*_FX_WINO_KBLOCK*_FX_WINO_AREA;
+        int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - 1)/CONV_WINO_KBLOCK;
+        size_t nweights = ngroups*Kg_nblocks*Cg*CONV_WINO_KBLOCK*CONV_WINO_AREA;
         conv->weightsWinoBuf.reserve(nweights + VEC_ALIGN);
         conv->weightsWinoBufPtr = alignPtr(conv->weightsWinoBuf.data(), VEC_ALIGN);
         float* wptrWino = conv->weightsWinoBufPtr;
         memset(wptrWino, 0, nweights * sizeof(wptrWino[0]));
 
         parallel_for_(Range(0, K), [&](const Range& r0){
-        float kernelTm[_FX_WINO_AREA];
+        float kernelTm[CONV_WINO_AREA];
         for (int k = r0.start; k < r0.end; k++)
         {
             int g = k / Kg;
             int k_ = k - g*Kg;
-            int ki = k_ / _FX_WINO_KBLOCK;
-            int dk = k_ - ki*_FX_WINO_KBLOCK;
+            int ki = k_ / CONV_WINO_KBLOCK;
+            int dk = k_ - ki*CONV_WINO_KBLOCK;
 
             for (int c = 0; c < Cg; c++)
             {
@@ -204,18 +206,18 @@ Ptr<FastConv> initFastConv(
                 }
 
                 // repack the data.
-                float* wptr = wptrWino + (g*Kg_nblocks + ki) * Cg *_FX_WINO_KBLOCK*_FX_WINO_AREA +
-                              (c*_FX_WINO_KBLOCK + dk)*_FX_WINO_ATOM_F32;
-                for (int i = 0; i < _FX_WINO_NATOMS_F32; i++,
-                        wptr += Cg * _FX_WINO_KBLOCK * _FX_WINO_ATOM_F32)
+                float* wptr = wptrWino + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA +
+                              (c*CONV_WINO_KBLOCK + dk)*CONV_WINO_ATOM_F32;
+                for (int i = 0; i < CONV_WINO_NATOMS_F32; i++,
+                        wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F32)
                 {
-                    CV_Assert(conv->weightsWinoBufPtr <= wptr && wptr + _FX_WINO_ATOM_F32 <= conv->weightsWinoBufPtr + nweights);
-                    memcpy(wptr, kernelTm + i * _FX_WINO_ATOM_F32, _FX_WINO_ATOM_F32*sizeof (wptr[0]));
+                    CV_Assert(conv->weightsWinoBufPtr <= wptr && wptr + CONV_WINO_ATOM_F32 <= conv->weightsWinoBufPtr + nweights);
+                    memcpy(wptr, kernelTm + i * CONV_WINO_ATOM_F32, CONV_WINO_ATOM_F32*sizeof (wptr[0]));
                 }
             }
         }});
     }
-    else if (conv->conv_type == _FX_CONV_TYPE_GENERIC)
+    else if (conv->conv_type == CONV_TYPE_GENERIC)
     {
         // The weights are packed as
         // ngroups x (ceil((K/ngroups)/CONV_MR)*CONV_MR) x (Cg*Hk*Wk*Dk) x CONV_MR tensor
@@ -372,7 +374,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
         fusedAddMat = _output.getMat();
     }
 
-    if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE)
+    if (conv->conv_type == CONV_TYPE_DEPTHWISE)
     {
         // Depthwise-Convolution layer should not be followed by Add layer.
         CV_Assert((conv_dim == CONV_1D || conv_dim == CONV_2D));
@@ -420,7 +422,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
     else
         activ = nullptr;
 
-    if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // winograd
+    if (conv->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd
     {
         CV_Assert(conv->weightsWinoBufPtr && input.dims == 4 && conv_dim == CONV_2D);
         if (runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct))
@@ -454,8 +456,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
     int dilation_d = conv->dilation_d, dilation_h = conv->dilation_h, dilation_w = conv->dilation_w;
 
     int ksize = Dk*Hk*Wk;
-    bool fast_1x1 = ksize == 1 && stride_d == 1 && stride_w == 1 && stride_h == 1 &&
-                    pad_front == 0 && pad_top == 0 && pad_left == 0;
+    bool fast_1x1 = ksize == 1 && stride_d == 1 && stride_w == 1 && stride_h == 1;
     int DkHkWkCg = Dk*Hk*Wk*Cg;
 
     std::vector<int> ofstab_(Hk*Wk*Dk*4, 0);
@@ -504,14 +505,14 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
     int MAX_STRIPES = (56 + CONV_NR - 1)/CONV_NR;
 
     // Friendly to L1 cache
-    const int K_BLOCK_SIZE = conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN ? 1 : 32;
+    const int K_BLOCK_SIZE = conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? 1 : 32;
     const int C_BLOCK_SIZE = 256;
 
     int Kg_nblocks = (Kg + CONV_MR-1)/CONV_MR, Kg_aligned = Kg_nblocks * CONV_MR;
 
     int stripes_per_sample = ((int)out_planesize + CONV_NR - 1) / CONV_NR;
 
-    if (stripes_per_sample < ntasks * 4 && conv->conv_type != _FX_CONV_TYPE_DEPTHWISE_REMAIN)
+    if (stripes_per_sample < ntasks * 4 && conv->conv_type != CONV_TYPE_DEPTHWISE_REMAIN)
     {
         MAX_STRIPES = 1;
         stripes_per_sample = 1;
@@ -555,7 +556,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
             int k0, k1;
             int zyx0, zyx_limit, zyx_block_limit = 0;
 
-            if (stripes_per_sample == 1 && conv->conv_type != _FX_CONV_TYPE_DEPTHWISE_REMAIN)
+            if (stripes_per_sample == 1 && conv->conv_type != CONV_TYPE_DEPTHWISE_REMAIN)
             {
                 k0 = kzyx0 * CONV_MR;
                 k1 = kzyx1 * CONV_MR;
@@ -618,7 +619,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
                             }
                         }
                     }
-                    else if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN)
+                    else if (conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
                     {
                         CV_Assert(Cg == 1);
                         const int HW0 = H0 * W0;
@@ -928,7 +929,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
 
                 // spacial branch for depth-wise convolution implemented using generic convolution.
                 // In this case, CONV_MR is 1, and CONV_NR is the same.
-                if (conv->conv_type == _FX_CONV_TYPE_DEPTHWISE_REMAIN)
+                if (conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
                 {
                     size_t outofs = (n * ngroups + g) * out_planesize + zyx0;
                     float *cptr0 = cbuf_task;
@@ -947,12 +948,8 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
                             memcpy(cptr0, cptr, outLen * sizeof(cptr[0]));
                             cptr = cptr0;
                         }
-#if CV_TRY_AVX2
-                        if (conv->useAVX2 && outLen > CONV_NR/3)
-                                opt_AVX2::convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct);
-                        else
-#endif
-                        convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen);
+
+                        convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
 
                         if (ifBuffer)
                         {
@@ -980,7 +977,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
                         {
                             const int outLen = std::min(out_width - stripe * CONV_NR, CONV_NR);
 
-#if CV_TRY_AVX2 || CV_TRY_NEON
+#if CV_TRY_AVX || CV_TRY_AVX2 || CV_NEON
                             // The possible CONV_NR is 28, 24, 12, so the possible CONV_NR/3 is 9, 8, 4.
                             bool runOpt = outLen > std::min(8, CONV_NR/3);
 #endif
@@ -992,16 +989,21 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
                             {
 #if CV_TRY_AVX2
                                 if (conv->useAVX2 && runOpt)
-                                    opt_AVX2::convBlock_AVX2(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0);
+                                    opt_AVX2::convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, CONV_MR, CONV_NR);
                                 else
 #endif
-#if CV_TRY_NEON
+#if CV_TRY_AVX
+                                if (conv->useAVX && runOpt)
+                                    opt_AVX::convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, CONV_MR, CONV_NR);
+                                else
+#endif
+#if CV_NEON
                                 if (conv->useNEON && runOpt)
-                                    opt_NEON::convBlock_NEON(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0);
+                                    opt_NEON::convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, CONV_MR, CONV_NR);
                                 else
 #endif
                                 // The possible outLen range is 24 or 8~1.
-                                convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, outLen);
+                                convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
                             }
                         }
                     }
@@ -1087,4 +1089,466 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
     }
     });
 }
+
+
+/****************************************************************************************\
+                                    SIMD and no-SIMD code for convBlock
+\****************************************************************************************/
+
+static void convBlockMR1NoSIMD(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
+                               const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
+{
+    std::vector<float> cbuffer(outLen, 0);
+    float* cbuf = cbuffer.data();
+    for( int p = 0; p < np; p++ )
+    {
+        float ai = a[p];
+        for( int j = 0; j < outLen; j++ )
+            cbuf[j] += b[convNR*p + j] * ai;
+    }
+
+    if (init_c)
+    {
+        for(int j = 0; j < outLen; j++)
+        {
+            c[j] += cbuf[j] + bias;
+            if (ifMinMaxAct)
+                c[j] = std::min(std::max(c[j], minval), maxval);
+        }
+    }
+    else
+    {
+        for(int j = 0; j < outLen; j++)
+        {
+            c[j] = cbuf[j] + bias;
+            if (ifMinMaxAct)
+                c[j] = std::min(std::max(c[j], minval), maxval);
+        }
+    }
+}
+
+#if CV_SIMD128
+static void convBlockMR1x28(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
+                               const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
+{
+    CV_Assert(convNR == 28);
+    v_float32x4 c0  = v_setall_f32(bias), c1 = c0, c2 = c0;
+    v_float32x4 c3 = c0, c4 = c0, c5 = c0;
+    v_float32x4 c6 = c0;
+
+    for (int p = 0; p < np; p++, a++, b += convNR)
+    {
+        v_float32x4 a0 = v_setall_f32(a[0]);
+        v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
+        v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20);
+        v_float32x4 b6 = v_load(b + 24);
+
+        c0 = v_fma(b0, a0, c0);
+        c1 = v_fma(b1, a0, c1);
+        c2 = v_fma(b2, a0, c2);
+        c3 = v_fma(b3, a0, c3);
+        c4 = v_fma(b4, a0, c4);
+        c5 = v_fma(b5, a0, c5);
+        c6 = v_fma(b6, a0, c6);
+    }
+
+    if (init_c)
+    {
+        c0 += v_load(c);
+        c1 += v_load(c + 4);
+        c2 += v_load(c + 8);
+        c3 += v_load(c + 12);
+        c4 += v_load(c + 16);
+        c5 += v_load(c + 20);
+        c6  += v_load(c + 24);
+    }
+
+    if (ifMinMaxAct)
+    {
+        v_float32x4 vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval);
+        c0 = v_min(v_max(c0, vmin), vmax);
+        c1 = v_min(v_max(c1, vmin), vmax);
+        c2 = v_min(v_max(c2, vmin), vmax);
+        c3 = v_min(v_max(c3, vmin), vmax);
+        c4 = v_min(v_max(c4, vmin), vmax);
+        c5 = v_min(v_max(c5, vmin), vmax);
+        c6 = v_min(v_max(c6, vmin), vmax);
+    }
+
+    v_store(c, c0);
+    v_store(c + 4, c1);
+    v_store(c + 8, c2);
+    v_store(c + 12, c3);
+    v_store(c + 16, c4);
+    v_store(c + 20, c5);
+    v_store(c + 24, c6);
+}
+
+static void convBlockMR1x24(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
+                            const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
+{
+    CV_Assert(convNR == 24);
+    v_float32x4 c0  = v_setall_f32(bias), c1 = c0, c2 = c0;
+    v_float32x4 c3 = c0, c4 = c0, c5 = c0;
+
+    for (int p = 0; p < np; p++, a++, b += convNR)
+    {
+        v_float32x4 a0 = v_setall_f32(a[0]);
+        v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
+        v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20);
+
+        c0 = v_fma(b0, a0, c0);
+        c1 = v_fma(b1, a0, c1);
+        c2 = v_fma(b2, a0, c2);
+        c3 = v_fma(b3, a0, c3);
+        c4 = v_fma(b4, a0, c4);
+        c5 = v_fma(b5, a0, c5);
+    }
+
+    if (init_c)
+    {
+        c0 += v_load(c);
+        c1 += v_load(c + 4);
+        c2 += v_load(c + 8);
+        c3 += v_load(c + 12);
+        c4 += v_load(c + 16);
+        c5 += v_load(c + 20);
+    }
+
+    if (ifMinMaxAct)
+    {
+        v_float32x4 vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval);
+        c0 = v_min(v_max(c0, vmin), vmax);
+        c1 = v_min(v_max(c1, vmin), vmax);
+        c2 = v_min(v_max(c2, vmin), vmax);
+        c3 = v_min(v_max(c3, vmin), vmax);
+        c4 = v_min(v_max(c4, vmin), vmax);
+        c5 = v_min(v_max(c5, vmin), vmax);
+    }
+
+    v_store(c, c0);
+    v_store(c + 4, c1);
+    v_store(c + 8, c2);
+    v_store(c + 12, c3);
+    v_store(c + 16, c4);
+    v_store(c + 20, c5);
+}
+
+static void convBlockMR1x12(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
+                            const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
+{
+    CV_Assert(convNR == 12);
+    v_float32x4 c0  = v_setall_f32(bias), c1 = c0, c2 = c0;
+    for (int p = 0; p < np; p++, a++, b += convNR)
+    {
+        v_float32x4 a0 = v_setall_f32(a[0]);
+        v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
+
+        c0 = v_fma(b0, a0, c0);
+        c1 = v_fma(b1, a0, c1);
+        c2 = v_fma(b2, a0, c2);
+    }
+
+    if (init_c)
+    {
+        c0 += v_load(c);
+        c1 += v_load(c + 4);
+        c2 += v_load(c + 8);
+    }
+
+    if (ifMinMaxAct)
+    {
+        v_float32x4 vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval);
+        c0 = v_min(v_max(c0, vmin), vmax);
+        c1 = v_min(v_max(c1, vmin), vmax);
+        c2 = v_min(v_max(c2, vmin), vmax);
+    }
+
+    v_store(c, c0);
+    v_store(c + 4, c1);
+    v_store(c + 8, c2);
+}
+#endif
+
+void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
+                  const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
+{
+#if CV_SIMD128
+    // The outLen represents the valid output value in CONV_NR length.
+    // When outLen is very small, we use the no-SIMD branch.
+    const int convNRby3 = convNR/3;
+    if (outLen > convNRby3)
+    {
+        if (convNR == 28)
+            convBlockMR1x28(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
+        else if (convNR == 24)
+            convBlockMR1x24(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
+        else if (convNR == 12)
+            convBlockMR1x12(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
+        else
+            convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
+    }
+     else
+        convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
+#else
+    convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
+#endif
+}
+
+#if CV_SIMD128
+static void convBlock4x24(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
+{
+    v_float32x4 c0  = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0;
+    v_float32x4 c6  = v_setzero_f32(), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6;
+    v_float32x4 c12 = v_setzero_f32(), c13 = c12, c14 = c12, c15 = c12, c16 = c12, c17 = c12;
+    v_float32x4 c18 = v_setzero_f32(), c19 = c18, c20 = c18, c21 = c18, c22 = c18, c23 = c18;
+
+    for (int p = 0; p < np; p++, a += convMR, b += convNR)
+    {
+        v_float32x4 a0 = v_setall_f32(a[0]);
+        v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
+        v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20);
+
+        c0 = v_fma(b0, a0, c0);
+        c1 = v_fma(b1, a0, c1);
+        c2 = v_fma(b2, a0, c2);
+        c3 = v_fma(b3, a0, c3);
+        c4 = v_fma(b4, a0, c4);
+        c5 = v_fma(b5, a0, c5);
+
+        a0  = v_setall_f32(a[1]);
+        c6  = v_fma(b0, a0, c6);
+        c7  = v_fma(b1, a0, c7);
+        c8  = v_fma(b2, a0, c8);
+        c9  = v_fma(b3, a0, c9);
+        c10 = v_fma(b4, a0, c10);
+        c11 = v_fma(b5, a0, c11);
+
+        a0 = v_setall_f32(a[2]);
+        c12 = v_fma(b0, a0, c12);
+        c13 = v_fma(b1, a0, c13);
+        c14 = v_fma(b2, a0, c14);
+        c15 = v_fma(b3, a0, c15);
+        c16 = v_fma(b4, a0, c16);
+        c17 = v_fma(b5, a0, c17);
+
+        a0 = v_setall_f32(a[3]);
+        c18 = v_fma(b0, a0, c18);
+        c19 = v_fma(b1, a0, c19);
+        c20 = v_fma(b2, a0, c20);
+        c21 = v_fma(b3, a0, c21);
+        c22 = v_fma(b4, a0, c22);
+        c23 = v_fma(b5, a0, c23);
+    }
+
+    if (!init_c)
+    {
+        c0 += v_load(c);
+        c1 += v_load(c + 4);
+        c2 += v_load(c + 8);
+        c3 += v_load(c + 12);
+        c4 += v_load(c + 16);
+        c5 += v_load(c + 20);
+
+        c6  += v_load(c + ldc);
+        c7  += v_load(c + ldc + 4);
+        c8  += v_load(c + ldc + 8);
+        c9  += v_load(c + ldc + 12);
+        c10 += v_load(c + ldc + 16);
+        c11 += v_load(c + ldc + 20);
+
+        c12 += v_load(c + ldc*2);
+        c13 += v_load(c + ldc*2 + 4);
+        c14 += v_load(c + ldc*2 + 8);
+        c15 += v_load(c + ldc*2 + 12);
+        c16 += v_load(c + ldc*2 + 16);
+        c17 += v_load(c + ldc*2 + 20);
+
+        c18 += v_load(c + ldc*3);
+        c19 += v_load(c + ldc*3 + 4);
+        c20 += v_load(c + ldc*3 + 8);
+        c21 += v_load(c + ldc*3 + 12);
+        c22 += v_load(c + ldc*3 + 16);
+        c23 += v_load(c + ldc*3 + 20);
+    }
+
+    v_store(c, c0);
+    v_store(c + 4, c1);
+    v_store(c + 8, c2);
+    v_store(c + 12, c3);
+    v_store(c + 16, c4);
+    v_store(c + 20, c5);
+
+    v_store(c + ldc, c6);
+    v_store(c + ldc + 4, c7);
+    v_store(c + ldc + 8, c8);
+    v_store(c + ldc + 12, c9);
+    v_store(c + ldc + 16, c10);
+    v_store(c + ldc + 20, c11);
+
+    v_store(c + ldc * 2, c12);
+    v_store(c + ldc * 2 + 4, c13);
+    v_store(c + ldc * 2 + 8, c14);
+    v_store(c + ldc * 2 + 12, c15);
+    v_store(c + ldc * 2 + 16, c16);
+    v_store(c + ldc * 2 + 20, c17);
+
+    v_store(c + ldc * 3, c18);
+    v_store(c + ldc * 3 + 4, c19);
+    v_store(c + ldc * 3 + 8, c20);
+    v_store(c + ldc * 3 + 12, c21);
+    v_store(c + ldc * 3 + 16, c22);
+    v_store(c + ldc * 3 + 20, c23);
+}
+
+static void convBlock4x8(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
+{
+    CV_Assert(convNR >= 4);
+    v_float32x4 c0  = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0;
+    v_float32x4 c4 = c0, c5 = c0, c6 = c0, c7 = c0;
+
+    for (int p = 0; p < np; p++, a += convMR, b += convNR)
+    {
+        v_float32x4 a0 = v_setall_f32(a[0]);
+        v_float32x4 a1 = v_setall_f32(a[1]);
+        v_float32x4 a2 = v_setall_f32(a[2]);
+        v_float32x4 a3 = v_setall_f32(a[3]);
+
+        v_float32x4 b0 = v_load(b), b1 = v_load(b + 4);
+
+        c0 = v_fma(b0, a0, c0);
+        c1 = v_fma(b1, a0, c1);
+
+        c2 = v_fma(b0, a1, c2);
+        c3 = v_fma(b1, a1, c3);
+
+        c4 = v_fma(b0, a2, c4);
+        c5 = v_fma(b1, a2, c5);
+
+        c6  = v_fma(b0, a3, c6);
+        c7  = v_fma(b1, a3, c7);
+    }
+
+    if (!init_c)
+    {
+        c0 += v_load(c);
+        c1 += v_load(c + 4);
+
+        c2  += v_load(c + ldc);
+        c3  += v_load(c + ldc + 4);
+
+        c4 += v_load(c + ldc*2);
+        c5 += v_load(c + ldc*2 + 4);
+
+        c6 += v_load(c + ldc*3);
+        c7 += v_load(c + ldc*3 + 4);
+    }
+
+    v_store(c, c0);
+    v_store(c + 4, c1);
+    v_store(c + ldc, c2);
+    v_store(c + ldc + 4, c3);
+    v_store(c + ldc * 2, c4);
+    v_store(c + ldc * 2 + 4, c5);
+    v_store(c + ldc * 3, c6);
+    v_store(c + ldc * 3 + 4, c7);
+}
+
+static void convBlock4x4(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
+{
+    CV_Assert(convNR >= 4);
+    v_float32x4 c0  = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0;
+
+    for (int p = 0; p < np; p++, a += convMR, b += convNR)
+    {
+        v_float32x4 a0 = v_setall_f32(a[0]);
+        v_float32x4 a1 = v_setall_f32(a[1]);
+        v_float32x4 a2 = v_setall_f32(a[2]);
+        v_float32x4 a3 = v_setall_f32(a[3]);
+
+        v_float32x4 b0 = v_load(b);
+
+        c0 = v_fma(b0, a0, c0);
+        c1 = v_fma(b0, a1, c1);
+        c2 = v_fma(b0, a2, c2);
+        c3 = v_fma(b0, a3, c3);
+    }
+
+    if (!init_c)
+    {
+        c0 += v_load(c);
+        c1 += v_load(c + ldc);
+        c2 += v_load(c + ldc*2);
+        c3 += v_load(c + ldc*3);
+    }
+
+    v_store(c, c0);
+    v_store(c + ldc, c1);
+    v_store(c + ldc * 2, c2);
+    v_store(c + ldc * 3, c3);
+}
+#endif
+
+static void convBlockNoSIMD(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
+                            const int convMR, const int convNR)
+{
+    std::vector<float> cbuffer(convMR * outLen, 0);
+    float* cbuf = cbuffer.data();
+    for( int p = 0; p < np; p++ )
+    {
+        for( int i = 0; i < convMR; i++ )
+        {
+            float ai = a[convMR*p + i];
+            for( int j = 0; j < outLen; j++ )
+                cbuf[i * outLen+j] += b[convNR*p + j] * ai;
+        }
+    }
+
+    if (!init_c)
+    {
+        for(int i = 0; i < convMR; i++)
+        {
+            for(int j = 0; j < outLen; j++)
+                c[i*ldc + j] += cbuf[i*outLen + j];
+        }
+    }
+    else
+    {
+        for(int i = 0; i < convMR; i++)
+        {
+            for(int j = 0; j < outLen; j++)
+                c[i*ldc + j] = cbuf[i*outLen + j];
+        }
+    }
+}
+
+void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
+               const int convMR, const int convNR)
+{
+    // The possible outLen range is [24, 8~1].
+#if CV_SIMD128
+    CV_Assert(convMR == 4);
+    if (outLen > 8 && convNR == 24)
+    {
+        convBlock4x24(np, a, b, c, ldc, init_c, convMR, convNR);
+        return;
+    }
+
+    if (outLen <= 8 && outLen > 4)
+    {
+        convBlock4x8(np, a, b, c, ldc, init_c, convMR, convNR);
+        return;
+    }
+
+    if (outLen <= 4 && outLen > 1)
+    {
+        convBlock4x4(np, a, b, c, ldc, init_c, convMR, convNR);
+        return;
+    }
+    convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen, convMR, convNR);
+#else
+    convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen, convMR, convNR);
+#endif
+}
+
 }} // namespace cv::dnn
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp b/modules/dnn/src/layers/cpu_kernels/convolution.hpp
similarity index 69%
rename from modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
rename to modules/dnn/src/layers/cpu_kernels/convolution.hpp
index 7794078bb4..0a077bf800 100644
--- a/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/convolution.hpp
@@ -22,27 +22,29 @@
 
 // Winograd Params
 enum {
-    _FX_WINO_STEP=6,
-    _FX_WINO_KSIZE=3,
-    _FX_WINO_SIZE=_FX_WINO_STEP+_FX_WINO_KSIZE-1,
-    _FX_WINO_AREA=_FX_WINO_SIZE*_FX_WINO_SIZE,
+    CONV_WINO_STEP=6,
+    CONV_WINO_KSIZE=3,
+    CONV_WINO_SIZE=CONV_WINO_STEP+CONV_WINO_KSIZE-1, // 8
+    CONV_WINO_AREA=CONV_WINO_SIZE*CONV_WINO_SIZE,
 
-    _FX_WINO_KBLOCK = 4,
+    CONV_WINO_KBLOCK = 4,
 #if (CV_NEON && CV_NEON_AARCH64) || CV_TRY_AVX2
-    _FX_WINO_IBLOCK = 6,
+    CONV_WINO_IBLOCK = 6,
 #else
-    _FX_WINO_IBLOCK = 3,
+    CONV_WINO_IBLOCK = 3,
 #endif
 
 #if CV_TRY_AVX2
-    _FX_WINO_ATOM_F32 = 8,
+    CONV_WINO_ATOM_F32 = 8,
 #else
-    _FX_WINO_ATOM_F32 = 4,
+    CONV_WINO_ATOM_F32 = 4,
 #endif
 
-    _FX_WINO_NATOMS_F32 = _FX_WINO_AREA / _FX_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16.
+    CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16.
 };
-enum { _FX_CONV_TYPE_GENERIC=0, _FX_CONV_TYPE_DEPTHWISE=1, _FX_CONV_TYPE_WINOGRAD3X3=2, _FX_CONV_TYPE_DEPTHWISE_REMAIN=3 };
+
+// NOTE that: CONV_TYPE_DEPTHWISE is for 3x3 depthwise conv, and others depthwise will be set as CONV_TYPE_DEPTHWISE_REMAIN.
+enum { CONV_TYPE_GENERIC=0, CONV_TYPE_DEPTHWISE=1, CONV_TYPE_WINOGRAD3X3=2, CONV_TYPE_DEPTHWISE_REMAIN=3 };
 enum { CONV_1D = 0, CONV_2D = 1, CONV_3D = 2 };
 #endif
 
@@ -105,22 +107,6 @@ void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv>& c
 int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv, int ntasks,
                   float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct);
 
-namespace opt_AVX2
-{
-#if CV_TRY_AVX2
-void convBlock_AVX2(int np, const float* a, const float* b, float* c, int ldc, bool init_c);
-
-void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c, const float minval,
-                  const float maxval, bool ifMinMaxAct);
-
-void _fx_winograd_accum_f32(const float* inwptr, const float* wptr, float* outbuf, int Cg, int iblock);
-void _fx_winograd_BtXB_8x8_f32(const float* inptr, int inpstep, float* outptr, int Cg);
-void _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep, float* bpptr, int bpstep, float* outptr, int outstep,
-                               float bias, float minval, float maxval, bool ifMinMaxAct);
-
-#endif
-} // namespace opt_AVX2
-
 } // namespace dnn
 } // namespace cv
 
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp
deleted file mode 100644
index c98fbe72bd..0000000000
--- a/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp
+++ /dev/null
@@ -1,499 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#include "../../precomp.hpp"
-#include "fast_convolution.hpp"
-
-namespace cv {
-namespace dnn {
-namespace opt_AVX2
-{
-#if CV_TRY_AVX2
-void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
-          const float minval, const float maxval, bool ifMinMaxAct)
-{
-#if CONV_NR == 24
-    __m256 c0 = _mm256_set1_ps(bias), c1 = c0, c2 = c0;
-
-    for (int p = 0; p < np; p++, a++, b += CONV_NR)
-    {
-        __m256 a0 = _mm256_set1_ps(a[0]);
-        __m256 b0 = _mm256_loadu_ps(b), b1 = _mm256_loadu_ps(b + 8), b2 = _mm256_loadu_ps(b + 16);
-
-        c0 = _mm256_fmadd_ps(b0, a0, c0);
-        c1 = _mm256_fmadd_ps(b1, a0, c1);
-        c2 = _mm256_fmadd_ps(b2, a0, c2);
-    }
-
-    if (init_c)
-    {
-        c0 = _mm256_add_ps(_mm256_loadu_ps(c), c0);
-        c1 = _mm256_add_ps(_mm256_loadu_ps(c + 8), c1);
-        c2 = _mm256_add_ps(_mm256_loadu_ps(c + 16), c2);
-    }
-
-     if (ifMinMaxAct)
-    {
-        __m256 vmax = _mm256_set1_ps(maxval);
-        __m256 vmin = _mm256_set1_ps(minval);
-
-        c0 = _mm256_min_ps(_mm256_max_ps(c0, vmin), vmax);
-        c1 = _mm256_min_ps(_mm256_max_ps(c1, vmin), vmax);
-        c2 = _mm256_min_ps(_mm256_max_ps(c2, vmin), vmax);
-    }
-
-    _mm256_storeu_ps(c, c0);
-    _mm256_storeu_ps(c + 8, c1);
-    _mm256_storeu_ps(c + 16, c2);
-    _mm256_zeroupper();
-#else
-#error "unsupported CONV_NR in convBlockMR1."
-#endif
-}
-
-void convBlock_AVX2(int np, const float* a, const float* b, float* c, int ldc, bool init_c)
-{
-#if CONV_MR == 4 && CONV_NR == 24
-    __m256 c00 = _mm256_set1_ps(0.f), c01 = c00, c02 = c00;
-    __m256 c10 = c00, c11 = c00, c12 = c00;
-    __m256 c20 = c00, c21 = c00, c22 = c00;
-    __m256 c30 = c00, c31 = c00, c32 = c00;
-
-    __m256 a0 = _mm256_setzero_ps(), a1 = _mm256_setzero_ps();
-    __m256 b0 = _mm256_setzero_ps(), b1 = _mm256_setzero_ps(), b2 = _mm256_setzero_ps();
-
-    for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR)
-    {
-        a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
-        b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16);
-
-        c00 = _mm256_fmadd_ps(b0, a0, c00);
-        c01 = _mm256_fmadd_ps(b1, a0, c01);
-        c02 = _mm256_fmadd_ps(b2, a0, c02);
-
-        c10 = _mm256_fmadd_ps(b0, a1, c10);
-        c11 = _mm256_fmadd_ps(b1, a1, c11);
-        c12 = _mm256_fmadd_ps(b2, a1, c12);
-
-        a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]);
-
-        c20 = _mm256_fmadd_ps(b0, a0, c20);
-        c21 = _mm256_fmadd_ps(b1, a0, c21);
-        c22 = _mm256_fmadd_ps(b2, a0, c22);
-
-        c30 = _mm256_fmadd_ps(b0, a1, c30);
-        c31 = _mm256_fmadd_ps(b1, a1, c31);
-        c32 = _mm256_fmadd_ps(b2, a1, c32);
-    }
-
-    if (!init_c)
-    {
-        c00 = _mm256_add_ps(c00, _mm256_load_ps(c));
-        c01 = _mm256_add_ps(c01, _mm256_load_ps(c + 8));
-        c02 = _mm256_add_ps(c02, _mm256_load_ps(c + 16));
-
-        c10 = _mm256_add_ps(c10, _mm256_load_ps(c + ldc));
-        c11 = _mm256_add_ps(c11, _mm256_load_ps(c + ldc + 8));
-        c12 = _mm256_add_ps(c12, _mm256_load_ps(c + ldc + 16));
-
-        c20 = _mm256_add_ps(c20, _mm256_load_ps(c + ldc*2));
-        c21 = _mm256_add_ps(c21, _mm256_load_ps(c + ldc*2 + 8));
-        c22 = _mm256_add_ps(c22, _mm256_load_ps(c + ldc*2 + 16));
-
-        c30 = _mm256_add_ps(c30, _mm256_load_ps(c + ldc*3));
-        c31 = _mm256_add_ps(c31, _mm256_load_ps(c + ldc*3 + 8));
-        c32 = _mm256_add_ps(c32, _mm256_load_ps(c + ldc*3 + 16));
-    }
-
-    _mm256_storeu_ps(c, c00), _mm256_storeu_ps(c+8, c01), _mm256_storeu_ps(c+16, c02);
-    _mm256_storeu_ps(c + ldc, c10), _mm256_storeu_ps(c + ldc + 8, c11), _mm256_storeu_ps(c + ldc + 16, c12);
-    _mm256_storeu_ps(c + ldc*2, c20), _mm256_storeu_ps(c + ldc*2 + 8, c21), _mm256_storeu_ps(c + ldc*2 + 16, c22);
-    _mm256_storeu_ps(c + ldc*3, c30), _mm256_storeu_ps(c + ldc*3 + 8, c31), _mm256_storeu_ps(c + ldc*3 + 16, c32);
-    _mm256_zeroupper();
-#else
-#error "unsupported CONV_MR and/or CONV_NR in convBlock_AVX2."
-#endif
-}
-
-void _fx_winograd_accum_f32(const float* inwptr, const float* wptr,
-                       float* outbuf, int Cg, int iblock)
-{
-    CV_Assert(_FX_WINO_IBLOCK == 6 && _FX_WINO_KBLOCK == 4 && _FX_WINO_ATOM_F32 == 8);
-    if (iblock > 3)
-    {
-        for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++,
-                outbuf += _FX_WINO_ATOM_F32)
-        {
-            __m256 s00 = _mm256_set1_ps(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00;
-            __m256 s10 = _mm256_set1_ps(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00;
-            __m256 s20 = _mm256_set1_ps(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00;
-            __m256 s30 = _mm256_set1_ps(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00;
-            for (int c = 0; c < Cg; c++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32,
-                                         wptr += _FX_WINO_KBLOCK*_FX_WINO_ATOM_F32)
-            {
-                __m256 w0 = _mm256_load_ps(wptr), w1 = _mm256_load_ps(wptr + 8);
-                __m256 w2 = _mm256_load_ps(wptr + 16), w3 = _mm256_load_ps(wptr + 24);
-                __m256 x0, x1;
-                x0 = _mm256_load_ps(inwptr);
-                x1 = _mm256_load_ps(inwptr + 8);
-                s00 = _mm256_fmadd_ps(w0, x0, s00);
-                s01 = _mm256_fmadd_ps(w0, x1, s01);
-                s10 = _mm256_fmadd_ps(w1, x0, s10);
-                s11 = _mm256_fmadd_ps(w1, x1, s11);
-                s20 = _mm256_fmadd_ps(w2, x0, s20);
-                s21 = _mm256_fmadd_ps(w2, x1, s21);
-                s30 = _mm256_fmadd_ps(w3, x0, s30);
-                s31 = _mm256_fmadd_ps(w3, x1, s31);
-                x0 = _mm256_load_ps(inwptr + 16);
-                x1 = _mm256_load_ps(inwptr + 24);
-                s02 = _mm256_fmadd_ps(w0, x0, s02);
-                s03 = _mm256_fmadd_ps(w0, x1, s03);
-                s12 = _mm256_fmadd_ps(w1, x0, s12);
-                s13 = _mm256_fmadd_ps(w1, x1, s13);
-                s22 = _mm256_fmadd_ps(w2, x0, s22);
-                s23 = _mm256_fmadd_ps(w2, x1, s23);
-                s32 = _mm256_fmadd_ps(w3, x0, s32);
-                s33 = _mm256_fmadd_ps(w3, x1, s33);
-                x0 = _mm256_load_ps(inwptr + 32);
-                x1 = _mm256_load_ps(inwptr + 40);
-                s04 = _mm256_fmadd_ps(w0, x0, s04);
-                s05 = _mm256_fmadd_ps(w0, x1, s05);
-                s14 = _mm256_fmadd_ps(w1, x0, s14);
-                s15 = _mm256_fmadd_ps(w1, x1, s15);
-                s24 = _mm256_fmadd_ps(w2, x0, s24);
-                s25 = _mm256_fmadd_ps(w2, x1, s25);
-                s34 = _mm256_fmadd_ps(w3, x0, s34);
-                s35 = _mm256_fmadd_ps(w3, x1, s35);
-            }
-
-            _mm256_store_ps(outbuf, s00);
-            _mm256_store_ps(outbuf + 1*64, s01);
-            _mm256_store_ps(outbuf + 2*64, s02);
-            _mm256_store_ps(outbuf + 3*64, s03);
-            _mm256_store_ps(outbuf + 4*64, s04);
-            _mm256_store_ps(outbuf + 5*64, s05);
-
-            _mm256_store_ps(outbuf + 6*64, s10);
-            _mm256_store_ps(outbuf + 7*64, s11);
-            _mm256_store_ps(outbuf + 8*64, s12);
-            _mm256_store_ps(outbuf + 9*64, s13);
-            _mm256_store_ps(outbuf + 10*64, s14);
-            _mm256_store_ps(outbuf + 11*64, s15);
-
-            _mm256_store_ps(outbuf + 12*64, s20);
-            _mm256_store_ps(outbuf + 13*64, s21);
-            _mm256_store_ps(outbuf + 14*64, s22);
-            _mm256_store_ps(outbuf + 15*64, s23);
-            _mm256_store_ps(outbuf + 16*64, s24);
-            _mm256_store_ps(outbuf + 17*64, s25);
-
-            _mm256_store_ps(outbuf + 18*64, s30);
-            _mm256_store_ps(outbuf + 19*64, s31);
-            _mm256_store_ps(outbuf + 20*64, s32);
-            _mm256_store_ps(outbuf + 21*64, s33);
-            _mm256_store_ps(outbuf + 22*64, s34);
-            _mm256_store_ps(outbuf + 23*64, s35);
-        }
-    }
-    else
-    {
-        for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++,
-                outbuf += _FX_WINO_ATOM_F32)
-        {
-            __m256 s00 = _mm256_set1_ps(0.f), s01 = s00, s02 = s00;
-            __m256 s10 = _mm256_set1_ps(0.f), s11 = s00, s12 = s00;
-            __m256 s20 = _mm256_set1_ps(0.f), s21 = s00, s22 = s00;
-            __m256 s30 = _mm256_set1_ps(0.f), s31 = s00, s32 = s00;
-            for (int c = 0; c < Cg; c++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32,
-                                         wptr += _FX_WINO_KBLOCK*_FX_WINO_ATOM_F32) {
-                __m256 w0 = _mm256_load_ps(wptr), w1 = _mm256_load_ps(wptr + 8);
-                __m256 w2 = _mm256_load_ps(wptr + 16), w3 = _mm256_load_ps(wptr + 24);
-                __m256 x0, x1, x2;
-                x0 = _mm256_load_ps(inwptr);
-                x1 = _mm256_load_ps(inwptr + 8);
-                x2 = _mm256_load_ps(inwptr + 16);
-                s00 = _mm256_fmadd_ps(w0, x0, s00);
-                s01 = _mm256_fmadd_ps(w0, x1, s01);
-                s02 = _mm256_fmadd_ps(w0, x2, s02);
-                s10 = _mm256_fmadd_ps(w1, x0, s10);
-                s11 = _mm256_fmadd_ps(w1, x1, s11);
-                s12 = _mm256_fmadd_ps(w1, x2, s12);
-                s20 = _mm256_fmadd_ps(w2, x0, s20);
-                s21 = _mm256_fmadd_ps(w2, x1, s21);
-                s22 = _mm256_fmadd_ps(w2, x2, s22);
-                s30 = _mm256_fmadd_ps(w3, x0, s30);
-                s31 = _mm256_fmadd_ps(w3, x1, s31);
-                s32 = _mm256_fmadd_ps(w3, x2, s32);
-            }
-
-            _mm256_store_ps(outbuf, s00);
-            _mm256_store_ps(outbuf + 1*64, s01);
-            _mm256_store_ps(outbuf + 2*64, s02);
-            _mm256_store_ps(outbuf + 6*64, s10);
-            _mm256_store_ps(outbuf + 7*64, s11);
-            _mm256_store_ps(outbuf + 8*64, s12);
-            _mm256_store_ps(outbuf + 12*64, s20);
-            _mm256_store_ps(outbuf + 13*64, s21);
-            _mm256_store_ps(outbuf + 14*64, s22);
-            _mm256_store_ps(outbuf + 18*64, s30);
-            _mm256_store_ps(outbuf + 19*64, s31);
-            _mm256_store_ps(outbuf + 20*64, s32);
-        }
-    }
-    _mm256_zeroupper();
-}
-static inline
-void transpose8_ps(__m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m256 &row4, __m256 &row5, __m256 &row6, __m256 &row7)
-{
-    __m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7;
-    __m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7;
-    __t0 = _mm256_unpacklo_ps(row0, row1);
-    __t1 = _mm256_unpackhi_ps(row0, row1);
-    __t2 = _mm256_unpacklo_ps(row2, row3);
-    __t3 = _mm256_unpackhi_ps(row2, row3);
-    __t4 = _mm256_unpacklo_ps(row4, row5);
-    __t5 = _mm256_unpackhi_ps(row4, row5);
-    __t6 = _mm256_unpacklo_ps(row6, row7);
-    __t7 = _mm256_unpackhi_ps(row6, row7);
-    __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
-    __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
-    __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
-    __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
-    __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
-    __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
-    __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
-    __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
-    row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
-    row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
-    row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
-    row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
-    row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
-    row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
-    row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
-    row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
-}
-
-/*Input transform*/
-void _fx_winograd_BtXB_8x8_f32(const float* inptr, int inpstep, float* outptr, int Cg)
-{
-    __m256 x00 = _mm256_loadu_ps(inptr);
-    __m256 x10 = _mm256_loadu_ps(inptr + inpstep);
-    __m256 x20 = _mm256_loadu_ps(inptr + inpstep*2);
-    __m256 x30 = _mm256_loadu_ps(inptr + inpstep*3);
-    __m256 x40 = _mm256_loadu_ps(inptr + inpstep*4);
-    __m256 x50 = _mm256_loadu_ps(inptr + inpstep*5);
-    __m256 x60 = _mm256_loadu_ps(inptr + inpstep*6);
-    __m256 x70 = _mm256_loadu_ps(inptr + inpstep*7);
-
-    __m256 z00, z10, z20, z30, z40, z50, z60, z70;
-
-    {
-        /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
-        /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
-        __m256 q5_25 = _mm256_set1_ps(5.25f), t00, t10;
-        t00 = _mm256_sub_ps(x40, x20);
-        t10 = _mm256_sub_ps(x30, x50);
-
-        __m256 y00 = _mm256_fmadd_ps(t00, q5_25, _mm256_sub_ps(x00, x60));
-        __m256 y70 = _mm256_fmadd_ps(t10, q5_25, _mm256_sub_ps(x70, x10));
-
-        /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
-        /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
-        __m256 qm4_25 = _mm256_set1_ps(-4.25f);
-        t00 = _mm256_fmadd_ps(x30, qm4_25, _mm256_add_ps(x10, x50));
-        t10 = _mm256_fmadd_ps(x40, qm4_25, _mm256_add_ps(x20, x60));
-
-        __m256 y10 = _mm256_add_ps(t00, t10);
-        __m256 y20 = _mm256_sub_ps(t10, t00);
-
-        /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
-        /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
-        __m256 q0_5 = _mm256_set1_ps(0.5f), q0_25 = _mm256_set1_ps(0.25f);
-        __m256 qm2_5 = _mm256_set1_ps(-2.5f), qm1_25 = _mm256_set1_ps(-1.25f);
-        t00 = _mm256_fmadd_ps(x10, q0_5, _mm256_add_ps(x50, x50));
-        t10 = _mm256_fmadd_ps(x20, q0_25, x60);
-        t00 = _mm256_fmadd_ps(x30, qm2_5, t00);
-        t10 = _mm256_fmadd_ps(x40, qm1_25, t10);
-
-        __m256 y30 = _mm256_add_ps(t00, t10);
-        __m256 y40 = _mm256_sub_ps(t10, t00);
-
-        /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
-        /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
-        __m256 q4 = _mm256_set1_ps(4.f), qm5 = _mm256_set1_ps(-5.f);
-        t00 = _mm256_fmadd_ps(x50, q0_5, _mm256_add_ps(x10, x10));
-        t10 = _mm256_fmadd_ps(x20, q4   , x60);
-        t00 = _mm256_fmadd_ps(x30, qm2_5, t00);
-        t10 = _mm256_fmadd_ps(x40, qm5  , t10);
-
-        __m256 y50 = _mm256_add_ps(t00, t10);
-        __m256 y60 = _mm256_sub_ps(t10, t00);
-
-        /* transpose 8x8 matrix in-place with some renumeration of the elements: */
-        transpose8_ps(y00, y10, y20, y30, y40, y50, y60, y70);
-
-        /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
-        /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
-        t00 = _mm256_sub_ps(y40, y20);
-        t10 = _mm256_sub_ps(y30, y50);
-        z00 = _mm256_fmadd_ps(t00, q5_25, _mm256_sub_ps(y00, y60));
-        z70 = _mm256_fmadd_ps(t10, q5_25, _mm256_sub_ps(y70, y10));
-
-        /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
-        /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
-        t00 = _mm256_fmadd_ps(y30, qm4_25, _mm256_add_ps(y10, y50));
-        t10 = _mm256_fmadd_ps(y40, qm4_25, _mm256_add_ps(y20, y60));
-        z10 = _mm256_add_ps(t00, t10);
-        z20 = _mm256_sub_ps(t10, t00);
-
-        /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
-        /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
-        t00 = _mm256_fmadd_ps(y10, q0_5, _mm256_add_ps(y50, y50));
-        t10 = _mm256_fmadd_ps(y20, q0_25, y60);
-        t00 = _mm256_fmadd_ps(y30, qm2_5, t00);
-        t10 = _mm256_fmadd_ps(y40, qm1_25, t10);
-
-        z30 = _mm256_add_ps(t00, t10);
-        z40 = _mm256_sub_ps(t10, t00);
-
-        /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
-        /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
-        t00 = _mm256_fmadd_ps(y50, q0_5, _mm256_add_ps(y10, y10));
-        t10 = _mm256_fmadd_ps(y20, q4, y60);
-        t00 = _mm256_fmadd_ps(y30, qm2_5, t00);
-        t10 = _mm256_fmadd_ps(y40, qm5, t10);
-
-        z50 = _mm256_add_ps(t00, t10);
-        z60 = _mm256_sub_ps(t10, t00);
-    }
-
-    const int outstep = _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32*Cg;
-
-    _mm256_storeu_ps(outptr, z00);
-    _mm256_storeu_ps(outptr + outstep, z10);
-    _mm256_storeu_ps(outptr + outstep*2, z20);
-    _mm256_storeu_ps(outptr + outstep*3, z30);
-    _mm256_storeu_ps(outptr + outstep*4, z40);
-    _mm256_storeu_ps(outptr + outstep*5, z50);
-    _mm256_storeu_ps(outptr + outstep*6, z60);
-    _mm256_storeu_ps(outptr + outstep*7, z70);
-    _mm256_zeroupper();
-}
-
-#define STORE6_ELE_FROM_16(ptr, z00, lowM, highM) \
-    lowM = _mm256_castps256_ps128(z00); \
-    highM = _mm256_extractf128_ps(z00, 1); \
-    _mm_storeu_ps(ptr, lowM); \
-    _mm_storel_epi64((__m128i*)(ptr + 4), _mm_castps_si128(highM))
-
-/*  Inverse Winograd 8x8 transform:
-    out = (A'*inp*A)', where
-    inp is input 8x8 FP32 matrix,
-    A' is
-    [1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f,
-     0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f,
-     0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f,
-     0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f,
-     0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f,
-     0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f]
-*/
-void _fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep,
-                          float* bpptr, int bpstep, float* outptr, int outstep,
-                          float bias, float minval, float maxval, bool ifMinMaxAct)
-{
-
-    __m256 x00 = _mm256_load_ps(inptr);
-    __m256 x10 = _mm256_load_ps(inptr + inpstep);
-    __m256 x20 = _mm256_load_ps(inptr + inpstep*2);
-    __m256 x30 = _mm256_load_ps(inptr + inpstep*3);
-    __m256 x40 = _mm256_load_ps(inptr + inpstep*4);
-    __m256 x50 = _mm256_load_ps(inptr + inpstep*5);
-    __m256 x60 = _mm256_load_ps(inptr + inpstep*6);
-    __m256 x70 = _mm256_load_ps(inptr + inpstep*7);
-    __m256 z00, z10, z20, z30, z40, z50;
-
-    {
-        __m256 s12_0, s34_0, s56_0;
-        s12_0 = _mm256_add_ps(x10, x20);
-        s34_0 = _mm256_add_ps(x30, x40);
-        s56_0 = _mm256_add_ps(x50, x60);
-
-        __m256 y00 = _mm256_add_ps(x00, _mm256_add_ps(s12_0, _mm256_add_ps(s34_0, s56_0)));
-        __m256 y20 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.25f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(4.0f), s12_0));
-        __m256 y40 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/16), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(16.0f), s12_0));
-
-        s12_0 = _mm256_sub_ps(x10, x20);
-        s34_0 = _mm256_sub_ps(x30, x40);
-        s56_0 = _mm256_sub_ps(x50, x60);
-        __m256 y50 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/32), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(32.f), _mm256_add_ps(x70, s12_0)));
-        __m256 y10 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.5f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(2.f), s12_0));
-        __m256 y30 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.125f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(8.f), s12_0));
-        __m256 y60 = _mm256_set1_ps(0.f), y70 = y60;
-
-        /* transpose 8x8 matrix in-place with some renumeration of the elements: */
-
-        transpose8_ps(y00, y10, y20, y30, y40, y50, y60, y70);
-
-        s12_0 = _mm256_add_ps(y10, y20);
-        s34_0 = _mm256_add_ps(y30, y40);
-        s56_0 = _mm256_add_ps(y50, y60);
-
-        z00 = _mm256_add_ps(y00, _mm256_add_ps(s12_0, _mm256_add_ps(s34_0, s56_0)));
-        z20 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.25f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(4.0f), s12_0));
-        z40 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/16), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(16.0f), s12_0));
-
-        s12_0 = _mm256_sub_ps(y10, y20);
-        s34_0 = _mm256_sub_ps(y30, y40);
-        s56_0 = _mm256_sub_ps(y50, y60);
-
-        z50 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(1.f/32), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(32.0f), _mm256_add_ps(y70, s12_0)));
-        z10 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.5f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(2.0f), s12_0));
-        z30 = _mm256_fmadd_ps(s56_0, _mm256_set1_ps(0.125f), _mm256_fmadd_ps(s34_0, _mm256_set1_ps(8.0f), s12_0));
-
-        __m256 vbias = _mm256_set1_ps(bias);
-        z00 = _mm256_add_ps(vbias, z00);
-        z10 = _mm256_add_ps(vbias, z10);
-        z20 = _mm256_add_ps(vbias, z20);
-        z30 = _mm256_add_ps(vbias, z30);
-        z40 = _mm256_add_ps(vbias, z40);
-        z50 = _mm256_add_ps(vbias, z50);
-    }
-
-    if (bpptr)
-    {
-        z00 = _mm256_add_ps(z00, _mm256_loadu_ps(bpptr));
-        z10 = _mm256_add_ps(z10, _mm256_loadu_ps(bpptr + bpstep));
-        z20 = _mm256_add_ps(z20, _mm256_loadu_ps(bpptr + bpstep*2));
-        z30 = _mm256_add_ps(z30, _mm256_loadu_ps(bpptr + bpstep*3));
-        z40 = _mm256_add_ps(z40, _mm256_loadu_ps(bpptr + bpstep*4));
-        z50 = _mm256_add_ps(z50, _mm256_loadu_ps(bpptr + bpstep*5));
-    }
-
-    if (ifMinMaxAct)
-    {
-        __m256 vmax = _mm256_set1_ps(maxval);
-        __m256 vmin = _mm256_set1_ps(minval);
-
-        z00 = _mm256_min_ps(_mm256_max_ps(z00, vmin), vmax);
-        z10 = _mm256_min_ps(_mm256_max_ps(z10, vmin), vmax);
-        z20 = _mm256_min_ps(_mm256_max_ps(z20, vmin), vmax);
-        z30 = _mm256_min_ps(_mm256_max_ps(z30, vmin), vmax);
-        z40 = _mm256_min_ps(_mm256_max_ps(z40, vmin), vmax);
-        z50 = _mm256_min_ps(_mm256_max_ps(z50, vmin), vmax);
-    }
-
-    __m128 lowM, highM;
-    STORE6_ELE_FROM_16(outptr, z00, lowM, highM);
-    STORE6_ELE_FROM_16(outptr + outstep, z10, lowM, highM);
-    STORE6_ELE_FROM_16(outptr + outstep * 2, z20, lowM, highM);
-    STORE6_ELE_FROM_16(outptr + outstep * 3, z30, lowM, highM);
-    STORE6_ELE_FROM_16(outptr + outstep * 4, z40, lowM, highM);
-    STORE6_ELE_FROM_16(outptr + outstep * 5, z50, lowM, highM);
-    _mm256_zeroupper();
-}
-
-#endif
-} // namespace opt_AVX2
-} // namespace dnn
-} // namespace cv
\ No newline at end of file
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp
deleted file mode 100644
index e146c0974e..0000000000
--- a/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp
+++ /dev/null
@@ -1,567 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#ifndef OPENCV_FAST_CONVOLUTION_SIMD_HPP
-#define OPENCV_FAST_CONVOLUTION_SIMD_HPP
-
-#include "opencv2/core/hal/intrin.hpp"
-#include <opencv2/core/utils/logger.hpp>
-
-namespace cv {
-namespace dnn {
-
-static void convBlockMR1NoSIMD(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
-                               const float minval, const float maxval, bool ifMinMaxAct, const int outLen)
-{
-    std::vector<float> cbuffer(outLen, 0);
-    float* cbuf = cbuffer.data();
-    for( int p = 0; p < np; p++ )
-    {
-        float ai = a[p];
-        for( int j = 0; j < outLen; j++ )
-            cbuf[j] += b[CONV_NR*p + j] * ai;
-    }
-
-    if (init_c)
-    {
-        for(int j = 0; j < outLen; j++)
-        {
-            c[j] += cbuf[j] + bias;
-            if (ifMinMaxAct)
-                c[j] = std::min(std::max(c[j], minval), maxval);
-        }
-    }
-    else
-    {
-        for(int j = 0; j < outLen; j++)
-        {
-            c[j] = cbuf[j] + bias;
-            if (ifMinMaxAct)
-                c[j] = std::min(std::max(c[j], minval), maxval);
-        }
-    }
-}
-
-void convBlockMR1(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
-                  const float minval, const float maxval, bool ifMinMaxAct, const int outLen)
-{
-#if CV_SIMD128
-    // The outLen represents the valid output value in CONV_NR length.
-    // When outLen is very small, we use the no-SIMD branch.
-    const int CONV_NRby3 = CONV_NR/3;
-    if (outLen > CONV_NRby3)
-    {
-        v_float32x4 c0  = v_setall_f32(bias), c1 = c0, c2 = c0; // CONV_NR == 12
-#if CONV_NR == 28 || CONV_NR == 24
-        v_float32x4 c3 = c0, c4 = c0, c5 = c0;
-#endif
-#if CONV_NR == 28
-        v_float32x4 c6 = c0;
-#endif
-        for (int p = 0; p < np; p++, a++, b += CONV_NR)
-        {
-            v_float32x4 a0 = v_setall_f32(a[0]);
-            v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
-#if CONV_NR == 28 || CONV_NR == 24
-            v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20);
-#endif
-#if CONV_NR == 28
-            v_float32x4 b6 = v_load(b + 24);
-#endif
-
-            c0 = v_fma(b0, a0, c0);
-            c1 = v_fma(b1, a0, c1);
-            c2 = v_fma(b2, a0, c2);
-#if CONV_NR == 28 || CONV_NR == 24
-            c3 = v_fma(b3, a0, c3);
-            c4 = v_fma(b4, a0, c4);
-            c5 = v_fma(b5, a0, c5);
-#endif
-#if CONV_NR == 28
-            c6 = v_fma(b6, a0, c6);
-#endif
-        }
-
-        if (init_c)
-        {
-            c0 += v_load(c);
-            c1 += v_load(c + 4);
-            c2 += v_load(c + 8);
-#if CONV_NR == 28 || CONV_NR == 24
-            c3 += v_load(c + 12);
-            c4 += v_load(c + 16);
-            c5 += v_load(c + 20);
-#endif
-#if CONV_NR == 28
-            c6  += v_load(c + 24);
-#endif
-        }
-
-        if (ifMinMaxAct)
-        {
-           v_float32x4 vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval);
-           c0 = v_min(v_max(c0, vmin), vmax);
-           c1 = v_min(v_max(c1, vmin), vmax);
-           c2 = v_min(v_max(c2, vmin), vmax);
-#if CONV_NR == 28 || CONV_NR == 24
-           c3 = v_min(v_max(c3, vmin), vmax);
-           c4 = v_min(v_max(c4, vmin), vmax);
-           c5 = v_min(v_max(c5, vmin), vmax);
-#endif
-#if CONV_NR == 28
-            c6 = v_min(v_max(c6, vmin), vmax);
-#endif
-        }
-
-        v_store(c, c0);
-        v_store(c + 4, c1);
-        v_store(c + 8, c2);
-#if CONV_NR == 28 || CONV_NR == 24
-        v_store(c + 12, c3);
-        v_store(c + 16, c4);
-        v_store(c + 20, c5);
-#endif
-#if CONV_NR == 28
-        v_store(c + 24, c6);
-#endif
-     }
-     else
-         convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen);
-#else
-    convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen);
-#endif
-}
-
-#if CV_SIMD128
-#if CONV_MR == 4 && CONV_NR == 24
-static void convBlock4x24(int np, const float* a, const float* b, float* c, int ldc, bool init_c)
-{
-    v_float32x4 c0  = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0;
-    v_float32x4 c6  = v_setzero_f32(), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6;
-    v_float32x4 c12 = v_setzero_f32(), c13 = c12, c14 = c12, c15 = c12, c16 = c12, c17 = c12;
-    v_float32x4 c18 = v_setzero_f32(), c19 = c18, c20 = c18, c21 = c18, c22 = c18, c23 = c18;
-
-    for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR)
-    {
-        v_float32x4 a0 = v_setall_f32(a[0]);
-        v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
-        v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20);
-
-        c0 = v_fma(b0, a0, c0);
-        c1 = v_fma(b1, a0, c1);
-        c2 = v_fma(b2, a0, c2);
-        c3 = v_fma(b3, a0, c3);
-        c4 = v_fma(b4, a0, c4);
-        c5 = v_fma(b5, a0, c5);
-
-        a0  = v_setall_f32(a[1]);
-        c6  = v_fma(b0, a0, c6);
-        c7  = v_fma(b1, a0, c7);
-        c8  = v_fma(b2, a0, c8);
-        c9  = v_fma(b3, a0, c9);
-        c10 = v_fma(b4, a0, c10);
-        c11 = v_fma(b5, a0, c11);
-
-        a0 = v_setall_f32(a[2]);
-        c12 = v_fma(b0, a0, c12);
-        c13 = v_fma(b1, a0, c13);
-        c14 = v_fma(b2, a0, c14);
-        c15 = v_fma(b3, a0, c15);
-        c16 = v_fma(b4, a0, c16);
-        c17 = v_fma(b5, a0, c17);
-
-        a0 = v_setall_f32(a[3]);
-        c18 = v_fma(b0, a0, c18);
-        c19 = v_fma(b1, a0, c19);
-        c20 = v_fma(b2, a0, c20);
-        c21 = v_fma(b3, a0, c21);
-        c22 = v_fma(b4, a0, c22);
-        c23 = v_fma(b5, a0, c23);
-    }
-
-    if (!init_c)
-    {
-        c0 += v_load(c);
-        c1 += v_load(c + 4);
-        c2 += v_load(c + 8);
-        c3 += v_load(c + 12);
-        c4 += v_load(c + 16);
-        c5 += v_load(c + 20);
-
-        c6  += v_load(c + ldc);
-        c7  += v_load(c + ldc + 4);
-        c8  += v_load(c + ldc + 8);
-        c9  += v_load(c + ldc + 12);
-        c10 += v_load(c + ldc + 16);
-        c11 += v_load(c + ldc + 20);
-
-        c12 += v_load(c + ldc*2);
-        c13 += v_load(c + ldc*2 + 4);
-        c14 += v_load(c + ldc*2 + 8);
-        c15 += v_load(c + ldc*2 + 12);
-        c16 += v_load(c + ldc*2 + 16);
-        c17 += v_load(c + ldc*2 + 20);
-
-        c18 += v_load(c + ldc*3);
-        c19 += v_load(c + ldc*3 + 4);
-        c20 += v_load(c + ldc*3 + 8);
-        c21 += v_load(c + ldc*3 + 12);
-        c22 += v_load(c + ldc*3 + 16);
-        c23 += v_load(c + ldc*3 + 20);
-    }
-
-    v_store(c, c0);
-    v_store(c + 4, c1);
-    v_store(c + 8, c2);
-    v_store(c + 12, c3);
-    v_store(c + 16, c4);
-    v_store(c + 20, c5);
-
-    v_store(c + ldc, c6);
-    v_store(c + ldc + 4, c7);
-    v_store(c + ldc + 8, c8);
-    v_store(c + ldc + 12, c9);
-    v_store(c + ldc + 16, c10);
-    v_store(c + ldc + 20, c11);
-
-    v_store(c + ldc * 2, c12);
-    v_store(c + ldc * 2 + 4, c13);
-    v_store(c + ldc * 2 + 8, c14);
-    v_store(c + ldc * 2 + 12, c15);
-    v_store(c + ldc * 2 + 16, c16);
-    v_store(c + ldc * 2 + 20, c17);
-
-    v_store(c + ldc * 3, c18);
-    v_store(c + ldc * 3 + 4, c19);
-    v_store(c + ldc * 3 + 8, c20);
-    v_store(c + ldc * 3 + 12, c21);
-    v_store(c + ldc * 3 + 16, c22);
-    v_store(c + ldc * 3 + 20, c23);
-}
-#endif
-
-static void convBlock4x8(int np, const float* a, const float* b, float* c, int ldc, bool init_c)
-{
-    CV_Assert(CONV_NR >= 4);
-    v_float32x4 c0  = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0;
-    v_float32x4 c4 = c0, c5 = c0, c6 = c0, c7 = c0;
-
-    for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR)
-    {
-        v_float32x4 a0 = v_setall_f32(a[0]);
-        v_float32x4 a1 = v_setall_f32(a[1]);
-        v_float32x4 a2 = v_setall_f32(a[2]);
-        v_float32x4 a3 = v_setall_f32(a[3]);
-
-        v_float32x4 b0 = v_load(b), b1 = v_load(b + 4);
-
-        c0 = v_fma(b0, a0, c0);
-        c1 = v_fma(b1, a0, c1);
-
-        c2 = v_fma(b0, a1, c2);
-        c3 = v_fma(b1, a1, c3);
-
-        c4 = v_fma(b0, a2, c4);
-        c5 = v_fma(b1, a2, c5);
-
-        c6  = v_fma(b0, a3, c6);
-        c7  = v_fma(b1, a3, c7);
-    }
-
-    if (!init_c)
-    {
-        c0 += v_load(c);
-        c1 += v_load(c + 4);
-
-        c2  += v_load(c + ldc);
-        c3  += v_load(c + ldc + 4);
-
-        c4 += v_load(c + ldc*2);
-        c5 += v_load(c + ldc*2 + 4);
-
-        c6 += v_load(c + ldc*3);
-        c7 += v_load(c + ldc*3 + 4);
-    }
-
-    v_store(c, c0);
-    v_store(c + 4, c1);
-    v_store(c + ldc, c2);
-    v_store(c + ldc + 4, c3);
-    v_store(c + ldc * 2, c4);
-    v_store(c + ldc * 2 + 4, c5);
-    v_store(c + ldc * 3, c6);
-    v_store(c + ldc * 3 + 4, c7);
-}
-
-static void convBlock4x4(int np, const float* a, const float* b, float* c, int ldc, bool init_c)
-{
-    CV_Assert(CONV_NR >= 4);
-    v_float32x4 c0  = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0;
-
-    for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR)
-    {
-        v_float32x4 a0 = v_setall_f32(a[0]);
-        v_float32x4 a1 = v_setall_f32(a[1]);
-        v_float32x4 a2 = v_setall_f32(a[2]);
-        v_float32x4 a3 = v_setall_f32(a[3]);
-
-        v_float32x4 b0 = v_load(b);
-
-        c0 = v_fma(b0, a0, c0);
-        c1 = v_fma(b0, a1, c1);
-        c2 = v_fma(b0, a2, c2);
-        c3 = v_fma(b0, a3, c3);
-    }
-
-    if (!init_c)
-    {
-        c0 += v_load(c);
-        c1 += v_load(c + ldc);
-        c2 += v_load(c + ldc*2);
-        c3 += v_load(c + ldc*3);
-    }
-
-    v_store(c, c0);
-    v_store(c + ldc, c1);
-    v_store(c + ldc * 2, c2);
-    v_store(c + ldc * 3, c3);
-}
-#endif
-
-static void convBlockNoSIMD(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen)
-{
-    std::vector<float> cbuffer(CONV_MR * outLen, 0);
-    float* cbuf = cbuffer.data();
-    for( int p = 0; p < np; p++ )
-    {
-        for( int i = 0; i < CONV_MR; i++ )
-        {
-            float ai = a[CONV_MR*p + i];
-            for( int j = 0; j < outLen; j++ )
-                cbuf[i * outLen+j] += b[CONV_NR*p + j] * ai;
-        }
-    }
-
-    if (!init_c)
-    {
-        for(int i = 0; i < CONV_MR; i++)
-        {
-            for(int j = 0; j < outLen; j++)
-                c[i*ldc + j] += cbuf[i*outLen + j];
-        }
-    }
-    else
-    {
-        for(int i = 0; i < CONV_MR; i++)
-        {
-            for(int j = 0; j < outLen; j++)
-                c[i*ldc + j] = cbuf[i*outLen + j];
-        }
-    }
-}
-
-void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen)
-{
-    // The possible outLen range is [24, 8~1].
-#if CV_SIMD128
-#if CONV_MR == 4 && CONV_NR == 24
-    const int CONV_NRby3 = CONV_NR/3;
-    if (outLen > CONV_NRby3)
-    {
-        convBlock4x24(np, a, b, c, ldc, init_c);
-        return;
-    }
-#endif
-
-    if (outLen <= 8 && outLen > 4)
-    {
-        convBlock4x8(np, a, b, c, ldc, init_c);
-        return;
-    }
-
-    if (outLen <= 4 && outLen > 1)
-    {
-        convBlock4x4(np, a, b, c, ldc, init_c);
-        return;
-    }
-    convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen);
-#else
-    convBlockNoSIMD(np, a, b, c, ldc, init_c, outLen);
-#endif
-}
-} // namespace dnn
-
-namespace opt_NEON
-{
-#if CV_TRY_NEON
-void convBlock_NEON(int np, const float* a, const float* b, float* c, int ldc, bool init_c)
-{
-#if CONV_MR == 4 && CONV_NR == 28  // AARCH64
-    {
-        float32x4_t c00 = vdupq_n_f32(0.f), c01 = c00, c02 = c00, c03 = c00, c04 = c00, c05 = c00, c06 = c00;
-        float32x4_t c10 = vdupq_n_f32(0.f), c11 = c10, c12 = c10, c13 = c10, c14 = c10, c15 = c10, c16 = c10;
-        float32x4_t c20 = vdupq_n_f32(0.f), c21 = c20, c22 = c20, c23 = c20, c24 = c20, c25 = c20, c26 = c20;
-        float32x4_t c30 = vdupq_n_f32(0.f), c31 = c30, c32 = c30, c33 = c30, c34 = c30, c35 = c30, c36 = c30;
-
-        for( int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR )
-        {
-            float32x4_t a0 = vld1q_f32(a), b0, b1, b2;
-            b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8);
-
-            c00 = vfmaq_laneq_f32(c00, b0, a0, 0);
-            c01 = vfmaq_laneq_f32(c01, b1, a0, 0);
-            c02 = vfmaq_laneq_f32(c02, b2, a0, 0);
-            c10 = vfmaq_laneq_f32(c10, b0, a0, 1);
-            c11 = vfmaq_laneq_f32(c11, b1, a0, 1);
-            c12 = vfmaq_laneq_f32(c12, b2, a0, 1);
-            c20 = vfmaq_laneq_f32(c20, b0, a0, 2);
-            c21 = vfmaq_laneq_f32(c21, b1, a0, 2);
-            c22 = vfmaq_laneq_f32(c22, b2, a0, 2);
-            c30 = vfmaq_laneq_f32(c30, b0, a0, 3);
-            c31 = vfmaq_laneq_f32(c31, b1, a0, 3);
-            c32 = vfmaq_laneq_f32(c32, b2, a0, 3);
-
-            b0 = vld1q_f32(b + 12); b1 = vld1q_f32(b + 16); b2 = vld1q_f32(b + 20);
-
-            c03 = vfmaq_laneq_f32(c03, b0, a0, 0);
-            c04 = vfmaq_laneq_f32(c04, b1, a0, 0);
-            c05 = vfmaq_laneq_f32(c05, b2, a0, 0);
-            c13 = vfmaq_laneq_f32(c13, b0, a0, 1);
-            c14 = vfmaq_laneq_f32(c14, b1, a0, 1);
-            c15 = vfmaq_laneq_f32(c15, b2, a0, 1);
-            c23 = vfmaq_laneq_f32(c23, b0, a0, 2);
-            c24 = vfmaq_laneq_f32(c24, b1, a0, 2);
-            c25 = vfmaq_laneq_f32(c25, b2, a0, 2);
-            c33 = vfmaq_laneq_f32(c33, b0, a0, 3);
-            c34 = vfmaq_laneq_f32(c34, b1, a0, 3);
-            c35 = vfmaq_laneq_f32(c35, b2, a0, 3);
-
-            b0 = vld1q_f32(b + 24);
-            c06 = vfmaq_laneq_f32(c06, b0, a0, 0);
-            c16 = vfmaq_laneq_f32(c16, b0, a0, 1);
-            c26 = vfmaq_laneq_f32(c26, b0, a0, 2);
-            c36 = vfmaq_laneq_f32(c36, b0, a0, 3);
-        }
-
-        if (!init_c)
-        {
-            c00 = vaddq_f32(c00, vld1q_f32(c));
-            c01 = vaddq_f32(c01, vld1q_f32(c + 4));
-            c02 = vaddq_f32(c02, vld1q_f32(c + 8));
-            c03 = vaddq_f32(c03, vld1q_f32(c + 12));
-            c04 = vaddq_f32(c04, vld1q_f32(c + 16));
-            c05 = vaddq_f32(c05, vld1q_f32(c + 20));
-            c06 = vaddq_f32(c06, vld1q_f32(c + 24));
-
-            c10 = vaddq_f32(c10, vld1q_f32(c + ldc));
-            c11 = vaddq_f32(c11, vld1q_f32(c + ldc + 4));
-            c12 = vaddq_f32(c12, vld1q_f32(c + ldc + 8));
-            c13 = vaddq_f32(c13, vld1q_f32(c + ldc + 12));
-            c14 = vaddq_f32(c14, vld1q_f32(c + ldc + 16));
-            c15 = vaddq_f32(c15, vld1q_f32(c + ldc + 20));
-            c16 = vaddq_f32(c16, vld1q_f32(c + ldc + 24));
-
-            c20 = vaddq_f32(c20, vld1q_f32(c + ldc*2));
-            c21 = vaddq_f32(c21, vld1q_f32(c + ldc*2 + 4));
-            c22 = vaddq_f32(c22, vld1q_f32(c + ldc*2 + 8));
-            c23 = vaddq_f32(c23, vld1q_f32(c + ldc*2 + 12));
-            c24 = vaddq_f32(c24, vld1q_f32(c + ldc*2 + 16));
-            c25 = vaddq_f32(c25, vld1q_f32(c + ldc*2 + 20));
-            c26 = vaddq_f32(c26, vld1q_f32(c + ldc*2 + 24));
-
-            c30 = vaddq_f32(c30, vld1q_f32(c + ldc*3));
-            c31 = vaddq_f32(c31, vld1q_f32(c + ldc*3 + 4));
-            c32 = vaddq_f32(c32, vld1q_f32(c + ldc*3 + 8));
-            c33 = vaddq_f32(c33, vld1q_f32(c + ldc*3 + 12));
-            c34 = vaddq_f32(c34, vld1q_f32(c + ldc*3 + 16));
-            c35 = vaddq_f32(c35, vld1q_f32(c + ldc*3 + 20));
-            c36 = vaddq_f32(c36, vld1q_f32(c + ldc*3 + 24));
-        }
-
-        vst1q_f32(c, c00); vst1q_f32(c+4, c01);
-        vst1q_f32(c+8, c02); vst1q_f32(c+12, c03);
-        vst1q_f32(c+16, c04); vst1q_f32(c+20, c05);
-        vst1q_f32(c+24, c06);
-
-        vst1q_f32(c+ldc, c10); vst1q_f32(c+ldc+4, c11);
-        vst1q_f32(c+ldc+8, c12); vst1q_f32(c+ldc+12, c13);
-        vst1q_f32(c+ldc+16, c14); vst1q_f32(c+ldc+20, c15);
-        vst1q_f32(c+ldc+24, c16);
-
-        vst1q_f32(c+ldc*2, c20); vst1q_f32(c+ldc*2+4, c21);
-        vst1q_f32(c+ldc*2+8, c22); vst1q_f32(c+ldc*2+12, c23);
-        vst1q_f32(c+ldc*2+16, c24); vst1q_f32(c+ldc*2+20, c25);
-        vst1q_f32(c+ldc*2+24, c26);
-
-        vst1q_f32(c+ldc*3, c30); vst1q_f32(c+ldc*3+4, c31);
-        vst1q_f32(c+ldc*3+8, c32); vst1q_f32(c+ldc*3+12, c33);
-        vst1q_f32(c+ldc*3+16, c34); vst1q_f32(c+ldc*3+20, c35);
-        vst1q_f32(c+ldc*3+24, c36);
-    }
-#elif CONV_MR == 4 && CONV_NR == 12 // ARMv7
-    {
-        float32x4_t c0 = vdupq_n_f32(0.f), c1 = c0, c2 = c0;
-        float32x4_t c3 = vdupq_n_f32(0.f), c4 = c3, c5 = c3;
-        float32x4_t c6 = vdupq_n_f32(0.f), c7 = c6, c8 = c6;
-        float32x4_t c9 = vdupq_n_f32(0.f), c10 = c9, c11 = c9;
-
-
-        float32x2_t a0 = vdup_n_f32(0.0f), a1 = a0;
-        float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f);
-
-        for (int p = 0; p < np; p++, a += CONV_MR, b += CONV_NR)
-        {
-            a0 = vld1_f32(a), a1 = vld1_f32(a+2);
-            b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
-
-            c0 = vmlaq_lane_f32(c0, b0, a0, 0);
-            c1 = vmlaq_lane_f32(c1, b1, a0, 0);
-            c2 = vmlaq_lane_f32(c2, b2, a0, 0);
-
-            c3 = vmlaq_lane_f32(c3, b0, a0, 1);
-            c4 = vmlaq_lane_f32(c4, b1, a0, 1);
-            c5 = vmlaq_lane_f32(c5, b2, a0, 1);
-
-            c6 = vmlaq_lane_f32(c6, b0, a1, 0);
-            c7 = vmlaq_lane_f32(c7, b1, a1, 0);
-            c8 = vmlaq_lane_f32(c8, b2, a1, 0);
-
-            c9  = vmlaq_lane_f32(c9 , b0, a1, 1);
-            c10 = vmlaq_lane_f32(c10, b1, a1, 1);
-            c11 = vmlaq_lane_f32(c11, b2, a1, 1);
-        }
-
-        if (!init_c)
-        {
-            c0 = vaddq_f32(c0, vld1q_f32(c));
-            c1 = vaddq_f32(c1, vld1q_f32(c + 4));
-            c2 = vaddq_f32(c2, vld1q_f32(c + 8));
-
-            c3 = vaddq_f32(c3, vld1q_f32(c + ldc));
-            c4 = vaddq_f32(c4, vld1q_f32(c + ldc + 4));
-            c5 = vaddq_f32(c5, vld1q_f32(c + ldc + 8));
-
-            c6 = vaddq_f32(c6, vld1q_f32(c + ldc * 2));
-            c7 = vaddq_f32(c7, vld1q_f32(c + ldc * 2 + 4));
-            c8 = vaddq_f32(c8, vld1q_f32(c + ldc * 2 + 8));
-
-            c9  = vaddq_f32(c9 , vld1q_f32(c + ldc * 3));
-            c10 = vaddq_f32(c10, vld1q_f32(c + ldc * 3 + 4));
-            c11 = vaddq_f32(c11, vld1q_f32(c + ldc * 3 + 8));
-        }
-
-        vst1q_f32(c, c0), vst1q_f32(c+4, c1), vst1q_f32(c+8, c2);
-        vst1q_f32(c + ldc, c3), vst1q_f32(c + ldc + 4, c4), vst1q_f32(c + ldc + 8, c5);
-        vst1q_f32(c + ldc*2, c6), vst1q_f32(c + ldc*2 + 4, c7), vst1q_f32(c + ldc*2 + 8, c8);
-        vst1q_f32(c + ldc*3, c9), vst1q_f32(c + ldc*3 + 4, c10), vst1q_f32(c + ldc*3 + 8, c11);
-    }
-//#else
-//#error "unsupported CONV_MR and/or CONV_NR in convBlock_NEON."
-#endif
-}
-#endif
-} // namespace opt_NEON
-
-} // namespace cv
-#endif //OPENCV_FAST_CONVOLUTION_SIMD_HPP
diff --git a/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp b/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp
deleted file mode 100644
index b0ccfd0cd2..0000000000
--- a/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp
+++ /dev/null
@@ -1,1153 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv_Winograd.fx).
-// Here is the original license:
-/*
-    This file is a part of ficus language project.
-    See ficus/LICENSE for the licensing terms
-*/
-
-#include "../../precomp.hpp"
-#include "fast_convolution.hpp"
-
-namespace cv { namespace dnn {
-
-#if CV_NEON || CV_SIMD128 || CV_TRY_AVX2
-enum { VEC_ALIGN = 32, DFT_TYPE = CV_32F }; // Memory alignment.
-
-static void
-_fx_winograd_accum_f32(const float* inwptr, const float* wptr,
-                       float* outbuf, int Cg, int iblock)
- {
-#if CV_NEON && CV_NEON_AARCH64
-    CV_Assert(_FX_WINO_IBLOCK == 6 && _FX_WINO_KBLOCK == 4 && _FX_WINO_ATOM_F32 == 4);
-    if (iblock > 3)
-    {
-        for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++,
-                outbuf += _FX_WINO_ATOM_F32)
-        {
-            float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00, s03 = s00, s04 = s00, s05 = s00;
-            float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00, s13 = s00, s14 = s00, s15 = s00;
-            float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00, s23 = s00, s24 = s00, s25 = s00;
-            float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00, s33 = s00, s34 = s00, s35 = s00;
-            for (int c = 0; c < Cg; c++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32,
-                                         wptr += _FX_WINO_KBLOCK*_FX_WINO_ATOM_F32) {
-                float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4);
-                float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12);
-                float32x4_t x0, x1;
-                x0 = vld1q_f32(inwptr);
-                x1 = vld1q_f32(inwptr + 4);
-                s00 = vfmaq_f32(s00, w0, x0);
-                s01 = vfmaq_f32(s01, w0, x1);
-                s10 = vfmaq_f32(s10, w1, x0);
-                s11 = vfmaq_f32(s11, w1, x1);
-                s20 = vfmaq_f32(s20, w2, x0);
-                s21 = vfmaq_f32(s21, w2, x1);
-                s30 = vfmaq_f32(s30, w3, x0);
-                s31 = vfmaq_f32(s31, w3, x1);
-                x0 = vld1q_f32(inwptr + 8);
-                x1 = vld1q_f32(inwptr + 12);
-                s02 = vfmaq_f32(s02, w0, x0);
-                s03 = vfmaq_f32(s03, w0, x1);
-                s12 = vfmaq_f32(s12, w1, x0);
-                s13 = vfmaq_f32(s13, w1, x1);
-                s22 = vfmaq_f32(s22, w2, x0);
-                s23 = vfmaq_f32(s23, w2, x1);
-                s32 = vfmaq_f32(s32, w3, x0);
-                s33 = vfmaq_f32(s33, w3, x1);
-                x0 = vld1q_f32(inwptr + 16);
-                x1 = vld1q_f32(inwptr + 20);
-                s04 = vfmaq_f32(s04, w0, x0);
-                s05 = vfmaq_f32(s05, w0, x1);
-                s14 = vfmaq_f32(s14, w1, x0);
-                s15 = vfmaq_f32(s15, w1, x1);
-                s24 = vfmaq_f32(s24, w2, x0);
-                s25 = vfmaq_f32(s25, w2, x1);
-                s34 = vfmaq_f32(s34, w3, x0);
-                s35 = vfmaq_f32(s35, w3, x1);
-            }
-
-            vst1q_f32(outbuf, s00);
-            vst1q_f32(outbuf + 1*64, s01);
-            vst1q_f32(outbuf + 2*64, s02);
-            vst1q_f32(outbuf + 3*64, s03);
-            vst1q_f32(outbuf + 4*64, s04);
-            vst1q_f32(outbuf + 5*64, s05);
-
-            vst1q_f32(outbuf + 6*64, s10);
-            vst1q_f32(outbuf + 7*64, s11);
-            vst1q_f32(outbuf + 8*64, s12);
-            vst1q_f32(outbuf + 9*64, s13);
-            vst1q_f32(outbuf + 10*64, s14);
-            vst1q_f32(outbuf + 11*64, s15);
-
-            vst1q_f32(outbuf + 12*64, s20);
-            vst1q_f32(outbuf + 13*64, s21);
-            vst1q_f32(outbuf + 14*64, s22);
-            vst1q_f32(outbuf + 15*64, s23);
-            vst1q_f32(outbuf + 16*64, s24);
-            vst1q_f32(outbuf + 17*64, s25);
-
-            vst1q_f32(outbuf + 18*64, s30);
-            vst1q_f32(outbuf + 19*64, s31);
-            vst1q_f32(outbuf + 20*64, s32);
-            vst1q_f32(outbuf + 21*64, s33);
-            vst1q_f32(outbuf + 22*64, s34);
-            vst1q_f32(outbuf + 23*64, s35);
-        }
-    }
-    else
-    {
-        for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++,
-                outbuf += _FX_WINO_ATOM_F32)
-        {
-            float32x4_t s00 = vdupq_n_f32(0.f), s01 = s00, s02 = s00;
-            float32x4_t s10 = vdupq_n_f32(0.f), s11 = s00, s12 = s00;
-            float32x4_t s20 = vdupq_n_f32(0.f), s21 = s00, s22 = s00;
-            float32x4_t s30 = vdupq_n_f32(0.f), s31 = s00, s32 = s00;
-            for (int c = 0; c < Cg; c++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32,
-                                         wptr += _FX_WINO_KBLOCK*_FX_WINO_ATOM_F32) {
-                float32x4_t w0 = vld1q_f32(wptr), w1 = vld1q_f32(wptr + 4);
-                float32x4_t w2 = vld1q_f32(wptr + 8), w3 = vld1q_f32(wptr + 12);
-                float32x4_t x0, x1, x2;
-                x0 = vld1q_f32(inwptr);
-                x1 = vld1q_f32(inwptr + 4);
-                x2 = vld1q_f32(inwptr + 8);
-                s00 = vfmaq_f32(s00, w0, x0);
-                s01 = vfmaq_f32(s01, w0, x1);
-                s02 = vfmaq_f32(s02, w0, x2);
-                s10 = vfmaq_f32(s10, w1, x0);
-                s11 = vfmaq_f32(s11, w1, x1);
-                s12 = vfmaq_f32(s12, w1, x2);
-                s20 = vfmaq_f32(s20, w2, x0);
-                s21 = vfmaq_f32(s21, w2, x1);
-                s22 = vfmaq_f32(s22, w2, x2);
-                s30 = vfmaq_f32(s30, w3, x0);
-                s31 = vfmaq_f32(s31, w3, x1);
-                s32 = vfmaq_f32(s32, w3, x2);
-            }
-
-            vst1q_f32(outbuf, s00);
-            vst1q_f32(outbuf + 1*64, s01);
-            vst1q_f32(outbuf + 2*64, s02);
-            vst1q_f32(outbuf + 6*64, s10);
-            vst1q_f32(outbuf + 7*64, s11);
-            vst1q_f32(outbuf + 8*64, s12);
-            vst1q_f32(outbuf + 12*64, s20);
-            vst1q_f32(outbuf + 13*64, s21);
-            vst1q_f32(outbuf + 14*64, s22);
-            vst1q_f32(outbuf + 18*64, s30);
-            vst1q_f32(outbuf + 19*64, s31);
-            vst1q_f32(outbuf + 20*64, s32);
-        }
-    }
-#elif CV_SIMD128
-    CV_Assert(_FX_WINO_IBLOCK == 3 && _FX_WINO_KBLOCK == 4 && _FX_WINO_ATOM_F32 == 4);
-    for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32; atom_id++,
-            outbuf += _FX_WINO_ATOM_F32)
-    {
-        v_float32x4 s00 = v_setzero_f32(), s01 = s00, s02 = s00;
-        v_float32x4 s10 = v_setzero_f32(), s11 = s00, s12 = s00;
-        v_float32x4 s20 = v_setzero_f32(), s21 = s00, s22 = s00;
-        v_float32x4 s30 = v_setzero_f32(), s31 = s00, s32 = s00;
-
-        for (int c = 0; c < Cg; c++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32,
-                                     wptr += _FX_WINO_KBLOCK*_FX_WINO_ATOM_F32)
-        {
-            v_float32x4 x0, x1, x2;
-            x0 = v_load(inwptr);
-            x1 = v_load(inwptr + 4);
-            x2 = v_load(inwptr + 8);
-
-            v_float32x4 w0 = v_load(wptr);
-            s00 = v_fma(w0, x0, s00);
-            s01 = v_fma(w0, x1, s01);
-            s02 = v_fma(w0, x2, s02);
-
-            w0 = v_load(wptr + 4);
-            s10 = v_fma(w0, x0, s10);
-            s11 = v_fma(w0, x1, s11);
-            s12 = v_fma(w0, x2, s12);
-
-            w0 = v_load(wptr + 8);
-            s20 = v_fma(w0, x0, s20);
-            s21 = v_fma(w0, x1, s21);
-            s22 = v_fma(w0, x2, s22);
-
-            w0 = v_load(wptr + 12);
-            s30 = v_fma(w0, x0, s30);
-            s31 = v_fma(w0, x1, s31);
-            s32 = v_fma(w0, x2, s32);
-        }
-
-        v_store(outbuf, s00);
-        v_store(outbuf + 1*64, s01);
-        v_store(outbuf + 2*64, s02);
-        v_store(outbuf + 3*64, s10);
-        v_store(outbuf + 4*64, s11);
-        v_store(outbuf + 5*64, s12);
-        v_store(outbuf + 6*64, s20);
-        v_store(outbuf + 7*64, s21);
-        v_store(outbuf + 8*64, s22);
-        v_store(outbuf + 9*64, s30);
-        v_store(outbuf + 10*64, s31);
-        v_store(outbuf + 11*64, s32);
-    }
-#else
-    for (int atom_id = 0; atom_id < _FX_WINO_NATOMS_F32;
-                    atom_id++, outbuf += _FX_WINO_ATOM_F32)
-    {
-        float sumbuf[_FX_WINO_IBLOCK*_FX_WINO_KBLOCK*_FX_WINO_ATOM_F32];
-        memset(sumbuf, 0, sizeof(sumbuf));
-        for (int c = 0; c < Cg; c++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32,
-                                     wptr += _FX_WINO_KBLOCK*_FX_WINO_ATOM_F32)
-        {
-            for (int i = 0; i < _FX_WINO_KBLOCK; i++)
-            {
-                for (int j = 0; j < _FX_WINO_IBLOCK; j++)
-                {
-                    int i_ = i*_FX_WINO_ATOM_F32;
-                    int j_ = j*_FX_WINO_ATOM_F32;
-                    int ij_ = i_*_FX_WINO_IBLOCK + j_;
-                    float s0 = inwptr[j_ + 0]*wptr[i_ + 0];
-                    float s1 = inwptr[j_ + 1]*wptr[i_ + 1];
-                    float s2 = inwptr[j_ + 2]*wptr[i_ + 2];
-                    float s3 = inwptr[j_ + 3]*wptr[i_ + 3];
-                    sumbuf[ij_ + 0] += s0;
-                    sumbuf[ij_ + 1] += s1;
-                    sumbuf[ij_ + 2] += s2;
-                    sumbuf[ij_ + 3] += s3;
-                }
-            }
-        }
-        for (int ij = 0; ij < _FX_WINO_KBLOCK*_FX_WINO_IBLOCK; ij++)
-        {
-            int ij_ = ij*_FX_WINO_ATOM_F32;
-            int ij_out = ij*_FX_WINO_AREA;
-            outbuf[ij_out + 0] = sumbuf[ij_ + 0];
-            outbuf[ij_out + 1] = sumbuf[ij_ + 1];
-            outbuf[ij_out + 2] = sumbuf[ij_ + 2];
-            outbuf[ij_out + 3] = sumbuf[ij_ + 3];
-        }
-    }
-#endif
-}
-
-#if CV_NEON
-#define T4x4(a, b, c, d, tr0, tr1) \
-    tr0 = vtrnq_f32(a, b); \
-    tr1 = vtrnq_f32(c, d); \
-    a = vcombine_f32(vget_low_f32(tr0.val[0]), vget_low_f32(tr1.val[0])); \
-    b = vcombine_f32(vget_low_f32(tr0.val[1]), vget_low_f32(tr1.val[1])); \
-    c = vcombine_f32(vget_high_f32(tr0.val[0]), vget_high_f32(tr1.val[0])); \
-    d = vcombine_f32(vget_high_f32(tr0.val[1]), vget_high_f32(tr1.val[1]))
-#endif
-
-/*Input transform*/
-static void
-_fx_winograd_BtXB_8x8_f32(const float* inptr, int inpstep,
-                          float* outptr, int Cg)
-{
-#if CV_NEON && CV_NEON_AARCH64
-    float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4);
-    float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4);
-    float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4);
-    float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4);
-    float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4);
-    float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4);
-    float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4);
-    float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4);
-
-    float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71;
-
-    {
-        /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
-        /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
-        float32x4_t q5_25 = vdupq_n_f32(5.25f), t00, t01, t10, t11;
-        t00 = vsubq_f32(x40, x20);
-        t01 = vsubq_f32(x41, x21);
-        t10 = vsubq_f32(x30, x50);
-        t11 = vsubq_f32(x31, x51);
-        float32x4_t y00 = vfmaq_f32(vsubq_f32(x00, x60), t00, q5_25);
-        float32x4_t y01 = vfmaq_f32(vsubq_f32(x01, x61), t01, q5_25);
-        float32x4_t y70 = vfmaq_f32(vsubq_f32(x70, x10), t10, q5_25);
-        float32x4_t y71 = vfmaq_f32(vsubq_f32(x71, x11), t11, q5_25);
-
-        /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
-        /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
-        float32x4_t qm4_25 = vdupq_n_f32(-4.25f);
-        t00 = vfmaq_f32(vaddq_f32(x10, x50), x30, qm4_25);
-        t01 = vfmaq_f32(vaddq_f32(x11, x51), x31, qm4_25);
-        t10 = vfmaq_f32(vaddq_f32(x20, x60), x40, qm4_25);
-        t11 = vfmaq_f32(vaddq_f32(x21, x61), x41, qm4_25);
-
-        float32x4_t y10 = vaddq_f32(t00, t10), y11 = vaddq_f32(t01, t11);
-        float32x4_t y20 = vsubq_f32(t10, t00), y21 = vsubq_f32(t11, t01);
-
-        /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
-        /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
-        float32x4_t q0_5 = vdupq_n_f32(0.5f), q0_25 = vdupq_n_f32(0.25f);
-        float32x4_t qm2_5 = vdupq_n_f32(-2.5f), qm1_25 = vdupq_n_f32(-1.25f);
-        t00 = vfmaq_f32(vaddq_f32(x50, x50), x10, q0_5);
-        t01 = vfmaq_f32(vaddq_f32(x51, x51), x11, q0_5);
-        t10 = vfmaq_f32(x60, x20, q0_25);
-        t11 = vfmaq_f32(x61, x21, q0_25);
-        t00 = vfmaq_f32(t00, x30, qm2_5);
-        t01 = vfmaq_f32(t01, x31, qm2_5);
-        t10 = vfmaq_f32(t10, x40, qm1_25);
-        t11 = vfmaq_f32(t11, x41, qm1_25);
-
-        float32x4_t y30 = vaddq_f32(t00, t10), y31 = vaddq_f32(t01, t11);
-        float32x4_t y40 = vsubq_f32(t10, t00), y41 = vsubq_f32(t11, t01);
-
-        /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
-        /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
-        float32x4_t q4 = vdupq_n_f32(4.f), qm5 = vdupq_n_f32(-5.f);
-        t00 = vfmaq_f32(vaddq_f32(x10, x10), x50, q0_5);
-        t01 = vfmaq_f32(vaddq_f32(x11, x11), x51, q0_5);
-        t10 = vfmaq_f32(x60, x20, q4);
-        t11 = vfmaq_f32(x61, x21, q4);
-        t00 = vfmaq_f32(t00, x30, qm2_5);
-        t01 = vfmaq_f32(t01, x31, qm2_5);
-        t10 = vfmaq_f32(t10, x40, qm5);
-        t11 = vfmaq_f32(t11, x41, qm5);
-
-        float32x4_t y50 = vaddq_f32(t00, t10), y51 = vaddq_f32(t01, t11);
-        float32x4_t y60 = vsubq_f32(t10, t00), y61 = vsubq_f32(t11, t01);
-
-        /* transpose 8x8 matrix in-place with some renumeration of the elements: */
-        /* Y:              */
-        /*        y00 y01  */
-        /*        y10 y11  */
-        /*        ...      */
-        /*        y70 y71  */
-        /*   Y':           */
-        /*        y00 y40  */
-        /*        y10 y50  */
-        /*        y20 y60  */
-        /*        y30 y70  */
-        /*        y01 y41  */
-        /*        y11 y51  */
-        /*        y21 y61  */
-        /*        y31 y71  */
-        /*    in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
-        float32x4x2_t tr0, tr1;
-
-        T4x4(y00, y10, y20, y30, tr0, tr1);
-        T4x4(y01, y11, y21, y31, tr0, tr1);
-        T4x4(y40, y50, y60, y70, tr0, tr1);
-        T4x4(y41, y51, y61, y71, tr0, tr1);
-
-        /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
-        /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
-        t00 = vsubq_f32(y01, y20);
-        t01 = vsubq_f32(y41, y60);
-        t10 = vsubq_f32(y30, y11);
-        t11 = vsubq_f32(y70, y51);
-        z00 = vfmaq_f32(vsubq_f32(y00, y21), t00, q5_25);
-        z01 = vfmaq_f32(vsubq_f32(y40, y61), t01, q5_25);
-        z70 = vfmaq_f32(vsubq_f32(y31, y10), t10, q5_25);
-        z71 = vfmaq_f32(vsubq_f32(y71, y50), t11, q5_25);
-
-        /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
-        /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
-        t00 = vfmaq_f32(vaddq_f32(y10, y11), y30, qm4_25);
-        t01 = vfmaq_f32(vaddq_f32(y50, y51), y70, qm4_25);
-        t10 = vfmaq_f32(vaddq_f32(y20, y21), y01, qm4_25);
-        t11 = vfmaq_f32(vaddq_f32(y60, y61), y41, qm4_25);
-
-        z10 = vaddq_f32(t00, t10); z11 = vaddq_f32(t01, t11);
-        z20 = vsubq_f32(t10, t00); z21 = vsubq_f32(t11, t01);
-
-        /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
-        /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
-        t00 = vfmaq_f32(vaddq_f32(y11, y11), y10, q0_5);
-        t01 = vfmaq_f32(vaddq_f32(y51, y51), y50, q0_5);
-        t10 = vfmaq_f32(y21, y20, q0_25);
-        t11 = vfmaq_f32(y61, y60, q0_25);
-        t00 = vfmaq_f32(t00, y30, qm2_5);
-        t01 = vfmaq_f32(t01, y70, qm2_5);
-        t10 = vfmaq_f32(t10, y01, qm1_25);
-        t11 = vfmaq_f32(t11, y41, qm1_25);
-
-        z30 = vaddq_f32(t00, t10); z31 = vaddq_f32(t01, t11);
-        z40 = vsubq_f32(t10, t00); z41 = vsubq_f32(t11, t01);
-
-        /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
-        /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
-        t00 = vfmaq_f32(vaddq_f32(y10, y10), y11, q0_5);
-        t01 = vfmaq_f32(vaddq_f32(y50, y50), y51, q0_5);
-        t10 = vfmaq_f32(y21, y20, q4);
-        t11 = vfmaq_f32(y61, y60, q4);
-        t00 = vfmaq_f32(t00, y30, qm2_5);
-        t01 = vfmaq_f32(t01, y70, qm2_5);
-        t10 = vfmaq_f32(t10, y01, qm5);
-        t11 = vfmaq_f32(t11, y41, qm5);
-
-        z50 = vaddq_f32(t00, t10); z51 = vaddq_f32(t01, t11);
-        z60 = vsubq_f32(t10, t00); z61 = vsubq_f32(t11, t01);
-    }
-
-    const int outstep = _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32*Cg;
-
-    vst1q_f32(outptr, z00);
-    vst1q_f32(outptr + outstep, z01);
-    vst1q_f32(outptr + outstep*2, z10);
-    vst1q_f32(outptr + outstep*3, z11);
-    vst1q_f32(outptr + outstep*4, z20);
-    vst1q_f32(outptr + outstep*5, z21);
-    vst1q_f32(outptr + outstep*6, z30);
-    vst1q_f32(outptr + outstep*7, z31);
-    vst1q_f32(outptr + outstep*8, z40);
-    vst1q_f32(outptr + outstep*9, z41);
-    vst1q_f32(outptr + outstep*10, z50);
-    vst1q_f32(outptr + outstep*11, z51);
-    vst1q_f32(outptr + outstep*12, z60);
-    vst1q_f32(outptr + outstep*13, z61);
-    vst1q_f32(outptr + outstep*14, z70);
-    vst1q_f32(outptr + outstep*15, z71);
-#elif CV_SIMD128
-    v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
-    v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
-    v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
-    v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4);
-    v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4);
-    v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4);
-    v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4);
-    v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4);
-
-    v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71;
-
-    {
-        /* Y[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*X */
-        /* Y[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*X */
-        v_float32x4 q5_25 = v_setall_f32(5.25f), t00, t01, t10, t11;
-        t00 = x40 - x20;
-        t01 = x41 - x21;
-        t10 = x30 - x50;
-        t11 = x31 - x51;
-        v_float32x4 y00 = v_fma(t00, q5_25, x00 - x60);
-        v_float32x4 y01 = v_fma(t01, q5_25, x01 - x61);
-        v_float32x4 y70 = v_fma(t10, q5_25, x70 - x10);
-        v_float32x4 y71 = v_fma(t11, q5_25, x71 - x11);
-
-        /* Y[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*X */
-        /* Y[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*X */
-        v_float32x4 qm4_25 = v_setall_f32(-4.25f);
-        t00 = v_fma(x30, qm4_25, x10 + x50);
-        t01 = v_fma(x31, qm4_25, x11 + x51);
-        t10 = v_fma(x40, qm4_25, x20 + x60);
-        t11 = v_fma(x41, qm4_25, x21 + x61);
-
-        v_float32x4 y10 = t00 + t10, y11 = t01 + t11;
-        v_float32x4 y20 = t10 - t00, y21 = t11 - t01;
-
-        /* Y[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*X */
-        /* Y[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*X */
-        v_float32x4 q0_5 = v_setall_f32(0.5f), q0_25 = v_setall_f32(0.25f);
-        v_float32x4 qm2_5 = v_setall_f32(-2.5f), qm1_25 = v_setall_f32(-1.25f);
-        t00 = v_fma(x10, q0_5, x50 + x50);
-        t01 = v_fma(x11, q0_5, x51 + x51);
-        t10 = v_fma(x20, q0_25, x60);
-        t11 = v_fma(x21, q0_25, x61);
-        t00 = v_fma(x30, qm2_5, t00);
-        t01 = v_fma(x31, qm2_5, t01);
-        t10 = v_fma(x40, qm1_25, t10);
-        t11 = v_fma(x41, qm1_25, t11);
-
-        v_float32x4 y30 = t00 + t10, y31 = t01 + t11;
-        v_float32x4 y40 = t10 - t00, y41 = t11 - t01;
-
-        /* Y[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*X */
-        /* Y[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*X */
-        v_float32x4 q4 = v_setall_f32(4.f), qm5 = v_setall_f32(-5.f);
-        t00 = v_fma(x50, q0_5, x10 + x10);
-        t01 = v_fma(x51, q0_5, x11 + x11);
-        t10 = v_fma(x20, q4   , x60);
-        t11 = v_fma(x21, q4   , x61);
-        t00 = v_fma(x30, qm2_5, t00);
-        t01 = v_fma(x31, qm2_5, t01);
-        t10 = v_fma(x40, qm5  , t10);
-        t11 = v_fma(x41, qm5  , t11);
-
-        v_float32x4 y50 = t00 + t10, y51 = t01 + t11;
-        v_float32x4 y60 = t10 - t00, y61 = t11 - t01;
-
-        /* transpose 8x8 matrix in-place with some renumeration of the elements: */
-        /* Y:              */
-        /*        y00 y01  */
-        /*        y10 y11  */
-        /*        ...      */
-        /*        y70 y71  */
-        /*   Y':           */
-        /*        y00 y40  */
-        /*        y10 y50  */
-        /*        y20 y60  */
-        /*        y30 y70  */
-        /*        y01 y41  */
-        /*        y11 y51  */
-        /*        y21 y61  */
-        /*        y31 y71  */
-        /*    in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
-
-        v_transpose4x4(y00, y10, y20, y30, y00, y10, y20, y30);
-        v_transpose4x4(y01, y11, y21, y31, y01, y11, y21, y31);
-        v_transpose4x4(y40, y50, y60, y70, y40, y50, y60, y70);
-        v_transpose4x4(y41, y51, y61, y71, y41, y51, y61, y71);
-
-        /* Z[0] = [1.f, 0.f, -5.25f, 0.f, 5.25f, 0.f, -1.f, 0.f]*Y */
-        /* Z[7] = [0.f, -1.f, 0.f, 5.25f, 0.f, -5.25f, 0.f, 1.f]*Y */
-        t00 = y01 - y20;
-        t01 = y41 - y60;
-        t10 = y30 - y11;
-        t11 = y70 - y51;
-        z00 = v_fma(t00, q5_25, y00 - y21);
-        z01 = v_fma(t01, q5_25, y40 - y61);
-        z70 = v_fma(t10, q5_25, y31 - y10);
-        z71 = v_fma(t11, q5_25, y71 - y50);
-
-        /* Z[1] = [0.f, 1.f, 1.f, -4.25f, -4.25f, 1.f, 1.f, 0.f]*Y */
-        /* Z[2] = [0.f, -1.f, 1.f, 4.25f, -4.25f, -1.f, 1.f, 0.f]*Y */
-        t00 = v_fma(y30, qm4_25, y10 + y11);
-        t01 = v_fma(y70, qm4_25, y50 + y51);
-        t10 = v_fma(y01, qm4_25, y20 + y21);
-        t11 = v_fma(y41, qm4_25, y60 + y61);
-
-        z10 = t00 + t10; z11 = t01 + t11;
-        z20 = t10 - t00; z21 = t11 - t01;
-
-        /* Z[3] = [0.f, 0.5f, 0.25f, -2.5f, -1.25f, 2.f, 1.f, 0.f]*Y */
-        /* Z[4] = [0.f, -0.5f, 0.25f, 2.5f, -1.25f, -2.f, 1.f, 0.f]*Y */
-        t00 = v_fma(y10, q0_5, y11 + y11);
-        t01 = v_fma(y50, q0_5, y51 + y51);
-        t10 = v_fma(y20, q0_25, y21);
-        t11 = v_fma(y60, q0_25, y61);
-        t00 = v_fma(y30, qm2_5, t00);
-        t01 = v_fma(y70, qm2_5, t01);
-        t10 = v_fma(y01, qm1_25, t10);
-        t11 = v_fma(y41, qm1_25, t11);
-
-        z30 = t00 + t10; z31 = t01 + t11;
-        z40 = t10 - t00; z41 = t11 - t01;
-
-        /* Z[5] = [0.f, 2.f, 4.f, -2.5f, -5.f, 0.5f, 1.f, 0.f]*Y */
-        /* Z[6] = [0.f, -2.f, 4.f, 2.5f, -5.f, -0.5f, 1.f, 0.f]*Y */
-        t00 = v_fma(y11, q0_5, y10 + y10);
-        t01 = v_fma(y51, q0_5, y50 + y50);
-        t10 = v_fma(y20, q4, y21);
-        t11 = v_fma(y60, q4, y61);
-        t00 = v_fma(y30, qm2_5, t00);
-        t01 = v_fma(y70, qm2_5, t01);
-        t10 = v_fma(y01, qm5, t10);
-        t11 = v_fma(y41, qm5, t11);
-
-        z50 = t00 + t10; z51 = t01 + t11;
-        z60 = t10 - t00; z61 = t11 - t01;
-    }
-
-    const int outstep = _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32*Cg;
-
-    v_store(outptr, z00);
-    v_store(outptr + outstep, z01);
-    v_store(outptr + outstep*2, z10);
-    v_store(outptr + outstep*3, z11);
-    v_store(outptr + outstep*4, z20);
-    v_store(outptr + outstep*5, z21);
-    v_store(outptr + outstep*6, z30);
-    v_store(outptr + outstep*7, z31);
-    v_store(outptr + outstep*8, z40);
-    v_store(outptr + outstep*9, z41);
-    v_store(outptr + outstep*10, z50);
-    v_store(outptr + outstep*11, z51);
-    v_store(outptr + outstep*12, z60);
-    v_store(outptr + outstep*13, z61);
-    v_store(outptr + outstep*14, z70);
-    v_store(outptr + outstep*15, z71);
-#else
-#error "Only SIMD128, AVX2 and NEON are supported in Winograd."
-#endif
-}
-
-/*  Inverse Winograd 8x8 transform:
-    out = (A'*inp*A)', where
-    inp is input 8x8 FP32 matrix,
-    A' is
-    [1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f,
-     0.f, 1.f, -1.f, 2.f, -2.f, 0.5f, -0.5f, 0.f,
-     0.f, 1.f, 1.f, 4.f, 4.f, 0.25f, 0.25f, 0.f,
-     0.f, 1.f, -1.f, 8.f, -8.f, 0.125f, -0.125f, 0.f,
-     0.f, 1.f, 1.f, 16.f, 16.f, 1.f/16, 1.f/16, 0.f,
-     0.f, 1.f, -1.f, 32.f, -32.f, 1.f/32, -1.f/32, 1.f]
-
-    inp is pre-loaded into xij registers,
-    out will be stored in zij, where (0<=i<=7 for x, 0<=i<=5 for z), 0<=j<=1.
-
-    After the inverse transform is done, we add bias,
-    optionally add results from the earlier tensors (by-pass),
-    optionally apply activation function and then
-    store the final results.
-
-    Note that both _FX_WINOGRAD_FWD_8x8() and
-    _FX_WINOGRAD_INV_8x8() produce tranposed output.
-    That is, after both forward and then inverse transformation,
-    we get non-transposed result.
-    Of course, for the correct work of Winograd-based convolution,
-    the Winograd-transformed weights should also be transposed.
-    init_conv() (see OpConv.fx) takes care of that.
-*/
-static void
-_fx_winograd_AtXA_8x8_f32(const float* inptr, int inpstep,
-                          float* bpptr, int bpstep, float* outptr, int outstep,
-                          float bias, float minval, float maxval, bool ifMinMaxAct)
-{
-#if CV_NEON && CV_NEON_AARCH64
-    float32x4_t x00 = vld1q_f32(inptr), x01 = vld1q_f32(inptr + 4);
-    float32x4_t x10 = vld1q_f32(inptr + inpstep), x11 = vld1q_f32(inptr + inpstep + 4);
-    float32x4_t x20 = vld1q_f32(inptr + inpstep*2), x21 = vld1q_f32(inptr + inpstep*2 + 4);
-    float32x4_t x30 = vld1q_f32(inptr + inpstep*3), x31 = vld1q_f32(inptr + inpstep*3 + 4);
-    float32x4_t x40 = vld1q_f32(inptr + inpstep*4), x41 = vld1q_f32(inptr + inpstep*4 + 4);
-    float32x4_t x50 = vld1q_f32(inptr + inpstep*5), x51 = vld1q_f32(inptr + inpstep*5 + 4);
-    float32x4_t x60 = vld1q_f32(inptr + inpstep*6), x61 = vld1q_f32(inptr + inpstep*6 + 4);
-    float32x4_t x70 = vld1q_f32(inptr + inpstep*7), x71 = vld1q_f32(inptr + inpstep*7 + 4);
-    float32x4_t z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51;
-
-    {
-        float32x4_t s12_0, s12_1, s34_0, s34_1, s56_0, s56_1;
-        s12_0 = vaddq_f32(x10, x20); s12_1 = vaddq_f32(x11, x21);
-        s34_0 = vaddq_f32(x30, x40); s34_1 = vaddq_f32(x31, x41);
-        s56_0 = vaddq_f32(x50, x60); s56_1 = vaddq_f32(x51, x61);
-
-        float32x4_t y00 = vaddq_f32(vaddq_f32(vaddq_f32(x00, s12_0), s34_0), s56_0);
-        float32x4_t y01 = vaddq_f32(vaddq_f32(vaddq_f32(x01, s12_1), s34_1), s56_1);
-        float32x4_t y20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f);
-        float32x4_t y21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f);
-        float32x4_t y40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16);
-        float32x4_t y41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16);
-
-        s12_0 = vsubq_f32(x10, x20); s12_1 = vsubq_f32(x11, x21);
-        s34_0 = vsubq_f32(x30, x40); s34_1 = vsubq_f32(x31, x41);
-        s56_0 = vsubq_f32(x50, x60); s56_1 = vsubq_f32(x51, x61);
-
-        float32x4_t y50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x70, s12_0),
-                                      s34_0, 32.f), s56_0, 1.f/32);
-        float32x4_t y51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(x71, s12_1),
-                                      s34_1, 32.f), s56_1, 1.f/32);
-        float32x4_t y10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f);
-        float32x4_t y11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f);
-        float32x4_t y30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f);
-        float32x4_t y31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f);
-        float32x4_t y60 = vdupq_n_f32(0.f), y61 = y60, y70 = y60, y71 = y60;
-
-        /* transpose 8x8 matrix in-place with some renumeration of the elements: */
-        /*  Y: */
-        /*        y00 y01 */
-        /*        y10 y11 */
-        /*        ... */
-        /*        y50 y51 */
-        /*        0   0 */
-        /*        0   0 */
-        /*   Y': */
-        /*        y00 y40 */
-        /*        y10 y50 */
-        /*        y20 y60 */
-        /*        y30 y70 */
-        /*        y01 y41 */
-        /*        y11 y51 */
-        /*        y21 y61 */
-        /*        y31 y71 */
-        /*    in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
-        float32x4x2_t tr0, tr1;
-
-        T4x4(y00, y10, y20, y30, tr0, tr1);
-        T4x4(y01, y11, y21, y31, tr0, tr1);
-        T4x4(y40, y50, y60, y70, tr0, tr1);
-        T4x4(y41, y51, y61, y71, tr0, tr1);
-
-        s12_0 = vaddq_f32(y10, y20); s12_1 = vaddq_f32(y50, y60);
-        s34_0 = vaddq_f32(y30, y01); s34_1 = vaddq_f32(y70, y41);
-        s56_0 = vaddq_f32(y11, y21); s56_1 = vaddq_f32(y51, y61);
-
-        z00 = vaddq_f32(vaddq_f32(vaddq_f32(y00, s12_0), s34_0), s56_0);
-        z01 = vaddq_f32(vaddq_f32(vaddq_f32(y40, s12_1), s34_1), s56_1);
-        z20 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 4.0f), s56_0, 0.25f);
-        z21 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 4.0f), s56_1, 0.25f);
-        z40 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 16.0f), s56_0, 1.f/16);
-        z41 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 16.0f), s56_1, 1.f/16);
-
-        s12_0 = vsubq_f32(y10, y20); s12_1 = vsubq_f32(y50, y60);
-        s34_0 = vsubq_f32(y30, y01); s34_1 = vsubq_f32(y70, y41);
-        s56_0 = vsubq_f32(y11, y21); s56_1 = vsubq_f32(y51, y61);
-
-        z50 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y31, s12_0),
-                          s34_0, 32.f), s56_0, 1.f/32);
-        z51 = vfmaq_n_f32(vfmaq_n_f32(vaddq_f32(y71, s12_1),
-                          s34_1, 32.f), s56_1, 1.f/32);
-        z10 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 2.0f), s56_0, 0.5f);
-        z11 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 2.0f), s56_1, 0.5f);
-        z30 = vfmaq_n_f32(vfmaq_n_f32(s12_0, s34_0, 8.0f), s56_0, 0.125f);
-        z31 = vfmaq_n_f32(vfmaq_n_f32(s12_1, s34_1, 8.0f), s56_1, 0.125f);
-        float32x4_t vbias = vdupq_n_f32(bias);
-
-        z00 = vaddq_f32(z00, vbias);
-        z01 = vaddq_f32(z01, vbias);
-        z10 = vaddq_f32(z10, vbias);
-        z11 = vaddq_f32(z11, vbias);
-        z20 = vaddq_f32(z20, vbias);
-        z21 = vaddq_f32(z21, vbias);
-        z30 = vaddq_f32(z30, vbias);
-        z31 = vaddq_f32(z31, vbias);
-        z40 = vaddq_f32(z40, vbias);
-        z41 = vaddq_f32(z41, vbias);
-        z50 = vaddq_f32(z50, vbias);
-        z51 = vaddq_f32(z51, vbias);
-    }
-
-    if (bpptr)
-    {
-        float32x2_t zhalf = vdup_n_f32(0.f);
-        z00 = vaddq_f32(z00, vld1q_f32(bpptr));
-        z01 = vaddq_f32(z01, vcombine_f32(vld1_f32(bpptr + 4), zhalf));
-        z10 = vaddq_f32(z10, vld1q_f32(bpptr + bpstep));
-        z11 = vaddq_f32(z11, vcombine_f32(vld1_f32(bpptr + bpstep + 4), zhalf));
-        z20 = vaddq_f32(z20, vld1q_f32(bpptr + bpstep*2));
-        z21 = vaddq_f32(z21, vcombine_f32(vld1_f32(bpptr + bpstep*2 + 4), zhalf));
-        z30 = vaddq_f32(z30, vld1q_f32(bpptr + bpstep*3));
-        z31 = vaddq_f32(z31, vcombine_f32(vld1_f32(bpptr + bpstep*3 + 4), zhalf));
-        z40 = vaddq_f32(z40, vld1q_f32(bpptr + bpstep*4));
-        z41 = vaddq_f32(z41, vcombine_f32(vld1_f32(bpptr + bpstep*4 + 4), zhalf));
-        z50 = vaddq_f32(z50, vld1q_f32(bpptr + bpstep*5));
-        z51 = vaddq_f32(z51, vcombine_f32(vld1_f32(bpptr + bpstep*5 + 4), zhalf));
-    }
-
-    if (ifMinMaxAct)
-    {
-        float32x4_t vmax = vdupq_n_f32(maxval);
-        float32x4_t vmin = vdupq_n_f32(minval);
-
-        z00 = vminq_f32(vmaxq_f32(z00, vmin), vmax);
-        z01 = vminq_f32(vmaxq_f32(z01, vmin), vmax);
-        z10 = vminq_f32(vmaxq_f32(z10, vmin), vmax);
-        z11 = vminq_f32(vmaxq_f32(z11, vmin), vmax);
-        z20 = vminq_f32(vmaxq_f32(z20, vmin), vmax);
-        z21 = vminq_f32(vmaxq_f32(z21, vmin), vmax);
-        z30 = vminq_f32(vmaxq_f32(z30, vmin), vmax);
-        z31 = vminq_f32(vmaxq_f32(z31, vmin), vmax);
-        z40 = vminq_f32(vmaxq_f32(z40, vmin), vmax);
-        z41 = vminq_f32(vmaxq_f32(z41, vmin), vmax);
-        z50 = vminq_f32(vmaxq_f32(z50, vmin), vmax);
-        z51 = vminq_f32(vmaxq_f32(z51, vmin), vmax);
-    }
-
-    vst1q_f32(outptr, z00);
-    vst1_f32(outptr + 4, vget_low_f32(z01));
-    vst1q_f32(outptr + outstep, z10);
-    vst1_f32(outptr + outstep + 4, vget_low_f32(z11));
-    vst1q_f32(outptr + outstep*2, z20);
-    vst1_f32(outptr + outstep*2 + 4, vget_low_f32(z21));
-    vst1q_f32(outptr + outstep*3, z30);
-    vst1_f32(outptr + outstep*3 + 4, vget_low_f32(z31));
-    vst1q_f32(outptr + outstep*4, z40);
-    vst1_f32(outptr + outstep*4 + 4, vget_low_f32(z41));
-    vst1q_f32(outptr + outstep*5, z50);
-    vst1_f32(outptr + outstep*5 + 4, vget_low_f32(z51));
-#elif CV_SIMD128
-    v_float32x4 x00 = v_load(inptr), x01 = v_load(inptr + 4);
-    v_float32x4 x10 = v_load(inptr + inpstep), x11 = v_load(inptr + inpstep + 4);
-    v_float32x4 x20 = v_load(inptr + inpstep*2), x21 = v_load(inptr + inpstep*2 + 4);
-    v_float32x4 x30 = v_load(inptr + inpstep*3), x31 = v_load(inptr + inpstep*3 + 4);
-    v_float32x4 x40 = v_load(inptr + inpstep*4), x41 = v_load(inptr + inpstep*4 + 4);
-    v_float32x4 x50 = v_load(inptr + inpstep*5), x51 = v_load(inptr + inpstep*5 + 4);
-    v_float32x4 x60 = v_load(inptr + inpstep*6), x61 = v_load(inptr + inpstep*6 + 4);
-    v_float32x4 x70 = v_load(inptr + inpstep*7), x71 = v_load(inptr + inpstep*7 + 4);
-    v_float32x4 z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51;
-
-    {
-        v_float32x4 s12_0, s12_1, s34_0, s34_1, s56_0, s56_1;
-        s12_0 = x10 + x20; s12_1 = x11 + x21;
-        s34_0 = x30 + x40; s34_1 = x31 + x41;
-        s56_0 = x50 + x60; s56_1 = x51 + x61;
-
-        v_float32x4 y00 = x00 + s12_0 + s34_0 + s56_0;
-        v_float32x4 y01 = x01 + s12_1 + s34_1 + s56_1;
-
-        v_float32x4 a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
-        v_float32x4 y20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
-        v_float32x4 y21 = v_fma(s56_1, a0 ,v_fma(s34_1, a1, s12_1) );
-
-        a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f);
-        v_float32x4 y40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
-        v_float32x4 y41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
-
-        s12_0 = x10 - x20; s12_1 = x11 - x21;
-        s34_0 = x30 - x40; s34_1 = x31 - x41;
-        s56_0 = x50 - x60; s56_1 = x51 - x61;
-
-        a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.f);
-        v_float32x4 y50 = v_fma(s56_0, a0, v_fma(s34_0, a1, x70 + s12_0));
-        v_float32x4 y51 = v_fma(s56_1, a0, v_fma(s34_1, a1, x71 + s12_1));
-
-        a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.f);
-        v_float32x4 y10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
-        v_float32x4 y11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
-
-        a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.f);
-        v_float32x4 y30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
-        v_float32x4 y31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
-
-        v_float32x4 y60 = v_setall_f32(0.f), y61 = y60, y70 = y60, y71 = y60;
-
-        /* transpose 8x8 matrix in-place with some renumeration of the elements: */
-        /*  Y: */
-        /*        y00 y01 */
-        /*        y10 y11 */
-        /*        ... */
-        /*        y50 y51 */
-        /*        0   0 */
-        /*        0   0 */
-        /*   Y': */
-        /*        y00 y40 */
-        /*        y10 y50 */
-        /*        y20 y60 */
-        /*        y30 y70 */
-        /*        y01 y41 */
-        /*        y11 y51 */
-        /*        y21 y61 */
-        /*        y31 y71 */
-        /*    in other words, y40 <-> y01, y50 <-> y11, y60 <-> y21, y70 <-> y31 */
-
-        v_transpose4x4(y00, y10, y20, y30, y00, y10, y20, y30);
-        v_transpose4x4(y01, y11, y21, y31, y01, y11, y21, y31);
-        v_transpose4x4(y40, y50, y60, y70, y40, y50, y60, y70);
-        v_transpose4x4(y41, y51, y61, y71, y41, y51, y61, y71);
-
-        s12_0 = y10 + y20; s12_1 = y50 + y60;
-        s34_0 = y30 + y01; s34_1 = y70 + y41;
-        s56_0 = y11 + y21; s56_1 = y51 + y61;
-
-        z00 = y00 + s12_0 + s34_0 + s56_0;
-        z01 = y40 + s12_1 + s34_1 + s56_1;
-
-        a0 = v_setall_f32(0.25f), a1 = v_setall_f32(4.0f);
-        z20 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
-        z21 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
-
-        a0 = v_setall_f32(1.f/16), a1 = v_setall_f32(16.0f);
-        z40 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
-        z41 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
-
-        s12_0 = y10 - y20; s12_1 = y50 - y60;
-        s34_0 = y30 - y01; s34_1 = y70 - y41;
-        s56_0 = y11 - y21; s56_1 = y51 - y61;
-
-        a0 = v_setall_f32(1.f/32), a1 = v_setall_f32(32.0f);
-        z50 = v_fma(s56_0, a0, v_fma(s34_0, a1, y31 + s12_0));
-        z51 = v_fma(s56_1, a0, v_fma(s34_1, a1, y71 + s12_1));
-
-        a0 = v_setall_f32(0.5f), a1 = v_setall_f32(2.0f);
-        z10 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
-        z11 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
-
-        a0 = v_setall_f32(0.125f), a1 = v_setall_f32(8.0f);
-        z30 = v_fma(s56_0, a0, v_fma(s34_0, a1, s12_0));
-        z31 = v_fma(s56_1, a0, v_fma(s34_1, a1, s12_1));
-
-        v_float32x4 vbias = v_setall_f32(bias);
-        z00 += vbias;
-        z01 += vbias;
-        z10 += vbias;
-        z11 += vbias;
-        z20 += vbias;
-        z21 += vbias;
-        z30 += vbias;
-        z31 += vbias;
-        z40 += vbias;
-        z41 += vbias;
-        z50 += vbias;
-        z51 += vbias;
-    }
-
-    if (bpptr)
-    {
-        z00 += v_load(bpptr);
-        z01 += v_load_low(bpptr + 4);
-        z10 += v_load(bpptr + bpstep);
-        z11 += v_load_low(bpptr + bpstep + 4);
-        z20 += v_load(bpptr + bpstep*2);
-        z21 += v_load_low(bpptr + bpstep*2 + 4);
-        z30 += v_load(bpptr + bpstep*3);
-        z31 += v_load_low(bpptr + bpstep*3 + 4);
-        z40 += v_load(bpptr + bpstep*4);
-        z41 += v_load_low(bpptr + bpstep*4 + 4);
-        z50 += v_load(bpptr + bpstep*5);
-        z51 += v_load_low(bpptr + bpstep*5 + 4);
-    }
-
-    if (ifMinMaxAct)
-    {
-        v_float32x4 vmax = v_setall_f32(maxval);
-        v_float32x4 vmin = v_setall_f32(minval);
-
-        z00 = v_min(v_max(z00, vmin), vmax);
-        z01 = v_min(v_max(z01, vmin), vmax);
-        z10 = v_min(v_max(z10, vmin), vmax);
-        z11 = v_min(v_max(z11, vmin), vmax);
-        z20 = v_min(v_max(z20, vmin), vmax);
-        z21 = v_min(v_max(z21, vmin), vmax);
-        z30 = v_min(v_max(z30, vmin), vmax);
-        z31 = v_min(v_max(z31, vmin), vmax);
-        z40 = v_min(v_max(z40, vmin), vmax);
-        z41 = v_min(v_max(z41, vmin), vmax);
-        z50 = v_min(v_max(z50, vmin), vmax);
-        z51 = v_min(v_max(z51, vmin), vmax);
-    }
-
-    v_store(outptr, z00);
-    v_store_low(outptr + 4, z01);
-    v_store(outptr + outstep, z10);
-    v_store_low(outptr + outstep + 4, z11);
-    v_store(outptr + outstep*2, z20);
-    v_store_low(outptr + outstep*2 + 4, z21);
-    v_store(outptr + outstep*3, z30);
-    v_store_low(outptr + outstep*3 + 4, z31);
-    v_store(outptr + outstep*4, z40);
-    v_store_low(outptr + outstep*4 + 4, z41);
-    v_store(outptr + outstep*5, z50);
-    v_store_low(outptr + outstep*5 + 4, z51);
-#else
-#error "Only SIMD128, AVX2 and NEON are supported in Winograd."
-#endif
-}
-
-int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv,
-                  int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
-{
-    Mat input = _input.getMat();
-    Mat output = _output.getMat();
-    Mat fusedAddMat = _fusedAddMat.getMat();
-
-    MatShape inputShape = shape(input);
-    MatShape outputShape = shape(output);
-    CV_Assert(inputShape.size() == 4 && outputShape.size() == 4);
-
-    int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3];  // [N, C, H, W]
-    int K = conv->K;
-    int H0 = outputShape[2], W0 = outputShape[3];
-
-    int pad_top = conv->pad_top;
-    int pad_left = conv->pad_left;
-
-    int ngroups = conv->ngroups, Cg = C/ngroups, Kg = K/ngroups;
-    int Kg_nblocks = (Kg + _FX_WINO_KBLOCK - 1)/_FX_WINO_KBLOCK;
-    const size_t inp_planesize = (size_t)Hi*Wi;
-    const size_t out_planesize = (size_t)H0*W0;
-
-    int blocks_per_row = (W0+_FX_WINO_STEP-1)/_FX_WINO_STEP;
-    int blocks_per_plane = ((H0+_FX_WINO_STEP-1)/_FX_WINO_STEP)*blocks_per_row;
-    int blocks_per_plane_aligned = ((blocks_per_plane +
-                                     _FX_WINO_IBLOCK-1)/_FX_WINO_IBLOCK)*_FX_WINO_IBLOCK;
-
-    size_t totalbufsize = (size_t)N*C*blocks_per_plane_aligned*_FX_WINO_AREA;
-
-    AutoBuffer<float> _buf;
-    _buf.allocate(totalbufsize + VEC_ALIGN);
-    float* wbuf_all = alignPtr(_buf.data(), VEC_ALIGN);
-
-    float* inp = input.ptr<float>();
-    float* out = output.ptr<float>();
-
-    float* fusedAddPtr = fusedAddMat.empty() ? nullptr : fusedAddMat.ptr<float>();
-
-    // Phase 1. compute forward Winograd transforms for all input blocks,
-    // all input planes, all samples in the batch.
-    // [TODO]: maybe, if there are too many input channels, it makes sense to
-    // transform only part of input channels at once and then compute the partial
-    // accumulated sums (i.e. update the output buffers several times,
-    // rather than compute them in one pass).
-    parallel_for_(Range(0, ntasks), [&](const Range& r0) {
-    for (int task_id = r0.start; task_id < r0.end; task_id++)
-    {
-        int nc0 = (N*C)*task_id/ntasks;
-        int nc1 = (N*C)*(task_id+1)/ntasks;
-        for(; nc0 < nc1; nc0++)
-        {
-            int n = nc0 / C;
-            int c = nc0 - n*C;
-            int g = c / Cg;
-            c -= g*Cg;
-            for (int block_id = 0; block_id < blocks_per_plane; block_id += _FX_WINO_IBLOCK)
-            {
-                for (int db = 0; db < _FX_WINO_IBLOCK; db++)
-                {
-                    size_t inwofs = ((n*ngroups + g)*blocks_per_plane_aligned +
-                                     block_id)*Cg*_FX_WINO_AREA +
-                                    (c*_FX_WINO_IBLOCK + db)*_FX_WINO_ATOM_F32;
-                    float* inwptr = (float*)wbuf_all + inwofs;
-
-                    if (block_id + db < blocks_per_plane)
-                    {
-                        int y0 = (block_id + db) / blocks_per_row;
-                        int x0 = (block_id + db) - y0 * blocks_per_row;
-                        y0 = y0*_FX_WINO_STEP - pad_top;
-                        x0 = x0*_FX_WINO_STEP - pad_left;
-                        bool partial = y0 < 0 || y0 + _FX_WINO_SIZE > Hi ||
-                                       x0 < 0 || x0 + _FX_WINO_SIZE > Wi;
-                        int dx1 = 0, dx2 = _FX_WINO_SIZE, dy1 = 0, dy2 = _FX_WINO_SIZE;
-                        int inpstep = Wi;
-
-                        float inpbuf[_FX_WINO_AREA];
-                        float* inptr0 = (float*)inp + nc0*inp_planesize + y0*Wi + x0;
-                        float* inptr = inptr0;
-
-                        if (partial)
-                        {
-                            memset(inpbuf, 0, sizeof(inpbuf));
-                            dy1 = -y0 > 0 ? -y0 : 0;
-                            dy2 = Hi - y0 < _FX_WINO_SIZE ? Hi - y0 : _FX_WINO_SIZE;
-
-                            if (dy2 < dy1) {dy2 = dy1 = 0;}
-                            dx1 = -x0 > 0 ? -x0 : 0;
-                            dx2 = Wi - x0 < _FX_WINO_SIZE ? Wi - x0 : _FX_WINO_SIZE;
-
-                            if (dx2 < dx1) {dx2 = dx1 = 0;}
-                            inptr0 -= y0*Wi + x0;
-
-                            if (dx1 < dx2 && dy1 < dy2)
-                            {
-                                for(int dy = dy1; dy < dy2; dy++)
-                                    memcpy(&inpbuf[dy*_FX_WINO_SIZE + dx1],
-                                           inptr0 + (y0+dy)*Wi + (x0+dx1),
-                                           (dx2-dx1)*sizeof(inpbuf[0]));
-                            }
-
-                            inptr = inpbuf;
-                            inpstep = _FX_WINO_SIZE;
-                        }
-#if CV_TRY_AVX2
-                        if (conv->useAVX2)
-                            opt_AVX2::_fx_winograd_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg);
-                        else
-#endif
-                        _fx_winograd_BtXB_8x8_f32(inptr, inpstep, inwptr, Cg);
-                    }
-                    else
-                    {
-                        for (int i = 0; i < _FX_WINO_NATOMS_F32; i++, inwptr += _FX_WINO_IBLOCK*_FX_WINO_ATOM_F32)
-                            memset(inwptr, 0, _FX_WINO_ATOM_F32*sizeof(inwptr[0]));
-                    }
-                }
-            }
-        }
-    }});
-
-    // Phase 2. compute elemwise-weighted sums of transformed blocks,
-    // apply inverse Winograd transforms to the sums,
-    // add bias, apply activation function if any and store the results.
-    parallel_for_(Range(0, ntasks), [&](const Range& r0) {
-    for (int task_id = r0.start; task_id < r0.end; task_id++)
-    {
-        size_t out_wbuf_size = _FX_WINO_AREA*_FX_WINO_KBLOCK*_FX_WINO_IBLOCK;
-        size_t outbuf_size = _FX_WINO_AREA;
-        AutoBuffer<float> out_wbuf_, outbuf_;
-        out_wbuf_.allocate(out_wbuf_size + VEC_ALIGN);
-        float* out_wbuf = alignPtr(out_wbuf_.data(), VEC_ALIGN);
-        outbuf_.allocate(outbuf_size + VEC_ALIGN);
-        float* outbuf = alignPtr(outbuf_.data(), VEC_ALIGN);
-
-        memset(out_wbuf, 0, out_wbuf_size * sizeof(float));
-        memset(outbuf, 0, outbuf_size * sizeof(float));
-
-        int ngk0 = (int)(((int64_t)N*Kg_nblocks*ngroups)*task_id/ntasks);
-        int ngk1 = (int)(((int64_t)N*Kg_nblocks*ngroups)*(task_id+1)/ntasks);
-
-        for(; ngk0 < ngk1; ngk0++)
-        {
-            int n = ngk0 / (Kg_nblocks*ngroups);
-            int gk0 = ngk0 % (Kg_nblocks*ngroups);
-            int g = gk0 / Kg_nblocks;
-            int k0 = (gk0 % Kg_nblocks)*_FX_WINO_KBLOCK;
-            int k1 = k0 + _FX_WINO_KBLOCK <= Kg ? k0 + _FX_WINO_KBLOCK : Kg;
-
-            for (int block_id0 = 0; block_id0 < blocks_per_plane; block_id0 += _FX_WINO_IBLOCK)
-            {
-                int block_id1 = block_id0 + _FX_WINO_IBLOCK;
-                block_id1 = block_id1 < blocks_per_plane ? block_id1 : blocks_per_plane;
-                size_t inwofs = ((n*ngroups + g)*blocks_per_plane_aligned + block_id0)*Cg*_FX_WINO_AREA;
-                size_t wofs = (g*Kg_nblocks*_FX_WINO_KBLOCK + k0)*Cg*_FX_WINO_AREA;
-
-                float* inwptr = wbuf_all + inwofs;
-                const float* wptr = conv->weightsWinoBufPtr + wofs;
-
-#if CV_TRY_AVX2
-                if (conv->useAVX2)
-                    opt_AVX2::_fx_winograd_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0);
-                else
-#endif
-                _fx_winograd_accum_f32(inwptr, wptr, out_wbuf, Cg, block_id1 - block_id0);
-                for (int k = k0; k < k1; k++)
-                {
-                    float biasv = conv->biasBuf[g*Kg + k];
-                    for (int block_id = block_id0; block_id < block_id1; block_id++)
-                    {
-                        int y0 = block_id / blocks_per_row;
-                        int x0 = block_id - y0 * blocks_per_row;
-                        y0 = y0*_FX_WINO_STEP;
-                        x0 = x0*_FX_WINO_STEP;
-                        int dy1 = H0 - y0;
-                        if (dy1 > _FX_WINO_STEP) dy1 = _FX_WINO_STEP;
-                        int dx1 = W0 - x0;
-                        if (dx1 > _FX_WINO_STEP) dx1 = _FX_WINO_STEP;
-                        assert(dx1 > 0 && dy1 > 0);
-                        bool partial = activ || dy1 < _FX_WINO_STEP || dx1 < _FX_WINO_STEP;
-                        size_t outofs = (n*K + g*Kg + k)*out_planesize + y0*W0 + x0;
-                        int outstep = W0;
-
-                        float* outptr0 = (float*)out + outofs;
-                        float* pbptr0 = fusedAddPtr ? fusedAddPtr + outofs : nullptr;
-                        float *outptr = outptr0, *bpptr = pbptr0;
-
-                        if (partial)
-                        {
-                            outptr = outbuf;
-                            outstep = _FX_WINO_SIZE;
-                            if (pbptr0)
-                            {
-                                bpptr = outbuf;
-                                for (int y = 0; y < dy1; y++)
-                                    memcpy(outbuf + y*_FX_WINO_SIZE, pbptr0 + y*W0,
-                                           dx1*sizeof(pbptr0[0]));
-                            }
-                        }
-#if CV_TRY_AVX2
-                        if (conv->useAVX2)
-                            opt_AVX2::_fx_winograd_AtXA_8x8_f32(out_wbuf + ((k - k0)*_FX_WINO_IBLOCK + (block_id - block_id0))*_FX_WINO_AREA, _FX_WINO_SIZE,
-                                                                bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
-                        else
-#endif
-                        _fx_winograd_AtXA_8x8_f32(out_wbuf + ((k - k0)*_FX_WINO_IBLOCK + (block_id - block_id0))*_FX_WINO_AREA, _FX_WINO_SIZE,
-                                                  bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
-                        if (partial)
-                        {
-                            if (activ)
-                                activ->forwardSlice(outptr, outptr, _FX_WINO_SIZE*_FX_WINO_STEP, 0, g*Kg + k, g*Kg + k + 1);
-                            for (int y = 0; y < dy1; y++)
-                                memcpy(outptr0 + y*W0, outptr + y*_FX_WINO_SIZE,dx1*sizeof(outptr0[0]));
-                        }
-                    }
-                }
-            }
-        }
-    }});
-    return 1;
-}
-
-#else
-
-int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _output, const Ptr<FastConv>& conv,
-                  int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
-{
-    return 0;
-}
-#endif
-}} // namespace cv::dnn
diff --git a/modules/dnn/src/layers/layers_common.simd.hpp b/modules/dnn/src/layers/layers_common.simd.hpp
index eb1735639e..4bae86911c 100644
--- a/modules/dnn/src/layers/layers_common.simd.hpp
+++ b/modules/dnn/src/layers/layers_common.simd.hpp
@@ -46,16 +46,6 @@ namespace cv {
 namespace dnn {
 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 
-void fastDepthwiseConv( const float* weights,
-                      int kernel_h, int kernel_w,
-                      int stride_h, int stride_w,
-                      int dilation_h, int dilation_w,
-                      int pad_t, int pad_l,
-                      const float* bias, const float* relu,
-                      const float* inptr,
-                      int height, int width,
-                      float* outptr,
-                      int out_d, int outH, int outW );
 void fastGEMM1T( const float* vec, const float* weights,
                  size_t wstep, const float* bias,
                  float* dst, int nvecs, int vecsize );
@@ -70,185 +60,6 @@ void fastGEMM( const float* aptr, size_t astep, const float* bptr,
 #define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
 #endif
 
-static inline void _mm256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
-{
-    __m256 t0 = _mm256_loadu_ps(ptr);
-    __m256 t1 = _mm256_loadu_ps(ptr + 8);
-
-    __m256 lo = _mm256_permute2f128_ps(t0, t1, 0+2*16);
-    __m256 hi = _mm256_permute2f128_ps(t0, t1, 1+3*16);
-    a = _mm256_shuffle_ps(lo, hi, 0x88);
-    b = _mm256_shuffle_ps(lo, hi, 0xdd);
-}
-
-void fastDepthwiseConv( const float* wptr,
-                     int kernel_h, int kernel_w,
-                     int stride_h, int stride_w,
-                     int dilation_h, int dilation_w,
-                     int pad_t, int pad_l,
-                     const float* biasptr, const float* relu,
-                     const float* inptr_,
-                     int height, int width,
-                     float* outptr_,
-                     int out_d, int outH, int outW )
-{
-    const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
-                w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
-                w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
-    int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
-    float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
-
-    for (int out_i = 0; out_i < outH; out_i++)
-    {
-        int in_i = out_i * stride_h - pad_t, out_j = 0;
-        const float* imgptr0 = inptr_ + in_i*width;
-        const float* imgptr1 = imgptr0 + dilation_h*width;
-        const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
-        float out, w00 = w00_, w01 = w01_, w02 = w02_;
-        float w20 = w20_, w21 = w21_, w22 = w22_;
-        if (in_i < 0)
-        {
-            w00 = w01 = w02 = 0.f;
-            imgptr0 = imgptr1;
-        }
-        else if (in_i + dilation_h*(kernel_h-1) >= height)
-        {
-            w20 = w21 = w22 = 0.f;
-            imgptr2 = imgptr1;
-        }
-        float* outptr = outptr_ + out_i*outW;
-        if (pad_l > 0)
-        {
-            out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
-                  imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
-                  imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
-            if (relu)
-                out = out > 0.f ? out : out*relu_coeff;
-            outptr[0] = out;
-            out_j = 1;
-        }
-
-        if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
-        {
-            const int VECSZ = 8;
-            __m256 vw00 = _mm256_set1_ps(w00), vw01 = _mm256_set1_ps(w01), vw02 = _mm256_set1_ps(w02),
-                      vw10 = _mm256_set1_ps(w10), vw11 = _mm256_set1_ps(w11), vw12 = _mm256_set1_ps(w12),
-                      vw20 = _mm256_set1_ps(w20), vw21 = _mm256_set1_ps(w21), vw22 = _mm256_set1_ps(w22);
-            __m256 z = _mm256_setzero_ps(), vbias = _mm256_set1_ps(bias), vrc = _mm256_set1_ps(relu_coeff);
-
-            if( stride_w == 1 )
-                for( ; out_j < outW1; out_j += VECSZ )
-                {
-                    if (out_j + VECSZ > outW1 && out_j > pad_l)
-                        out_j = outW1 - VECSZ;
-                    int in_j = out_j * stride_w - pad_l;
-                    __m256 v00 = _mm256_loadu_ps(imgptr0 + in_j),
-                           v01 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w),
-                           v02 = _mm256_loadu_ps(imgptr0 + in_j + dilation_w*2),
-                           v10 = _mm256_loadu_ps(imgptr1 + in_j),
-                           v11 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w),
-                           v12 = _mm256_loadu_ps(imgptr1 + in_j + dilation_w*2),
-                           v20 = _mm256_loadu_ps(imgptr2 + in_j),
-                           v21 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w),
-                           v22 = _mm256_loadu_ps(imgptr2 + in_j + dilation_w*2);
-
-                    __m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias);
-                    __m256 vout1 = _mm256_mul_ps(v01, vw01);
-                    __m256 vout2 = _mm256_mul_ps(v02, vw02);
-
-                    vout0 = _mm256_fmadd_ps(v10, vw10, vout0);
-                    vout1 = _mm256_fmadd_ps(v11, vw11, vout1);
-                    vout2 = _mm256_fmadd_ps(v12, vw12, vout2);
-
-                    vout0 = _mm256_fmadd_ps(v20, vw20, vout0);
-                    vout1 = _mm256_fmadd_ps(v21, vw21, vout1);
-                    vout2 = _mm256_fmadd_ps(v22, vw22, vout2);
-
-                    vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2);
-                    if (relu)
-                    {
-                        __m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ);
-                        vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m);
-                    }
-                    _mm256_storeu_ps(outptr + out_j, vout0);
-                }
-            else
-                for( ; out_j < outW1; out_j += VECSZ )
-                {
-                    if (out_j + VECSZ > outW1 && out_j > pad_l)
-                        out_j = outW1 - VECSZ;
-                    int in_j = out_j * stride_w - pad_l;
-                    __m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
-                    _mm256_load_deinterleave(imgptr0 + in_j, v00, v01);
-                    _mm256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
-                    _mm256_load_deinterleave(imgptr1 + in_j, v10, v11);
-                    _mm256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
-                    _mm256_load_deinterleave(imgptr2 + in_j, v20, v21);
-                    _mm256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
-
-                    __m256 vout0 = _mm256_fmadd_ps(v00, vw00, vbias);
-                    __m256 vout1 = _mm256_mul_ps(v01, vw01);
-                    __m256 vout2 = _mm256_mul_ps(v02, vw02);
-
-                    vout0 = _mm256_fmadd_ps(v10, vw10, vout0);
-                    vout1 = _mm256_fmadd_ps(v11, vw11, vout1);
-                    vout2 = _mm256_fmadd_ps(v12, vw12, vout2);
-
-                    vout0 = _mm256_fmadd_ps(v20, vw20, vout0);
-                    vout1 = _mm256_fmadd_ps(v21, vw21, vout1);
-                    vout2 = _mm256_fmadd_ps(v22, vw22, vout2);
-
-                    vout0 = _mm256_add_ps(_mm256_add_ps(vout0, vout1), vout2);
-                    if (relu)
-                    {
-                        __m256 m = _mm256_cmp_ps(vout0, z, _CMP_GT_OQ);
-                        vout0 = _mm256_blendv_ps(_mm256_mul_ps(vout0, vrc), vout0, m);
-                    }
-                    _mm256_storeu_ps(outptr + out_j, vout0);
-                }
-        }
-
-        for (; out_j < outW1; out_j++)
-        {
-            int in_j = out_j * stride_w - pad_l;
-            out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
-                  imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
-                  imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
-            if (relu)
-                out = out > 0.f ? out : out*relu_coeff;
-            outptr[out_j] = out;
-        }
-
-        for (; out_j < outW; out_j++ )
-        {
-            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
-            float s0 = 1.f, s1 = 1.f, s2 = 1.f;
-            if (in_j0 >= width)
-            {
-                in_j0 = 0;
-                s0 = 0.f;
-            }
-            if (in_j1 >= width)
-            {
-                in_j1 = 0;
-                s1 = 0.f;
-            }
-            if (in_j2 >= width)
-            {
-                in_j2 = 0;
-                s2 = 0.f;
-            }
-            out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
-                  imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
-                  imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
-            if (relu)
-                out = out > 0.f ? out : out*relu_coeff;
-            outptr[out_j] = out;
-        }
-    }
-    _mm256_zeroupper();
-}
-
 // Used to generate the mask used when calculating tails
 static const uint32_t tailMaskArray[15] = {
     0, 0, 0, 0, 0, 0, 0, 0,
@@ -654,382 +465,10 @@ void fastGEMM1T( const float* vec, const float* weights,
     }
 }
 
-/*
-Example for load_deinterleave:
-    input: ptr[16] = {1,2,3, ... ,14,15,16}
-    output: a = {1, 3, 5, 7, 9, 11, 13, 15}
-    output: b = {2, 4, 6, 8,10, 12, 14, 16}
-*/
-static inline void vfloat32m2_load_deinterleave(const float* ptr, vfloat32m2_t& a, vfloat32m2_t& b, int vl)
-{
-    vuint64m4_t mask = vmv_v_x_u64m4(1,vl*2);
-    vuint32m4_t mask_re = vreinterpret_v_u64m4_u32m4(mask);
-    vbool8_t mask0 = vmseq_vx_u32m4_b8 (mask_re, 1, vl*2);
-    vbool8_t mask1 = vmseq_vx_u32m4_b8 (mask_re, 0, vl*2);
-    vfloat32m4_t tempa = vundefined_f32m4(), tempb = vundefined_f32m4();
-    vfloat32m4_t vw = vle32_v_f32m4(ptr, vl*2);
-    tempa = vcompress_vm_f32m4(mask0, tempa, vw, vl*2);
-    tempb = vcompress_vm_f32m4(mask1, tempb, vw, vl*2);
-    /* The following instructions have not to be supported by the GNU toolchain.
-       So we temporarily use store and load instead.
-    // a = vlmul_trunc_v_f32m4_f32m2(tempa);
-    // b = vlmul_trunc_v_f32m4_f32m2(tempb);
-    */
-    cv::AutoBuffer<float> cvBuffer(sizeof(float)*vl*2);
-    float* buffer = (float*)cvBuffer.data();
-    vse32_v_f32m4(buffer, tempa, vl);
-    a = vle32_v_f32m2(buffer, vl);
-    vse32_v_f32m4(buffer, tempb, vl);
-    b = vle32_v_f32m2(buffer, vl);
-}
-
-void fastDepthwiseConv( const float* wptr,
-                     int kernel_h, int kernel_w,
-                     int stride_h, int stride_w,
-                     int dilation_h, int dilation_w,
-                     int pad_t, int pad_l,
-                     const float* biasptr, const float* relu,
-                     const float* inptr_,
-                     int height, int width,
-                     float* outptr_,
-                     int out_d, int outH, int outW )
-{
-    int vl;
-    const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
-                w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
-                w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
-    int outW1 = std::min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
-    float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
-
-    for (int out_i = 0; out_i < outH; out_i++)
-    {
-        int in_i = out_i * stride_h - pad_t, out_j = 0;
-        const float* imgptr0 = inptr_ + in_i*width;
-        const float* imgptr1 = imgptr0 + dilation_h*width;
-        const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
-        float out, w00 = w00_, w01 = w01_, w02 = w02_;
-        float w20 = w20_, w21 = w21_, w22 = w22_;
-        if (in_i < 0)
-        {
-            w00 = w01 = w02 = 0.f;
-            imgptr0 = imgptr1;
-        }
-        else if (in_i + dilation_h*(kernel_h-1) >= height)
-        {
-            w20 = w21 = w22 = 0.f;
-            imgptr2 = imgptr1;
-        }
-        float* outptr = outptr_ + out_i*outW;
-        if (pad_l > 0)
-        {
-            out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
-                  imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
-                  imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
-            if (relu)
-                out = out > 0.f ? out : out*relu_coeff;
-            outptr[0] = out;
-            out_j = 1;
-        }
-
-        if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
-        {
-            int avl = outW1 - out_j;
-            if( stride_w == 1 )
-                for( ; out_j < outW1; out_j += vl, avl -= vl)
-                {
-                    vl = vsetvl_e32m2(avl);
-                    int in_j = out_j * stride_w - pad_l;
-                    vfloat32m2_t v00 = vle32_v_f32m2(imgptr0 + in_j, vl),
-                           v01 = vle32_v_f32m2(imgptr0 + in_j + dilation_w, vl),
-                           v02 = vle32_v_f32m2(imgptr0 + in_j + dilation_w*2, vl),
-                           v10 = vle32_v_f32m2(imgptr1 + in_j, vl),
-                           v11 = vle32_v_f32m2(imgptr1 + in_j + dilation_w, vl),
-                           v12 = vle32_v_f32m2(imgptr1 + in_j + dilation_w*2, vl),
-                           v20 = vle32_v_f32m2(imgptr2 + in_j, vl),
-                           v21 = vle32_v_f32m2(imgptr2 + in_j + dilation_w, vl),
-                           v22 = vle32_v_f32m2(imgptr2 + in_j + dilation_w*2, vl);
-
-                    vfloat32m2_t vout0 = vfmul_vf_f32m2(v00, w00, vl);
-                    vfloat32m2_t vout1 = vfmul_vf_f32m2(v01, w01, vl);
-                    vfloat32m2_t vout2 = vfmul_vf_f32m2(v02, w02, vl);
-                    vout0 = vfadd_vf_f32m2(vout0, bias, vl);
-
-                    vout0 = vfmacc_vf_f32m2(vout0, w10, v10, vl);
-                    vout1 = vfmacc_vf_f32m2(vout1, w11, v11, vl);
-                    vout2 = vfmacc_vf_f32m2(vout2, w12, v12, vl);
-
-                    vout0 = vfmacc_vf_f32m2(vout0, w20, v20, vl);
-                    vout1 = vfmacc_vf_f32m2(vout1, w21, v21, vl);
-                    vout2 = vfmacc_vf_f32m2(vout2, w22, v22, vl);
-
-                    vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl);
-                    if (relu)
-                    {
-                        vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl);
-                        vout0 = vmerge_vvm_f32m2(m, vfmul_vf_f32m2(vout0, relu_coeff, vl), vout0, vl);
-                    }
-                    vse32_v_f32m2(outptr + out_j, vout0, vl);
-                }
-            else //stride_w == 2 && dilation_w == 1
-                for( ; out_j < outW1; out_j += vl, avl -= vl)
-                {
-                    vl = vsetvl_e32m2(avl);
-                    int in_j = out_j * stride_w - pad_l;
-                    vfloat32m2_t v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
-                    vfloat32m2_load_deinterleave(imgptr0 + in_j, v00, v01, vl);
-                    vfloat32m2_load_deinterleave(imgptr0 + in_j + 2, v02, unused, vl);
-                    vfloat32m2_load_deinterleave(imgptr1 + in_j, v10, v11, vl);
-                    vfloat32m2_load_deinterleave(imgptr1 + in_j + 2, v12, unused, vl);
-                    vfloat32m2_load_deinterleave(imgptr2 + in_j, v20, v21, vl);
-                    vfloat32m2_load_deinterleave(imgptr2 + in_j + 2, v22, unused, vl);
-
-                    vfloat32m2_t vout0 = vfmul_vf_f32m2(v00, w00, vl);
-                    vfloat32m2_t vout1 = vfmul_vf_f32m2(v01, w01, vl);
-                    vfloat32m2_t vout2 = vfmul_vf_f32m2(v02, w02, vl);
-                    vout0 = vfadd_vf_f32m2(vout0, bias, vl);
-
-                    vout0 = vfmacc_vf_f32m2(vout0, w10, v10, vl);
-                    vout1 = vfmacc_vf_f32m2(vout1, w11, v11, vl);
-                    vout2 = vfmacc_vf_f32m2(vout2, w12, v12, vl);
-
-                    vout0 = vfmacc_vf_f32m2(vout0, w20, v20, vl);
-                    vout1 = vfmacc_vf_f32m2(vout1, w21, v21, vl);
-                    vout2 = vfmacc_vf_f32m2(vout2, w22, v22, vl);
-
-                    vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl);
-                    if (relu)
-                    {
-                        vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl);
-                        vout0 = vmerge_vvm_f32m2(m, vfmul_vf_f32m2(vout0, relu_coeff, vl), vout0, vl);
-                    }
-                    vse32_v_f32m2(outptr + out_j, vout0, vl);
-                }
-        }
-
-        for (; out_j < outW1; out_j++)
-        {
-            int in_j = out_j * stride_w - pad_l;
-            out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
-                  imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
-                  imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
-            if (relu)
-                out = out > 0.f ? out : out*relu_coeff;
-            outptr[out_j] = out;
-        }
-
-        for (; out_j < outW; out_j++ )
-        {
-            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
-            float s0 = 1.f, s1 = 1.f, s2 = 1.f;
-            if (in_j0 >= width)
-            {
-                in_j0 = 0;
-                s0 = 0.f;
-            }
-            if (in_j1 >= width)
-            {
-                in_j1 = 0;
-                s1 = 0.f;
-            }
-            if (in_j2 >= width)
-            {
-                in_j2 = 0;
-                s2 = 0.f;
-            }
-            out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
-                  imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
-                  imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
-            if (relu)
-                out = out > 0.f ? out : out*relu_coeff;
-            outptr[out_j] = out;
-        }
-    }
-}
-
 #endif // CV_RVV
 
 #if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_LASX
 
-static inline void _v256_load_deinterleave(const float* ptr, __m256& a, __m256& b)
-{
-    __m256 t0 = (__m256)__lasx_xvld(ptr, 0);
-    __m256 t1 = (__m256)__lasx_xvld(ptr, 8*4);
-
-    __m256 lo = (__m256)__lasx_xvpermi_q(t0, t1, 2+0*16);
-    __m256 hi = (__m256)__lasx_xvpermi_q(t0, t1, 3+1*16);
-
-    a = (__m256)__lasx_xvpermi_w(hi, lo, 0x88);
-    b = (__m256)__lasx_xvpermi_w(hi, lo, 0xdd);
-}
-
-void fastDepthwiseConv( const float* wptr,
-                     int kernel_h, int kernel_w,
-                     int stride_h, int stride_w,
-                     int dilation_h, int dilation_w,
-                     int pad_t, int pad_l,
-                     const float* biasptr, const float* relu,
-                     const float* inptr_,
-                     int height, int width,
-                     float* outptr_,
-                     int out_d, int outH, int outW )
-{
-    const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
-                w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
-                w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
-    int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
-    float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
-
-    for (int out_i = 0; out_i < outH; out_i++)
-    {
-        int in_i = out_i * stride_h - pad_t, out_j = 0;
-        const float* imgptr0 = inptr_ + in_i*width;
-        const float* imgptr1 = imgptr0 + dilation_h*width;
-        const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
-        float out, w00 = w00_, w01 = w01_, w02 = w02_;
-        float w20 = w20_, w21 = w21_, w22 = w22_;
-        if (in_i < 0)
-        {
-            w00 = w01 = w02 = 0.f;
-            imgptr0 = imgptr1;
-        }
-        else if (in_i + dilation_h*(kernel_h-1) >= height)
-        {
-            w20 = w21 = w22 = 0.f;
-            imgptr2 = imgptr1;
-        }
-        float* outptr = outptr_ + out_i*outW;
-        if (pad_l > 0)
-        {
-            out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
-                  imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
-                  imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
-            if (relu)
-                out = out > 0.f ? out : out*relu_coeff;
-            outptr[0] = out;
-            out_j = 1;
-        }
-
-        if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
-        {
-            const int VECSZ = 8;
-            __m256 vw00 = _v256_setall_ps(w00), vw01 = _v256_setall_ps(w01), vw02 = _v256_setall_ps(w02),
-                   vw10 = _v256_setall_ps(w10), vw11 = _v256_setall_ps(w11), vw12 = _v256_setall_ps(w12),
-                   vw20 = _v256_setall_ps(w20), vw21 = _v256_setall_ps(w21), vw22 = _v256_setall_ps(w22);
-            __m256 z = (__m256)__lasx_xvxor_v((__m256i)vw00, (__m256i)vw00),
-            vbias = _v256_setall_ps(bias), vrc = _v256_setall_ps(relu_coeff);
-
-            if( stride_w == 1 )
-                for( ; out_j < outW1; out_j += VECSZ )
-                {
-                    if (out_j + VECSZ > outW1 && out_j > pad_l)
-                        out_j = outW1 - VECSZ;
-                    int in_j = out_j * stride_w - pad_l;
-                    __m256 v00 = (__m256)__lasx_xvld(imgptr0 + in_j, 0),
-                           v01 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w, 0),
-                           v02 = (__m256)__lasx_xvld(imgptr0 + in_j + dilation_w*2, 0),
-                           v10 = (__m256)__lasx_xvld(imgptr1 + in_j, 0),
-                           v11 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w, 0),
-                           v12 = (__m256)__lasx_xvld(imgptr1 + in_j + dilation_w*2, 0),
-                           v20 = (__m256)__lasx_xvld(imgptr2 + in_j, 0),
-                           v21 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w, 0),
-                           v22 = (__m256)__lasx_xvld(imgptr2 + in_j + dilation_w*2, 0);
-
-                    __m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
-                    __m256 vout1 = __lasx_xvfmul_s(v01, vw01);
-                    __m256 vout2 = __lasx_xvfmul_s(v02, vw02);
-
-                    vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
-                    vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
-                    vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
-
-                    vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
-                    vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
-                    vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
-
-                    vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
-                    if (relu)
-                    {
-                        __m256i m = __lasx_xvfcmp_clt_s(z, vout0);
-                        vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
-                    }
-                    __lasx_xvst(vout0, outptr + out_j, 0);
-                }
-            else
-                for( ; out_j < outW1; out_j += VECSZ )
-                {
-                    if (out_j + VECSZ > outW1 && out_j > pad_l)
-                        out_j = outW1 - VECSZ;
-                    int in_j = out_j * stride_w - pad_l;
-                    __m256 v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
-                    _v256_load_deinterleave(imgptr0 + in_j, v00, v01);
-                    _v256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
-                    _v256_load_deinterleave(imgptr1 + in_j, v10, v11);
-                    _v256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
-                    _v256_load_deinterleave(imgptr2 + in_j, v20, v21);
-                    _v256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
-
-                    __m256 vout0 = __lasx_xvfmadd_s(v00, vw00, vbias);
-                    __m256 vout1 = __lasx_xvfmul_s(v01, vw01);
-                    __m256 vout2 = __lasx_xvfmul_s(v02, vw02);
-
-                    vout0 = __lasx_xvfmadd_s(v10, vw10, vout0);
-                    vout1 = __lasx_xvfmadd_s(v11, vw11, vout1);
-                    vout2 = __lasx_xvfmadd_s(v12, vw12, vout2);
-
-                    vout0 = __lasx_xvfmadd_s(v20, vw20, vout0);
-                    vout1 = __lasx_xvfmadd_s(v21, vw21, vout1);
-                    vout2 = __lasx_xvfmadd_s(v22, vw22, vout2);
-
-                    vout0 = __lasx_xvfadd_s(__lasx_xvfadd_s(vout0, vout1), vout2);
-                    if (relu)
-                    {
-                        __m256i m = __lasx_xvfcmp_clt_s(z, vout0);
-                        vout0 = (__m256)__lasx_xvbitsel_v((__m256i)__lasx_xvfmul_s(vout0, vrc), (__m256i)vout0, m);
-                    }
-                    __lasx_xvst(vout0, outptr + out_j, 0);
-                }
-        }
-
-        for (; out_j < outW1; out_j++)
-        {
-            int in_j = out_j * stride_w - pad_l;
-            out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
-                  imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
-                  imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
-            if (relu)
-                out = out > 0.f ? out : out*relu_coeff;
-            outptr[out_j] = out;
-        }
-
-        for (; out_j < outW; out_j++ )
-        {
-            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
-            float s0 = 1.f, s1 = 1.f, s2 = 1.f;
-            if (in_j0 >= width)
-            {
-                in_j0 = 0;
-                s0 = 0.f;
-            }
-            if (in_j1 >= width)
-            {
-                in_j1 = 0;
-                s1 = 0.f;
-            }
-            if (in_j2 >= width)
-            {
-                in_j2 = 0;
-                s2 = 0.f;
-            }
-            out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
-                  imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
-                  imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
-            if (relu)
-                out = out > 0.f ? out : out*relu_coeff;
-            outptr[out_j] = out;
-        }
-    }
-}
-
 // dst = vec * weights^t + bias
 void fastGEMM1T( const float* vec, const float* weights,
                  size_t wstep, const float* bias,