diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index 3233ab3c66..f9813fa053 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -106,6 +106,7 @@ CV__DNN_INLINE_NS_BEGIN
         DNN_TARGET_CUDA_FP16,
         DNN_TARGET_HDDL,
         DNN_TARGET_NPU,
+        DNN_TARGET_CPU_FP16, // Only the ARM platform is supported. Low precision computing, accelerate model inference.
     };
 
     /**
diff --git a/modules/dnn/src/dnn_common.hpp b/modules/dnn/src/dnn_common.hpp
index 2561de4a9f..27947afea1 100644
--- a/modules/dnn/src/dnn_common.hpp
+++ b/modules/dnn/src/dnn_common.hpp
@@ -13,7 +13,7 @@
 namespace cv { namespace dnn {
 CV__DNN_INLINE_NS_BEGIN
 #define IS_DNN_OPENCL_TARGET(id) (id == DNN_TARGET_OPENCL || id == DNN_TARGET_OPENCL_FP16)
-#define IS_DNN_CPU_TARGET(id) (id == DNN_TARGET_CPU) // TODO: add DNN_TARGET_CPU_FP16
+#define IS_DNN_CPU_TARGET(id) (id == DNN_TARGET_CPU || id == DNN_TARGET_CPU_FP16)
 Mutex& getInitializationMutex();
 void initializeLayerFactory();
 
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index fc0120cdb8..da85deebaa 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -428,7 +428,6 @@ public:
     virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
     {
         BaseConvolutionLayerImpl::finalize(inputs_arr, outputs_arr);
-
         std::vector<Mat> inputs;
         inputs_arr.getMatVector(inputs);
         // prepare weightsMat where each row is aligned and has enough zero padding on the right to
@@ -1405,7 +1404,8 @@ public:
 
                 CV_Assert(outputs[0].size[1] % ngroups == 0);
                 fastConvImpl = initFastConv(weightsMat, &biasvec[0], ngroups, K, C, kernel_size, strides,
-                                            dilations, pads_begin, pads_end, conv_dim, canUseWinograd);
+                                            dilations, pads_begin, pads_end, conv_dim,
+                                            preferableTarget == DNN_TARGET_CPU_FP16, canUseWinograd);
             }
 
             runFastConv(inputs[0], outputs[0], fastConvImpl, nstripes, activ, reluslope, fusedAdd);
diff --git a/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp b/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp
index 71b17dcc9b..27b0d4ba1f 100644
--- a/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp
@@ -8,7 +8,7 @@ namespace cv {
 namespace dnn {
 CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
 
-void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR);
+void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, int width, const int convMR, const int convNR);
 
 #if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX
 
@@ -17,7 +17,7 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
 #define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b))
 #endif
 
-void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
+void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, int width, const int convMR, const int convNR)
 {
     CV_Assert(convMR == 4 && convNR == 24);
     __m256 c00 = _mm256_set1_ps(0.f), c01 = c00, c02 = c00;
@@ -28,29 +28,72 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
     __m256 a0 = _mm256_setzero_ps(), a1 = _mm256_setzero_ps();
     __m256 b0 = _mm256_setzero_ps(), b1 = _mm256_setzero_ps(), b2 = _mm256_setzero_ps();
 
-    for (int p = 0; p < np; p++, a += convMR, b += convNR)
+    if (width > 16)
     {
-        a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
-        b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16);
+        for (int p = 0; p < np; p++, a += convMR, b += convNR)
+        {
+            a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
+            b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16);
+
+            c00 = _mm256_fmadd_ps(b0, a0, c00);
+            c01 = _mm256_fmadd_ps(b1, a0, c01);
+            c02 = _mm256_fmadd_ps(b2, a0, c02);
+
+            c10 = _mm256_fmadd_ps(b0, a1, c10);
+            c11 = _mm256_fmadd_ps(b1, a1, c11);
+            c12 = _mm256_fmadd_ps(b2, a1, c12);
+
+            a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]);
+
+            c20 = _mm256_fmadd_ps(b0, a0, c20);
+            c21 = _mm256_fmadd_ps(b1, a0, c21);
+            c22 = _mm256_fmadd_ps(b2, a0, c22);
+
+            c30 = _mm256_fmadd_ps(b0, a1, c30);
+            c31 = _mm256_fmadd_ps(b1, a1, c31);
+            c32 = _mm256_fmadd_ps(b2, a1, c32);
+        }
+    }
+    else if (width > 8)
+    {
+        for (int p = 0; p < np; p++, a += convMR, b += convNR)
+        {
+            a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
+            b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8);
 
-        c00 = _mm256_fmadd_ps(b0, a0, c00);
-        c01 = _mm256_fmadd_ps(b1, a0, c01);
-        c02 = _mm256_fmadd_ps(b2, a0, c02);
+            c00 = _mm256_fmadd_ps(b0, a0, c00);
+            c01 = _mm256_fmadd_ps(b1, a0, c01);
 
-        c10 = _mm256_fmadd_ps(b0, a1, c10);
-        c11 = _mm256_fmadd_ps(b1, a1, c11);
-        c12 = _mm256_fmadd_ps(b2, a1, c12);
+            c10 = _mm256_fmadd_ps(b0, a1, c10);
+            c11 = _mm256_fmadd_ps(b1, a1, c11);
 
-        a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]);
+            a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]);
 
-        c20 = _mm256_fmadd_ps(b0, a0, c20);
-        c21 = _mm256_fmadd_ps(b1, a0, c21);
-        c22 = _mm256_fmadd_ps(b2, a0, c22);
+            c20 = _mm256_fmadd_ps(b0, a0, c20);
+            c21 = _mm256_fmadd_ps(b1, a0, c21);
 
-        c30 = _mm256_fmadd_ps(b0, a1, c30);
-        c31 = _mm256_fmadd_ps(b1, a1, c31);
-        c32 = _mm256_fmadd_ps(b2, a1, c32);
+            c30 = _mm256_fmadd_ps(b0, a1, c30);
+            c31 = _mm256_fmadd_ps(b1, a1, c31);
+        }
     }
+    else
+    {
+        for (int p = 0; p < np; p++, a += convMR, b += convNR)
+        {
+            a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
+            b0 = _mm256_load_ps(b);
+
+            c00 = _mm256_fmadd_ps(b0, a0, c00);
+            c10 = _mm256_fmadd_ps(b0, a1, c10);
+
+            a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]);
+
+            c20 = _mm256_fmadd_ps(b0, a0, c20);
+            c30 = _mm256_fmadd_ps(b0, a1, c30);
+        }
+    }
+
+
 
     if (!init_c)
     {
@@ -87,7 +130,7 @@ namespace opt_NEON
 {
 #if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON
 
-void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
+void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, int width, const int convMR, const int convNR)
 {
 #if CV_NEON_AARCH64
     if (convMR == 4 && convNR == 28) // AARCH64
@@ -97,44 +140,105 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
         float32x4_t c20 = vdupq_n_f32(0.f), c21 = c20, c22 = c20, c23 = c20, c24 = c20, c25 = c20, c26 = c20;
         float32x4_t c30 = vdupq_n_f32(0.f), c31 = c30, c32 = c30, c33 = c30, c34 = c30, c35 = c30, c36 = c30;
 
-        for( int p = 0; p < np; p++, a += convMR, b += convNR )
+        if (width > 16)
+        {
+            for( int p = 0; p < np; p++, a += convMR, b += convNR )
+            {
+                float32x4_t a0 = vld1q_f32(a), b0, b1, b2;
+                b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8);
+
+                c00 = vfmaq_laneq_f32(c00, b0, a0, 0);
+                c01 = vfmaq_laneq_f32(c01, b1, a0, 0);
+                c02 = vfmaq_laneq_f32(c02, b2, a0, 0);
+                c10 = vfmaq_laneq_f32(c10, b0, a0, 1);
+                c11 = vfmaq_laneq_f32(c11, b1, a0, 1);
+                c12 = vfmaq_laneq_f32(c12, b2, a0, 1);
+                c20 = vfmaq_laneq_f32(c20, b0, a0, 2);
+                c21 = vfmaq_laneq_f32(c21, b1, a0, 2);
+                c22 = vfmaq_laneq_f32(c22, b2, a0, 2);
+                c30 = vfmaq_laneq_f32(c30, b0, a0, 3);
+                c31 = vfmaq_laneq_f32(c31, b1, a0, 3);
+                c32 = vfmaq_laneq_f32(c32, b2, a0, 3);
+
+                b0 = vld1q_f32(b + 12); b1 = vld1q_f32(b + 16); b2 = vld1q_f32(b + 20);
+
+                c03 = vfmaq_laneq_f32(c03, b0, a0, 0);
+                c04 = vfmaq_laneq_f32(c04, b1, a0, 0);
+                c05 = vfmaq_laneq_f32(c05, b2, a0, 0);
+                c13 = vfmaq_laneq_f32(c13, b0, a0, 1);
+                c14 = vfmaq_laneq_f32(c14, b1, a0, 1);
+                c15 = vfmaq_laneq_f32(c15, b2, a0, 1);
+                c23 = vfmaq_laneq_f32(c23, b0, a0, 2);
+                c24 = vfmaq_laneq_f32(c24, b1, a0, 2);
+                c25 = vfmaq_laneq_f32(c25, b2, a0, 2);
+                c33 = vfmaq_laneq_f32(c33, b0, a0, 3);
+                c34 = vfmaq_laneq_f32(c34, b1, a0, 3);
+                c35 = vfmaq_laneq_f32(c35, b2, a0, 3);
+
+                b0 = vld1q_f32(b + 24);
+                c06 = vfmaq_laneq_f32(c06, b0, a0, 0);
+                c16 = vfmaq_laneq_f32(c16, b0, a0, 1);
+                c26 = vfmaq_laneq_f32(c26, b0, a0, 2);
+                c36 = vfmaq_laneq_f32(c36, b0, a0, 3);
+            }
+        }
+        else if (width > 8)
+        {
+            for( int p = 0; p < np; p++, a += convMR, b += convNR )
+            {
+                float32x4_t a0 = vld1q_f32(a), b0, b1, b2;
+                b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8);
+
+                c00 = vfmaq_laneq_f32(c00, b0, a0, 0);
+                c01 = vfmaq_laneq_f32(c01, b1, a0, 0);
+                c02 = vfmaq_laneq_f32(c02, b2, a0, 0);
+                c10 = vfmaq_laneq_f32(c10, b0, a0, 1);
+                c11 = vfmaq_laneq_f32(c11, b1, a0, 1);
+                c12 = vfmaq_laneq_f32(c12, b2, a0, 1);
+                c20 = vfmaq_laneq_f32(c20, b0, a0, 2);
+                c21 = vfmaq_laneq_f32(c21, b1, a0, 2);
+                c22 = vfmaq_laneq_f32(c22, b2, a0, 2);
+                c30 = vfmaq_laneq_f32(c30, b0, a0, 3);
+                c31 = vfmaq_laneq_f32(c31, b1, a0, 3);
+                c32 = vfmaq_laneq_f32(c32, b2, a0, 3);
+
+                b0 = vld1q_f32(b + 12);
+
+                c03 = vfmaq_laneq_f32(c03, b0, a0, 0);
+                c13 = vfmaq_laneq_f32(c13, b0, a0, 1);
+                c23 = vfmaq_laneq_f32(c23, b0, a0, 2);
+                c33 = vfmaq_laneq_f32(c33, b0, a0, 3);
+            }
+        }
+        else if (width > 4)
+        {
+            for( int p = 0; p < np; p++, a += convMR, b += convNR )
+            {
+                float32x4_t a0 = vld1q_f32(a), b0, b1;
+                b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4);
+
+                c00 = vfmaq_laneq_f32(c00, b0, a0, 0);
+                c01 = vfmaq_laneq_f32(c01, b1, a0, 0);
+                c10 = vfmaq_laneq_f32(c10, b0, a0, 1);
+                c11 = vfmaq_laneq_f32(c11, b1, a0, 1);
+                c20 = vfmaq_laneq_f32(c20, b0, a0, 2);
+                c21 = vfmaq_laneq_f32(c21, b1, a0, 2);
+                c30 = vfmaq_laneq_f32(c30, b0, a0, 3);
+                c31 = vfmaq_laneq_f32(c31, b1, a0, 3);
+            }
+        }
+        else
         {
-            float32x4_t a0 = vld1q_f32(a), b0, b1, b2;
-            b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8);
-
-            c00 = vfmaq_laneq_f32(c00, b0, a0, 0);
-            c01 = vfmaq_laneq_f32(c01, b1, a0, 0);
-            c02 = vfmaq_laneq_f32(c02, b2, a0, 0);
-            c10 = vfmaq_laneq_f32(c10, b0, a0, 1);
-            c11 = vfmaq_laneq_f32(c11, b1, a0, 1);
-            c12 = vfmaq_laneq_f32(c12, b2, a0, 1);
-            c20 = vfmaq_laneq_f32(c20, b0, a0, 2);
-            c21 = vfmaq_laneq_f32(c21, b1, a0, 2);
-            c22 = vfmaq_laneq_f32(c22, b2, a0, 2);
-            c30 = vfmaq_laneq_f32(c30, b0, a0, 3);
-            c31 = vfmaq_laneq_f32(c31, b1, a0, 3);
-            c32 = vfmaq_laneq_f32(c32, b2, a0, 3);
-
-            b0 = vld1q_f32(b + 12); b1 = vld1q_f32(b + 16); b2 = vld1q_f32(b + 20);
-
-            c03 = vfmaq_laneq_f32(c03, b0, a0, 0);
-            c04 = vfmaq_laneq_f32(c04, b1, a0, 0);
-            c05 = vfmaq_laneq_f32(c05, b2, a0, 0);
-            c13 = vfmaq_laneq_f32(c13, b0, a0, 1);
-            c14 = vfmaq_laneq_f32(c14, b1, a0, 1);
-            c15 = vfmaq_laneq_f32(c15, b2, a0, 1);
-            c23 = vfmaq_laneq_f32(c23, b0, a0, 2);
-            c24 = vfmaq_laneq_f32(c24, b1, a0, 2);
-            c25 = vfmaq_laneq_f32(c25, b2, a0, 2);
-            c33 = vfmaq_laneq_f32(c33, b0, a0, 3);
-            c34 = vfmaq_laneq_f32(c34, b1, a0, 3);
-            c35 = vfmaq_laneq_f32(c35, b2, a0, 3);
-
-            b0 = vld1q_f32(b + 24);
-            c06 = vfmaq_laneq_f32(c06, b0, a0, 0);
-            c16 = vfmaq_laneq_f32(c16, b0, a0, 1);
-            c26 = vfmaq_laneq_f32(c26, b0, a0, 2);
-            c36 = vfmaq_laneq_f32(c36, b0, a0, 3);
+            for( int p = 0; p < np; p++, a += convMR, b += convNR )
+            {
+                float32x4_t a0 = vld1q_f32(a), b0;
+                b0 = vld1q_f32(b);
+
+                c00 = vfmaq_laneq_f32(c00, b0, a0, 0);
+                c10 = vfmaq_laneq_f32(c10, b0, a0, 1);
+                c20 = vfmaq_laneq_f32(c20, b0, a0, 2);
+                c30 = vfmaq_laneq_f32(c30, b0, a0, 3);
+            }
         }
 
         if (!init_c)
@@ -204,26 +308,62 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
         float32x2_t a0 = vdup_n_f32(0.0f), a1 = a0;
         float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f);
 
-        for (int p = 0; p < np; p++, a += convMR, b += convNR)
+        if (width > 8)
+        {
+            for (int p = 0; p < np; p++, a += convMR, b += convNR)
+            {
+                a0 = vld1_f32(a), a1 = vld1_f32(a+2);
+                b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
+
+                c0 = vmlaq_lane_f32(c0, b0, a0, 0);
+                c1 = vmlaq_lane_f32(c1, b1, a0, 0);
+                c2 = vmlaq_lane_f32(c2, b2, a0, 0);
+
+                c3 = vmlaq_lane_f32(c3, b0, a0, 1);
+                c4 = vmlaq_lane_f32(c4, b1, a0, 1);
+                c5 = vmlaq_lane_f32(c5, b2, a0, 1);
+
+                c6 = vmlaq_lane_f32(c6, b0, a1, 0);
+                c7 = vmlaq_lane_f32(c7, b1, a1, 0);
+                c8 = vmlaq_lane_f32(c8, b2, a1, 0);
+
+                c9  = vmlaq_lane_f32(c9 , b0, a1, 1);
+                c10 = vmlaq_lane_f32(c10, b1, a1, 1);
+                c11 = vmlaq_lane_f32(c11, b2, a1, 1);
+            }
+        }
+        else if (width > 4)
         {
-            a0 = vld1_f32(a), a1 = vld1_f32(a+2);
-            b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
+            for (int p = 0; p < np; p++, a += convMR, b += convNR)
+            {
+                a0 = vld1_f32(a), a1 = vld1_f32(a+2);
+                b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4);
 
-            c0 = vmlaq_lane_f32(c0, b0, a0, 0);
-            c1 = vmlaq_lane_f32(c1, b1, a0, 0);
-            c2 = vmlaq_lane_f32(c2, b2, a0, 0);
+                c0 = vmlaq_lane_f32(c0, b0, a0, 0);
+                c1 = vmlaq_lane_f32(c1, b1, a0, 0);
 
-            c3 = vmlaq_lane_f32(c3, b0, a0, 1);
-            c4 = vmlaq_lane_f32(c4, b1, a0, 1);
-            c5 = vmlaq_lane_f32(c5, b2, a0, 1);
+                c3 = vmlaq_lane_f32(c3, b0, a0, 1);
+                c4 = vmlaq_lane_f32(c4, b1, a0, 1);
 
-            c6 = vmlaq_lane_f32(c6, b0, a1, 0);
-            c7 = vmlaq_lane_f32(c7, b1, a1, 0);
-            c8 = vmlaq_lane_f32(c8, b2, a1, 0);
+                c6 = vmlaq_lane_f32(c6, b0, a1, 0);
+                c7 = vmlaq_lane_f32(c7, b1, a1, 0);
 
-            c9  = vmlaq_lane_f32(c9 , b0, a1, 1);
-            c10 = vmlaq_lane_f32(c10, b1, a1, 1);
-            c11 = vmlaq_lane_f32(c11, b2, a1, 1);
+                c9  = vmlaq_lane_f32(c9 , b0, a1, 1);
+                c10 = vmlaq_lane_f32(c10, b1, a1, 1);
+            }
+        }
+        else
+        {
+            for (int p = 0; p < np; p++, a += convMR, b += convNR)
+            {
+                a0 = vld1_f32(a), a1 = vld1_f32(a+2);
+                b0 = vld1q_f32(b);
+
+                c0 = vmlaq_lane_f32(c0, b0, a0, 0);
+                c3 = vmlaq_lane_f32(c3, b0, a0, 1);
+                c6 = vmlaq_lane_f32(c6, b0, a1, 0);
+                c9  = vmlaq_lane_f32(c9 , b0, a1, 1);
+            }
         }
 
         if (!init_c)
@@ -254,6 +394,366 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i
         CV_Error(Error::StsNotImplemented, "Unsupported convMR and/or convNR in opt_NEON::convBlock");
 }
 
+void convBlockMR1_F32(int np, const float * a, const float * b, float *c, const float bias, bool init_c,
+                  const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR)
+{
+    CV_Assert(convNR == 28);
+    float32x4_t c0 = vdupq_n_f32(bias), c1 = c0, c2 = c0;
+    float32x4_t c3 = c0, c4 = c0, c5 = c0, c6 = c0;
+
+    if (width > 16)
+    {
+        for (int p = 0; p < np; p++, a++, b += convNR)
+        {
+            float32x4_t b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
+            float32x4_t b3 = vld1q_f32(b + 12), b4 = vld1q_f32(b + 16), b5 = vld1q_f32(b + 20);
+            float32x4_t b6 = vld1q_f32(b + 24);
+
+            c0 = vmlaq_n_f32(c0, b0, a[0]);
+            c1 = vmlaq_n_f32(c1, b1, a[0]);
+            c2 = vmlaq_n_f32(c2, b2, a[0]);
+            c3 = vmlaq_n_f32(c3, b3, a[0]);
+            c4 = vmlaq_n_f32(c4, b4, a[0]);
+            c5 = vmlaq_n_f32(c5, b5, a[0]);
+            c6 = vmlaq_n_f32(c6, b6, a[0]);
+        }
+    }
+    else if (width > 8)
+    {
+        for (int p = 0; p < np; p++, a++, b += convNR)
+        {
+            float32x4_t b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
+            float32x4_t b3 = vld1q_f32(b + 12);
+
+            c0 = vmlaq_n_f32(c0, b0, a[0]);
+            c1 = vmlaq_n_f32(c1, b1, a[0]);
+            c2 = vmlaq_n_f32(c2, b2, a[0]);
+            c3 = vmlaq_n_f32(c3, b3, a[0]);
+        }
+    }
+    else if (width > 4)
+    {
+        for (int p = 0; p < np; p++, a++, b += convNR)
+        {
+            float32x4_t b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4);
+
+            c0 = vmlaq_n_f32(c0, b0, a[0]);
+            c1 = vmlaq_n_f32(c1, b1, a[0]);
+        }
+    }
+    else
+    {
+        for (int p = 0; p < np; p++, a++, b += convNR)
+        {
+            float32x4_t b0 = vld1q_f32(b);
+            c0 = vmlaq_n_f32(c0, b0, a[0]);
+        }
+    }
+
+    if (init_c)
+    {
+        c0 += vld1q_f32(c);
+        c1 += vld1q_f32(c + 4);
+        c2 += vld1q_f32(c + 8);
+        c3 += vld1q_f32(c + 12);
+        c4 += vld1q_f32(c + 16);
+        c5 += vld1q_f32(c + 20);
+        c6 += vld1q_f32(c + 24);
+    }
+
+    if (ifMinMaxAct)
+    {
+        float32x4_t v_minval = vdupq_n_f32(minval), v_maxval = vdupq_n_f32(maxval);
+
+        c0 = vminq_f32(vmaxq_f32(c0, v_minval), v_maxval);
+        c1 = vminq_f32(vmaxq_f32(c1, v_minval), v_maxval);
+        c2 = vminq_f32(vmaxq_f32(c2, v_minval), v_maxval);
+        c3 = vminq_f32(vmaxq_f32(c3, v_minval), v_maxval);
+        c4 = vminq_f32(vmaxq_f32(c4, v_minval), v_maxval);
+        c5 = vminq_f32(vmaxq_f32(c5, v_minval), v_maxval);
+        c6 = vminq_f32(vmaxq_f32(c6, v_minval), v_maxval);
+    }
+
+    vst1q_f32(c, c0);
+    vst1q_f32(c + 4, c1);
+    vst1q_f32(c + 8, c2);
+    vst1q_f32(c + 12, c3);
+    vst1q_f32(c + 16, c4);
+    vst1q_f32(c + 20, c5);
+    vst1q_f32(c + 24, c6);
+}
+
+#if CV_NEON_AARCH64 && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+// Fix conflict between float16_t in arm_neon.h and float16_t in cvdef.h.
+typedef __fp16 float16_t;
+
+#ifndef __ARM_FEATURE_FMA // Work around without FMA support.
+#define vfmaq_f16(a, b, c) (a + b * c)
+#endif
+void convBlock_FP16(int np, const char * _a, const char * _b, char * _c, int ldc, bool init_c, int width,
+                    const int convMR_fp16, const int convNR_fp16)
+{
+#if 1
+    const float16_t* a = (const float16_t*)_a;
+    const float16_t* b = (const float16_t*)_b;
+    float16_t* c = (float16_t*)_c;
+
+    CV_Assert(convMR_fp16 == 8 && convNR_fp16 == 24);
+
+    float16x8_t c00 = vdupq_n_f16(0), c01 = c00, c02 = c00;
+    float16x8_t c10 = c00, c11 = c00, c12 = c00;
+    float16x8_t c20 = c00, c21 = c00, c22 = c00;
+    float16x8_t c30 = c00, c31 = c00, c32 = c00;
+    float16x8_t c40 = c00, c41 = c00, c42 = c00;
+    float16x8_t c50 = c00, c51 = c00, c52 = c00;
+    float16x8_t c60 = c00, c61 = c00, c62 = c00;
+    float16x8_t c70 = c00, c71 = c00, c72 = c00;
+
+    float16x8_t b0 = c00, b1 = c00, b2 = c00;
+
+    if (width > 16)
+    {
+        for (int p = 0; p < np; p++, a += convMR_fp16, b += convNR_fp16)
+        {
+            float16x4_t a0 = vld1_f16(a), a1 = vld1_f16(a + 4);
+            b0 = vld1q_f16(b), b1 = vld1q_f16(b + 8), b2 = vld1q_f16(b + 16);
+
+            c00 = vfmaq_lane_f16(c00, b0, a0, 0);
+            c01 = vfmaq_lane_f16(c01, b1, a0, 0);
+            c02 = vfmaq_lane_f16(c02, b2, a0, 0);
+
+            c10 = vfmaq_lane_f16(c10, b0, a0, 1);
+            c11 = vfmaq_lane_f16(c11, b1, a0, 1);
+            c12 = vfmaq_lane_f16(c12, b2, a0, 1);
+
+            c20 = vfmaq_lane_f16(c20, b0, a0, 2);
+            c21 = vfmaq_lane_f16(c21, b1, a0, 2);
+            c22 = vfmaq_lane_f16(c22, b2, a0, 2);
+
+            c30 = vfmaq_lane_f16(c30, b0, a0, 3);
+            c31 = vfmaq_lane_f16(c31, b1, a0, 3);
+            c32 = vfmaq_lane_f16(c32, b2, a0, 3);
+
+            c40 = vfmaq_lane_f16(c40, b0, a1, 0);
+            c41 = vfmaq_lane_f16(c41, b1, a1, 0);
+            c42 = vfmaq_lane_f16(c42, b2, a1, 0);
+
+            c50 = vfmaq_lane_f16(c50, b0, a1, 1);
+            c51 = vfmaq_lane_f16(c51, b1, a1, 1);
+            c52 = vfmaq_lane_f16(c52, b2, a1, 1);
+
+            c60 = vfmaq_lane_f16(c60, b0, a1, 2);
+            c61 = vfmaq_lane_f16(c61, b1, a1, 2);
+            c62 = vfmaq_lane_f16(c62, b2, a1, 2);
+
+            c70 = vfmaq_lane_f16(c70, b0, a1, 3);
+            c71 = vfmaq_lane_f16(c71, b1, a1, 3);
+            c72 = vfmaq_lane_f16(c72, b2, a1, 3);
+        }
+    }
+    else if (width > 8)
+    {
+        for( int p = 0; p < np; p++, a += convMR_fp16, b += convNR_fp16)
+        {
+            float16x4_t a0 = vld1_f16(a), a1 = vld1_f16(a + 4);
+            float16x8_t b0 = vld1q_f16(b), b1 = vld1q_f16(b + 8);
+
+            c00 = vfmaq_lane_f16(c00, b0, a0, 0);
+            c01 = vfmaq_lane_f16(c01, b1, a0, 0);
+
+            c10 = vfmaq_lane_f16(c10, b0, a0, 1);
+            c11 = vfmaq_lane_f16(c11, b1, a0, 1);
+
+            c20 = vfmaq_lane_f16(c20, b0, a0, 2);
+            c21 = vfmaq_lane_f16(c21, b1, a0, 2);
+
+            c30 = vfmaq_lane_f16(c30, b0, a0, 3);
+            c31 = vfmaq_lane_f16(c31, b1, a0, 3);
+
+            c40 = vfmaq_lane_f16(c40, b0, a1, 0);
+            c41 = vfmaq_lane_f16(c41, b1, a1, 0);
+
+            c50 = vfmaq_lane_f16(c50, b0, a1, 1);
+            c51 = vfmaq_lane_f16(c51, b1, a1, 1);
+
+            c60 = vfmaq_lane_f16(c60, b0, a1, 2);
+            c61 = vfmaq_lane_f16(c61, b1, a1, 2);
+
+            c70 = vfmaq_lane_f16(c70, b0, a1, 3);
+            c71 = vfmaq_lane_f16(c71, b1, a1, 3);
+        }
+    }
+    else
+    {
+        for( int p = 0; p < np; p++, a += convMR_fp16, b += convNR_fp16)
+        {
+            float16x4_t a0 = vld1_f16(a), a1 = vld1_f16(a + 4);
+            float16x8_t b0 = vld1q_f16(b);
+
+            c00 = vfmaq_lane_f16(c00, b0, a0, 0);
+            c10 = vfmaq_lane_f16(c10, b0, a0, 1);
+            c20 = vfmaq_lane_f16(c20, b0, a0, 2);
+            c30 = vfmaq_lane_f16(c30, b0, a0, 3);
+            c40 = vfmaq_lane_f16(c40, b0, a1, 0);
+            c50 = vfmaq_lane_f16(c50, b0, a1, 1);
+            c60 = vfmaq_lane_f16(c60, b0, a1, 2);
+            c70 = vfmaq_lane_f16(c70, b0, a1, 3);
+        }
+    }
+
+    if (!init_c)
+    {
+#undef _FX_UPDATE_CBUF_ROW
+#define _FX_UPDATE_CBUF_ROW(row) \
+        c##row##0 = c##row##0 + vld1q_f16(c + row*ldc); \
+        c##row##1 = c##row##1 + vld1q_f16(c + row*ldc + 8); \
+        c##row##2 = c##row##2 + vld1q_f16(c + row*ldc + 16)
+
+        _FX_UPDATE_CBUF_ROW(0);
+        _FX_UPDATE_CBUF_ROW(1);
+        _FX_UPDATE_CBUF_ROW(2);
+        _FX_UPDATE_CBUF_ROW(3);
+        _FX_UPDATE_CBUF_ROW(4);
+        _FX_UPDATE_CBUF_ROW(5);
+        _FX_UPDATE_CBUF_ROW(6);
+        _FX_UPDATE_CBUF_ROW(7);
+    }
+
+#undef _FX_STORE_CBUF_ROW
+#define _FX_STORE_CBUF_ROW(row) \
+    vst1q_f16(c + row*ldc, c##row##0); \
+    vst1q_f16(c + row*ldc + 8, c##row##1); \
+    vst1q_f16(c + row*ldc + 16, c##row##2)
+
+    _FX_STORE_CBUF_ROW(0);
+    _FX_STORE_CBUF_ROW(1);
+    _FX_STORE_CBUF_ROW(2);
+    _FX_STORE_CBUF_ROW(3);
+    _FX_STORE_CBUF_ROW(4);
+    _FX_STORE_CBUF_ROW(5);
+    _FX_STORE_CBUF_ROW(6);
+    _FX_STORE_CBUF_ROW(7);
+#else
+    // reference only.
+    const float16_t* a = (const float16_t*)_a;
+    const float16_t* b = (const float16_t*)_b;
+    float16_t* c = (float16_t*)_c;
+    float cbuf[convMR_fp16*convNR_fp16];
+    memset(cbuf, 0, sizeof(cbuf));
+
+    for( int p = 0; p < np; p++ )
+    {
+        for( int i = 0; i < convMR_fp16; i++ )
+        {
+            float ai = float(a[convMR_fp16*p + i]);
+                for( int j = 0; j < convNR_fp16; j++ )
+                    cbuf[i*convNR_fp16+j] += float(b[convNR_fp16*p + j]) * ai;
+        }
+    }
+
+    if (!init_c)
+    {
+    for(int i = 0; i < convMR_fp16; i++)
+        {
+            for(int j = 0; j < convNR_fp16; j++)
+                c[i*ldc + j] = float16_t(float(c[i*ldc + j]) + cbuf[i*convNR_fp16 + j]);
+        }
+    }
+    else
+    {
+        for(int i = 0; i < convMR_fp16; i++)
+        {
+            for(int j = 0; j < convNR_fp16; j++)
+                c[i*ldc + j] = (float16_t)(cbuf[i*convNR_fp16 + j]);
+        }
+    }
+#endif
+}
+
+void convBlockMR1_FP16(int np, const char* _a, const char* _b, float *c, const float _bias, bool init_c,
+                            const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR_FP16)
+{
+    CV_Assert(convNR_FP16 == 24); // CONV_NR_FP16 = 24
+    const float16_t* a = (const float16_t*)_a;
+    const float16_t* b = (const float16_t*)_b;
+
+    const float16_t bias = (float16_t)_bias;
+
+    float16x8_t c0 = vdupq_n_f16(bias), c1 = c0, c2 = c0;
+
+    if (width > 16)
+    {
+        for (int p = 0; p < np; p++, a++, b += convNR_FP16)
+        {
+            float16x8_t a0= vdupq_n_f16(a[0]);
+            float16x8_t b0 = vld1q_f16(b), b1 = vld1q_f16(b + 8), b2 = vld1q_f16(b + 16);
+
+            c0 = vfmaq_f16(c0, a0, b0);
+            c1 = vfmaq_f16(c1, a0, b1);
+            c2 = vfmaq_f16(c2, a0, b2);
+        }
+    }
+    else if (width > 8)
+    {
+        for (int p = 0; p < np; p++, a++, b += convNR_FP16)
+        {
+            float16x8_t a0= vdupq_n_f16(a[0]);
+            float16x8_t b0 = vld1q_f16(b), b1 = vld1q_f16(b + 8);
+
+            c0 = vfmaq_f16(c0, a0, b0);
+            c1 = vfmaq_f16(c1, a0, b1);
+        }
+    }
+    else
+    {
+        for (int p = 0; p < np; p++, a++, b += convNR_FP16)
+        {
+            float16x8_t a0= vdupq_n_f16(a[0]);
+            float16x8_t b0 = vld1q_f16(b);
+
+            c0 = vfmaq_f16(c0, a0, b0);
+        }
+    }
+
+    // convert FP 16 to FP 32.
+    float32x4_t c00 = vcvt_f32_f16(vget_low_f16(c0));
+    float32x4_t c01 = vcvt_f32_f16(vget_high_f16(c0));
+    float32x4_t c10 = vcvt_f32_f16(vget_low_f16(c1));
+    float32x4_t c11 = vcvt_f32_f16(vget_high_f16(c1));
+    float32x4_t c20 = vcvt_f32_f16(vget_low_f16(c2));
+    float32x4_t c21 = vcvt_f32_f16(vget_high_f16(c2));
+
+    if (init_c)
+    {
+        c00 += vld1q_f32(c);
+        c01 += vld1q_f32(c + 4);
+        c10 += vld1q_f32(c + 8);
+        c11 += vld1q_f32(c + 12);
+        c20 += vld1q_f32(c + 16);
+        c21 += vld1q_f32(c + 20);
+    }
+
+    if (ifMinMaxAct)
+    {
+        float32x4_t v_minval = vdupq_n_f32(minval), v_maxval = vdupq_n_f32(maxval);
+
+        c00 = vminq_f32(vmaxq_f32(c00, v_minval), v_maxval);
+        c01 = vminq_f32(vmaxq_f32(c01, v_minval), v_maxval);
+        c10 = vminq_f32(vmaxq_f32(c10, v_minval), v_maxval);
+        c11 = vminq_f32(vmaxq_f32(c11, v_minval), v_maxval);
+        c20 = vminq_f32(vmaxq_f32(c20, v_minval), v_maxval);
+        c21 = vminq_f32(vmaxq_f32(c21, v_minval), v_maxval);
+    }
+
+    vst1q_f32(c, c00);
+    vst1q_f32(c + 4, c01);
+    vst1q_f32(c + 8, c10);
+    vst1q_f32(c + 12, c11);
+    vst1q_f32(c + 16, c20);
+    vst1q_f32(c + 20, c21);
+}
+#endif
+
 #endif
 }
 }} // namespace cv::dnn
diff --git a/modules/dnn/src/layers/cpu_kernels/convolution.cpp b/modules/dnn/src/layers/cpu_kernels/convolution.cpp
index 60c13bfa87..4c9a2fd387 100644
--- a/modules/dnn/src/layers/cpu_kernels/convolution.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/convolution.cpp
@@ -34,10 +34,10 @@ Ptr<FastConv> initFastConv(
         const std::vector<size_t>& pads_begin,
         const std::vector<size_t>& pads_end,
         int conv_dim,
+        const bool _useFP16,
         bool useWinograd)
 {
     Ptr<FastConv> conv = makePtr<FastConv>();
-
     CV_Assert(ngroups > 0 && K > 0 && C > 0 && K % ngroups == 0);
 
     // Weight shape, [K, C, Dk, Hk, Wk] for Conv3D, [K, C, Hk, Wk] for Conv2D, [K, C, Wk] for Conv1D.
@@ -117,6 +117,13 @@ Ptr<FastConv> initFastConv(
     auto wShape = shape(weightsMat);
     const size_t wstep = weightsMat.step1();
 
+    conv->useFP16 = false;
+#ifdef CONV_ARM_FP16
+    // TODO: add FP16 support for Winograd.
+    if (_useFP16 && (conv->conv_type == CONV_TYPE_GENERIC || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN))
+        conv->useFP16 = true;
+#endif
+
     float *srcWeights = (float *)weightsMat.data;
     if (conv->conv_type == CONV_TYPE_DEPTHWISE || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
     {
@@ -128,17 +135,38 @@ Ptr<FastConv> initFastConv(
         // TODO: simplify the following code with std::copy.
         // this code aims to let memory fit with vector size.
         int padded_ksize = ((ksize + VEC_ALIGN-1) / VEC_ALIGN) * VEC_ALIGN;
-        int nweights = C*padded_ksize;
-        conv->weightsBuf.reserve(nweights + VEC_ALIGN);
-        conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN);
-        memset(conv->weightsBufPtr, 0, nweights*sizeof(conv->weightsBufPtr[0]));
-        auto weightsBufPtr = conv->weightsBufPtr;
-        parallel_for_(Range(0, C), [&](const Range& r0){
-        for(int c = r0.start; c < r0.end; c++)
+        int nweights = C * padded_ksize;
+
+#ifdef CONV_ARM_FP16
+        if (conv->useFP16)
         {
-            for (int k = 0; k < ksize; k++)
-                weightsBufPtr[c*padded_ksize + k] = srcWeights[c*wstep + k];
-        }});
+            conv->weightsBuf_FP16.resize(nweights + VEC_ALIGN);
+            conv->weightsBufPtr_FP16 = alignPtr(conv->weightsBuf_FP16.data(), VEC_ALIGN * sizeof(float16_t ));
+            memset(conv->weightsBufPtr_FP16, 0, nweights * sizeof(float16_t ));
+            auto weightsBufPtr_FP16 = conv->weightsBufPtr_FP16;
+
+            parallel_for_(Range(0, C), [&](const Range& r0){
+            for(int c = r0.start; c < r0.end; c++)
+            {
+                for (int k = 0; k < ksize; k++)
+                    weightsBufPtr_FP16[c*padded_ksize + k] = (float16_t)srcWeights[c*wstep + k];
+            }});
+        }
+        else
+#endif
+        {
+            conv->weightsBuf.resize(nweights + VEC_ALIGN);
+            conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN * sizeof(float ));
+            memset(conv->weightsBufPtr, 0, nweights*sizeof(float ));
+            auto weightsBufPtr = conv->weightsBufPtr;
+
+            parallel_for_(Range(0, C), [&](const Range& r0){
+            for(int c = r0.start; c < r0.end; c++)
+            {
+                for (int k = 0; k < ksize; k++)
+                    weightsBufPtr[c*padded_ksize + k] = srcWeights[c*wstep + k];
+            }});
+        }
     }
     else if(conv->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd
     {
@@ -163,10 +191,25 @@ Ptr<FastConv> initFastConv(
         int Kg = K/ngroups;
         int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - 1)/CONV_WINO_KBLOCK;
         size_t nweights = ngroups*Kg_nblocks*Cg*CONV_WINO_KBLOCK*CONV_WINO_AREA;
-        conv->weightsWinoBuf.reserve(nweights + VEC_ALIGN);
-        conv->weightsWinoBufPtr = alignPtr(conv->weightsWinoBuf.data(), VEC_ALIGN);
-        float* wptrWino = conv->weightsWinoBufPtr;
-        memset(wptrWino, 0, nweights * sizeof(wptrWino[0]));
+
+        float* wptrWino = nullptr;
+#ifdef CONV_ARM_FP16
+        float16_t* wptrWino_FP16 = nullptr;
+        if (conv->useFP16)
+        {
+            conv->weightsWinoBuf_FP16.resize(nweights + VEC_ALIGN);
+            conv->weightsWinoBufPtr_FP16 = alignPtr(conv->weightsWinoBuf_FP16.data(), VEC_ALIGN);
+            wptrWino_FP16 = conv->weightsWinoBufPtr_FP16;
+            memset(wptrWino_FP16, 0, nweights * sizeof(wptrWino_FP16[0]));
+        }
+        else
+#endif
+        {
+            conv->weightsWinoBuf.resize(nweights + VEC_ALIGN);
+            conv->weightsWinoBufPtr = alignPtr(conv->weightsWinoBuf.data(), VEC_ALIGN);
+            wptrWino = conv->weightsWinoBufPtr;
+            memset(wptrWino, 0, nweights * sizeof(wptrWino[0]));
+        }
 
         parallel_for_(Range(0, K), [&](const Range& r0){
         float kernelTm[CONV_WINO_AREA];
@@ -206,57 +249,133 @@ Ptr<FastConv> initFastConv(
                 }
 
                 // repack the data.
-                float* wptr = wptrWino + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA +
-                              (c*CONV_WINO_KBLOCK + dk)*CONV_WINO_ATOM_F32;
-                for (int i = 0; i < CONV_WINO_NATOMS_F32; i++,
-                        wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F32)
+#ifdef CONV_ARM_FP16
+                if (conv->useFP16)
+                {
+                    float16_t* wptr = wptrWino_FP16 + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA +
+                                  (c*CONV_WINO_KBLOCK + dk)*CONV_WINO_ATOM_F16;
+                    for (int i = 0; i < CONV_WINO_NATOMS_F16; i++,
+                            wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F16)
+                    {
+                        CV_Assert(conv->weightsWinoBufPtr_FP16 <= wptr && wptr + CONV_WINO_ATOM_F16 <= conv->weightsWinoBufPtr_FP16 + nweights);
+                        for (int j = 0; j < CONV_WINO_ATOM_F16; j++)
+                        {
+                            wptr[j] = (float16_t)kernelTm[i * CONV_WINO_ATOM_F16 + j];
+                        }
+                    }
+                }
+                else
+#endif
                 {
-                    CV_Assert(conv->weightsWinoBufPtr <= wptr && wptr + CONV_WINO_ATOM_F32 <= conv->weightsWinoBufPtr + nweights);
-                    memcpy(wptr, kernelTm + i * CONV_WINO_ATOM_F32, CONV_WINO_ATOM_F32*sizeof (wptr[0]));
+                    float* wptr = wptrWino + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA +
+                                  (c*CONV_WINO_KBLOCK + dk)*CONV_WINO_ATOM_F32;
+                    for (int i = 0; i < CONV_WINO_NATOMS_F32; i++,
+                            wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F32)
+                    {
+                        CV_Assert(conv->weightsWinoBufPtr <= wptr && wptr + CONV_WINO_ATOM_F32 <= conv->weightsWinoBufPtr + nweights);
+                        memcpy(wptr, kernelTm + i * CONV_WINO_ATOM_F32, CONV_WINO_ATOM_F32*sizeof (wptr[0]));
+                    }
                 }
             }
-        }});
+        }
+        });
     }
     else if (conv->conv_type == CONV_TYPE_GENERIC)
     {
         // The weights are packed as
         // ngroups x (ceil((K/ngroups)/CONV_MR)*CONV_MR) x (Cg*Hk*Wk*Dk) x CONV_MR tensor
         int Kg = K/ngroups, Cg = max(C/ngroups, 1);
-        int numStripsMR = (Kg + CONV_MR - 1) / CONV_MR;
-        int Kg_aligned = numStripsMR * CONV_MR;
         int DkHkWkCg = Dk*Hk*Wk*Cg;
+
+        int numStripsMR = (Kg + CONV_MR_FP32 - 1) / CONV_MR_FP32;
+        int Kg_aligned = numStripsMR * CONV_MR_FP32;
         size_t nweights = ngroups*Kg_aligned*DkHkWkCg;
-        conv->weightsBuf.reserve(nweights + VEC_ALIGN);
-        conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN);
-        float* weightsBufPtr = conv->weightsBufPtr;
-        memset(weightsBufPtr, 0, nweights*sizeof(weightsBufPtr[0]));
+
+        float* weightsBufPtr = nullptr;
+
+#ifdef CONV_ARM_FP16
+        int numStripsMR_FP16 = (Kg + CONV_MR_FP16 - 1) / CONV_MR_FP16;
+        int Kg_aligned_FP16 = numStripsMR_FP16 * CONV_MR_FP16;
+        size_t nweights_FP16 = ngroups * Kg_aligned_FP16 * DkHkWkCg;
+
+        float16_t* weightsBufPtr_FP16 = nullptr;
+        if (conv->useFP16)
+        {
+            conv->weightsBuf_FP16.resize(nweights_FP16 + VEC_ALIGN);
+            conv->weightsBufPtr_FP16 = alignPtr(conv->weightsBuf_FP16.data(), VEC_ALIGN);
+            weightsBufPtr_FP16 = conv->weightsBufPtr_FP16;
+            memset(weightsBufPtr_FP16, 0, nweights_FP16*sizeof(weightsBufPtr_FP16[0]));
+        }
+        else
+#endif
+        {
+            conv->weightsBuf.resize(nweights + VEC_ALIGN);
+            conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN);
+            weightsBufPtr = conv->weightsBufPtr;
+            memset(weightsBufPtr, 0, nweights*sizeof(weightsBufPtr[0]));
+        }
 
         // Pack the weight.
-        parallel_for_(Range(0, ngroups * numStripsMR), [&](const Range& r0){
-        for (int gsi = r0.start; gsi < r0.end; gsi++)
+#ifdef CONV_ARM_FP16
+        if (conv->useFP16)
         {
-            int g = gsi / numStripsMR;
-            int si = gsi - g * numStripsMR;
+            parallel_for_(Range(0, ngroups * numStripsMR_FP16), [&](const Range& r0){
+            for (int gsi = r0.start; gsi < r0.end; gsi++)
+            {
+                int g = gsi / numStripsMR_FP16;
+                int si = gsi - g * numStripsMR_FP16;
 
-            int startK = si * CONV_MR;
-            CV_Assert(startK < Kg_aligned);
+                int startK = si * CONV_MR_FP16;
+                CV_Assert(startK < Kg_aligned_FP16);
 
-            float* packed_wptr = weightsBufPtr + DkHkWkCg * (startK + g * Kg_aligned);
-            int dk = Kg - startK < CONV_MR ? Kg - startK : CONV_MR; // check if we need zero padding.
+                float16_t* packed_wptr = weightsBufPtr_FP16 + DkHkWkCg * (startK + g * Kg_aligned_FP16);
+                int dk = Kg - startK < CONV_MR_FP16 ? Kg - startK : CONV_MR_FP16; // check if we need zero padding.
 
-            int k_idx = g*Kg + startK;
-            for(int hwd = 0; hwd < Hk*Wk*Dk; hwd++) {
-                for(int c = 0; c < Cg; c++, packed_wptr += CONV_MR)
+                int k_idx = g*Kg + startK;
+                for(int hwd = 0; hwd < Hk*Wk*Dk; hwd++)
                 {
-                    const float* wptr = srcWeights + wstep * k_idx + c*Hk*Wk*Dk + hwd;
-                    int k = 0;
-                    for(; k < dk; k++, wptr += wstep)
-                        packed_wptr[k] = *wptr;
-                    for(; k < CONV_MR; k++)
-                        packed_wptr[k] = 0.f;
+                    for(int c = 0; c < Cg; c++, packed_wptr += CONV_MR_FP16)
+                    {
+                        const float* wptr = srcWeights + wstep * k_idx + c*Hk*Wk*Dk + hwd;
+                        int k = 0;
+                        for(; k < dk; k++, wptr += wstep)
+                            packed_wptr[k] = (float16_t)(*wptr);
+                        for(; k < CONV_MR_FP16; k++)
+                            packed_wptr[k] = (float16_t)0.f;
+                    }
                 }
-            }
-        }});
+            }});
+        }
+        else
+#endif
+        {
+            parallel_for_(Range(0, ngroups * numStripsMR), [&](const Range& r0){
+            for (int gsi = r0.start; gsi < r0.end; gsi++)
+            {
+                int g = gsi / numStripsMR;
+                int si = gsi - g * numStripsMR;
+
+                int startK = si * CONV_MR_FP32;
+                CV_Assert(startK < Kg_aligned);
+
+                float* packed_wptr = weightsBufPtr + DkHkWkCg * (startK + g * Kg_aligned);
+                int dk = Kg - startK < CONV_MR_FP32 ? Kg - startK : CONV_MR_FP32; // check if we need zero padding.
+
+                int k_idx = g*Kg + startK;
+                for(int hwd = 0; hwd < Hk*Wk*Dk; hwd++)
+                {
+                    for(int c = 0; c < Cg; c++, packed_wptr += CONV_MR_FP32)
+                    {
+                        const float* wptr = srcWeights + wstep * k_idx + c*Hk*Wk*Dk + hwd;
+                        int k = 0;
+                        for(; k < dk; k++, wptr += wstep)
+                            packed_wptr[k] = *wptr;
+                        for(; k < CONV_MR_FP32; k++)
+                            packed_wptr[k] = 0.f;
+                    }
+                }
+            }});
+        }
     }
     else
         CV_Error(CV_StsUnsupportedFormat, "Unknown convolution type.");
@@ -275,75 +394,142 @@ Ptr<FastConv> initFastConv(
     return conv;
 }
 
-static inline void packData8(float*& inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const int* ofstab,
-                      const int stride_w, const int ksize)
+static inline void packData8(char*& inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const int* ofstab,
+                      const int stride_w, const int ksize, const int esz)
 {
-    float* inpbufC = inpbuf + s0;
-    float* inptrInC = inptrIn;
+    char * inpbufC = inpbuf + s0 * esz;
+    float* inptrInC = (float* )inptrIn;
 
-    if (stride_w == 1)
-        for (int k = 0; k < ksize; k++)
+#ifdef CONV_ARM_FP16
+    float16_t* inpbufC_FP16 = (float16_t *)inpbufC;
+    if (esz == sizeof(float16_t))
+    {
+        if (stride_w == 1)
         {
-            int k1 = ofstab[k];
-            float v0 = inptrInC[k1];
-            float v1 = inptrInC[k1 + 1];
-            float v2 = inptrInC[k1 + 2];
-            float v3 = inptrInC[k1 + 3];
-            float v4 = inptrInC[k1 + 4];
-            float v5 = inptrInC[k1 + 5];
-            float v6 = inptrInC[k1 + 6];
-            float v7 = inptrInC[k1 + 7];
-
-            inpbufC[k*CONV_NR] = v0;
-            inpbufC[k*CONV_NR+1] = v1;
-            inpbufC[k*CONV_NR+2] = v2;
-            inpbufC[k*CONV_NR+3] = v3;
-            inpbufC[k*CONV_NR+4] = v4;
-            inpbufC[k*CONV_NR+5] = v5;
-            inpbufC[k*CONV_NR+6] = v6;
-            inpbufC[k*CONV_NR+7] = v7;
+            for (int k = 0; k < ksize; k++)
+            {
+                int k1 = ofstab[k];
+
+                float32x4_t v0 = vld1q_f32(inptrInC + k1);
+                float32x4_t v1 = vld1q_f32(inptrInC + k1 + 4);
+                vst1q_f16((__fp16*)inpbufC_FP16 + k * CONV_NR_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
+            }
         }
-    else
-        for (int k = 0; k < ksize; k++)
+        else
         {
-            int k1 = ofstab[k];
-            float v0 = inptrInC[k1];
-            float v1 = inptrInC[k1 + stride_w];
-            float v2 = inptrInC[k1 + 2*stride_w];
-            float v3 = inptrInC[k1 + 3*stride_w];
-            float v4 = inptrInC[k1 + 4*stride_w];
-            float v5 = inptrInC[k1 + 5*stride_w];
-            float v6 = inptrInC[k1 + 6*stride_w];
-            float v7 = inptrInC[k1 + 7*stride_w];
-
-            inpbufC[k*CONV_NR] = v0;
-            inpbufC[k*CONV_NR+1] = v1;
-            inpbufC[k*CONV_NR+2] = v2;
-            inpbufC[k*CONV_NR+3] = v3;
-            inpbufC[k*CONV_NR+4] = v4;
-            inpbufC[k*CONV_NR+5] = v5;
-            inpbufC[k*CONV_NR+6] = v6;
-            inpbufC[k*CONV_NR+7] = v7;
+            for (int k = 0; k < ksize; k++)
+            {
+                int k1 = ofstab[k];
+                float32x4_t v0, v1;
+
+                v0[0] = inptrInC[k1];
+                v0[1] = inptrInC[k1 + stride_w];
+                v0[2] = inptrInC[k1 + 2*stride_w];
+                v0[3] = inptrInC[k1 + 3*stride_w];
+                v1[0] = inptrInC[k1 + 4*stride_w];
+                v1[1] = inptrInC[k1 + 5*stride_w];
+                v1[2] = inptrInC[k1 + 6*stride_w];
+                v1[3] = inptrInC[k1 + 7*stride_w];
+
+                vst1q_f16((__fp16*)inpbufC_FP16 + k * CONV_NR_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
+            }
         }
+    }
+    else // float 32
+#endif
+    {
+        CV_Assert(esz == sizeof(float ));
+        float* inpbufC_FP32 = (float* )inpbufC;
+        if (stride_w == 1)
+            for (int k = 0; k < ksize; k++)
+            {
+                int k1 = ofstab[k];
+#if CV_SIMD256
+                vx_store(inpbufC_FP32 + k*CONV_NR, vx_load(inptrInC + k1));
+#elif CV_SIMD128
+                v_float32x4 vv0 = v_load(inptrInC + k1);
+                v_float32x4 vv1 = v_load(inptrInC + k1 + 4);
+                v_store(inpbufC_FP32 + k*CONV_NR_FP32, vv0);
+                v_store(inpbufC_FP32 + k*CONV_NR_FP32 + 4, vv1);
+#else
+                float v0 = inptrInC[k1];
+                float v1 = inptrInC[k1 + 1];
+                float v2 = inptrInC[k1 + 2];
+                float v3 = inptrInC[k1 + 3];
+                float v4 = inptrInC[k1 + 4];
+                float v5 = inptrInC[k1 + 5];
+                float v6 = inptrInC[k1 + 6];
+                float v7 = inptrInC[k1 + 7];
+
+                inpbufC_FP32[k*CONV_NR_FP32] = v0;
+                inpbufC_FP32[k*CONV_NR_FP32+1] = v1;
+                inpbufC_FP32[k*CONV_NR_FP32+2] = v2;
+                inpbufC_FP32[k*CONV_NR_FP32+3] = v3;
+                inpbufC_FP32[k*CONV_NR_FP32+4] = v4;
+                inpbufC_FP32[k*CONV_NR_FP32+5] = v5;
+                inpbufC_FP32[k*CONV_NR_FP32+6] = v6;
+                inpbufC_FP32[k*CONV_NR_FP32+7] = v7;
+#endif
+            }
+        else
+            for (int k = 0; k < ksize; k++)
+            {
+                int k1 = ofstab[k];
+                float v0 = inptrInC[k1];
+                float v1 = inptrInC[k1 + stride_w];
+                float v2 = inptrInC[k1 + 2*stride_w];
+                float v3 = inptrInC[k1 + 3*stride_w];
+                float v4 = inptrInC[k1 + 4*stride_w];
+                float v5 = inptrInC[k1 + 5*stride_w];
+                float v6 = inptrInC[k1 + 6*stride_w];
+                float v7 = inptrInC[k1 + 7*stride_w];
+
+                inpbufC_FP32[k*CONV_NR_FP32] = v0;
+                inpbufC_FP32[k*CONV_NR_FP32+1] = v1;
+                inpbufC_FP32[k*CONV_NR_FP32+2] = v2;
+                inpbufC_FP32[k*CONV_NR_FP32+3] = v3;
+                inpbufC_FP32[k*CONV_NR_FP32+4] = v4;
+                inpbufC_FP32[k*CONV_NR_FP32+5] = v5;
+                inpbufC_FP32[k*CONV_NR_FP32+6] = v6;
+                inpbufC_FP32[k*CONV_NR_FP32+7] = v7;
+            }
+    }
     x0+=7;
     s0+=7;
     inptrIn += 7*stride_w;
     in_w += 7*stride_w;
 }
 
-static inline void packData2(float*& inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const int* ofstab,
-                      const int stride_w, const int ksize)
+static inline void packData2(char *& inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const int* ofstab,
+                      const int stride_w, const int ksize, const int esz)
 {
-    float* inpbufC = inpbuf + s0;
+    char* inpbufC = inpbuf + s0 * esz;
     float* inptrInC = inptrIn;
 
-    for (int k = 0; k < ksize; k++)
+#ifdef CONV_ARM_FP16
+    float16_t* inpbufC_FP16 = (float16_t *)inpbufC;
+    if (esz == sizeof(float16_t))
+    {
+        for (int k = 0; k < ksize; k++)
+        {
+            int k1 = ofstab[k];
+            float v0 = inptrInC[k1];
+            float v1 = inptrInC[k1 + stride_w];
+            inpbufC_FP16[k*CONV_NR_FP16] = (float16_t)v0;
+            inpbufC_FP16[k*CONV_NR_FP16+1] = (float16_t)v1;
+        }
+    } else
+#endif
     {
-        int k1 = ofstab[k];
-        float v0 = inptrInC[k1];
-        float v1 = inptrInC[k1 + stride_w];
-        inpbufC[k*CONV_NR] = v0;
-        inpbufC[k*CONV_NR+1] = v1;
+        float * inpbufC_FP32 = (float *)inpbufC;
+        for (int k = 0; k < ksize; k++)
+        {
+            int k1 = ofstab[k];
+            float v0 = inptrInC[k1];
+            float v1 = inptrInC[k1 + stride_w];
+            inpbufC_FP32[k*CONV_NR_FP32] = v0;
+            inpbufC_FP32[k*CONV_NR_FP32+1] = v1;
+        }
     }
 
     x0++;
@@ -352,131 +538,683 @@ static inline void packData2(float*& inpbuf, float*& inptrIn, int& in_w, int& x0
     in_w += stride_w;
 }
 
-void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, int ntasks,
-                   const Ptr<ActivationLayer>& actLayer, const std::vector<float>& reluslope, bool fusedAdd)
+#ifdef CONV_ARM_FP16
+// Fast convert float 32 to float16
+static inline void _cvt32f16f( const float* src, float16_t* dst, int len)
 {
-    Mat input = _input.getMat();
-    Mat output = _output.getMat();
-    int conv_dim = conv->conv_dim;
+    int j = 0;
+    const int VECSZ = 4;
+    __fp16* dst_FP16 = (__fp16 *)dst;
+    if (len > VECSZ * 4)
+    {
+        const int VECSZ4 = 4 * VECSZ;
+        for( ; j + VECSZ4 < len; j += VECSZ4)
+        {
 
-    CV_Assert_N(input.dims == output.dims,
-                input.size[0] == output.size[0],
-                conv->C == input.size[1],
-                conv->K == output.size[1],
-                input.type() == output.type(),
-                input.isContinuous(),
-                output.isContinuous());
+            float32x4_t v0 = vld1q_f32(src + j);
+            float32x4_t v1 = vld1q_f32(src + j + 4);
+            float32x4_t v2 = vld1q_f32(src + j + 8);
+            float32x4_t v3 = vld1q_f32(src + j + 12);
 
-    Mat fusedAddMat;
-    if (fusedAdd)
-    {
-        CV_Assert(conv->conv_dim != CONV_3D && "Conv3D does not support Conv+Add fusion optimization!");
-        fusedAddMat = _output.getMat();
+            vst1q_f16(dst_FP16 + j, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
+            vst1q_f16(dst_FP16 + j + 8, vcombine_f16(vcvt_f16_f32(v2), vcvt_f16_f32(v3)));
+        }
     }
 
-    if (conv->conv_type == CONV_TYPE_DEPTHWISE)
+    for( ; j < len; j += VECSZ )
     {
-        // Depthwise-Convolution layer should not be followed by Add layer.
-        CV_Assert((conv_dim == CONV_1D || conv_dim == CONV_2D));
-        return runDepthwise(input, output, conv, actLayer.get(), reluslope, fusedAdd);
-    }
-
-    MatShape inputShape = shape(input);
-    MatShape outputShape = shape(output);
-
-    CV_Assert(inputShape.size() == outputShape.size());
+        if( j > len - VECSZ )
+        {
+            if( j == 0 )
+                break;
+            j = len - VECSZ;
+        }
 
-    ActivationLayer* activ = nullptr;
-    float minval = -FLT_MAX, maxval = FLT_MAX;
-    bool ifMinMaxAct = false;
+        float16x4_t hv = vcvt_f16_f32(vld1q_f32(src + j));
+        vst1_f16(dst_FP16 + j, hv);
+    }
+    for( ; j < len; j++ )
+        dst[j] = float16_t(src[j]);
+}
+#endif
 
-    if (actLayer)
+static inline void packInputData(char* inpbuf_task, float* inp, const int* ofstab, const int* dhwTab, int zyx0, int zyx_limit,
+                                 int ksize, int stride_d, int stride_h, int stride_w, int pad_front, int pad_top, int pad_left,
+                                 int Dk, int Hk, int Wk, int dilation_d, int dilation_h, int dilation_w, int Di, int Hi, int Wi,
+                                 int H0, int W0, int Cg, int stripesize, int inp_plane_ofs, int inp_planesize, int conv_dim, int conv_type,
+                                 const int CONV_NR, const int esz,  bool fast_1x1, bool useFP16)
+{
+    for (int stripe = 0; zyx0 < zyx_limit; stripe++, zyx0 += CONV_NR)
     {
-        Ptr<ReLULayer> activ_relu = actLayer.dynamicCast<ReLULayer>();
-        Ptr<ReLU6Layer> activ_relu6 = actLayer.dynamicCast<ReLU6Layer>();
-
-        if (!activ_relu.empty())
+        char *inpbuf = inpbuf_task + stripe * stripesize * esz;
+        float *inptr = inp + inp_plane_ofs;
+
+        /*
+            1. pack the data. Copy the HkxWk CONV_NR-wide slices from
+               each feature plane of the input tensor to the input buffer.
+        */
+        if (fast_1x1)
         {
-            if (activ_relu->negativeSlope == 0.0f)
+            int slice_len = zyx_limit - zyx0;
+            bool partial = slice_len < CONV_NR;
+            const int CONV_NR_esz = CONV_NR * esz;
+            // Superfast branch for 1x1 convolutions with sy=sx=1.
+            // in this case each feature plane can be safely treated
+            // as 1D array, and we just extract next portion
+            // of CONV_NR elements from each feature plane and
+            // put it together.
+            inptr += zyx0;
+            if (!partial)
             {
-                minval = 0.0f;
-                ifMinMaxAct = true;
-                activ = nullptr;
+                // Make special branch where memcpy() is called with a constant buffer size.
+                // Compilers will likely unroll this loop properly.
+#ifdef CONV_ARM_FP16
+                if (useFP16)
+                {
+                    for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
+                        _cvt32f16f(inptr, (float16_t *)inpbuf, CONV_NR);
+                }
+                else
+#endif
+                    for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
+                        memcpy(inpbuf, inptr, CONV_NR_esz);
             }
-            else // Leaky ReLU
+            else
             {
-                activ = actLayer.get();
+#ifdef CONV_ARM_FP16
+                if (useFP16)
+                {
+                    for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
+                    {
+                        _cvt32f16f(inptr, (float16_t *)inpbuf, slice_len);
+                        memset(inpbuf + slice_len * esz, 0, (CONV_NR - slice_len) * esz);
+                    }
+                }
+                else
+#endif
+                for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz)
+                {
+                    memcpy(inpbuf, inptr, slice_len * esz);
+                    memset(inpbuf + slice_len * esz, 0, (CONV_NR - slice_len) * esz);
+                }
             }
         }
-        else if (!activ_relu6.empty())
+        else if (conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
         {
-            minval = activ_relu6->minValue;
-            maxval = activ_relu6->maxValue;
+            CV_Assert(Cg == 1);
+            const int HW0 = H0 * W0;
+            const int HWi = Hi * Wi;
+            int slice_len = std::min(zyx_limit - zyx0, CONV_NR);
 
-            ifMinMaxAct = true;
-            activ = nullptr;
-        }
-        else
-            activ = actLayer.get();
-    }
-    else
-        activ = nullptr;
+            // here some non-continuous sub-row of the row will not be
+            // filled from the tensor; we need to make sure that the uncovered
+            // elements are explicitly set to 0's. the easiest way is to
+            // set all the elements to 0's before the loop.
+            memset(inpbuf, 0, stripesize * esz);
 
-    if (conv->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd
-    {
-        CV_Assert(conv->weightsWinoBufPtr && input.dims == 4 && conv_dim == CONV_2D);
-        if (runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct))
-            return;
-    }
+            int z0 = zyx0 / HW0, yx0 = zyx0 - z0 * HW0;
+            int y0 = yx0 / W0, x0 = yx0 - y0 * W0;
 
-    int N = inputShape[0], C = inputShape[1];
+            if (conv_dim == CONV_1D)
+            {
+                for (int slice_i = 0; slice_i < slice_len; y0++, x0=0)
+                {
+                    int delta = std::min(slice_len - slice_i, W0 - x0);
+                    int x1 = x0 + delta;
 
-    // input shape: [N, C, D, H, W] for Conv3D, [N, C, H, W] for Conv2D, [N, C, W] for Conv1D.
-    int Di = conv_dim == CONV_3D ? inputShape[2] : 1;
-    int Hi = conv_dim == CONV_1D ? 1 : inputShape[inputShape.size() - 2];
-    int Wi = inputShape[inputShape.size() - 1];
+                    int in_w = x0 * stride_w - pad_left;
+                    float* inptrIn = inptr + in_w;
 
-    int ngroups = conv->ngroups;
-    int K = conv->K, Dk = conv->Dk, Hk = conv->Hk, Wk = conv->Wk;
+                    int s0 = slice_i;
 
-    int D0 = conv_dim == CONV_3D ? outputShape[2] : 1;
-    int H0 = conv_dim == CONV_1D ? 1 : outputShape[outputShape.size() - 2];
-    int W0 = outputShape[outputShape.size() - 1];
+                    for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
+                    {
+                        // Pack 8
+                        if (x0 + 8 <= x1 && 0 <= in_w &&
+                            in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w)
+                        {
+                            packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
+                        }
+                        else if (x0 + 2 <= x1 && 0 <= in_w &&
+                                 in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w)
+                        {
+                            packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
+                        }
+                        else
+                        {
+                            int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w);
+                            int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w);
+                            const float* inptrInC = inptrIn;
+#ifdef CONV_ARM_FP16
+                            if (useFP16)
+                            {
+                                float16_t* inpbufC = (float16_t *)inpbuf + s0;
+                                for (int w = w0; w < w1; w++)
+                                {
+                                    int imgofs = w*dilation_w;
+                                    inpbufC[w*CONV_NR] = (float16_t)inptrInC[imgofs];
+                                }
+                            }
+                            else
+#endif
+                            {
+                                float* inpbufC = (float *)inpbuf + s0;
+                                for (int w = w0; w < w1; w++)
+                                {
+                                    int imgofs = w*dilation_w;
+                                    inpbufC[w*CONV_NR] = inptrInC[imgofs];
+                                }
+                            }
+                        }
+                    }
+                    slice_i += delta;
+                }
+            }
+            else if (conv_dim == CONV_2D)
+            {
+                for (int slice_i = 0; slice_i < slice_len; y0++, x0=0)
+                {
+                    int delta = std::min(slice_len - slice_i, W0 - x0);
+                    int x1 = x0 + delta;
 
-    int Cg = C/ngroups, Kg = K/ngroups;
+                    int in_h = y0 * stride_h - pad_top;
+                    int in_w = x0 * stride_w - pad_left;
 
-    const size_t inp_planesize = (size_t)Di*Hi*Wi;
-    const size_t out_planesize = (size_t)D0*H0*W0;
+                    float* inptrIn = inptr + in_h*Wi + in_w;
 
-    int pad_front = conv->pad_front;
-    int pad_top = conv->pad_top;
-    int pad_left = conv->pad_left;
+                    bool ok_i = 0 <= in_h && in_h < Hi - (Hk-1)*dilation_h;
+                    int h0 = std::max(0, (-in_h + dilation_h-1)/dilation_h);
+                    int h1 = std::min(Hk, (Hi - in_h + dilation_h-1)/dilation_h);
 
-    int stride_d = conv->stride_d, stride_h = conv->stride_h, stride_w = conv->stride_w;
-    int dilation_d = conv->dilation_d, dilation_h = conv->dilation_h, dilation_w = conv->dilation_w;
+                    int s0 = slice_i;
+                    for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
+                    {
+                        // Pack 8
+                        if (ok_i && x0 + 8 <= x1 && 0 <= in_w &&
+                            in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w)
+                        {
+                            packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
+                        }
+                        else if (ok_i && x0 + 2 <= x1 && 0 <= in_w &&
+                                 in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w)
+                        {
+                            packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
+                        }
+                        else
+                        {
+                            int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w);
+                            int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w);
 
-    int ksize = Dk*Hk*Wk;
-    bool fast_1x1 = ksize == 1 && stride_d == 1 && stride_w == 1 && stride_h == 1
-            && pad_front == 0 && pad_left == 0 && pad_top == 0;
-    int DkHkWkCg = Dk*Hk*Wk*Cg;
+                            const float* inptrInC = inptrIn;
+#ifdef CONV_ARM_FP16
+                            if (useFP16)
+                            {
+                                float16_t* inpbufC = (float16_t *)inpbuf + s0;
 
-    std::vector<int> ofstab_(Hk*Wk*Dk*4, 0);
-    int* ofstab = ofstab_.data();
-    int* dhwTab = ofstab + Hk*Wk*Dk;
-    int padded_ksize = ((ksize + VEC_ALIGN-1) / VEC_ALIGN) * VEC_ALIGN;
+                                for (int h = h0; h < h1; h++)
+                                {
+                                    for (int w = w0; w < w1; w++)
+                                    {
+                                        int imgofs = h*(dilation_h*Wi) + w*dilation_w;
+                                        inpbufC[(h*Wk + w)*CONV_NR] = (float16_t)inptrInC[imgofs];
+                                    }
+                                }
+                            }
+                            else
+#endif
+                            {
+                                float* inpbufC = (float *)inpbuf + s0;
 
-    if (conv_dim == CONV_1D)
-    {
-        for( int w = 0; w < Wk; w++)
-        {
-            int dw = w*dilation_w;
-            dhwTab[w*3+2] = dw;
-            ofstab[w] = dw;
-        }
-    }
-    else if (conv_dim == CONV_2D)
-    {
-        for (int h = 0; h < Hk; h++)
+                                for (int h = h0; h < h1; h++)
+                                {
+                                    for (int w = w0; w < w1; w++)
+                                    {
+                                        int imgofs = h*(dilation_h*Wi) + w*dilation_w;
+                                        inpbufC[(h*Wk + w)*CONV_NR] = inptrInC[imgofs];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    slice_i += delta;
+                }
+            }
+            else if (conv_dim == CONV_3D)
+            {
+                for (int slice_i = 0; slice_i < slice_len; z0 += (y0+1)/H0, y0 = (y0+1)%H0, x0=0)
+                {
+                    int delta = std::min(slice_len - slice_i, W0 - x0);
+                    int x1 = x0 + delta;
+
+                    int in_d = z0 * stride_d - pad_front;
+                    int in_h = y0 * stride_h - pad_top;
+                    int in_w = x0 * stride_w - pad_left;
+
+                    float* inptrIn = inptr + in_d*HWi + in_h*Wi + in_w;
+
+                    int d0 = std::max(0, (-in_d + dilation_d - 1) / dilation_d);
+                    int d1 = std::min(Dk, (Di - in_d + dilation_d - 1) / dilation_d);
+
+                    bool ok_i = 0 <= in_d && in_d < Di - (Dk-1)*dilation_d &&
+                                0 <= in_h && in_h < Hi - (Hk-1)*dilation_h;
+                    int h0 = std::max(0, (-in_h + dilation_h-1)/dilation_h);
+                    int h1 = std::min(Hk, (Hi - in_h + dilation_h-1)/dilation_h);
+
+                    int s0 = slice_i;
+                    for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
+                    {
+                        // Pack 8
+                        if (ok_i && x0 + 8 <= x1 && 0 <= in_w &&
+                            in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w)
+                        {
+                            packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
+                        }
+                        else if (ok_i && x0 + 2 <= x1 && 0 <= in_w &&
+                                 in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w)
+                        {
+                            packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz);
+                        }
+                        else
+                        {
+                            int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w);
+                            int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w);
+                            const float* inptrInC = inptrIn;
+#ifdef CONV_ARM_FP16
+                            if (useFP16)
+                            {
+                                float16_t* inpbufC = (float16_t* )inpbuf + s0;
+
+                                for ( int d = d0; d < d1; d++)
+                                {
+                                    for (int h = h0; h < h1; h++)
+                                    {
+                                        for (int w = w0; w < w1; w++)
+                                        {
+                                            int imgofs = d*dilation_d*HWi + h*(dilation_h*Wi) + w*dilation_w;
+                                            inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = (float16_t)inptrInC[imgofs];
+                                        }
+                                    }
+                                }
+                            }
+                            else
+#endif
+                            {
+                                float* inpbufC = (float* )inpbuf + s0;
+
+                                for ( int d = d0; d < d1; d++)
+                                {
+                                    for (int h = h0; h < h1; h++)
+                                    {
+                                        for (int w = w0; w < w1; w++)
+                                        {
+                                            int imgofs = d*dilation_d*HWi + h*(dilation_h*Wi) + w*dilation_w;
+                                            inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = inptrInC[imgofs];
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    slice_i += delta;
+                }
+            }
+        }
+        else
+        {
+            const int HW0 = H0 * W0;
+            const int HWi = Hi * Wi;
+            int z0_ = zyx0 / HW0, yx0 = zyx0 - z0_ * HW0;
+            int y0_ = yx0 / W0, x0_ = yx0 - y0_ * W0;
+            for (int k = 0; k < ksize; k++)
+            {
+                int dz = dhwTab[k * 3], dy = dhwTab[k * 3 + 1], dx = dhwTab[k * 3 + 2];
+                int i = 0, z0 = z0_, y0 = y0_, x0 = x0_;
+                for (; i < CONV_NR;)
+                {
+                    float* inpbuf_ki = (float* )inpbuf + k * CONV_NR * Cg + i;
+#ifdef CONV_ARM_FP16
+                    float16_t * inpbuf_ki_FP16 = (float16_t *)inpbuf + k * CONV_NR * Cg + i;
+#endif
+
+                    int zi = z0 * stride_d + dz - pad_front;
+                    int yi = y0 * stride_h + dy - pad_top;
+                    int xi = x0 * stride_w + dx - pad_left;
+
+                    if ((unsigned) zi < (unsigned) Di && (unsigned) yi < (unsigned) Hi &&
+                        (unsigned) xi < (unsigned) Wi)
+                    {
+                        const float *inptr_ki = inptr + zi * HWi + yi * Wi + xi;
+                        if (i + 8 <= CONV_NR && x0 + 8 <= W0 && xi + stride_w * 8 <= Wi)
+                        {
+                            if (stride_w == 1)
+                            {
+#ifdef CONV_ARM_FP16
+                                if (useFP16)
+                                {
+                                    for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
+                                    {
+                                        float32x4_t v0 = vld1q_f32(inptr_ki);
+                                        float32x4_t v1 = vld1q_f32(inptr_ki + 4);
+
+                                        vst1q_f16((__fp16* )inpbuf_ki_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
+                                    }
+                                }
+                                else
+#endif
+                                for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
+                                {
+                                    float t0 = inptr_ki[0], t1 = inptr_ki[1];
+                                    float t2 = inptr_ki[2], t3 = inptr_ki[3];
+                                    float t4 = inptr_ki[4], t5 = inptr_ki[5];
+                                    float t6 = inptr_ki[6], t7 = inptr_ki[7];
+                                    inpbuf_ki[0] = t0;
+                                    inpbuf_ki[1] = t1;
+                                    inpbuf_ki[2] = t2;
+                                    inpbuf_ki[3] = t3;
+                                    inpbuf_ki[4] = t4;
+                                    inpbuf_ki[5] = t5;
+                                    inpbuf_ki[6] = t6;
+                                    inpbuf_ki[7] = t7;
+                                }
+                            }
+                            else if (stride_w == 2)
+                            {
+#ifdef CONV_ARM_FP16
+                                if (useFP16)
+                                {
+                                    for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
+                                    {
+                                        float32x4_t v0, v1;
+                                        v0[0] = inptr_ki[0], v0[1] = inptr_ki[2];
+                                        v0[2] = inptr_ki[4], v0[3] = inptr_ki[6];
+                                        v1[0] = inptr_ki[8], v1[1] = inptr_ki[10];
+                                        v1[2] = inptr_ki[12], v1[3] = inptr_ki[14];
+                                        vst1q_f16((__fp16* )inpbuf_ki_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
+                                    }
+                                }
+                                else
+#endif
+                                for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
+                                {
+                                    float t0 = inptr_ki[0], t1 = inptr_ki[2];
+                                    float t2 = inptr_ki[4], t3 = inptr_ki[6];
+                                    float t4 = inptr_ki[8], t5 = inptr_ki[10];
+                                    float t6 = inptr_ki[12], t7 = inptr_ki[14];
+                                    inpbuf_ki[0] = t0;
+                                    inpbuf_ki[1] = t1;
+                                    inpbuf_ki[2] = t2;
+                                    inpbuf_ki[3] = t3;
+                                    inpbuf_ki[4] = t4;
+                                    inpbuf_ki[5] = t5;
+                                    inpbuf_ki[6] = t6;
+                                    inpbuf_ki[7] = t7;
+                                }
+                            }
+                            else
+                            {
+#ifdef CONV_ARM_FP16
+                                if (useFP16)
+                                {
+                                    for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
+                                    {
+                                        float32x4_t v0, v1;
+
+                                        v0[0] = inptr_ki[0], v0[1] = inptr_ki[stride_w];
+                                        v0[2] = inptr_ki[stride_w * 2], v0[3] = inptr_ki[stride_w * 3];
+                                        v1[0] = inptr_ki[stride_w * 4], v1[1] = inptr_ki[stride_w * 5];
+                                        v1[2] = inptr_ki[stride_w * 6], v1[3] = inptr_ki[stride_w * 7];
+                                        vst1q_f16((__fp16* )inpbuf_ki_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1)));
+                                    }
+                                }
+                                else
+#endif
+                                for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
+                                {
+                                    float t0 = inptr_ki[0], t1 = inptr_ki[stride_w];
+                                    float t2 = inptr_ki[stride_w * 2], t3 = inptr_ki[stride_w * 3];
+                                    float t4 = inptr_ki[stride_w * 4], t5 = inptr_ki[stride_w * 5];
+                                    float t6 = inptr_ki[stride_w * 6], t7 = inptr_ki[stride_w * 7];
+                                    inpbuf_ki[0] = t0;
+                                    inpbuf_ki[1] = t1;
+                                    inpbuf_ki[2] = t2;
+                                    inpbuf_ki[3] = t3;
+                                    inpbuf_ki[4] = t4;
+                                    inpbuf_ki[5] = t5;
+                                    inpbuf_ki[6] = t6;
+                                    inpbuf_ki[7] = t7;
+                                }
+                            }
+                            i += 8;
+                            x0 += 8;
+                        }
+                        else if (i + 4 <= CONV_NR && x0 + 4 <= W0 && xi + stride_w * 4 <= Wi)
+                        {
+                            if (stride_w == 1)
+                            {
+#ifdef CONV_ARM_FP16
+                                if (useFP16)
+                                {
+                                    for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
+                                    {
+                                        float32x4_t v0 = vld1q_f32(inptr_ki);
+                                        vst1_f16((__fp16* )inpbuf_ki_FP16, vcvt_f16_f32(v0));
+                                    }
+                                }
+                                else
+#endif
+                                for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
+                                {
+                                    float t0 = inptr_ki[0], t1 = inptr_ki[1];
+                                    float t2 = inptr_ki[2], t3 = inptr_ki[3];
+                                    inpbuf_ki[0] = t0;
+                                    inpbuf_ki[1] = t1;
+                                    inpbuf_ki[2] = t2;
+                                    inpbuf_ki[3] = t3;
+                                }
+                            }
+                            else
+                            {
+#ifdef CONV_ARM_FP16
+                                if (useFP16)
+                                {
+                                    for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
+                                    {
+                                        float32x4_t v0;
+                                        v0[0] = inptr_ki[0], v0[1] = inptr_ki[stride_w];
+                                        v0[2] = inptr_ki[stride_w * 2], v0[3] = inptr_ki[stride_w * 3];
+                                        vst1_f16((__fp16* )inpbuf_ki_FP16, vcvt_f16_f32(v0));
+                                    }
+                                }
+                                else
+#endif
+                                for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
+                                {
+                                    float t0 = inptr_ki[0], t1 = inptr_ki[stride_w];
+                                    float t2 = inptr_ki[stride_w * 2], t3 = inptr_ki[stride_w * 3];
+                                    inpbuf_ki[0] = t0;
+                                    inpbuf_ki[1] = t1;
+                                    inpbuf_ki[2] = t2;
+                                    inpbuf_ki[3] = t3;
+                                }
+                            }
+                            i += 4;
+                            x0 += 4;
+                        }
+                        else
+                        {
+#ifdef CONV_ARM_FP16
+                            if (useFP16)
+                            {
+                                for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize)
+                                    inpbuf_ki_FP16[0] = (float16_t)(*inptr_ki);
+                            }
+                            else
+#endif
+                            for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
+                                *inpbuf_ki = *inptr_ki;
+                            i++;
+                            x0++;
+                        }
+                    }
+                    else
+                    {
+#ifdef CONV_ARM_FP16
+                        if (useFP16)
+                        {
+                            for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR)
+                                inpbuf_ki_FP16[0] = (float16_t)0.f;
+                        }
+                        else
+#endif
+                        for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR)
+                            inpbuf_ki[0] = 0.f;
+                        i++;
+                        x0++;
+                    }
+
+                    int mask = x0 >= W0;
+                    y0 += mask;
+                    x0 &= mask - 1;
+
+                    mask = y0 >= H0; // Only Conv 3D need jump at z0 dimension
+                    if (mask && conv_dim != CONV_3D)
+                        break;
+
+                    z0 += mask;
+                    y0 &= mask - 1;
+                }
+            }
+        }
+    }
+}
+
+void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& conv, int ntasks,
+                   const Ptr<ActivationLayer>& actLayer, const std::vector<float>& reluslope, bool fusedAdd)
+{
+    Mat input = _input.getMat();
+    Mat output = _output.getMat();
+    int conv_dim = conv->conv_dim;
+
+    CV_Assert_N(input.dims == output.dims,
+                input.size[0] == output.size[0],
+                conv->C == input.size[1],
+                conv->K == output.size[1],
+                input.type() == output.type(),
+                input.isContinuous(),
+                output.isContinuous());
+
+    const bool useFP16 = conv->useFP16;
+    Mat fusedAddMat;
+    if (fusedAdd)
+    {
+        CV_Assert(conv->conv_dim != CONV_3D && "Conv3D does not support Conv+Add fusion optimization!");
+        fusedAddMat = _output.getMat();
+    }
+
+    if (conv->conv_type == CONV_TYPE_DEPTHWISE)
+    {
+        // Depthwise-Convolution layer should not be followed by Add layer.
+        CV_Assert((conv_dim == CONV_1D || conv_dim == CONV_2D) && !useFP16);
+        return runDepthwise(input, output, conv, actLayer.get(), reluslope, fusedAdd);
+    }
+
+    MatShape inputShape = shape(input);
+    MatShape outputShape = shape(output);
+
+    CV_Assert(inputShape.size() == outputShape.size());
+
+    ActivationLayer* activ = nullptr;
+    float minval = -FLT_MAX, maxval = FLT_MAX;
+    bool ifMinMaxAct = false;
+
+    if (actLayer)
+    {
+        Ptr<ReLULayer> activ_relu = actLayer.dynamicCast<ReLULayer>();
+        Ptr<ReLU6Layer> activ_relu6 = actLayer.dynamicCast<ReLU6Layer>();
+
+        if (!activ_relu.empty())
+        {
+            if (activ_relu->negativeSlope == 0.0f)
+            {
+                minval = 0.0f;
+                ifMinMaxAct = true;
+                activ = nullptr;
+            }
+            else // Leaky ReLU
+            {
+                activ = actLayer.get();
+            }
+        }
+        else if (!activ_relu6.empty())
+        {
+            minval = activ_relu6->minValue;
+            maxval = activ_relu6->maxValue;
+
+            ifMinMaxAct = true;
+            activ = nullptr;
+        }
+        else
+            activ = actLayer.get();
+    }
+    else
+        activ = nullptr;
+
+    // TODO: support FP16 for winograd.
+    if (conv->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd
+    {
+        CV_Assert(conv->weightsWinoBufPtr && input.dims == 4 && conv_dim == CONV_2D && !useFP16);
+        if (runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct))
+            return;
+    }
+
+    int N = inputShape[0], C = inputShape[1];
+
+    // input shape: [N, C, D, H, W] for Conv3D, [N, C, H, W] for Conv2D, [N, C, W] for Conv1D.
+    int Di = conv_dim == CONV_3D ? inputShape[2] : 1;
+    int Hi = conv_dim == CONV_1D ? 1 : inputShape[inputShape.size() - 2];
+    int Wi = inputShape[inputShape.size() - 1];
+
+    int ngroups = conv->ngroups;
+    int K = conv->K, Dk = conv->Dk, Hk = conv->Hk, Wk = conv->Wk;
+
+    int D0 = conv_dim == CONV_3D ? outputShape[2] : 1;
+    int H0 = conv_dim == CONV_1D ? 1 : outputShape[outputShape.size() - 2];
+    int W0 = outputShape[outputShape.size() - 1];
+
+    int Cg = C/ngroups, Kg = K/ngroups;
+
+    const size_t inp_planesize = (size_t)Di*Hi*Wi;
+    const size_t out_planesize = (size_t)D0*H0*W0;
+
+    int pad_front = conv->pad_front;
+    int pad_top = conv->pad_top;
+    int pad_left = conv->pad_left;
+
+    int stride_d = conv->stride_d, stride_h = conv->stride_h, stride_w = conv->stride_w;
+    int dilation_d = conv->dilation_d, dilation_h = conv->dilation_h, dilation_w = conv->dilation_w;
+
+    int ksize = Dk*Hk*Wk;
+    bool fast_1x1 = ksize == 1 && stride_d == 1 && stride_w == 1 && stride_h == 1
+            && pad_front == 0 && pad_left == 0 && pad_top == 0;
+    int DkHkWkCg = Dk*Hk*Wk*Cg;
+
+    std::vector<int> ofstab_(Hk*Wk*Dk*4, 0);
+    int* ofstab = ofstab_.data();
+    int* dhwTab = ofstab + Hk*Wk*Dk;
+    int padded_ksize = ((ksize + VEC_ALIGN-1) / VEC_ALIGN) * VEC_ALIGN;
+
+    if (conv_dim == CONV_1D)
+    {
+        for( int w = 0; w < Wk; w++)
+        {
+            int dw = w*dilation_w;
+            dhwTab[w*3+2] = dw;
+            ofstab[w] = dw;
+        }
+    }
+    else if (conv_dim == CONV_2D)
+    {
+        for (int h = 0; h < Hk; h++)
             for( int w = 0; w < Wk; w++)
             {
                 int k = h*Wk + w;
@@ -503,45 +1241,132 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
             }
     }
 
+    int CONV_NR = CONV_NR_FP32;
+    int CONV_MR = CONV_MR_FP32;
+    int esz = sizeof(float );
+
+#ifdef CONV_ARM_FP16
+    if (useFP16)
+    {
+        // works at FP 16.
+        CONV_NR = CONV_NR_FP16;
+        CONV_MR = CONV_MR_FP16;
+        esz = sizeof(float16_t);
+    }
+#endif
+
     int MAX_STRIPES = conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? 1 : (56 + CONV_NR - 1)/CONV_NR;
 
     // Friendly to L1 cache
     const int K_BLOCK_SIZE = conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? 1 : 32;
     const int C_BLOCK_SIZE = 256;
 
-    int Kg_nblocks = (Kg + CONV_MR-1)/CONV_MR, Kg_aligned = Kg_nblocks * CONV_MR;
+    int Kg_nblocks = (Kg + CONV_MR-1)/CONV_MR;
+    int Kg_aligned = Kg_nblocks * CONV_MR;
 
-    int stripes_per_sample = ((int)out_planesize + CONV_NR - 1) / CONV_NR;
+    int stripes_per_plane0 = ((int)out_planesize + CONV_NR - 1) / CONV_NR;
+    int stripes_per_plane = stripes_per_plane0;
 
-    if (stripes_per_sample < ntasks * 4 && conv->conv_type != CONV_TYPE_DEPTHWISE_REMAIN)
+    if (stripes_per_plane < ntasks * 4 || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
     {
         MAX_STRIPES = 1;
-        stripes_per_sample = 1;
+        stripes_per_plane = 1;
     }
     else
         Kg_nblocks = 1;
 
-    int Kstripes = Kg_nblocks*stripes_per_sample;
-    int nsubtasks = N*ngroups*Kstripes;
+    bool seperateIm2col = fast_1x1 || stripes_per_plane == 1;
+
+    int Kstripes = Kg_nblocks * stripes_per_plane;
+    int nsubtasks = N * ngroups * Kstripes;
+
+    size_t stripesize = alignSize(CONV_NR * ksize * Cg, VEC_ALIGN);
+    size_t cbufsize = alignSize(CONV_NR * K_BLOCK_SIZE * MAX_STRIPES, VEC_ALIGN);
+
+    size_t taskbufsize = cbufsize * sizeof(float );
+
+    if (!seperateIm2col)
+        taskbufsize += MAX_STRIPES * stripesize * esz;
 
-    size_t stripesize = CONV_NR * ksize * Cg;
-    size_t taskbufsize = (stripesize + CONV_NR * K_BLOCK_SIZE) * MAX_STRIPES;
-    size_t totalbufsize = taskbufsize * ntasks;
+    size_t totalbufsize_base = taskbufsize * ntasks;
+    size_t totalbufsize = totalbufsize_base;
+    if (seperateIm2col)
+        totalbufsize += N * ngroups * stripes_per_plane0 * stripesize * esz;
 
-    AutoBuffer<float> inpbuf_all_;
-    totalbufsize = alignSize(totalbufsize, VEC_ALIGN);
-    inpbuf_all_.allocate(totalbufsize + VEC_ALIGN);
-    float* inpbuf_all = alignPtr(inpbuf_all_.data(), (int)(VEC_ALIGN*sizeof(inpbuf_all_[0])));
+    AutoBuffer<char> inpbuf_all_;
+    char* inpbuf_all = nullptr;
+
+    inpbuf_all_.allocate(totalbufsize + VEC_ALIGN * sizeof(float ));
+    inpbuf_all = alignPtr(inpbuf_all_.data(), (int)(VEC_ALIGN * sizeof(float )));
+    char* inpbuf_all_0 = inpbuf_all + totalbufsize_base;
 
     float* inp = input.ptr<float>();
     float* out = output.ptr<float>();
     float* fusedAddPtr0 = fusedAddMat.empty() ? 0 : fusedAddMat.ptr<float>();
 
+    // In the case of 1x1 convolution we first reorder the whole input tensor.
+    // In general, im2row results in Hk*Wk-x unrolling factor
+    // (e.g. 3*3=9x unrolling for 3x3 convolution), thus for 1x1 convolution
+    // the reordered tensor will take as much space as the original tensor.
+    if (seperateIm2col)
+    {
+        // the optional phase 1. im2row
+        parallel_for_(Range(0, ntasks), [&](const Range& r0) {
+        for (int task_id = r0.start; task_id < r0.end; task_id++)
+        {
+            if (fast_1x1)
+            {
+                int nc0 = task_id*N*C/ntasks, nc1 = (task_id+1)*N*C/ntasks, dc = 0;
+                for (; nc0 < nc1; nc0 += dc)
+                {
+                    int n = nc0/C, c0 = nc0 - n*C;
+                    int g = c0 / Cg;
+                    c0 -= g*Cg;
+                    dc = Cg - c0 <= nc1 - nc0 ? Cg - c0 : nc1 - nc0;
+
+                    float * inptr_ = inp + (size_t)nc0*inp_planesize;
+                    char* inpbuf_ = inpbuf_all_0 + ((n*ngroups + g)*stripes_per_plane0*stripesize + c0*CONV_NR)*esz;
+
+                    packInputData(inpbuf_, inptr_, ofstab, dhwTab, 0, out_planesize, ksize, stride_d, stride_h,
+                                  stride_w, pad_front, pad_top, pad_left, Dk, Hk, Wk, dilation_d, dilation_h, dilation_w,
+                                  Di, Hi, Wi, H0, W0, dc, stripesize, 0, inp_planesize, conv->conv_dim,
+                                  conv->conv_type, CONV_NR, esz, fast_1x1, useFP16);
+                }
+            }
+            else
+            {
+                const int allTasks = N * ngroups * stripes_per_plane0;
+                int ngs0 = task_id*allTasks/ntasks, ngs1 = (task_id+1)*allTasks/ntasks, ds = 0;
+
+                for (; ngs0 < ngs1; ngs0 += ds)
+                {
+                    int n = ngs0 / (ngroups * stripes_per_plane0), gs0 = ngs0 - n*ngroups*stripes_per_plane0;
+                    int g = gs0 / stripes_per_plane0, s0 = gs0 - g*stripes_per_plane0;
+
+                    ds = stripes_per_plane0 - s0 <= ngs1 - ngs0 ? stripes_per_plane0 - s0 : ngs1 - ngs0;
+
+                    int zyx = s0 * CONV_NR;
+                    int zyx_limit = (s0 + ds) * CONV_NR < out_planesize ? (s0 + ds) * CONV_NR : out_planesize;
+
+                    float * inptr_ = inp + (size_t)(n * ngroups + g) * Cg * inp_planesize;
+                    char* inpbuf_ = inpbuf_all_0 + ((n * ngroups + g) * stripes_per_plane0 * stripesize + s0 * stripesize) * esz;
+
+                    packInputData(inpbuf_, inptr_, ofstab, dhwTab, zyx, zyx_limit, ksize, stride_d, stride_h,
+                                  stride_w, pad_front, pad_top, pad_left, Dk, Hk, Wk, dilation_d, dilation_h, dilation_w,
+                                  Di, Hi, Wi, H0, W0, Cg, stripesize, 0, inp_planesize, conv->conv_dim,
+                                  conv->conv_type, CONV_NR, esz, fast_1x1, useFP16);
+                }
+            }
+        }
+        });
+    }
+
+    // Compute
     parallel_for_(Range(0, ntasks), [&](const Range& r0) {
     for (int task_id = r0.start; task_id < r0.end; task_id++)
     {
-        float* inpbuf_task = &inpbuf_all[taskbufsize * task_id];
-        float* cbuf_task = inpbuf_task + stripesize * MAX_STRIPES;
+        float * cbuf_task = (float *)(inpbuf_all + taskbufsize * task_id);
+        char * inpbuf_task = (char*)(cbuf_task + cbufsize);
 
         int ngs0 = (int)((size_t)nsubtasks * task_id / ntasks);
         int ngs1 = (int)((size_t)nsubtasks * (task_id+1) / ntasks);
@@ -557,7 +1382,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
             int k0, k1;
             int zyx0, zyx_limit, zyx_block_limit = 0;
 
-            if (stripes_per_sample == 1 && conv->conv_type != CONV_TYPE_DEPTHWISE_REMAIN)
+            if (stripes_per_plane == 1 || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
             {
                 k0 = kzyx0 * CONV_MR;
                 k1 = kzyx1 * CONV_MR;
@@ -581,380 +1406,75 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
                 zyx_block_limit = zyx_block_limit < zyx_limit ? zyx_block_limit : zyx_limit;
 
                 int nstripes = (zyx_block_limit - zyx0 + CONV_NR - 1) / CONV_NR;
-                int zyx0_saved = zyx0;
 
                 CV_Assert(nstripes <= MAX_STRIPES);
 
-                for (int stripe = 0; zyx0 < zyx_block_limit; stripe++, zyx0 += CONV_NR)
+                if (!seperateIm2col)
                 {
-                    float *inpbuf = inpbuf_task + stripe * stripesize;
-                    float *inptr = inp + inp_plane_ofs;
-
-                    /*
-                        1. pack the data. Copy the HkxWk CONV_NR-wide slices from
-                           each feature plane of the input tensor to the input buffer.
-                    */
-                    if (fast_1x1)
-                    {
-                        int slice_len = zyx_block_limit - zyx0;
-                        bool partial = slice_len < CONV_NR;
-                        // Superfast branch for 1x1 convolutions with sy=sx=1.
-                        // in this case each feature plane can be safely treated
-                        // as 1D array, and we just extract next portion
-                        // of CONV_NR elements from each feature plane and
-                        // put it together.
-                        inptr += zyx0;
-                        if (!partial)
-                        {
-                            // Make special branch where memcpy() is called with a constant buffer size.
-                            // Compilers will likely unroll this loop properly.
-                            for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR)
-                                memcpy(inpbuf, inptr, CONV_NR * sizeof(inpbuf[0]));
-                        }
-                        else
-                        {
-                            for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR)
-                            {
-                                memcpy(inpbuf, inptr, slice_len * sizeof(inpbuf[0]));
-                                memset(inpbuf + slice_len, 0, (CONV_NR - slice_len) * sizeof(inpbuf[0]));
-                            }
-                        }
-                    }
-                    else if (conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
-                    {
-                        CV_Assert(Cg == 1);
-                        const int HW0 = H0 * W0;
-                        const int HWi = Hi * Wi;
-                        int slice_len = std::min(zyx_block_limit - zyx0, CONV_NR);
-
-                        // here some non-continuous sub-row of the row will not be
-                        // filled from the tensor; we need to make sure that the uncovered
-                        // elements are explicitly set to 0's. the easiest way is to
-                        // set all the elements to 0's before the loop.
-                        memset(inpbuf, 0, stripesize*sizeof(inpbuf[0]));
-
-                        int z0 = zyx0 / HW0, yx0 = zyx0 - z0 * HW0;
-                        int y0 = yx0 / W0, x0 = yx0 - y0 * W0;
-
-                        if (conv_dim == CONV_1D)
-                        {
-                            for (int slice_i = 0; slice_i < slice_len; y0++, x0=0)
-                            {
-                                int delta = std::min(slice_len - slice_i, W0 - x0);
-                                int x1 = x0 + delta;
-
-                                int in_w = x0 * stride_w - pad_left;
-                                float* inptrIn = inptr + in_w;
-
-                                int s0 = slice_i;
-
-                                for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
-                                {
-                                    // Pack 8
-                                    if (x0 + 8 <= x1 && 0 <= in_w &&
-                                        in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w)
-                                    {
-                                        packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize);
-                                    }
-                                    else if (x0 + 2 <= x1 && 0 <= in_w &&
-                                        in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w)
-                                    {
-                                        packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize);
-                                    }
-                                    else
-                                    {
-                                        int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w);
-                                        int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w);
-
-                                        float* inpbufC = inpbuf + s0;
-                                        float* inptrInC = inptrIn;
-                                        for (int w = w0; w < w1; w++)
-                                        {
-                                            int imgofs = w*dilation_w;
-                                            inpbufC[w*CONV_NR] = inptrInC[imgofs];
-                                        }
-                                    }
-                                }
-                                slice_i += delta;
-                            }
-                        }
-                        else if (conv_dim == CONV_2D)
-                        {
-                            for (int slice_i = 0; slice_i < slice_len; y0++, x0=0)
-                            {
-                                int delta = std::min(slice_len - slice_i, W0 - x0);
-                                int x1 = x0 + delta;
-
-                                int in_h = y0 * stride_h - pad_top;
-                                int in_w = x0 * stride_w - pad_left;
-
-                                float* inptrIn = inptr + in_h*Wi + in_w;
-
-                                bool ok_i = 0 <= in_h && in_h < Hi - (Hk-1)*dilation_h;
-                                int h0 = std::max(0, (-in_h + dilation_h-1)/dilation_h);
-                                int h1 = std::min(Hk, (Hi - in_h + dilation_h-1)/dilation_h);
-
-                                int s0 = slice_i;
-                                for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
-                                {
-                                    // Pack 8
-                                    if (ok_i && x0 + 8 <= x1 && 0 <= in_w &&
-                                        in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w)
-                                    {
-                                        packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize);
-                                    }
-                                    else if (ok_i && x0 + 2 <= x1 && 0 <= in_w &&
-                                            in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w)
-                                    {
-                                        packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize);
-                                    }
-                                    else
-                                    {
-                                        int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w);
-                                        int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w);
-
-                                        float* inpbufC = inpbuf + s0;
-                                        float* inptrInC = inptrIn;
-
-                                        for (int h = h0; h < h1; h++)
-                                        {
-                                            for (int w = w0; w < w1; w++)
-                                            {
-                                                int imgofs = h*(dilation_h*Wi) + w*dilation_w;
-                                                inpbufC[(h*Wk + w)*CONV_NR] = inptrInC[imgofs];
-                                            }
-                                        }
-                                    }
-                                }
-                                slice_i += delta;
-                            }
-                        }
-                        else if (conv_dim == CONV_3D)
-                        {
-                            for (int slice_i = 0; slice_i < slice_len; z0 += (y0+1)/H0, y0 = (y0+1)%H0, x0=0)
-                            {
-                                int delta = std::min(slice_len - slice_i, W0 - x0);
-                                int x1 = x0 + delta;
-
-                                int in_d = z0 * stride_d - pad_front;
-                                int in_h = y0 * stride_h - pad_top;
-                                int in_w = x0 * stride_w - pad_left;
-
-                                float* inptrIn = inptr + in_d*HWi + in_h*Wi + in_w;
-
-                                int d0 = std::max(0, (-in_d + dilation_d - 1) / dilation_d);
-                                int d1 = std::min(Dk, (Di - in_d + dilation_d - 1) / dilation_d);
-
-                                bool ok_i = 0 <= in_d && in_d < Di - (Dk-1)*dilation_d &&
-                                        0 <= in_h && in_h < Hi - (Hk-1)*dilation_h;
-                                int h0 = std::max(0, (-in_h + dilation_h-1)/dilation_h);
-                                int h1 = std::min(Hk, (Hi - in_h + dilation_h-1)/dilation_h);
-
-                                int s0 = slice_i;
-                                for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w)
-                                {
-                                    // Pack 8
-                                    if (ok_i && x0 + 8 <= x1 && 0 <= in_w &&
-                                        in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w)
-                                    {
-                                        packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize);
-                                    }
-                                    else if (ok_i && x0 + 2 <= x1 && 0 <= in_w &&
-                                        in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w)
-                                    {
-                                        packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize);
-                                    }
-                                    else
-                                    {
-                                        int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w);
-                                        int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w);
-
-                                        float* inpbufC = inpbuf + s0;
-                                        float* inptrInC = inptrIn;
-
-                                        for ( int d = d0; d < d1; d++)
-                                        {
-                                            for (int h = h0; h < h1; h++)
-                                            {
-                                                for (int w = w0; w < w1; w++)
-                                                {
-                                                    int imgofs = d*dilation_d*HWi + h*(dilation_h*Wi) + w*dilation_w;
-                                                    inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = inptrInC[imgofs];
-                                                }
-                                            }
-                                        }
-                                    }
-                                }
-                                slice_i += delta;
-                            }
-                        }
-                    }
-                    else
-                    {
-                        const int HW0 = H0 * W0;
-                        const int HWi = Hi * Wi;
-                        int z0_ = zyx0 / HW0, yx0 = zyx0 - z0_ * HW0;
-                        int y0_ = yx0 / W0, x0_ = yx0 - y0_ * W0;
-                        for (int k = 0; k < ksize; k++)
-                        {
-                            int dz = dhwTab[k * 3], dy = dhwTab[k * 3 + 1], dx = dhwTab[k * 3 + 2];
-                            int i = 0, z0 = z0_, y0 = y0_, x0 = x0_;
-                            for (; i < CONV_NR;)
-                            {
-                                float *inpbuf_ki = inpbuf + k * CONV_NR * Cg + i;
-                                int zi = z0 * stride_d + dz - pad_front;
-                                int yi = y0 * stride_h + dy - pad_top;
-                                int xi = x0 * stride_w + dx - pad_left;
-
-                                if ((unsigned) zi < (unsigned) Di && (unsigned) yi < (unsigned) Hi &&
-                                    (unsigned) xi < (unsigned) Wi)
-                                {
-                                    const float *inptr_ki = inptr + zi * HWi + yi * Wi + xi;
-                                    if (i + 8 <= CONV_NR && x0 + 8 <= W0 && xi + stride_w * 8 <= Wi)
-                                    {
-                                        if (stride_w == 1)
-                                        {
-                                            for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
-                                            {
-                                                float t0 = inptr_ki[0], t1 = inptr_ki[1];
-                                                float t2 = inptr_ki[2], t3 = inptr_ki[3];
-                                                float t4 = inptr_ki[4], t5 = inptr_ki[5];
-                                                float t6 = inptr_ki[6], t7 = inptr_ki[7];
-                                                inpbuf_ki[0] = t0;
-                                                inpbuf_ki[1] = t1;
-                                                inpbuf_ki[2] = t2;
-                                                inpbuf_ki[3] = t3;
-                                                inpbuf_ki[4] = t4;
-                                                inpbuf_ki[5] = t5;
-                                                inpbuf_ki[6] = t6;
-                                                inpbuf_ki[7] = t7;
-                                            }
-                                        }
-                                        else if (stride_w == 2)
-                                        {
-                                            for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
-                                            {
-                                                float t0 = inptr_ki[0], t1 = inptr_ki[2];
-                                                float t2 = inptr_ki[4], t3 = inptr_ki[6];
-                                                float t4 = inptr_ki[8], t5 = inptr_ki[10];
-                                                float t6 = inptr_ki[12], t7 = inptr_ki[14];
-                                                inpbuf_ki[0] = t0;
-                                                inpbuf_ki[1] = t1;
-                                                inpbuf_ki[2] = t2;
-                                                inpbuf_ki[3] = t3;
-                                                inpbuf_ki[4] = t4;
-                                                inpbuf_ki[5] = t5;
-                                                inpbuf_ki[6] = t6;
-                                                inpbuf_ki[7] = t7;
-                                            }
-                                        }
-                                        else
-                                        {
-                                            for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
-                                            {
-                                                float t0 = inptr_ki[0], t1 = inptr_ki[stride_w];
-                                                float t2 = inptr_ki[stride_w * 2], t3 = inptr_ki[stride_w * 3];
-                                                float t4 = inptr_ki[stride_w * 4], t5 = inptr_ki[stride_w * 5];
-                                                float t6 = inptr_ki[stride_w * 6], t7 = inptr_ki[stride_w * 7];
-                                                inpbuf_ki[0] = t0;
-                                                inpbuf_ki[1] = t1;
-                                                inpbuf_ki[2] = t2;
-                                                inpbuf_ki[3] = t3;
-                                                inpbuf_ki[4] = t4;
-                                                inpbuf_ki[5] = t5;
-                                                inpbuf_ki[6] = t6;
-                                                inpbuf_ki[7] = t7;
-                                            }
-                                        }
-                                        i += 8;
-                                        x0 += 8;
-                                    }
-                                    else if (i + 4 <= CONV_NR && x0 + 4 <= W0 && xi + stride_w * 4 <= Wi)
-                                    {
-                                        if (stride_w == 1)
-                                        {
-                                            for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
-                                            {
-                                                float t0 = inptr_ki[0], t1 = inptr_ki[1];
-                                                float t2 = inptr_ki[2], t3 = inptr_ki[3];
-                                                inpbuf_ki[0] = t0;
-                                                inpbuf_ki[1] = t1;
-                                                inpbuf_ki[2] = t2;
-                                                inpbuf_ki[3] = t3;
-                                            }
-                                        }
-                                        else
-                                        {
-                                            for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
-                                            {
-                                                float t0 = inptr_ki[0], t1 = inptr_ki[stride_w];
-                                                float t2 = inptr_ki[stride_w * 2], t3 = inptr_ki[stride_w * 3];
-                                                inpbuf_ki[0] = t0;
-                                                inpbuf_ki[1] = t1;
-                                                inpbuf_ki[2] = t2;
-                                                inpbuf_ki[3] = t3;
-                                            }
-                                        }
-                                        i += 4;
-                                        x0 += 4;
-                                    }
-                                    else
-                                    {
-                                        for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize)
-                                            *inpbuf_ki = *inptr_ki;
-                                        i++;
-                                        x0++;
-                                    }
-                                }
-                                else
-                                {
-                                    for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR)
-                                        inpbuf_ki[0] = 0.f;
-                                    i++;
-                                    x0++;
-                                }
-
-                                int mask = x0 >= W0;
-                                y0 += mask;
-                                x0 &= mask - 1;
-
-                                mask = y0 >= H0;
-                                z0 += mask;
-                                y0 &= mask - 1;
-                            }
-                        }
-                    }
+                    packInputData(inpbuf_task, inp, ofstab, dhwTab, zyx0, zyx_block_limit, ksize, stride_d, stride_h,
+                                  stride_w, pad_front, pad_top, pad_left, Dk, Hk, Wk, dilation_d, dilation_h, dilation_w,
+                                  Di, Hi, Wi, H0, W0, Cg, stripesize, inp_plane_ofs, inp_planesize, conv->conv_dim,
+                                  conv->conv_type, CONV_NR, esz, fast_1x1, useFP16);
                 }
 
-                zyx0 = zyx0_saved;
-
-                // spacial branch for depth-wise convolution implemented using generic convolution.
-                // In this case, CONV_MR is 1, and CONV_NR is the same.
+                char *weights = nullptr;
+#ifdef CONV_ARM_FP16
+                if (useFP16)
+                {
+                    CV_Assert(!conv->weightsBuf_FP16.empty());
+                    weights = (char *)conv->weightsBufPtr_FP16;
+                }
+                else
+#endif
+                {
+                    CV_Assert(!conv->weightsBuf.empty());
+                    weights = (char *)conv->weightsBufPtr;
+                }
+                // optional branch, only for depth-wise convolution which was implemented by generic convolution.
+                // In this case, CONV_MR is 1, and CONV_NR remains the same.
                 if (conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)
                 {
+                    CV_Assert(weights);
                     size_t outofs = (n * ngroups + g) * out_planesize + zyx0;
                     float *cptr0 = cbuf_task;
-                    float *weights = conv->weightsBufPtr + g * padded_ksize;
+                    weights += g * padded_ksize * esz;
+
                     int out_width = zyx_block_limit - zyx0;
                     float *outptr = out + outofs;
                     const float biasVal = *(conv->biasBuf.data() + g);
+                    const char *inptr_ = seperateIm2col ? inpbuf_all_0 + (ng*stripes_per_plane0 + zyx0/CONV_NR) * stripesize * esz:
+                                        inpbuf_task;
+
                     for (int stripe = 0; stripe < nstripes; stripe++)
                     {
-                        const float *inptr = inpbuf_task + stripe * stripesize;
+                        const char *inptr = inptr_ + stripe * stripesize * esz;
                         const int outLen = std::min(out_width - stripe * CONV_NR, CONV_NR);
                         bool ifBuffer = outLen < CONV_NR;
                         float *cptr = outptr + stripe * CONV_NR;
                         if (ifBuffer)
                         {
-                            memcpy(cptr0, cptr, outLen * sizeof(cptr[0]));
+                            memcpy(cptr0, cptr, outLen * sizeof(float ));
                             cptr = cptr0;
                         }
-
-                        convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
+#if CV_NEON && CV_NEON_AARCH64
+                        if (conv->useNEON)
+                        {
+#ifdef CONV_ARM_FP16
+                            if (useFP16)
+                            {
+                                opt_NEON::convBlockMR1_FP16(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
+                            }
+                            else
+#endif
+                            opt_NEON::convBlockMR1_F32(DkHkWkCg, (const float *)weights, (const float *)inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
+                        }
+                        else
+#endif
+                        convBlockMR1(DkHkWkCg, (const float *)weights, (const float *)inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR);
 
                         if (ifBuffer)
                         {
-                            memcpy(outptr + stripe * CONV_NR, cptr, outLen * sizeof(cptr[0]));
+                            memcpy(outptr + stripe * CONV_NR, cptr, outLen * sizeof(float ));
                         }
                     }
                     if (activ)
@@ -962,7 +1482,9 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
                     continue;
                 }
 
-                float *weights = conv->weightsBufPtr + g * Kg_aligned * DkHkWkCg;
+                CV_Assert(weights);
+                weights += g * Kg_aligned * DkHkWkCg * esz;
+
                 const float *biasptr = conv->biasBuf.data() + Kg * g;
                 int ldc = nstripes * CONV_NR;
 
@@ -974,109 +1496,196 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
                     for (int c0 = 0; c0 < DkHkWkCg; c0 += C_BLOCK_SIZE)
                     {
                         int c1 = c0 + C_BLOCK_SIZE < DkHkWkCg ? c0 + C_BLOCK_SIZE : DkHkWkCg;
-                        for (int stripe = 0; stripe < nstripes; stripe++)
+                        const char *inptr = seperateIm2col ? inpbuf_all_0 + (ng*stripes_per_plane0 + zyx0/CONV_NR)*stripesize*esz:
+                                             inpbuf_task;
+                        inptr += (c0 * CONV_NR) * esz;
+                        for (int stripe = 0; stripe < nstripes; stripe++, inptr += stripesize * esz)
                         {
                             const int outLen = std::min(out_width - stripe * CONV_NR, CONV_NR);
 
-#if CV_TRY_AVX || CV_TRY_AVX2 || CV_NEON
-                            // The possible CONV_NR is 28, 24, 12, so the possible CONV_NR/3 is 9, 8, 4.
-                            bool runOpt = outLen > std::min(8, CONV_NR/3);
-#endif
-                            float *wptr = weights + k0_block * DkHkWkCg + c0 * CONV_MR;
-                            const float *inptr = inpbuf_task + stripe * stripesize + c0 * CONV_NR;
+                            char *wptr = weights + (k0_block * DkHkWkCg + c0 * CONV_MR) * esz;
                             float *cptr = cbuf_task + stripe * CONV_NR;
+                            float16_t* cptr_f16 = (float16_t*)cbuf_task + stripe*CONV_NR;
                             for (int k = k0_block; k < k1_block; k += CONV_MR,
-                                    wptr += DkHkWkCg * CONV_MR, cptr += CONV_MR * ldc)
+                                    wptr += DkHkWkCg * CONV_MR * esz, cptr += CONV_MR * ldc, cptr_f16 += CONV_MR * ldc)
                             {
 #if CV_TRY_AVX2
-                                if (conv->useAVX2 && runOpt)
-                                    opt_AVX2::convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, CONV_MR, CONV_NR);
+                                if (conv->useAVX2)
+                                    opt_AVX2::convBlock(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
                                 else
 #endif
 #if CV_TRY_AVX
-                                if (conv->useAVX && runOpt)
-                                    opt_AVX::convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, CONV_MR, CONV_NR);
+                                if (conv->useAVX)
+                                    opt_AVX::convBlock(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
                                 else
 #endif
 #if CV_NEON
-                                if (conv->useNEON && runOpt)
-                                    opt_NEON::convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, CONV_MR, CONV_NR);
+                                if (conv->useNEON)
+                                {
+#ifdef CONV_ARM_FP16
+                                    if (useFP16)
+                                    {
+                                        opt_NEON::convBlock_FP16(c1 - c0, wptr, inptr, (char *)cptr_f16, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
+                                    }
+                                    else
+#endif
+                                    opt_NEON::convBlock(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
+                                }
                                 else
 #endif
                                 // The possible outLen range is 24 or 8~1.
-                                convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
+                                convBlock(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR);
                             }
                         }
                     }
 
                     size_t outofs = ((n * ngroups + g) * Kg + k0_block) * out_planesize + zyx0;
                     const float *cptr = cbuf_task;
-
+                    const float16_t *cptr_fp16 = (const float16_t *)cbuf_task;
                     float *outptr = out + outofs;
                     const float *pbptr = fusedAddPtr0 ? fusedAddPtr0 + outofs : 0;
 
                     for (int k = k0_block; k < k1_block; k++,
-                            cptr += ldc, outptr += out_planesize,
-                            pbptr += (pbptr ? out_planesize : 0)) {
+                            cptr += ldc, cptr_fp16 += ldc, outptr += out_planesize,
+                            pbptr += (pbptr ? out_planesize : 0))
+                    {
                         float biasval = biasptr[k];
                         int j = 0;
-#if CV_SIMD128
-                        v_float32x4 vbias = v_setall_f32(biasval);
-                        v_float32x4 vmax = v_setall_f32(maxval);
-                        v_float32x4 vmin = v_setall_f32(minval);
 
-                        if (pbptr)
+#ifdef CONV_ARM_FP16
+                        if (useFP16)
                         {
-                            for (; j + 7 < out_width; j += 8)
+                            float32x4_t vbias = vdupq_n_f32(biasval);
+                            float32x4_t vmax = vdupq_n_f32(maxval);
+                            float32x4_t vmin = vdupq_n_f32(minval);
+                            if (pbptr)
+                            {
+                                for (; j + 7 < out_width; j += 8)
+                                {
+                                    float32x4_t v0 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + j)) + vbias;
+                                    float32x4_t v1 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + + j + 4)) + vbias;
+
+                                    v0 += vld1q_f32(pbptr + j);
+                                    v1 += vld1q_f32(pbptr + j + 4);
+
+                                    if (ifMinMaxAct)
+                                    {
+                                        v0 = vminq_f32(vmaxq_f32(v0, vmin), vmax);
+                                        v1 = vminq_f32(vmaxq_f32(v1, vmin), vmax);
+                                    }
+
+                                    vst1q_f32(outptr + j, v0);
+                                    vst1q_f32(outptr + j + 4, v1);
+                                }
+                            }
+                            else
                             {
-                                v_float32x4 v0 = v_add(v_load(cptr + j), vbias);
-                                v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias);
-                                v0 = v_add(v0, v_load(pbptr + j));
-                                v1 = v_add(v1, v_load(pbptr + j + 4));
+                                for (; j + 7 < out_width; j += 8)
+                                {
+                                    float32x4_t v0 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + j)) + vbias;
+                                    float32x4_t v1 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + j + 4)) + vbias;
+
+                                    if (ifMinMaxAct)
+                                    {
+                                        v0 = vminq_f32(vmaxq_f32(v0, vmin), vmax);
+                                        v1 = vminq_f32(vmaxq_f32(v1, vmin), vmax);
+                                    }
+
+                                    vst1q_f32(outptr + j, v0);
+                                    vst1q_f32(outptr + j + 4, v1);
+                                }
+                            }
 
-                                if (ifMinMaxAct)
+                            if (pbptr)
+                            {
+                                for (; j < out_width; j++)
                                 {
-                                    v0 = v_min(v_max(v0, vmin), vmax);
-                                    v1 = v_min(v_max(v1, vmin), vmax);
+                                    float v = (float )cptr_fp16[j] + biasval;
+                                    v += pbptr[j];
+                                    if (ifMinMaxAct)
+                                        v = std::min(std::max(v, minval), maxval);
+                                    outptr[j] = v;
                                 }
+                            }
+                            else
+                            {
+                                for (; j < out_width; j++)
+                                {
+                                    float v = (float )cptr_fp16[j] + biasval;
 
-                                v_store(outptr + j, v0);
-                                v_store(outptr + j + 4, v1);
+                                    if (ifMinMaxAct)
+                                        v = std::min(std::max(v, minval), maxval);
+                                    outptr[j] = v;
+                                }
                             }
                         }
                         else
+#endif
                         {
-                            for (; j + 7 < out_width; j += 8)
-                            {
-                                v_float32x4 v0 = v_add(v_load(cptr + j), vbias);
-                                v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias);
+#if CV_SIMD128
+                            v_float32x4 vbias = v_setall_f32(biasval);
+                            v_float32x4 vmax = v_setall_f32(maxval);
+                            v_float32x4 vmin = v_setall_f32(minval);
 
-                                if (ifMinMaxAct)
+                            if (pbptr)
+                            {
+                                for (; j + 7 < out_width; j += 8)
                                 {
-                                    v0 = v_min(v_max(v0, vmin), vmax);
-                                    v1 = v_min(v_max(v1, vmin), vmax);
+                                    v_float32x4 v0 = v_add(v_load(cptr + j), vbias);
+                                    v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias);
+
+                                    v0 = v_add(v0, v_load(pbptr + j));
+                                    v1 = v_add(v1, v_load(pbptr + j + 4));
+
+                                    if (ifMinMaxAct)
+                                    {
+                                        v0 = v_min(v_max(v0, vmin), vmax);
+                                        v1 = v_min(v_max(v1, vmin), vmax);
+                                    }
+
+                                    v_store(outptr + j, v0);
+                                    v_store(outptr + j + 4, v1);
                                 }
+                            }
+                            else
+                            {
+                                for (; j + 7 < out_width; j += 8)
+                                {
+                                    v_float32x4 v0 = v_add(v_load(cptr + j), vbias);
+                                    v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias);
+
+                                    if (ifMinMaxAct)
+                                    {
+                                        v0 = v_min(v_max(v0, vmin), vmax);
+                                        v1 = v_min(v_max(v1, vmin), vmax);
+                                    }
 
-                                v_store(outptr + j, v0);
-                                v_store(outptr + j + 4, v1);
+                                    v_store(outptr + j, v0);
+                                    v_store(outptr + j + 4, v1);
+                                }
                             }
-                        }
 #endif
-                        if (pbptr) {
-                            for (; j < out_width; j++) {
-                                float v = cptr[j] + biasval;
-                                v += pbptr[j];
-                                if (ifMinMaxAct)
-                                    v = std::min(std::max(v, minval), maxval);
-                                outptr[j] = v;
+                            if (pbptr)
+                            {
+                                for (; j < out_width; j++)
+                                {
+                                    float v = cptr[j] + biasval;
+                                    v += pbptr[j];
+                                    if (ifMinMaxAct)
+                                        v = std::min(std::max(v, minval), maxval);
+                                    outptr[j] = v;
+                                }
                             }
-                        } else {
-                            for (; j < out_width; j++) {
-                                float v = cptr[j] + biasval;
+                            else
+                            {
+                                for (; j < out_width; j++)
+                                {
+                                    float v = cptr[j] + biasval;
 
-                                if (ifMinMaxAct)
-                                    v = std::min(std::max(v, minval), maxval);
-                                outptr[j] = v;
+                                    if (ifMinMaxAct)
+                                        v = std::min(std::max(v, minval), maxval);
+                                    outptr[j] = v;
+                                }
                             }
                         }
 
@@ -1095,7 +1704,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr<FastConv>& co
                                     SIMD and no-SIMD code for convBlock
 \****************************************************************************************/
 
-static void convBlockMR1NoSIMD(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
+static inline void convBlockMR1NoSIMD(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
                                const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
 {
     std::vector<float> cbuffer(outLen, 0);
@@ -1128,64 +1737,8 @@ static void convBlockMR1NoSIMD(int np, const float* a, const float* b, float *c,
 }
 
 #if CV_SIMD128
-static void convBlockMR1x28(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
-                               const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
-{
-    CV_Assert(convNR == 28);
-    v_float32x4 c0  = v_setall_f32(bias), c1 = c0, c2 = c0;
-    v_float32x4 c3 = c0, c4 = c0, c5 = c0;
-    v_float32x4 c6 = c0;
-
-    for (int p = 0; p < np; p++, a++, b += convNR)
-    {
-        v_float32x4 a0 = v_setall_f32(a[0]);
-        v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
-        v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20);
-        v_float32x4 b6 = v_load(b + 24);
-
-        c0 = v_fma(b0, a0, c0);
-        c1 = v_fma(b1, a0, c1);
-        c2 = v_fma(b2, a0, c2);
-        c3 = v_fma(b3, a0, c3);
-        c4 = v_fma(b4, a0, c4);
-        c5 = v_fma(b5, a0, c5);
-        c6 = v_fma(b6, a0, c6);
-    }
-
-    if (init_c)
-    {
-        c0 = v_add(c0, v_load(c));
-        c1 = v_add(c1, v_load(c + 4));
-        c2 = v_add(c2, v_load(c + 8));
-        c3 = v_add(c3, v_load(c + 12));
-        c4 = v_add(c4, v_load(c + 16));
-        c5 = v_add(c5, v_load(c + 20));
-        c6 = v_add(c6, v_load(c + 24));
-    }
-
-    if (ifMinMaxAct)
-    {
-        v_float32x4 vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval);
-        c0 = v_min(v_max(c0, vmin), vmax);
-        c1 = v_min(v_max(c1, vmin), vmax);
-        c2 = v_min(v_max(c2, vmin), vmax);
-        c3 = v_min(v_max(c3, vmin), vmax);
-        c4 = v_min(v_max(c4, vmin), vmax);
-        c5 = v_min(v_max(c5, vmin), vmax);
-        c6 = v_min(v_max(c6, vmin), vmax);
-    }
-
-    v_store(c, c0);
-    v_store(c + 4, c1);
-    v_store(c + 8, c2);
-    v_store(c + 12, c3);
-    v_store(c + 16, c4);
-    v_store(c + 20, c5);
-    v_store(c + 24, c6);
-}
-
-static void convBlockMR1x24(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
-                            const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
+static inline void convBlockMR1x24(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
+                            const float minval, const float maxval, bool ifMinMaxAct, const int convNR)
 {
     CV_Assert(convNR == 24);
     v_float32x4 c0  = v_setall_f32(bias), c1 = c0, c2 = c0;
@@ -1234,8 +1787,8 @@ static void convBlockMR1x24(int np, const float* a, const float* b, float *c, co
     v_store(c + 20, c5);
 }
 
-static void convBlockMR1x12(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
-                            const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR)
+static inline void convBlockMR1x12(int np, const float* a, const float* b, float *c, const float bias, bool init_c,
+                            const float minval, const float maxval, bool ifMinMaxAct, const int convNR)
 {
     CV_Assert(convNR == 12);
     v_float32x4 c0  = v_setall_f32(bias), c1 = c0, c2 = c0;
@@ -1279,12 +1832,10 @@ void convBlockMR1(int np, const float* a, const float* b, float *c, const float
     const int convNRby3 = convNR/3;
     if (outLen > convNRby3)
     {
-        if (convNR == 28)
-            convBlockMR1x28(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
-        else if (convNR == 24)
-            convBlockMR1x24(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
+        if (convNR == 24)
+            convBlockMR1x24(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, convNR);
         else if (convNR == 12)
-            convBlockMR1x12(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
+            convBlockMR1x12(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, convNR);
         else
             convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR);
     }
@@ -1296,7 +1847,7 @@ void convBlockMR1(int np, const float* a, const float* b, float *c, const float
 }
 
 #if CV_SIMD128
-static void convBlock4x24(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
+static inline void convBlock4x24(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
 {
     v_float32x4 c0  = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0;
     v_float32x4 c6  = v_setzero_f32(), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6;
@@ -1401,7 +1952,7 @@ static void convBlock4x24(int np, const float* a, const float* b, float* c, int
     v_store(c + ldc * 3 + 20, c23);
 }
 
-static void convBlock4x8(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
+static inline void convBlock4x8(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
 {
     CV_Assert(convNR >= 4);
     v_float32x4 c0  = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0;
@@ -1454,7 +2005,7 @@ static void convBlock4x8(int np, const float* a, const float* b, float* c, int l
     v_store(c + ldc * 3 + 4, c7);
 }
 
-static void convBlock4x4(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
+static inline void convBlock4x4(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR)
 {
     CV_Assert(convNR >= 4);
     v_float32x4 c0  = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0;
@@ -1489,7 +2040,7 @@ static void convBlock4x4(int np, const float* a, const float* b, float* c, int l
 }
 #endif
 
-static void convBlockNoSIMD(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
+static inline void convBlockNoSIMD(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen,
                             const int convMR, const int convNR)
 {
     std::vector<float> cbuffer(convMR * outLen, 0);
diff --git a/modules/dnn/src/layers/cpu_kernels/convolution.hpp b/modules/dnn/src/layers/cpu_kernels/convolution.hpp
index 3d44c3189b..6fabc3da7c 100644
--- a/modules/dnn/src/layers/cpu_kernels/convolution.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/convolution.hpp
@@ -10,14 +10,27 @@
 #ifndef CONV_PRAM
 #define CONV_PRAM
 #if CV_NEON && CV_NEON_AARCH64  // 32 registers.
-#define CONV_MR 4
-#define CONV_NR 28
+#define CONV_MR_FP32 4
+#define CONV_NR_FP32 28
+
+// The FP16 can only be supported by ARM64 and with FP16 FMA supported.
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC // check FP16 FMA.
+#define CONV_ARM_FP16 1
+#endif
+
+#ifdef CONV_ARM_FP16
+// Currently, only ARM 64 support FP16.
+#define CONV_MR_FP16 8
+#define CONV_NR_FP16 24
+typedef __fp16 float16_t; // Fix conflict between float16_t in arm_neon.h and float16_t in cvdef.h.
+#endif
+
 #elif CV_NEON              // 16 registers.
-#define CONV_MR 4
-#define CONV_NR 12
+#define CONV_MR_FP32 4
+#define CONV_NR_FP32 12
 #else // SIMD 128, AVX or AVX2
-#define CONV_MR 4
-#define CONV_NR 24
+#define CONV_MR_FP32 4
+#define CONV_NR_FP32 24
 #endif
 
 // Winograd Params
@@ -41,6 +54,10 @@ enum {
 #endif
 
     CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16.
+
+    // FP 16
+    CONV_WINO_ATOM_F16 = CONV_WINO_ATOM_F32 * 2,
+    CONV_WINO_NATOMS_F16 = CONV_WINO_AREA / CONV_WINO_ATOM_F16,
 };
 
 // NOTE that: CONV_TYPE_DEPTHWISE is for 3x3 depthwise conv, and others depthwise will be set as CONV_TYPE_DEPTHWISE_REMAIN.
@@ -64,8 +81,17 @@ struct FastConv
     std::vector<float> weightsWinoBuf; // For Winograd F(6x6, 3x3).
     float* weightsWinoBufPtr;
     std::vector<float> biasBuf;
+
+#if CV_NEON && CV_NEON_AARCH64 && CV_FP16
+    std::vector<float16_t> weightsBuf_FP16;
+    float16_t* weightsBufPtr_FP16;
+    std::vector<float16_t> weightsWinoBuf_FP16;
+    float16_t* weightsWinoBufPtr_FP16;
+#endif
+
     int conv_type;
     int conv_dim;  // Flag for conv1d, conv2d, or conv3d.
+    bool useFP16 = false; // Only ARMv8 is supported.
 #if CV_SIMD128
     bool useSIMD128 = true;
 #else
@@ -95,6 +121,7 @@ Ptr<FastConv> initFastConv(
         const std::vector<size_t>& pads_begin,
         const std::vector<size_t>& pads_end,
         int conv_dim,
+        const bool useFP16,
         bool useWinograd);
 
 // It contains different computing branches, like winograd, 1x1 conv.
diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp
index ff53d4d114..02f56e7419 100644
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -215,7 +215,7 @@ public:
         if (backendId == DNN_BACKEND_OPENCV)
         {
             if (kernel_size.size() == 3)
-                return preferableTarget == DNN_TARGET_CPU;
+                return IS_DNN_CPU_TARGET(preferableTarget);
             if (kernel_size.size() <= 2)
                 return true;
             else
diff --git a/modules/dnn/src/net_impl.cpp b/modules/dnn/src/net_impl.cpp
index 775016b3b7..8024a05597 100644
--- a/modules/dnn/src/net_impl.cpp
+++ b/modules/dnn/src/net_impl.cpp
@@ -98,6 +98,7 @@ void Net::Impl::validateBackendAndTarget()
 
     CV_Assert(preferableBackend != DNN_BACKEND_OPENCV ||
               preferableTarget == DNN_TARGET_CPU ||
+              preferableTarget == DNN_TARGET_CPU_FP16 ||
               preferableTarget == DNN_TARGET_OPENCL ||
               preferableTarget == DNN_TARGET_OPENCL_FP16);
     CV_Assert(preferableBackend != DNN_BACKEND_HALIDE ||
@@ -972,7 +973,8 @@ void Net::Impl::forward(OutputArrayOfArrays outputBlobs, const String& outputNam
     }
     else if (outputBlobs.isMatVector())
     {
-        if (preferableTarget != DNN_TARGET_CPU)
+        // The DNN_TARGET_CPU and DNN_TARGET_CPU_FP16 both use the CPU memory, do not need the copyToHost.
+        if (preferableTarget != DNN_TARGET_CPU && preferableTarget != DNN_TARGET_CPU_FP16)
         {
             for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
             {
@@ -1336,7 +1338,7 @@ Mat Net::Impl::getBlob(const LayerPin& pin) const
                                               "the #%d was requested",
                                                ld.name.c_str(), ld.outputBlobs.size(), pin.oid));
     }
-    if (preferableTarget != DNN_TARGET_CPU)
+    if (preferableTarget != DNN_TARGET_CPU && preferableTarget != DNN_TARGET_CPU_FP16)
     {
         CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty());
         // Transfer data to CPU if it's require.
@@ -1552,7 +1554,7 @@ string Net::Impl::dump(bool forceAllocation) const
             prevNode = itBackend->second;
         }
     }
-    std::vector<string> colors = { "#ffffb3", "#fccde5", "#8dd3c7", "#bebada", "#80b1d3", "#fdb462", "#ff4848", "#b35151", "#b266ff", "#b266ff", "#3cb371"};
+    std::vector<string> colors = { "#ffffb3", "#fccde5", "#8dd3c7", "#bebada", "#80b1d3", "#fdb462", "#ff4848", "#b35151", "#b266ff", "#b266ff", "#3cb371", "#ffcab3"};
     string backend;
     switch (prefBackend)
     {
@@ -1755,6 +1757,10 @@ string Net::Impl::dump(bool forceAllocation) const
             out << "NPU";
             colorId = 9;
             break;
+        case DNN_TARGET_CPU_FP16:
+            out << "CPU_FP16";
+            colorId = 10;
+            break;
             // don't use default:
         }
         CV_Assert(colorId < colors.size());
diff --git a/modules/dnn/src/net_impl_backend.cpp b/modules/dnn/src/net_impl_backend.cpp
index ef816be66d..d29b6934a2 100644
--- a/modules/dnn/src/net_impl_backend.cpp
+++ b/modules/dnn/src/net_impl_backend.cpp
@@ -17,7 +17,8 @@ CV__DNN_INLINE_NS_BEGIN
 
 Ptr<BackendWrapper> Net::Impl::wrap(Mat& host)
 {
-    if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU)
+    if (preferableBackend == DNN_BACKEND_OPENCV &&
+            (preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_CPU_FP16))
         return Ptr<BackendWrapper>();
 
     MatShape shape(host.dims);
@@ -104,7 +105,7 @@ void Net::Impl::initBackend(const std::vector<LayerPin>& blobsToKeep_)
     CV_TRACE_FUNCTION();
     if (preferableBackend == DNN_BACKEND_OPENCV)
     {
-        CV_Assert(preferableTarget == DNN_TARGET_CPU || IS_DNN_OPENCL_TARGET(preferableTarget));
+        CV_Assert(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_CPU_FP16 || IS_DNN_OPENCL_TARGET(preferableTarget));
     }
     else if (preferableBackend == DNN_BACKEND_HALIDE)
     {
@@ -232,6 +233,15 @@ void Net::Impl::setPreferableTarget(int targetId)
                 preferableTarget = DNN_TARGET_OPENCL;
 #endif
         }
+
+#if !defined(__arm64__) || !__arm64__
+        if (targetId == DNN_TARGET_CPU_FP16)
+        {
+            CV_LOG_WARNING(NULL, "DNN: fall back to DNN_TARGET_CPU. Only ARM v8 CPU is supported by DNN_TARGET_CPU_FP16.");
+            targetId = DNN_TARGET_CPU;
+        }
+#endif
+
         clear();
     }
 }
diff --git a/modules/dnn/src/registry.cpp b/modules/dnn/src/registry.cpp
index f5c9e584c6..40630a93e4 100644
--- a/modules/dnn/src/registry.cpp
+++ b/modules/dnn/src/registry.cpp
@@ -61,6 +61,11 @@ private:
         }
 #endif
 
+        bool haveBackendCPU_FP16 = false;
+#if defined(__arm64__) && __arm64__
+        haveBackendCPU_FP16 = true;
+#endif
+
         if (haveBackendOpenVINO && openvino::checkTarget(DNN_TARGET_CPU))
         {
             backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, DNN_TARGET_CPU));
@@ -104,6 +109,9 @@ private:
 
         backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_CPU));
 
+        if (haveBackendCPU_FP16)
+            backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_CPU_FP16));
+
 #ifdef HAVE_VULKAN
         if (haveVulkan())
             backends.push_back(std::make_pair(DNN_BACKEND_VKCOM, DNN_TARGET_VULKAN));
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index 7aa9c756fb..d8e69f3bbb 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -175,6 +175,8 @@ TEST_P(DNNTestNetwork, ENet)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
     processNet("dnn/Enet-model-best.net", "", Size(512, 512), "l367_Deconvolution",
                target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_enet.yml" :
                                              "dnn/halide_scheduler_enet.yml",
@@ -189,7 +191,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE);
     Mat sample = imread(findDataFile("dnn/street.png"));
     Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
-    float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 1.5e-2 : 0.0;
+    float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 1.5e-2 : 0.0;
     float iouDiff = (target == DNN_TARGET_MYRIAD) ? 0.063  : 0.0;
     float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.262  : FLT_MIN;
          processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
@@ -225,7 +227,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe_Different_Width_Height)
     Mat sample = imread(findDataFile("dnn/street.png"));
     Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 560), Scalar(127.5, 127.5, 127.5), false);
     float scoreDiff = 0.0, iouDiff = 0.0;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
     {
         scoreDiff = 0.029;
         iouDiff = 0.09;
@@ -242,7 +244,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe_Different_Width_Height)
 
 TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow)
 {
-    applyTestTag(target == DNN_TARGET_CPU ? "" : CV_TEST_TAG_MEMORY_512MB);
+    applyTestTag((target == DNN_TARGET_CPU || target == DNN_TARGET_CPU_FP16) ? "" : CV_TEST_TAG_MEMORY_512MB);
     if (backend == DNN_BACKEND_HALIDE)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE);
 
@@ -250,7 +252,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow)
     Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false);
     float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.216 : 0.2;
     float scoreDiff = 0.0, iouDiff = 0.0;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
     {
         scoreDiff = 0.095;
         iouDiff = 0.09;
@@ -282,7 +284,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow_Different_Width_Height)
     Mat sample = imread(findDataFile("dnn/street.png"));
     Mat inp = blobFromImage(sample, 1.0f, Size(300, 560), Scalar(), false);
     float scoreDiff = 0.0, iouDiff = 0.0;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
     {
         scoreDiff = 0.013;
         iouDiff = 0.06;
@@ -306,7 +308,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow)
     Mat sample = imread(findDataFile("dnn/street.png"));
     Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false);
     float scoreDiff = 2e-5, iouDiff = 0.0;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
     {
         scoreDiff = 0.013;
         iouDiff = 0.062;
@@ -332,7 +334,7 @@ TEST_P(DNNTestNetwork, SSD_VGG16)
     Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false);
 
     float scoreDiff = 0.0, iouDiff = 0.0;
-    if (target == DNN_TARGET_OPENCL_FP16)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
     {
         scoreDiff = 0.04;
     }
@@ -387,7 +389,7 @@ TEST_P(DNNTestNetwork, OpenPose_pose_mpi)
 
     // output range: [-0.001, 0.97]
     const float l1 = (target == DNN_TARGET_MYRIAD) ? 0.02 : 0.0;
-    const float lInf = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16) ? 0.2 : 0.0;
+    const float lInf = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16) ? 0.2 : 0.0;
     processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi.prototxt",
                Size(46, 46), "", "", l1, lInf);
     expectNoFallbacksFromIE(net);
@@ -461,7 +463,7 @@ TEST_P(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
     Mat sample = imread(findDataFile("dnn/street.png"));
     Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false);
     float scoreDiff = 0.0, iouDiff = 0.0;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
     {
         scoreDiff = 0.02;
         iouDiff = 0.1;
@@ -483,7 +485,7 @@ TEST_P(DNNTestNetwork, DenseNet_121)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE);
     // Reference output values are in range [-3.807, 4.605]
     float l1 = 0.0, lInf = 0.0;
-    if (target == DNN_TARGET_OPENCL_FP16)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
     {
         l1 = 2e-2;
         lInf = 9e-2;
@@ -538,6 +540,11 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
         l1 = 0.3;
         lInf = 7.6;
     }
+    else if (target == DNN_TARGET_CPU_FP16)
+    {
+        l1 = 0.4;
+        lInf = 19.;
+    }
 
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp
index 8059fc6888..809b959a21 100644
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@@ -153,7 +153,7 @@ TEST_P(Test_Caffe_nets, Axpy)
         }
     }
     float l1 = 1e-5, lInf = 1e-4;
-    if (target == DNN_TARGET_OPENCL_FP16)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
     {
         l1 = 2e-4;
         lInf = 1e-3;
@@ -180,7 +180,7 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
 #else
     applyTestTag(targetId == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
 #endif
-    ASSERT_TRUE(ocl::useOpenCL() || targetId == DNN_TARGET_CPU);
+    ASSERT_TRUE(ocl::useOpenCL() || targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16);
 
     bool readFromMemory = get<0>(GetParam());
     Net net;
@@ -214,7 +214,7 @@ TEST_P(Reproducibility_AlexNet, Accuracy)
     ASSERT_EQ(inLayerShapes[0][3], 227);
 
     const float l1 = 1e-5;
-    const float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 4e-3 : 1e-4;
+    const float lInf = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_CPU_FP16) ? 4e-3 : 1e-4;
 
     net.setPreferableBackend(DNN_BACKEND_OPENCV);
     net.setPreferableTarget(targetId);
@@ -308,7 +308,7 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
     ASSERT_EQ(out.size[2], 100);
 
     float scores_diff = 1e-5, boxes_iou_diff = 1e-4;
-    if (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD)
+    if (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD || targetId == DNN_TARGET_CPU_FP16)
     {
         scores_diff = 1.5e-2;
         boxes_iou_diff = 6.3e-2;
@@ -375,7 +375,7 @@ TEST_P(Reproducibility_ResNet50, Accuracy)
 {
     Target targetId = GetParam();
     applyTestTag(targetId == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
-    ASSERT_TRUE(ocl::useOpenCL() || targetId == DNN_TARGET_CPU);
+    ASSERT_TRUE(ocl::useOpenCL() || targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16);
 
     Net net = readNetFromCaffe(findDataFile("dnn/ResNet-50-deploy.prototxt"),
                                findDataFile("dnn/ResNet-50-model.caffemodel", false));
@@ -383,8 +383,8 @@ TEST_P(Reproducibility_ResNet50, Accuracy)
     net.setPreferableBackend(DNN_BACKEND_OPENCV);
     net.setPreferableTarget(targetId);
 
-    float l1 = (targetId == DNN_TARGET_OPENCL_FP16) ? 3e-5 : 1e-5;
-    float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 6e-3 : 1e-4;
+    float l1 = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_CPU_FP16) ? 3e-5 : 1e-5;
+    float lInf = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_CPU_FP16) ? 6e-3 : 1e-4;
 
     Mat input = blobFromImage(imread(_tf("googlenet_0.png")), 1.0f, Size(224,224), Scalar(), false);
     ASSERT_TRUE(!input.empty());
@@ -415,6 +415,8 @@ TEST_P(Reproducibility_SqueezeNet_v1_1, Accuracy)
     int targetId = GetParam();
     if(targetId == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+    if(targetId == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
     Net net = readNetFromCaffe(findDataFile("dnn/squeezenet_v1.1.prototxt"),
                                findDataFile("dnn/squeezenet_v1.1.caffemodel", false));
     net.setPreferableBackend(DNN_BACKEND_OPENCV);
@@ -509,7 +511,7 @@ TEST_P(Test_Caffe_nets, Colorization)
 
     // Reference output values are in range [-29.1, 69.5]
     double l1 = 4e-4, lInf = 3e-3;
-    if (target == DNN_TARGET_OPENCL_FP16)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
     {
         l1 = 0.25;
         lInf = 5.3;
@@ -566,7 +568,7 @@ TEST_P(Test_Caffe_nets, DenseNet_121)
     {
         l1 = 0.11; lInf = 0.5;
     }
-    else if (target == DNN_TARGET_CUDA_FP16)
+    else if (target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16)
     {
         l1 = 0.04; lInf = 0.2;
     }
@@ -635,6 +637,8 @@ TEST_P(opencv_face_detector, Accuracy)
 
     if (targetId == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+    if (targetId == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
 
     Net net = readNetFromCaffe(proto, model);
     Mat img = imread(findDataFile("gpu/lbpcascade/er.png"));
@@ -665,6 +669,8 @@ TEST_P(opencv_face_detector, issue_15106)
 
     if (targetId == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+    if (targetId == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
 
     Net net = readNetFromCaffe(proto, model);
     Mat img = imread(findDataFile("cv/shared/lena.png"));
@@ -768,6 +774,8 @@ TEST_P(Test_Caffe_nets, FasterRCNN_zf)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
     if (target == DNN_TARGET_CUDA_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
+    if (target == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
     static Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.90121, 120.407, 115.83, 570.586, 528.395,
                                            0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762,
                                            0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176);
@@ -783,7 +791,7 @@ TEST_P(Test_Caffe_nets, RFCN)
     );
 
     float scoreDiff = default_l1, iouDiff = default_lInf;
-    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
+    if (backend == DNN_BACKEND_OPENCV && (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16))
     {
         scoreDiff = 4e-3;
         iouDiff = 8e-2;
diff --git a/modules/dnn/test/test_common.hpp b/modules/dnn/test/test_common.hpp
index e3c7a553f8..6262a0f7a4 100644
--- a/modules/dnn/test/test_common.hpp
+++ b/modules/dnn/test/test_common.hpp
@@ -21,6 +21,7 @@
 #define CV_TEST_TAG_DNN_SKIP_OPENCV_BACKEND      "dnn_skip_opencv_backend"
 #define CV_TEST_TAG_DNN_SKIP_HALIDE              "dnn_skip_halide"
 #define CV_TEST_TAG_DNN_SKIP_CPU                 "dnn_skip_cpu"
+#define CV_TEST_TAG_DNN_SKIP_CPU_FP16            "dnn_skip_cpu_fp16"
 #define CV_TEST_TAG_DNN_SKIP_OPENCL              "dnn_skip_ocl"
 #define CV_TEST_TAG_DNN_SKIP_OPENCL_FP16         "dnn_skip_ocl_fp16"
 #define CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER       "dnn_skip_ie_nn_builder"
@@ -164,7 +165,7 @@ public:
 
     static void getDefaultThresholds(int backend, int target, double* l1, double* lInf)
     {
-        if (target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+        if (target == DNN_TARGET_CPU_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
         {
             *l1 = 4e-3;
             *lInf = 2e-2;
diff --git a/modules/dnn/test/test_common.impl.hpp b/modules/dnn/test/test_common.impl.hpp
index bad6a8d082..2a4ee34b02 100644
--- a/modules/dnn/test/test_common.impl.hpp
+++ b/modules/dnn/test/test_common.impl.hpp
@@ -49,6 +49,7 @@ void PrintTo(const cv::dnn::Target& v, std::ostream* os)
     case DNN_TARGET_CUDA: *os << "CUDA"; return;
     case DNN_TARGET_CUDA_FP16: *os << "CUDA_FP16"; return;
     case DNN_TARGET_NPU: *os << "NPU"; return;
+    case DNN_TARGET_CPU_FP16: *os << "CPU_FP16"; return;
     } // don't use "default:" to emit compiler warnings
     *os << "DNN_TARGET_UNKNOWN(" << (int)v << ")";
 }
@@ -439,7 +440,7 @@ void initDNNTests()
 
     registerGlobalSkipTag(
         CV_TEST_TAG_DNN_SKIP_OPENCV_BACKEND,
-        CV_TEST_TAG_DNN_SKIP_CPU,
+        CV_TEST_TAG_DNN_SKIP_CPU, CV_TEST_TAG_DNN_SKIP_CPU_FP16,
         CV_TEST_TAG_DNN_SKIP_OPENCL, CV_TEST_TAG_DNN_SKIP_OPENCL_FP16
     );
 #if defined(HAVE_HALIDE)
diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp
index 2160c81fad..2e6db5ef68 100644
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@@ -360,9 +360,9 @@ TEST_P(Test_Darknet_nets, YoloVoc)
                                     1, 6,  0.667770f, 0.446555f, 0.453578f, 0.499986f, 0.519167f,  // a car
                                     1, 6,  0.844947f, 0.637058f, 0.460398f, 0.828508f, 0.66427f);  // a car
 
-    double nmsThreshold = (target == DNN_TARGET_MYRIAD) ? 0.397 : 0.4;
+    double nmsThreshold = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.397 : 0.4;
     double scoreDiff = 8e-5, iouDiff = 3e-4;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
     {
         scoreDiff = 1e-2;
         iouDiff = 0.018;
@@ -451,7 +451,7 @@ TEST_P(Test_Darknet_nets, TinyYoloVoc)
                                     1, 6,  0.928758f, 0.651024f, 0.463539f, 0.823784f, 0.654998f); // a car
 
     double scoreDiff = 8e-5, iouDiff = 3e-4;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
     {
         scoreDiff = 8e-3;
         iouDiff = 0.018;
@@ -636,7 +636,7 @@ TEST_P(Test_Darknet_nets, YOLOv3)
     Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_);
 
     double scoreDiff = 8e-5, iouDiff = 3e-4;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
     {
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2022010000)
         if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
@@ -725,8 +725,8 @@ TEST_P(Test_Darknet_nets, YOLOv4)
     };
     Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_);
 
-    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.006 : 8e-5;
-    double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.042 : 3e-4;
+    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.006 : 8e-5;
+    double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.042 : 3e-4;
     if (target == DNN_TARGET_CUDA_FP16)
     {
         scoreDiff = 0.008;
@@ -847,7 +847,7 @@ TEST_P(Test_Darknet_nets, YOLOv4_tiny)
     Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_);
 
     double scoreDiff = 0.012f;
-    double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.15 : 0.01f;
+    double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.15 : 0.01f;
     if (target == DNN_TARGET_CUDA_FP16)
         iouDiff = 0.02;
 
@@ -930,7 +930,7 @@ TEST_P(Test_Darknet_nets, YOLOv4x_mish)
     double scoreDiff = 8e-5;
     double iouDiff = 3e-4;
 
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16)
     {
         scoreDiff = 0.006;
         iouDiff = 0.042;
@@ -1093,6 +1093,8 @@ TEST_P(Test_Darknet_layers, connected)
 {
     if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
     testDarknetLayer("connected", true);
 }
 
diff --git a/modules/dnn/test/test_googlenet.cpp b/modules/dnn/test/test_googlenet.cpp
index e51dcd0988..f911ff029f 100644
--- a/modules/dnn/test/test_googlenet.cpp
+++ b/modules/dnn/test/test_googlenet.cpp
@@ -58,6 +58,8 @@ TEST_P(Reproducibility_GoogLeNet, Batching)
     const int targetId = GetParam();
     if (targetId == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+    if (targetId == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
     Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt"),
                                findDataFile("dnn/bvlc_googlenet.caffemodel", false));
     net.setPreferableBackend(DNN_BACKEND_OPENCV);
@@ -89,6 +91,8 @@ TEST_P(Reproducibility_GoogLeNet, IntermediateBlobs)
     const int targetId = GetParam();
     if (targetId == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+    if (targetId == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
     Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt"),
                                findDataFile("dnn/bvlc_googlenet.caffemodel", false));
     net.setPreferableBackend(DNN_BACKEND_OPENCV);
@@ -120,6 +124,8 @@ TEST_P(Reproducibility_GoogLeNet, SeveralCalls)
     const int targetId = GetParam();
     if (targetId == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+    if (targetId == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
     Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt"),
                                findDataFile("dnn/bvlc_googlenet.caffemodel", false));
     net.setPreferableBackend(DNN_BACKEND_OPENCV);
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index a5652f16b7..763d94b99c 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -212,6 +212,8 @@ TEST_P(Test_Caffe_layers, InnerProduct)
 
     if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
 
     testLayerUsingCaffeModels("layer_inner_product", true);
 }
@@ -378,7 +380,7 @@ TEST_P(Test_Caffe_layers, Eltwise)
 
 TEST_P(Test_Caffe_layers, PReLU)
 {
-    double lInf = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16) ? 0.021 : 0.0;
+    double lInf = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16) ? 0.021 : 0.0;
     testLayerUsingCaffeModels("layer_prelu", true, true, 0.0, lInf);
 }
 
@@ -2459,7 +2461,7 @@ TEST_P(ConvolutionActivationFusion, Accuracy)
     std::vector<int> expectedFusedLayers;
     if (backendId == DNN_BACKEND_OPENCV)
     {
-        if (targetId == DNN_TARGET_CPU)
+        if (targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16)
             expectedFusedLayers.push_back(activId); // all activations are fused
         else if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
         {
@@ -2594,7 +2596,7 @@ TEST_P(ConvolutionEltwiseActivationFusion, Accuracy)
     std::vector<int> expectedFusedLayers;
     if (backendId == DNN_BACKEND_OPENCV)
     {
-        if (targetId == DNN_TARGET_CPU)
+        if (targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16)
             expectedFusedLayers.push_back(activId); // activation is fused with eltwise layer
         else if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
         {
@@ -2683,7 +2685,7 @@ TEST_P(ConvolutionActivationEltwiseFusion, Accuracy)
     std::vector<int> expectedFusedLayers;
     if (backendId == DNN_BACKEND_OPENCV)
     {
-        if (targetId == DNN_TARGET_CPU)
+        if (targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16)
             expectedFusedLayers.push_back(activId); // activation fused with convolution
         else if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16)
         {
diff --git a/modules/dnn/test/test_model.cpp b/modules/dnn/test/test_model.cpp
index 2d6c4c7ac1..bd03551ab8 100644
--- a/modules/dnn/test/test_model.cpp
+++ b/modules/dnn/test/test_model.cpp
@@ -332,7 +332,7 @@ TEST_P(Test_Model, DetectRegion)
     double confThreshold = 0.24;
     double nmsThreshold = (target == DNN_TARGET_MYRIAD) ? 0.397 : 0.4;
     double scoreDiff = 8e-5, iouDiff = 1e-5;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16)
     {
         scoreDiff = 1e-2;
         iouDiff = 1.6e-2;
@@ -392,7 +392,7 @@ TEST_P(Test_Model, DetectRegionWithNmsAcrossClasses)
     double confThreshold = 0.24;
     double nmsThreshold = (target == DNN_TARGET_MYRIAD) ? 0.15: 0.15;
     double scoreDiff = 8e-5, iouDiff = 1e-5;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16)
     {
         scoreDiff = 1e-2;
         iouDiff = 1.6e-2;
@@ -443,7 +443,7 @@ TEST_P(Test_Model, DetectionOutput)
     double scoreDiff = default_l1, iouDiff = 1e-5;
     float confThreshold = 0.8;
     double nmsThreshold = 0.0;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CUDA_FP16)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16)
     {
         if (backend == DNN_BACKEND_OPENCV)
             scoreDiff = 4e-3;
@@ -495,7 +495,7 @@ TEST_P(Test_Model, DetectionMobilenetSSD)
     Size size{300, 300};
 
     double scoreDiff = 1e-5, iouDiff = 1e-5;
-    if (target == DNN_TARGET_OPENCL_FP16)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
     {
         scoreDiff = 1.7e-2;
         iouDiff = 6.91e-2;
@@ -522,6 +522,8 @@ TEST_P(Test_Model, Keypoints_pose)
 {
     if (target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+    if (target == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
 #ifdef HAVE_INF_ENGINE
     if (target == DNN_TARGET_MYRIAD)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
@@ -569,7 +571,7 @@ TEST_P(Test_Model, Keypoints_face)
 
     // Ref. Range: [-1.1784188, 1.7758257]
     float norm = 1e-4;
-    if (target == DNN_TARGET_OPENCL_FP16)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
         norm = 5e-3;
     if (target == DNN_TARGET_MYRIAD)
     {
@@ -605,7 +607,7 @@ TEST_P(Test_Model, Detection_normalized)
         scoreDiff = 3e-4;
         iouDiff = 0.018;
     }
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16)
     {
         scoreDiff = 5e-3;
         iouDiff = 0.09;
@@ -654,7 +656,7 @@ TEST_P(Test_Model, Segmentation)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
 #endif
 
-    if ((backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
+    if ((backend == DNN_BACKEND_OPENCV && (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16))
         || (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16))
     {
         norm = 2.0f;  // l1 = 0.01 lInf = 2
@@ -741,6 +743,8 @@ TEST_P(Test_Model, TextDetectionByDB)
 {
     if (target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+    if (target == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
 
     std::string imgPath = _tf("text_det_test1.png");
     std::string weightPathDB = _tf("onnx/models/DB_TD500_resnet50.onnx", false);
@@ -801,7 +805,7 @@ TEST_P(Test_Model, TextDetectionByEAST)
     double eps_size = 5/*pixels*/;
     double eps_angle = 1;
 
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_MYRIAD)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
     {
         eps_center = 10;
         eps_size = 25;
diff --git a/modules/dnn/test/test_onnx_conformance.cpp b/modules/dnn/test/test_onnx_conformance.cpp
index 8f24fdf135..b238427dfb 100644
--- a/modules/dnn/test/test_onnx_conformance.cpp
+++ b/modules/dnn/test/test_onnx_conformance.cpp
@@ -957,7 +957,7 @@ public:
         backend = get<0>(get<1>(GetParam()));
         target = get<1>(get<1>(GetParam()));
 
-        if (target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+        if (target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
         {
             default_l1 = 7e-3;
             default_lInf = 2e-2;
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index a2fed56b68..12a5ad1957 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -2179,7 +2179,7 @@ TEST_P(Test_ONNX_nets, TinyYolov2)
 
     // output range: [-11; 8]
     double l1 =  default_l1, lInf = default_lInf;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
     {
         l1 = 0.02;
         lInf = 0.2;
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index bcedcffe15..b795076f55 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -486,6 +486,11 @@ TEST_P(Test_TensorFlow_layers, slim_batch_norm)
         l1 = 0.005;
         lInf = 0.33;
     }
+    else if (target == DNN_TARGET_CPU_FP16)
+    {
+        l1 = 0.041;
+        lInf = 0.37;
+    }
 
     runTensorFlowNet("slim_batch_norm", false, l1, lInf);
 }
@@ -710,6 +715,9 @@ TEST_P(Test_TensorFlow_layers, matmul)
 {
     if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
+
     runTensorFlowNet("matmul");
     runTensorFlowNet("nhwc_transpose_reshape_matmul");
     // Reference output values are in range [-5.688, 4.484]
@@ -723,6 +731,8 @@ TEST_P(Test_TensorFlow_layers, batch_matmul)
 {
     if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
     runTensorFlowNet("batch_matmul");
 }
 
@@ -730,6 +740,8 @@ TEST_P(Test_TensorFlow_layers, square)
 {
     if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
     runTensorFlowNet("square");
 }
 
@@ -924,7 +936,7 @@ TEST_P(Test_TensorFlow_nets, MobileNet_SSD)
     Mat out = net.forward();
 
     double scoreDiff = default_l1, iouDiff = default_lInf;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
     {
         scoreDiff = 0.01;
         iouDiff = 0.1;
@@ -971,7 +983,7 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_SSD)
                                     0, 10, 0.93973452, 0.66561931, 0.37841269, 0.68074018, 0.42907384);
 
     double scoreDiff = default_l1, iouDiff = default_lInf;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
     {
         scoreDiff = 0.0097;
         iouDiff = 0.09;
@@ -1004,7 +1016,7 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD)
     Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/ssd_mobilenet_v1_coco_2017_11_17.detection_out.npy"));
     float scoreDiff = 1.5e-5, iouDiff = 1e-3;
     float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.35 : 0.3;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
     {
         scoreDiff = 0.011;
         iouDiff = 0.012;
@@ -1053,6 +1065,8 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN_inception_v2_coco_2018_01_28)
 
     if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
+    if (target == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
 
     checkBackend();
 
@@ -1085,6 +1099,9 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN_inception_v2_coco_2018_01_28)
         if (target == DNN_TARGET_OPENCL_FP16)
             applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
 
+        if (target == DNN_TARGET_CPU_FP16)
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
+
         normAssertDetections(ref, out, name.c_str(), 0.3, scoresDiff, iouDiff);
     }
 }
@@ -1164,6 +1181,9 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN_resnet50_coco_2018_01_28)
         if (target == DNN_TARGET_OPENCL_FP16)
             applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
 
+        if (target == DNN_TARGET_CPU_FP16)
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
+
         normAssertDetections(ref, out, name.c_str(), 0.3, scoresDiff, iouDiff);
     }
 }
@@ -1191,7 +1211,7 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD_PPN)
     Mat out = net.forward();
 
     double scoreDiff = 1.1e-5, iouDiff = default_lInf;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
     {
         scoreDiff = 0.048;
         iouDiff = 0.058;
@@ -1230,7 +1250,7 @@ TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8)
                                     0, 1, 0.97203469, 0.67965847, 0.06876482, 0.73999709, 0.1513494,
                                     0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427, 0.5347801);
     double scoreDiff = 3.4e-3, iouDiff = 1e-2;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
     {
         scoreDiff = 4e-3;
         iouDiff = 0.024;
@@ -1317,6 +1337,11 @@ TEST_P(Test_TensorFlow_nets, EAST_text_detection)
         lInf_scores = 0.1;
         l1_geometry = 0.3; lInf_geometry = 7;
     }
+    else if (target == DNN_TARGET_CPU_FP16)
+    {
+        lInf_scores = 0.1;
+        l1_geometry = 0.28; lInf_geometry = 5.94;
+    }
     else
     {
         l1_geometry = 1e-4, lInf_geometry = 4.3e-3;
@@ -1360,6 +1385,10 @@ TEST_P(Test_TensorFlow_layers, fp16_weights_fp16_pad_and_concat)
 TEST_P(Test_TensorFlow_layers, fp16_weights_fp16_padding_valid)
 {
     float l1 = 0.00078, lInf = 0.012;
+
+    if (target == DNN_TARGET_CPU_FP16)
+        l1 = 0.00083;
+
     runTensorFlowNet("fp16_padding_valid", false, l1, lInf);
 }
 TEST_P(Test_TensorFlow_layers, fp16_weights_fp16_max_pool_even)
@@ -1407,8 +1436,13 @@ TEST_P(Test_TensorFlow_layers, fp16_weights_fp16_max_pool_odd_valid)
 
 TEST_P(Test_TensorFlow_layers, fp16_padding_same)
 {
+    float l1 = 7e-4, lInf = 4e-3;
+
+    if (target == DNN_TARGET_CPU_FP16)
+        lInf = 5e-3;
+
     // Reference output values are in range [-3.504, -0.002]
-    runTensorFlowNet("fp16_padding_same", false, 7e-4, 4e-3);
+    runTensorFlowNet("fp16_padding_same", false, l1, lInf);
 }
 
 TEST_P(Test_TensorFlow_layers, defun)
@@ -1450,6 +1484,9 @@ TEST_P(Test_TensorFlow_layers, lstm)
     if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
 
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
+
     runTensorFlowNet("lstm", true);
     runTensorFlowNet("lstm", true, 0.0, 0.0, true);
 }
@@ -1771,8 +1808,8 @@ TEST_P(Test_TensorFlow_nets, Mask_RCNN)
     Mat outDetections = outs[0];
     Mat outMasks = outs[1];
 
-    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.2 : 2e-5;
-    double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.018 : default_lInf;
+    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.2 : 2e-5;
+    double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.018 : default_lInf;
     normAssertDetections(refDetections, outDetections, "", /*threshold for zero confidence*/1e-5, scoreDiff, iouDiff);
 
     // Output size of masks is NxCxHxW where
@@ -1805,7 +1842,7 @@ TEST_P(Test_TensorFlow_nets, Mask_RCNN)
 
     double inter = cv::countNonZero(masks & refMasks);
     double area = cv::countNonZero(masks | refMasks);
-    EXPECT_GE(inter / area, (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.98 : 0.99);
+    EXPECT_GE(inter / area, (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.98 : 0.99);
 
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         expectNoFallbacks(net);
@@ -1815,6 +1852,7 @@ TEST_P(Test_TensorFlow_nets, EfficientDet)
 {
     if (target != DNN_TARGET_CPU)
     {
+        if (target == DNN_TARGET_CPU_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
         if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
         if (target == DNN_TARGET_OPENCL)      applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
         if (target == DNN_TARGET_MYRIAD)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp
index 2e18ac8c48..8510ec4e64 100644
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@@ -113,7 +113,7 @@ TEST_P(Test_Torch_layers, run_convolution)
 {
     // Output reference values are in range [23.4018, 72.0181]
     double l1 = default_l1, lInf = default_lInf;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
     {
         l1 = 0.08;
         lInf = 0.43;
@@ -132,6 +132,8 @@ TEST_P(Test_Torch_layers, run_pool_max)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
     if (target == DNN_TARGET_CUDA_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
+    if (target == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
     double l1 = 0.0, lInf = 0.0;
     runTorchNet("net_pool_max", "", true, false, true, l1, lInf);
 }
@@ -158,7 +160,7 @@ TEST_P(Test_Torch_layers, run_reshape_single_sample)
 {
     // Reference output values in range [14.4586, 18.4492].
     double l1 = default_l1, lInf = default_lInf;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
     {
         l1 = 0.033;
         lInf = 0.05;
@@ -175,6 +177,8 @@ TEST_P(Test_Torch_layers, run_linear)
 {
     if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+    if (target == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
     runTorchNet("net_linear_2d");
 }
 
@@ -186,7 +190,7 @@ TEST_P(Test_Torch_layers, run_concat)
 TEST_P(Test_Torch_layers, run_depth_concat)
 {
     double lInf = 0.0;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
     {
         lInf = 0.032;
     }
@@ -252,7 +256,7 @@ TEST_P(Test_Torch_layers, net_conv_gemm_lrn)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
 #endif
     double l1 = 0.0, lInf = 0.0;
-    if (target == DNN_TARGET_OPENCL_FP16)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)
     {
         l1 = 0.046;
         lInf = 0.023;
@@ -369,7 +373,7 @@ TEST_P(Test_Torch_nets, OpenFace_accuracy)
     // Reference output values are in range [-0.17212, 0.263492]
     // on Myriad problem layer: l4_Pooling - does not use pads_begin
     float l1 = 1e-5, lInf = 1e-3;
-    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
+    if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16)
     {
         l1 = 2e-3;
         lInf = 5e-3;
@@ -431,6 +435,8 @@ TEST_P(Test_Torch_nets, ENet_accuracy)
         throw SkipTestException("");
     if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
+    if (target == DNN_TARGET_CPU_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16);
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020010000)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
@@ -562,6 +568,10 @@ TEST_P(Test_Torch_nets, FastNeuralStyle_accuracy)
         {
             normAssert(out, refBlob, "", 0.6, 25);
         }
+        else if (target == DNN_TARGET_CPU_FP16)
+        {
+            normAssert(out, refBlob, "", 0.62, 25);
+        }
         else
             normAssert(out, refBlob, "", 0.5, 1.1);
     }