diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp index 3233ab3c66..f9813fa053 100644 --- a/modules/dnn/include/opencv2/dnn/dnn.hpp +++ b/modules/dnn/include/opencv2/dnn/dnn.hpp @@ -106,6 +106,7 @@ CV__DNN_INLINE_NS_BEGIN DNN_TARGET_CUDA_FP16, DNN_TARGET_HDDL, DNN_TARGET_NPU, + DNN_TARGET_CPU_FP16, // Only the ARM platform is supported. Low precision computing, accelerate model inference. }; /** diff --git a/modules/dnn/src/dnn_common.hpp b/modules/dnn/src/dnn_common.hpp index 2561de4a9f..27947afea1 100644 --- a/modules/dnn/src/dnn_common.hpp +++ b/modules/dnn/src/dnn_common.hpp @@ -13,7 +13,7 @@ namespace cv { namespace dnn { CV__DNN_INLINE_NS_BEGIN #define IS_DNN_OPENCL_TARGET(id) (id == DNN_TARGET_OPENCL || id == DNN_TARGET_OPENCL_FP16) -#define IS_DNN_CPU_TARGET(id) (id == DNN_TARGET_CPU) // TODO: add DNN_TARGET_CPU_FP16 +#define IS_DNN_CPU_TARGET(id) (id == DNN_TARGET_CPU || id == DNN_TARGET_CPU_FP16) Mutex& getInitializationMutex(); void initializeLayerFactory(); diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index fc0120cdb8..da85deebaa 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -428,7 +428,6 @@ public: virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE { BaseConvolutionLayerImpl::finalize(inputs_arr, outputs_arr); - std::vector inputs; inputs_arr.getMatVector(inputs); // prepare weightsMat where each row is aligned and has enough zero padding on the right to @@ -1405,7 +1404,8 @@ public: CV_Assert(outputs[0].size[1] % ngroups == 0); fastConvImpl = initFastConv(weightsMat, &biasvec[0], ngroups, K, C, kernel_size, strides, - dilations, pads_begin, pads_end, conv_dim, canUseWinograd); + dilations, pads_begin, pads_end, conv_dim, + preferableTarget == DNN_TARGET_CPU_FP16, canUseWinograd); } runFastConv(inputs[0], outputs[0], fastConvImpl, nstripes, activ, reluslope, fusedAdd); diff --git a/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp b/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp index 71b17dcc9b..27b0d4ba1f 100644 --- a/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp +++ b/modules/dnn/src/layers/cpu_kernels/conv_block.simd.hpp @@ -8,7 +8,7 @@ namespace cv { namespace dnn { CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN -void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR); +void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, int width, const int convMR, const int convNR); #if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX @@ -17,7 +17,7 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i #define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(c, _mm256_mul_ps(a, b)) #endif -void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR) +void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, int width, const int convMR, const int convNR) { CV_Assert(convMR == 4 && convNR == 24); __m256 c00 = _mm256_set1_ps(0.f), c01 = c00, c02 = c00; @@ -28,29 +28,72 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i __m256 a0 = _mm256_setzero_ps(), a1 = _mm256_setzero_ps(); __m256 b0 = _mm256_setzero_ps(), b1 = _mm256_setzero_ps(), b2 = _mm256_setzero_ps(); - for (int p = 0; p < np; p++, a += convMR, b += convNR) + if (width > 16) { - a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]); - b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16); + for (int p = 0; p < np; p++, a += convMR, b += convNR) + { + a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]); + b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16); + + c00 = _mm256_fmadd_ps(b0, a0, c00); + c01 = _mm256_fmadd_ps(b1, a0, c01); + c02 = _mm256_fmadd_ps(b2, a0, c02); + + c10 = _mm256_fmadd_ps(b0, a1, c10); + c11 = _mm256_fmadd_ps(b1, a1, c11); + c12 = _mm256_fmadd_ps(b2, a1, c12); + + a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]); + + c20 = _mm256_fmadd_ps(b0, a0, c20); + c21 = _mm256_fmadd_ps(b1, a0, c21); + c22 = _mm256_fmadd_ps(b2, a0, c22); + + c30 = _mm256_fmadd_ps(b0, a1, c30); + c31 = _mm256_fmadd_ps(b1, a1, c31); + c32 = _mm256_fmadd_ps(b2, a1, c32); + } + } + else if (width > 8) + { + for (int p = 0; p < np; p++, a += convMR, b += convNR) + { + a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]); + b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8); - c00 = _mm256_fmadd_ps(b0, a0, c00); - c01 = _mm256_fmadd_ps(b1, a0, c01); - c02 = _mm256_fmadd_ps(b2, a0, c02); + c00 = _mm256_fmadd_ps(b0, a0, c00); + c01 = _mm256_fmadd_ps(b1, a0, c01); - c10 = _mm256_fmadd_ps(b0, a1, c10); - c11 = _mm256_fmadd_ps(b1, a1, c11); - c12 = _mm256_fmadd_ps(b2, a1, c12); + c10 = _mm256_fmadd_ps(b0, a1, c10); + c11 = _mm256_fmadd_ps(b1, a1, c11); - a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]); + a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]); - c20 = _mm256_fmadd_ps(b0, a0, c20); - c21 = _mm256_fmadd_ps(b1, a0, c21); - c22 = _mm256_fmadd_ps(b2, a0, c22); + c20 = _mm256_fmadd_ps(b0, a0, c20); + c21 = _mm256_fmadd_ps(b1, a0, c21); - c30 = _mm256_fmadd_ps(b0, a1, c30); - c31 = _mm256_fmadd_ps(b1, a1, c31); - c32 = _mm256_fmadd_ps(b2, a1, c32); + c30 = _mm256_fmadd_ps(b0, a1, c30); + c31 = _mm256_fmadd_ps(b1, a1, c31); + } } + else + { + for (int p = 0; p < np; p++, a += convMR, b += convNR) + { + a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]); + b0 = _mm256_load_ps(b); + + c00 = _mm256_fmadd_ps(b0, a0, c00); + c10 = _mm256_fmadd_ps(b0, a1, c10); + + a0 = _mm256_set1_ps(a[2]), a1 = _mm256_set1_ps(a[3]); + + c20 = _mm256_fmadd_ps(b0, a0, c20); + c30 = _mm256_fmadd_ps(b0, a1, c30); + } + } + + if (!init_c) { @@ -87,7 +130,7 @@ namespace opt_NEON { #if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_NEON -void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR) +void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool init_c, int width, const int convMR, const int convNR) { #if CV_NEON_AARCH64 if (convMR == 4 && convNR == 28) // AARCH64 @@ -97,44 +140,105 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i float32x4_t c20 = vdupq_n_f32(0.f), c21 = c20, c22 = c20, c23 = c20, c24 = c20, c25 = c20, c26 = c20; float32x4_t c30 = vdupq_n_f32(0.f), c31 = c30, c32 = c30, c33 = c30, c34 = c30, c35 = c30, c36 = c30; - for( int p = 0; p < np; p++, a += convMR, b += convNR ) + if (width > 16) + { + for( int p = 0; p < np; p++, a += convMR, b += convNR ) + { + float32x4_t a0 = vld1q_f32(a), b0, b1, b2; + b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8); + + c00 = vfmaq_laneq_f32(c00, b0, a0, 0); + c01 = vfmaq_laneq_f32(c01, b1, a0, 0); + c02 = vfmaq_laneq_f32(c02, b2, a0, 0); + c10 = vfmaq_laneq_f32(c10, b0, a0, 1); + c11 = vfmaq_laneq_f32(c11, b1, a0, 1); + c12 = vfmaq_laneq_f32(c12, b2, a0, 1); + c20 = vfmaq_laneq_f32(c20, b0, a0, 2); + c21 = vfmaq_laneq_f32(c21, b1, a0, 2); + c22 = vfmaq_laneq_f32(c22, b2, a0, 2); + c30 = vfmaq_laneq_f32(c30, b0, a0, 3); + c31 = vfmaq_laneq_f32(c31, b1, a0, 3); + c32 = vfmaq_laneq_f32(c32, b2, a0, 3); + + b0 = vld1q_f32(b + 12); b1 = vld1q_f32(b + 16); b2 = vld1q_f32(b + 20); + + c03 = vfmaq_laneq_f32(c03, b0, a0, 0); + c04 = vfmaq_laneq_f32(c04, b1, a0, 0); + c05 = vfmaq_laneq_f32(c05, b2, a0, 0); + c13 = vfmaq_laneq_f32(c13, b0, a0, 1); + c14 = vfmaq_laneq_f32(c14, b1, a0, 1); + c15 = vfmaq_laneq_f32(c15, b2, a0, 1); + c23 = vfmaq_laneq_f32(c23, b0, a0, 2); + c24 = vfmaq_laneq_f32(c24, b1, a0, 2); + c25 = vfmaq_laneq_f32(c25, b2, a0, 2); + c33 = vfmaq_laneq_f32(c33, b0, a0, 3); + c34 = vfmaq_laneq_f32(c34, b1, a0, 3); + c35 = vfmaq_laneq_f32(c35, b2, a0, 3); + + b0 = vld1q_f32(b + 24); + c06 = vfmaq_laneq_f32(c06, b0, a0, 0); + c16 = vfmaq_laneq_f32(c16, b0, a0, 1); + c26 = vfmaq_laneq_f32(c26, b0, a0, 2); + c36 = vfmaq_laneq_f32(c36, b0, a0, 3); + } + } + else if (width > 8) + { + for( int p = 0; p < np; p++, a += convMR, b += convNR ) + { + float32x4_t a0 = vld1q_f32(a), b0, b1, b2; + b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8); + + c00 = vfmaq_laneq_f32(c00, b0, a0, 0); + c01 = vfmaq_laneq_f32(c01, b1, a0, 0); + c02 = vfmaq_laneq_f32(c02, b2, a0, 0); + c10 = vfmaq_laneq_f32(c10, b0, a0, 1); + c11 = vfmaq_laneq_f32(c11, b1, a0, 1); + c12 = vfmaq_laneq_f32(c12, b2, a0, 1); + c20 = vfmaq_laneq_f32(c20, b0, a0, 2); + c21 = vfmaq_laneq_f32(c21, b1, a0, 2); + c22 = vfmaq_laneq_f32(c22, b2, a0, 2); + c30 = vfmaq_laneq_f32(c30, b0, a0, 3); + c31 = vfmaq_laneq_f32(c31, b1, a0, 3); + c32 = vfmaq_laneq_f32(c32, b2, a0, 3); + + b0 = vld1q_f32(b + 12); + + c03 = vfmaq_laneq_f32(c03, b0, a0, 0); + c13 = vfmaq_laneq_f32(c13, b0, a0, 1); + c23 = vfmaq_laneq_f32(c23, b0, a0, 2); + c33 = vfmaq_laneq_f32(c33, b0, a0, 3); + } + } + else if (width > 4) + { + for( int p = 0; p < np; p++, a += convMR, b += convNR ) + { + float32x4_t a0 = vld1q_f32(a), b0, b1; + b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); + + c00 = vfmaq_laneq_f32(c00, b0, a0, 0); + c01 = vfmaq_laneq_f32(c01, b1, a0, 0); + c10 = vfmaq_laneq_f32(c10, b0, a0, 1); + c11 = vfmaq_laneq_f32(c11, b1, a0, 1); + c20 = vfmaq_laneq_f32(c20, b0, a0, 2); + c21 = vfmaq_laneq_f32(c21, b1, a0, 2); + c30 = vfmaq_laneq_f32(c30, b0, a0, 3); + c31 = vfmaq_laneq_f32(c31, b1, a0, 3); + } + } + else { - float32x4_t a0 = vld1q_f32(a), b0, b1, b2; - b0 = vld1q_f32(b); b1 = vld1q_f32(b + 4); b2 = vld1q_f32(b + 8); - - c00 = vfmaq_laneq_f32(c00, b0, a0, 0); - c01 = vfmaq_laneq_f32(c01, b1, a0, 0); - c02 = vfmaq_laneq_f32(c02, b2, a0, 0); - c10 = vfmaq_laneq_f32(c10, b0, a0, 1); - c11 = vfmaq_laneq_f32(c11, b1, a0, 1); - c12 = vfmaq_laneq_f32(c12, b2, a0, 1); - c20 = vfmaq_laneq_f32(c20, b0, a0, 2); - c21 = vfmaq_laneq_f32(c21, b1, a0, 2); - c22 = vfmaq_laneq_f32(c22, b2, a0, 2); - c30 = vfmaq_laneq_f32(c30, b0, a0, 3); - c31 = vfmaq_laneq_f32(c31, b1, a0, 3); - c32 = vfmaq_laneq_f32(c32, b2, a0, 3); - - b0 = vld1q_f32(b + 12); b1 = vld1q_f32(b + 16); b2 = vld1q_f32(b + 20); - - c03 = vfmaq_laneq_f32(c03, b0, a0, 0); - c04 = vfmaq_laneq_f32(c04, b1, a0, 0); - c05 = vfmaq_laneq_f32(c05, b2, a0, 0); - c13 = vfmaq_laneq_f32(c13, b0, a0, 1); - c14 = vfmaq_laneq_f32(c14, b1, a0, 1); - c15 = vfmaq_laneq_f32(c15, b2, a0, 1); - c23 = vfmaq_laneq_f32(c23, b0, a0, 2); - c24 = vfmaq_laneq_f32(c24, b1, a0, 2); - c25 = vfmaq_laneq_f32(c25, b2, a0, 2); - c33 = vfmaq_laneq_f32(c33, b0, a0, 3); - c34 = vfmaq_laneq_f32(c34, b1, a0, 3); - c35 = vfmaq_laneq_f32(c35, b2, a0, 3); - - b0 = vld1q_f32(b + 24); - c06 = vfmaq_laneq_f32(c06, b0, a0, 0); - c16 = vfmaq_laneq_f32(c16, b0, a0, 1); - c26 = vfmaq_laneq_f32(c26, b0, a0, 2); - c36 = vfmaq_laneq_f32(c36, b0, a0, 3); + for( int p = 0; p < np; p++, a += convMR, b += convNR ) + { + float32x4_t a0 = vld1q_f32(a), b0; + b0 = vld1q_f32(b); + + c00 = vfmaq_laneq_f32(c00, b0, a0, 0); + c10 = vfmaq_laneq_f32(c10, b0, a0, 1); + c20 = vfmaq_laneq_f32(c20, b0, a0, 2); + c30 = vfmaq_laneq_f32(c30, b0, a0, 3); + } } if (!init_c) @@ -204,26 +308,62 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i float32x2_t a0 = vdup_n_f32(0.0f), a1 = a0; float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f); - for (int p = 0; p < np; p++, a += convMR, b += convNR) + if (width > 8) + { + for (int p = 0; p < np; p++, a += convMR, b += convNR) + { + a0 = vld1_f32(a), a1 = vld1_f32(a+2); + b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8); + + c0 = vmlaq_lane_f32(c0, b0, a0, 0); + c1 = vmlaq_lane_f32(c1, b1, a0, 0); + c2 = vmlaq_lane_f32(c2, b2, a0, 0); + + c3 = vmlaq_lane_f32(c3, b0, a0, 1); + c4 = vmlaq_lane_f32(c4, b1, a0, 1); + c5 = vmlaq_lane_f32(c5, b2, a0, 1); + + c6 = vmlaq_lane_f32(c6, b0, a1, 0); + c7 = vmlaq_lane_f32(c7, b1, a1, 0); + c8 = vmlaq_lane_f32(c8, b2, a1, 0); + + c9 = vmlaq_lane_f32(c9 , b0, a1, 1); + c10 = vmlaq_lane_f32(c10, b1, a1, 1); + c11 = vmlaq_lane_f32(c11, b2, a1, 1); + } + } + else if (width > 4) { - a0 = vld1_f32(a), a1 = vld1_f32(a+2); - b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8); + for (int p = 0; p < np; p++, a += convMR, b += convNR) + { + a0 = vld1_f32(a), a1 = vld1_f32(a+2); + b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4); - c0 = vmlaq_lane_f32(c0, b0, a0, 0); - c1 = vmlaq_lane_f32(c1, b1, a0, 0); - c2 = vmlaq_lane_f32(c2, b2, a0, 0); + c0 = vmlaq_lane_f32(c0, b0, a0, 0); + c1 = vmlaq_lane_f32(c1, b1, a0, 0); - c3 = vmlaq_lane_f32(c3, b0, a0, 1); - c4 = vmlaq_lane_f32(c4, b1, a0, 1); - c5 = vmlaq_lane_f32(c5, b2, a0, 1); + c3 = vmlaq_lane_f32(c3, b0, a0, 1); + c4 = vmlaq_lane_f32(c4, b1, a0, 1); - c6 = vmlaq_lane_f32(c6, b0, a1, 0); - c7 = vmlaq_lane_f32(c7, b1, a1, 0); - c8 = vmlaq_lane_f32(c8, b2, a1, 0); + c6 = vmlaq_lane_f32(c6, b0, a1, 0); + c7 = vmlaq_lane_f32(c7, b1, a1, 0); - c9 = vmlaq_lane_f32(c9 , b0, a1, 1); - c10 = vmlaq_lane_f32(c10, b1, a1, 1); - c11 = vmlaq_lane_f32(c11, b2, a1, 1); + c9 = vmlaq_lane_f32(c9 , b0, a1, 1); + c10 = vmlaq_lane_f32(c10, b1, a1, 1); + } + } + else + { + for (int p = 0; p < np; p++, a += convMR, b += convNR) + { + a0 = vld1_f32(a), a1 = vld1_f32(a+2); + b0 = vld1q_f32(b); + + c0 = vmlaq_lane_f32(c0, b0, a0, 0); + c3 = vmlaq_lane_f32(c3, b0, a0, 1); + c6 = vmlaq_lane_f32(c6, b0, a1, 0); + c9 = vmlaq_lane_f32(c9 , b0, a1, 1); + } } if (!init_c) @@ -254,6 +394,366 @@ void convBlock(int np, const float* a, const float* b, float* c, int ldc, bool i CV_Error(Error::StsNotImplemented, "Unsupported convMR and/or convNR in opt_NEON::convBlock"); } +void convBlockMR1_F32(int np, const float * a, const float * b, float *c, const float bias, bool init_c, + const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR) +{ + CV_Assert(convNR == 28); + float32x4_t c0 = vdupq_n_f32(bias), c1 = c0, c2 = c0; + float32x4_t c3 = c0, c4 = c0, c5 = c0, c6 = c0; + + if (width > 16) + { + for (int p = 0; p < np; p++, a++, b += convNR) + { + float32x4_t b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8); + float32x4_t b3 = vld1q_f32(b + 12), b4 = vld1q_f32(b + 16), b5 = vld1q_f32(b + 20); + float32x4_t b6 = vld1q_f32(b + 24); + + c0 = vmlaq_n_f32(c0, b0, a[0]); + c1 = vmlaq_n_f32(c1, b1, a[0]); + c2 = vmlaq_n_f32(c2, b2, a[0]); + c3 = vmlaq_n_f32(c3, b3, a[0]); + c4 = vmlaq_n_f32(c4, b4, a[0]); + c5 = vmlaq_n_f32(c5, b5, a[0]); + c6 = vmlaq_n_f32(c6, b6, a[0]); + } + } + else if (width > 8) + { + for (int p = 0; p < np; p++, a++, b += convNR) + { + float32x4_t b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8); + float32x4_t b3 = vld1q_f32(b + 12); + + c0 = vmlaq_n_f32(c0, b0, a[0]); + c1 = vmlaq_n_f32(c1, b1, a[0]); + c2 = vmlaq_n_f32(c2, b2, a[0]); + c3 = vmlaq_n_f32(c3, b3, a[0]); + } + } + else if (width > 4) + { + for (int p = 0; p < np; p++, a++, b += convNR) + { + float32x4_t b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4); + + c0 = vmlaq_n_f32(c0, b0, a[0]); + c1 = vmlaq_n_f32(c1, b1, a[0]); + } + } + else + { + for (int p = 0; p < np; p++, a++, b += convNR) + { + float32x4_t b0 = vld1q_f32(b); + c0 = vmlaq_n_f32(c0, b0, a[0]); + } + } + + if (init_c) + { + c0 += vld1q_f32(c); + c1 += vld1q_f32(c + 4); + c2 += vld1q_f32(c + 8); + c3 += vld1q_f32(c + 12); + c4 += vld1q_f32(c + 16); + c5 += vld1q_f32(c + 20); + c6 += vld1q_f32(c + 24); + } + + if (ifMinMaxAct) + { + float32x4_t v_minval = vdupq_n_f32(minval), v_maxval = vdupq_n_f32(maxval); + + c0 = vminq_f32(vmaxq_f32(c0, v_minval), v_maxval); + c1 = vminq_f32(vmaxq_f32(c1, v_minval), v_maxval); + c2 = vminq_f32(vmaxq_f32(c2, v_minval), v_maxval); + c3 = vminq_f32(vmaxq_f32(c3, v_minval), v_maxval); + c4 = vminq_f32(vmaxq_f32(c4, v_minval), v_maxval); + c5 = vminq_f32(vmaxq_f32(c5, v_minval), v_maxval); + c6 = vminq_f32(vmaxq_f32(c6, v_minval), v_maxval); + } + + vst1q_f32(c, c0); + vst1q_f32(c + 4, c1); + vst1q_f32(c + 8, c2); + vst1q_f32(c + 12, c3); + vst1q_f32(c + 16, c4); + vst1q_f32(c + 20, c5); + vst1q_f32(c + 24, c6); +} + +#if CV_NEON_AARCH64 && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +// Fix conflict between float16_t in arm_neon.h and float16_t in cvdef.h. +typedef __fp16 float16_t; + +#ifndef __ARM_FEATURE_FMA // Work around without FMA support. +#define vfmaq_f16(a, b, c) (a + b * c) +#endif +void convBlock_FP16(int np, const char * _a, const char * _b, char * _c, int ldc, bool init_c, int width, + const int convMR_fp16, const int convNR_fp16) +{ +#if 1 + const float16_t* a = (const float16_t*)_a; + const float16_t* b = (const float16_t*)_b; + float16_t* c = (float16_t*)_c; + + CV_Assert(convMR_fp16 == 8 && convNR_fp16 == 24); + + float16x8_t c00 = vdupq_n_f16(0), c01 = c00, c02 = c00; + float16x8_t c10 = c00, c11 = c00, c12 = c00; + float16x8_t c20 = c00, c21 = c00, c22 = c00; + float16x8_t c30 = c00, c31 = c00, c32 = c00; + float16x8_t c40 = c00, c41 = c00, c42 = c00; + float16x8_t c50 = c00, c51 = c00, c52 = c00; + float16x8_t c60 = c00, c61 = c00, c62 = c00; + float16x8_t c70 = c00, c71 = c00, c72 = c00; + + float16x8_t b0 = c00, b1 = c00, b2 = c00; + + if (width > 16) + { + for (int p = 0; p < np; p++, a += convMR_fp16, b += convNR_fp16) + { + float16x4_t a0 = vld1_f16(a), a1 = vld1_f16(a + 4); + b0 = vld1q_f16(b), b1 = vld1q_f16(b + 8), b2 = vld1q_f16(b + 16); + + c00 = vfmaq_lane_f16(c00, b0, a0, 0); + c01 = vfmaq_lane_f16(c01, b1, a0, 0); + c02 = vfmaq_lane_f16(c02, b2, a0, 0); + + c10 = vfmaq_lane_f16(c10, b0, a0, 1); + c11 = vfmaq_lane_f16(c11, b1, a0, 1); + c12 = vfmaq_lane_f16(c12, b2, a0, 1); + + c20 = vfmaq_lane_f16(c20, b0, a0, 2); + c21 = vfmaq_lane_f16(c21, b1, a0, 2); + c22 = vfmaq_lane_f16(c22, b2, a0, 2); + + c30 = vfmaq_lane_f16(c30, b0, a0, 3); + c31 = vfmaq_lane_f16(c31, b1, a0, 3); + c32 = vfmaq_lane_f16(c32, b2, a0, 3); + + c40 = vfmaq_lane_f16(c40, b0, a1, 0); + c41 = vfmaq_lane_f16(c41, b1, a1, 0); + c42 = vfmaq_lane_f16(c42, b2, a1, 0); + + c50 = vfmaq_lane_f16(c50, b0, a1, 1); + c51 = vfmaq_lane_f16(c51, b1, a1, 1); + c52 = vfmaq_lane_f16(c52, b2, a1, 1); + + c60 = vfmaq_lane_f16(c60, b0, a1, 2); + c61 = vfmaq_lane_f16(c61, b1, a1, 2); + c62 = vfmaq_lane_f16(c62, b2, a1, 2); + + c70 = vfmaq_lane_f16(c70, b0, a1, 3); + c71 = vfmaq_lane_f16(c71, b1, a1, 3); + c72 = vfmaq_lane_f16(c72, b2, a1, 3); + } + } + else if (width > 8) + { + for( int p = 0; p < np; p++, a += convMR_fp16, b += convNR_fp16) + { + float16x4_t a0 = vld1_f16(a), a1 = vld1_f16(a + 4); + float16x8_t b0 = vld1q_f16(b), b1 = vld1q_f16(b + 8); + + c00 = vfmaq_lane_f16(c00, b0, a0, 0); + c01 = vfmaq_lane_f16(c01, b1, a0, 0); + + c10 = vfmaq_lane_f16(c10, b0, a0, 1); + c11 = vfmaq_lane_f16(c11, b1, a0, 1); + + c20 = vfmaq_lane_f16(c20, b0, a0, 2); + c21 = vfmaq_lane_f16(c21, b1, a0, 2); + + c30 = vfmaq_lane_f16(c30, b0, a0, 3); + c31 = vfmaq_lane_f16(c31, b1, a0, 3); + + c40 = vfmaq_lane_f16(c40, b0, a1, 0); + c41 = vfmaq_lane_f16(c41, b1, a1, 0); + + c50 = vfmaq_lane_f16(c50, b0, a1, 1); + c51 = vfmaq_lane_f16(c51, b1, a1, 1); + + c60 = vfmaq_lane_f16(c60, b0, a1, 2); + c61 = vfmaq_lane_f16(c61, b1, a1, 2); + + c70 = vfmaq_lane_f16(c70, b0, a1, 3); + c71 = vfmaq_lane_f16(c71, b1, a1, 3); + } + } + else + { + for( int p = 0; p < np; p++, a += convMR_fp16, b += convNR_fp16) + { + float16x4_t a0 = vld1_f16(a), a1 = vld1_f16(a + 4); + float16x8_t b0 = vld1q_f16(b); + + c00 = vfmaq_lane_f16(c00, b0, a0, 0); + c10 = vfmaq_lane_f16(c10, b0, a0, 1); + c20 = vfmaq_lane_f16(c20, b0, a0, 2); + c30 = vfmaq_lane_f16(c30, b0, a0, 3); + c40 = vfmaq_lane_f16(c40, b0, a1, 0); + c50 = vfmaq_lane_f16(c50, b0, a1, 1); + c60 = vfmaq_lane_f16(c60, b0, a1, 2); + c70 = vfmaq_lane_f16(c70, b0, a1, 3); + } + } + + if (!init_c) + { +#undef _FX_UPDATE_CBUF_ROW +#define _FX_UPDATE_CBUF_ROW(row) \ + c##row##0 = c##row##0 + vld1q_f16(c + row*ldc); \ + c##row##1 = c##row##1 + vld1q_f16(c + row*ldc + 8); \ + c##row##2 = c##row##2 + vld1q_f16(c + row*ldc + 16) + + _FX_UPDATE_CBUF_ROW(0); + _FX_UPDATE_CBUF_ROW(1); + _FX_UPDATE_CBUF_ROW(2); + _FX_UPDATE_CBUF_ROW(3); + _FX_UPDATE_CBUF_ROW(4); + _FX_UPDATE_CBUF_ROW(5); + _FX_UPDATE_CBUF_ROW(6); + _FX_UPDATE_CBUF_ROW(7); + } + +#undef _FX_STORE_CBUF_ROW +#define _FX_STORE_CBUF_ROW(row) \ + vst1q_f16(c + row*ldc, c##row##0); \ + vst1q_f16(c + row*ldc + 8, c##row##1); \ + vst1q_f16(c + row*ldc + 16, c##row##2) + + _FX_STORE_CBUF_ROW(0); + _FX_STORE_CBUF_ROW(1); + _FX_STORE_CBUF_ROW(2); + _FX_STORE_CBUF_ROW(3); + _FX_STORE_CBUF_ROW(4); + _FX_STORE_CBUF_ROW(5); + _FX_STORE_CBUF_ROW(6); + _FX_STORE_CBUF_ROW(7); +#else + // reference only. + const float16_t* a = (const float16_t*)_a; + const float16_t* b = (const float16_t*)_b; + float16_t* c = (float16_t*)_c; + float cbuf[convMR_fp16*convNR_fp16]; + memset(cbuf, 0, sizeof(cbuf)); + + for( int p = 0; p < np; p++ ) + { + for( int i = 0; i < convMR_fp16; i++ ) + { + float ai = float(a[convMR_fp16*p + i]); + for( int j = 0; j < convNR_fp16; j++ ) + cbuf[i*convNR_fp16+j] += float(b[convNR_fp16*p + j]) * ai; + } + } + + if (!init_c) + { + for(int i = 0; i < convMR_fp16; i++) + { + for(int j = 0; j < convNR_fp16; j++) + c[i*ldc + j] = float16_t(float(c[i*ldc + j]) + cbuf[i*convNR_fp16 + j]); + } + } + else + { + for(int i = 0; i < convMR_fp16; i++) + { + for(int j = 0; j < convNR_fp16; j++) + c[i*ldc + j] = (float16_t)(cbuf[i*convNR_fp16 + j]); + } + } +#endif +} + +void convBlockMR1_FP16(int np, const char* _a, const char* _b, float *c, const float _bias, bool init_c, + const float minval, const float maxval, bool ifMinMaxAct, const int width, const int convNR_FP16) +{ + CV_Assert(convNR_FP16 == 24); // CONV_NR_FP16 = 24 + const float16_t* a = (const float16_t*)_a; + const float16_t* b = (const float16_t*)_b; + + const float16_t bias = (float16_t)_bias; + + float16x8_t c0 = vdupq_n_f16(bias), c1 = c0, c2 = c0; + + if (width > 16) + { + for (int p = 0; p < np; p++, a++, b += convNR_FP16) + { + float16x8_t a0= vdupq_n_f16(a[0]); + float16x8_t b0 = vld1q_f16(b), b1 = vld1q_f16(b + 8), b2 = vld1q_f16(b + 16); + + c0 = vfmaq_f16(c0, a0, b0); + c1 = vfmaq_f16(c1, a0, b1); + c2 = vfmaq_f16(c2, a0, b2); + } + } + else if (width > 8) + { + for (int p = 0; p < np; p++, a++, b += convNR_FP16) + { + float16x8_t a0= vdupq_n_f16(a[0]); + float16x8_t b0 = vld1q_f16(b), b1 = vld1q_f16(b + 8); + + c0 = vfmaq_f16(c0, a0, b0); + c1 = vfmaq_f16(c1, a0, b1); + } + } + else + { + for (int p = 0; p < np; p++, a++, b += convNR_FP16) + { + float16x8_t a0= vdupq_n_f16(a[0]); + float16x8_t b0 = vld1q_f16(b); + + c0 = vfmaq_f16(c0, a0, b0); + } + } + + // convert FP 16 to FP 32. + float32x4_t c00 = vcvt_f32_f16(vget_low_f16(c0)); + float32x4_t c01 = vcvt_f32_f16(vget_high_f16(c0)); + float32x4_t c10 = vcvt_f32_f16(vget_low_f16(c1)); + float32x4_t c11 = vcvt_f32_f16(vget_high_f16(c1)); + float32x4_t c20 = vcvt_f32_f16(vget_low_f16(c2)); + float32x4_t c21 = vcvt_f32_f16(vget_high_f16(c2)); + + if (init_c) + { + c00 += vld1q_f32(c); + c01 += vld1q_f32(c + 4); + c10 += vld1q_f32(c + 8); + c11 += vld1q_f32(c + 12); + c20 += vld1q_f32(c + 16); + c21 += vld1q_f32(c + 20); + } + + if (ifMinMaxAct) + { + float32x4_t v_minval = vdupq_n_f32(minval), v_maxval = vdupq_n_f32(maxval); + + c00 = vminq_f32(vmaxq_f32(c00, v_minval), v_maxval); + c01 = vminq_f32(vmaxq_f32(c01, v_minval), v_maxval); + c10 = vminq_f32(vmaxq_f32(c10, v_minval), v_maxval); + c11 = vminq_f32(vmaxq_f32(c11, v_minval), v_maxval); + c20 = vminq_f32(vmaxq_f32(c20, v_minval), v_maxval); + c21 = vminq_f32(vmaxq_f32(c21, v_minval), v_maxval); + } + + vst1q_f32(c, c00); + vst1q_f32(c + 4, c01); + vst1q_f32(c + 8, c10); + vst1q_f32(c + 12, c11); + vst1q_f32(c + 16, c20); + vst1q_f32(c + 20, c21); +} +#endif + #endif } }} // namespace cv::dnn diff --git a/modules/dnn/src/layers/cpu_kernels/convolution.cpp b/modules/dnn/src/layers/cpu_kernels/convolution.cpp index 60c13bfa87..4c9a2fd387 100644 --- a/modules/dnn/src/layers/cpu_kernels/convolution.cpp +++ b/modules/dnn/src/layers/cpu_kernels/convolution.cpp @@ -34,10 +34,10 @@ Ptr initFastConv( const std::vector& pads_begin, const std::vector& pads_end, int conv_dim, + const bool _useFP16, bool useWinograd) { Ptr conv = makePtr(); - CV_Assert(ngroups > 0 && K > 0 && C > 0 && K % ngroups == 0); // Weight shape, [K, C, Dk, Hk, Wk] for Conv3D, [K, C, Hk, Wk] for Conv2D, [K, C, Wk] for Conv1D. @@ -117,6 +117,13 @@ Ptr initFastConv( auto wShape = shape(weightsMat); const size_t wstep = weightsMat.step1(); + conv->useFP16 = false; +#ifdef CONV_ARM_FP16 + // TODO: add FP16 support for Winograd. + if (_useFP16 && (conv->conv_type == CONV_TYPE_GENERIC || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN)) + conv->useFP16 = true; +#endif + float *srcWeights = (float *)weightsMat.data; if (conv->conv_type == CONV_TYPE_DEPTHWISE || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN) { @@ -128,17 +135,38 @@ Ptr initFastConv( // TODO: simplify the following code with std::copy. // this code aims to let memory fit with vector size. int padded_ksize = ((ksize + VEC_ALIGN-1) / VEC_ALIGN) * VEC_ALIGN; - int nweights = C*padded_ksize; - conv->weightsBuf.reserve(nweights + VEC_ALIGN); - conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN); - memset(conv->weightsBufPtr, 0, nweights*sizeof(conv->weightsBufPtr[0])); - auto weightsBufPtr = conv->weightsBufPtr; - parallel_for_(Range(0, C), [&](const Range& r0){ - for(int c = r0.start; c < r0.end; c++) + int nweights = C * padded_ksize; + +#ifdef CONV_ARM_FP16 + if (conv->useFP16) { - for (int k = 0; k < ksize; k++) - weightsBufPtr[c*padded_ksize + k] = srcWeights[c*wstep + k]; - }}); + conv->weightsBuf_FP16.resize(nweights + VEC_ALIGN); + conv->weightsBufPtr_FP16 = alignPtr(conv->weightsBuf_FP16.data(), VEC_ALIGN * sizeof(float16_t )); + memset(conv->weightsBufPtr_FP16, 0, nweights * sizeof(float16_t )); + auto weightsBufPtr_FP16 = conv->weightsBufPtr_FP16; + + parallel_for_(Range(0, C), [&](const Range& r0){ + for(int c = r0.start; c < r0.end; c++) + { + for (int k = 0; k < ksize; k++) + weightsBufPtr_FP16[c*padded_ksize + k] = (float16_t)srcWeights[c*wstep + k]; + }}); + } + else +#endif + { + conv->weightsBuf.resize(nweights + VEC_ALIGN); + conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN * sizeof(float )); + memset(conv->weightsBufPtr, 0, nweights*sizeof(float )); + auto weightsBufPtr = conv->weightsBufPtr; + + parallel_for_(Range(0, C), [&](const Range& r0){ + for(int c = r0.start; c < r0.end; c++) + { + for (int k = 0; k < ksize; k++) + weightsBufPtr[c*padded_ksize + k] = srcWeights[c*wstep + k]; + }}); + } } else if(conv->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd { @@ -163,10 +191,25 @@ Ptr initFastConv( int Kg = K/ngroups; int Kg_nblocks = (Kg + CONV_WINO_KBLOCK - 1)/CONV_WINO_KBLOCK; size_t nweights = ngroups*Kg_nblocks*Cg*CONV_WINO_KBLOCK*CONV_WINO_AREA; - conv->weightsWinoBuf.reserve(nweights + VEC_ALIGN); - conv->weightsWinoBufPtr = alignPtr(conv->weightsWinoBuf.data(), VEC_ALIGN); - float* wptrWino = conv->weightsWinoBufPtr; - memset(wptrWino, 0, nweights * sizeof(wptrWino[0])); + + float* wptrWino = nullptr; +#ifdef CONV_ARM_FP16 + float16_t* wptrWino_FP16 = nullptr; + if (conv->useFP16) + { + conv->weightsWinoBuf_FP16.resize(nweights + VEC_ALIGN); + conv->weightsWinoBufPtr_FP16 = alignPtr(conv->weightsWinoBuf_FP16.data(), VEC_ALIGN); + wptrWino_FP16 = conv->weightsWinoBufPtr_FP16; + memset(wptrWino_FP16, 0, nweights * sizeof(wptrWino_FP16[0])); + } + else +#endif + { + conv->weightsWinoBuf.resize(nweights + VEC_ALIGN); + conv->weightsWinoBufPtr = alignPtr(conv->weightsWinoBuf.data(), VEC_ALIGN); + wptrWino = conv->weightsWinoBufPtr; + memset(wptrWino, 0, nweights * sizeof(wptrWino[0])); + } parallel_for_(Range(0, K), [&](const Range& r0){ float kernelTm[CONV_WINO_AREA]; @@ -206,57 +249,133 @@ Ptr initFastConv( } // repack the data. - float* wptr = wptrWino + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA + - (c*CONV_WINO_KBLOCK + dk)*CONV_WINO_ATOM_F32; - for (int i = 0; i < CONV_WINO_NATOMS_F32; i++, - wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F32) +#ifdef CONV_ARM_FP16 + if (conv->useFP16) + { + float16_t* wptr = wptrWino_FP16 + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA + + (c*CONV_WINO_KBLOCK + dk)*CONV_WINO_ATOM_F16; + for (int i = 0; i < CONV_WINO_NATOMS_F16; i++, + wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F16) + { + CV_Assert(conv->weightsWinoBufPtr_FP16 <= wptr && wptr + CONV_WINO_ATOM_F16 <= conv->weightsWinoBufPtr_FP16 + nweights); + for (int j = 0; j < CONV_WINO_ATOM_F16; j++) + { + wptr[j] = (float16_t)kernelTm[i * CONV_WINO_ATOM_F16 + j]; + } + } + } + else +#endif { - CV_Assert(conv->weightsWinoBufPtr <= wptr && wptr + CONV_WINO_ATOM_F32 <= conv->weightsWinoBufPtr + nweights); - memcpy(wptr, kernelTm + i * CONV_WINO_ATOM_F32, CONV_WINO_ATOM_F32*sizeof (wptr[0])); + float* wptr = wptrWino + (g*Kg_nblocks + ki) * Cg *CONV_WINO_KBLOCK*CONV_WINO_AREA + + (c*CONV_WINO_KBLOCK + dk)*CONV_WINO_ATOM_F32; + for (int i = 0; i < CONV_WINO_NATOMS_F32; i++, + wptr += Cg * CONV_WINO_KBLOCK * CONV_WINO_ATOM_F32) + { + CV_Assert(conv->weightsWinoBufPtr <= wptr && wptr + CONV_WINO_ATOM_F32 <= conv->weightsWinoBufPtr + nweights); + memcpy(wptr, kernelTm + i * CONV_WINO_ATOM_F32, CONV_WINO_ATOM_F32*sizeof (wptr[0])); + } } } - }}); + } + }); } else if (conv->conv_type == CONV_TYPE_GENERIC) { // The weights are packed as // ngroups x (ceil((K/ngroups)/CONV_MR)*CONV_MR) x (Cg*Hk*Wk*Dk) x CONV_MR tensor int Kg = K/ngroups, Cg = max(C/ngroups, 1); - int numStripsMR = (Kg + CONV_MR - 1) / CONV_MR; - int Kg_aligned = numStripsMR * CONV_MR; int DkHkWkCg = Dk*Hk*Wk*Cg; + + int numStripsMR = (Kg + CONV_MR_FP32 - 1) / CONV_MR_FP32; + int Kg_aligned = numStripsMR * CONV_MR_FP32; size_t nweights = ngroups*Kg_aligned*DkHkWkCg; - conv->weightsBuf.reserve(nweights + VEC_ALIGN); - conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN); - float* weightsBufPtr = conv->weightsBufPtr; - memset(weightsBufPtr, 0, nweights*sizeof(weightsBufPtr[0])); + + float* weightsBufPtr = nullptr; + +#ifdef CONV_ARM_FP16 + int numStripsMR_FP16 = (Kg + CONV_MR_FP16 - 1) / CONV_MR_FP16; + int Kg_aligned_FP16 = numStripsMR_FP16 * CONV_MR_FP16; + size_t nweights_FP16 = ngroups * Kg_aligned_FP16 * DkHkWkCg; + + float16_t* weightsBufPtr_FP16 = nullptr; + if (conv->useFP16) + { + conv->weightsBuf_FP16.resize(nweights_FP16 + VEC_ALIGN); + conv->weightsBufPtr_FP16 = alignPtr(conv->weightsBuf_FP16.data(), VEC_ALIGN); + weightsBufPtr_FP16 = conv->weightsBufPtr_FP16; + memset(weightsBufPtr_FP16, 0, nweights_FP16*sizeof(weightsBufPtr_FP16[0])); + } + else +#endif + { + conv->weightsBuf.resize(nweights + VEC_ALIGN); + conv->weightsBufPtr = alignPtr(conv->weightsBuf.data(), VEC_ALIGN); + weightsBufPtr = conv->weightsBufPtr; + memset(weightsBufPtr, 0, nweights*sizeof(weightsBufPtr[0])); + } // Pack the weight. - parallel_for_(Range(0, ngroups * numStripsMR), [&](const Range& r0){ - for (int gsi = r0.start; gsi < r0.end; gsi++) +#ifdef CONV_ARM_FP16 + if (conv->useFP16) { - int g = gsi / numStripsMR; - int si = gsi - g * numStripsMR; + parallel_for_(Range(0, ngroups * numStripsMR_FP16), [&](const Range& r0){ + for (int gsi = r0.start; gsi < r0.end; gsi++) + { + int g = gsi / numStripsMR_FP16; + int si = gsi - g * numStripsMR_FP16; - int startK = si * CONV_MR; - CV_Assert(startK < Kg_aligned); + int startK = si * CONV_MR_FP16; + CV_Assert(startK < Kg_aligned_FP16); - float* packed_wptr = weightsBufPtr + DkHkWkCg * (startK + g * Kg_aligned); - int dk = Kg - startK < CONV_MR ? Kg - startK : CONV_MR; // check if we need zero padding. + float16_t* packed_wptr = weightsBufPtr_FP16 + DkHkWkCg * (startK + g * Kg_aligned_FP16); + int dk = Kg - startK < CONV_MR_FP16 ? Kg - startK : CONV_MR_FP16; // check if we need zero padding. - int k_idx = g*Kg + startK; - for(int hwd = 0; hwd < Hk*Wk*Dk; hwd++) { - for(int c = 0; c < Cg; c++, packed_wptr += CONV_MR) + int k_idx = g*Kg + startK; + for(int hwd = 0; hwd < Hk*Wk*Dk; hwd++) { - const float* wptr = srcWeights + wstep * k_idx + c*Hk*Wk*Dk + hwd; - int k = 0; - for(; k < dk; k++, wptr += wstep) - packed_wptr[k] = *wptr; - for(; k < CONV_MR; k++) - packed_wptr[k] = 0.f; + for(int c = 0; c < Cg; c++, packed_wptr += CONV_MR_FP16) + { + const float* wptr = srcWeights + wstep * k_idx + c*Hk*Wk*Dk + hwd; + int k = 0; + for(; k < dk; k++, wptr += wstep) + packed_wptr[k] = (float16_t)(*wptr); + for(; k < CONV_MR_FP16; k++) + packed_wptr[k] = (float16_t)0.f; + } } - } - }}); + }}); + } + else +#endif + { + parallel_for_(Range(0, ngroups * numStripsMR), [&](const Range& r0){ + for (int gsi = r0.start; gsi < r0.end; gsi++) + { + int g = gsi / numStripsMR; + int si = gsi - g * numStripsMR; + + int startK = si * CONV_MR_FP32; + CV_Assert(startK < Kg_aligned); + + float* packed_wptr = weightsBufPtr + DkHkWkCg * (startK + g * Kg_aligned); + int dk = Kg - startK < CONV_MR_FP32 ? Kg - startK : CONV_MR_FP32; // check if we need zero padding. + + int k_idx = g*Kg + startK; + for(int hwd = 0; hwd < Hk*Wk*Dk; hwd++) + { + for(int c = 0; c < Cg; c++, packed_wptr += CONV_MR_FP32) + { + const float* wptr = srcWeights + wstep * k_idx + c*Hk*Wk*Dk + hwd; + int k = 0; + for(; k < dk; k++, wptr += wstep) + packed_wptr[k] = *wptr; + for(; k < CONV_MR_FP32; k++) + packed_wptr[k] = 0.f; + } + } + }}); + } } else CV_Error(CV_StsUnsupportedFormat, "Unknown convolution type."); @@ -275,75 +394,142 @@ Ptr initFastConv( return conv; } -static inline void packData8(float*& inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const int* ofstab, - const int stride_w, const int ksize) +static inline void packData8(char*& inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const int* ofstab, + const int stride_w, const int ksize, const int esz) { - float* inpbufC = inpbuf + s0; - float* inptrInC = inptrIn; + char * inpbufC = inpbuf + s0 * esz; + float* inptrInC = (float* )inptrIn; - if (stride_w == 1) - for (int k = 0; k < ksize; k++) +#ifdef CONV_ARM_FP16 + float16_t* inpbufC_FP16 = (float16_t *)inpbufC; + if (esz == sizeof(float16_t)) + { + if (stride_w == 1) { - int k1 = ofstab[k]; - float v0 = inptrInC[k1]; - float v1 = inptrInC[k1 + 1]; - float v2 = inptrInC[k1 + 2]; - float v3 = inptrInC[k1 + 3]; - float v4 = inptrInC[k1 + 4]; - float v5 = inptrInC[k1 + 5]; - float v6 = inptrInC[k1 + 6]; - float v7 = inptrInC[k1 + 7]; - - inpbufC[k*CONV_NR] = v0; - inpbufC[k*CONV_NR+1] = v1; - inpbufC[k*CONV_NR+2] = v2; - inpbufC[k*CONV_NR+3] = v3; - inpbufC[k*CONV_NR+4] = v4; - inpbufC[k*CONV_NR+5] = v5; - inpbufC[k*CONV_NR+6] = v6; - inpbufC[k*CONV_NR+7] = v7; + for (int k = 0; k < ksize; k++) + { + int k1 = ofstab[k]; + + float32x4_t v0 = vld1q_f32(inptrInC + k1); + float32x4_t v1 = vld1q_f32(inptrInC + k1 + 4); + vst1q_f16((__fp16*)inpbufC_FP16 + k * CONV_NR_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1))); + } } - else - for (int k = 0; k < ksize; k++) + else { - int k1 = ofstab[k]; - float v0 = inptrInC[k1]; - float v1 = inptrInC[k1 + stride_w]; - float v2 = inptrInC[k1 + 2*stride_w]; - float v3 = inptrInC[k1 + 3*stride_w]; - float v4 = inptrInC[k1 + 4*stride_w]; - float v5 = inptrInC[k1 + 5*stride_w]; - float v6 = inptrInC[k1 + 6*stride_w]; - float v7 = inptrInC[k1 + 7*stride_w]; - - inpbufC[k*CONV_NR] = v0; - inpbufC[k*CONV_NR+1] = v1; - inpbufC[k*CONV_NR+2] = v2; - inpbufC[k*CONV_NR+3] = v3; - inpbufC[k*CONV_NR+4] = v4; - inpbufC[k*CONV_NR+5] = v5; - inpbufC[k*CONV_NR+6] = v6; - inpbufC[k*CONV_NR+7] = v7; + for (int k = 0; k < ksize; k++) + { + int k1 = ofstab[k]; + float32x4_t v0, v1; + + v0[0] = inptrInC[k1]; + v0[1] = inptrInC[k1 + stride_w]; + v0[2] = inptrInC[k1 + 2*stride_w]; + v0[3] = inptrInC[k1 + 3*stride_w]; + v1[0] = inptrInC[k1 + 4*stride_w]; + v1[1] = inptrInC[k1 + 5*stride_w]; + v1[2] = inptrInC[k1 + 6*stride_w]; + v1[3] = inptrInC[k1 + 7*stride_w]; + + vst1q_f16((__fp16*)inpbufC_FP16 + k * CONV_NR_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1))); + } } + } + else // float 32 +#endif + { + CV_Assert(esz == sizeof(float )); + float* inpbufC_FP32 = (float* )inpbufC; + if (stride_w == 1) + for (int k = 0; k < ksize; k++) + { + int k1 = ofstab[k]; +#if CV_SIMD256 + vx_store(inpbufC_FP32 + k*CONV_NR, vx_load(inptrInC + k1)); +#elif CV_SIMD128 + v_float32x4 vv0 = v_load(inptrInC + k1); + v_float32x4 vv1 = v_load(inptrInC + k1 + 4); + v_store(inpbufC_FP32 + k*CONV_NR_FP32, vv0); + v_store(inpbufC_FP32 + k*CONV_NR_FP32 + 4, vv1); +#else + float v0 = inptrInC[k1]; + float v1 = inptrInC[k1 + 1]; + float v2 = inptrInC[k1 + 2]; + float v3 = inptrInC[k1 + 3]; + float v4 = inptrInC[k1 + 4]; + float v5 = inptrInC[k1 + 5]; + float v6 = inptrInC[k1 + 6]; + float v7 = inptrInC[k1 + 7]; + + inpbufC_FP32[k*CONV_NR_FP32] = v0; + inpbufC_FP32[k*CONV_NR_FP32+1] = v1; + inpbufC_FP32[k*CONV_NR_FP32+2] = v2; + inpbufC_FP32[k*CONV_NR_FP32+3] = v3; + inpbufC_FP32[k*CONV_NR_FP32+4] = v4; + inpbufC_FP32[k*CONV_NR_FP32+5] = v5; + inpbufC_FP32[k*CONV_NR_FP32+6] = v6; + inpbufC_FP32[k*CONV_NR_FP32+7] = v7; +#endif + } + else + for (int k = 0; k < ksize; k++) + { + int k1 = ofstab[k]; + float v0 = inptrInC[k1]; + float v1 = inptrInC[k1 + stride_w]; + float v2 = inptrInC[k1 + 2*stride_w]; + float v3 = inptrInC[k1 + 3*stride_w]; + float v4 = inptrInC[k1 + 4*stride_w]; + float v5 = inptrInC[k1 + 5*stride_w]; + float v6 = inptrInC[k1 + 6*stride_w]; + float v7 = inptrInC[k1 + 7*stride_w]; + + inpbufC_FP32[k*CONV_NR_FP32] = v0; + inpbufC_FP32[k*CONV_NR_FP32+1] = v1; + inpbufC_FP32[k*CONV_NR_FP32+2] = v2; + inpbufC_FP32[k*CONV_NR_FP32+3] = v3; + inpbufC_FP32[k*CONV_NR_FP32+4] = v4; + inpbufC_FP32[k*CONV_NR_FP32+5] = v5; + inpbufC_FP32[k*CONV_NR_FP32+6] = v6; + inpbufC_FP32[k*CONV_NR_FP32+7] = v7; + } + } x0+=7; s0+=7; inptrIn += 7*stride_w; in_w += 7*stride_w; } -static inline void packData2(float*& inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const int* ofstab, - const int stride_w, const int ksize) +static inline void packData2(char *& inpbuf, float*& inptrIn, int& in_w, int& x0, int& s0, const int* ofstab, + const int stride_w, const int ksize, const int esz) { - float* inpbufC = inpbuf + s0; + char* inpbufC = inpbuf + s0 * esz; float* inptrInC = inptrIn; - for (int k = 0; k < ksize; k++) +#ifdef CONV_ARM_FP16 + float16_t* inpbufC_FP16 = (float16_t *)inpbufC; + if (esz == sizeof(float16_t)) + { + for (int k = 0; k < ksize; k++) + { + int k1 = ofstab[k]; + float v0 = inptrInC[k1]; + float v1 = inptrInC[k1 + stride_w]; + inpbufC_FP16[k*CONV_NR_FP16] = (float16_t)v0; + inpbufC_FP16[k*CONV_NR_FP16+1] = (float16_t)v1; + } + } else +#endif { - int k1 = ofstab[k]; - float v0 = inptrInC[k1]; - float v1 = inptrInC[k1 + stride_w]; - inpbufC[k*CONV_NR] = v0; - inpbufC[k*CONV_NR+1] = v1; + float * inpbufC_FP32 = (float *)inpbufC; + for (int k = 0; k < ksize; k++) + { + int k1 = ofstab[k]; + float v0 = inptrInC[k1]; + float v1 = inptrInC[k1 + stride_w]; + inpbufC_FP32[k*CONV_NR_FP32] = v0; + inpbufC_FP32[k*CONV_NR_FP32+1] = v1; + } } x0++; @@ -352,131 +538,683 @@ static inline void packData2(float*& inpbuf, float*& inptrIn, int& in_w, int& x0 in_w += stride_w; } -void runFastConv(InputArray _input, OutputArray _output, const Ptr& conv, int ntasks, - const Ptr& actLayer, const std::vector& reluslope, bool fusedAdd) +#ifdef CONV_ARM_FP16 +// Fast convert float 32 to float16 +static inline void _cvt32f16f( const float* src, float16_t* dst, int len) { - Mat input = _input.getMat(); - Mat output = _output.getMat(); - int conv_dim = conv->conv_dim; + int j = 0; + const int VECSZ = 4; + __fp16* dst_FP16 = (__fp16 *)dst; + if (len > VECSZ * 4) + { + const int VECSZ4 = 4 * VECSZ; + for( ; j + VECSZ4 < len; j += VECSZ4) + { - CV_Assert_N(input.dims == output.dims, - input.size[0] == output.size[0], - conv->C == input.size[1], - conv->K == output.size[1], - input.type() == output.type(), - input.isContinuous(), - output.isContinuous()); + float32x4_t v0 = vld1q_f32(src + j); + float32x4_t v1 = vld1q_f32(src + j + 4); + float32x4_t v2 = vld1q_f32(src + j + 8); + float32x4_t v3 = vld1q_f32(src + j + 12); - Mat fusedAddMat; - if (fusedAdd) - { - CV_Assert(conv->conv_dim != CONV_3D && "Conv3D does not support Conv+Add fusion optimization!"); - fusedAddMat = _output.getMat(); + vst1q_f16(dst_FP16 + j, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1))); + vst1q_f16(dst_FP16 + j + 8, vcombine_f16(vcvt_f16_f32(v2), vcvt_f16_f32(v3))); + } } - if (conv->conv_type == CONV_TYPE_DEPTHWISE) + for( ; j < len; j += VECSZ ) { - // Depthwise-Convolution layer should not be followed by Add layer. - CV_Assert((conv_dim == CONV_1D || conv_dim == CONV_2D)); - return runDepthwise(input, output, conv, actLayer.get(), reluslope, fusedAdd); - } - - MatShape inputShape = shape(input); - MatShape outputShape = shape(output); - - CV_Assert(inputShape.size() == outputShape.size()); + if( j > len - VECSZ ) + { + if( j == 0 ) + break; + j = len - VECSZ; + } - ActivationLayer* activ = nullptr; - float minval = -FLT_MAX, maxval = FLT_MAX; - bool ifMinMaxAct = false; + float16x4_t hv = vcvt_f16_f32(vld1q_f32(src + j)); + vst1_f16(dst_FP16 + j, hv); + } + for( ; j < len; j++ ) + dst[j] = float16_t(src[j]); +} +#endif - if (actLayer) +static inline void packInputData(char* inpbuf_task, float* inp, const int* ofstab, const int* dhwTab, int zyx0, int zyx_limit, + int ksize, int stride_d, int stride_h, int stride_w, int pad_front, int pad_top, int pad_left, + int Dk, int Hk, int Wk, int dilation_d, int dilation_h, int dilation_w, int Di, int Hi, int Wi, + int H0, int W0, int Cg, int stripesize, int inp_plane_ofs, int inp_planesize, int conv_dim, int conv_type, + const int CONV_NR, const int esz, bool fast_1x1, bool useFP16) +{ + for (int stripe = 0; zyx0 < zyx_limit; stripe++, zyx0 += CONV_NR) { - Ptr activ_relu = actLayer.dynamicCast(); - Ptr activ_relu6 = actLayer.dynamicCast(); - - if (!activ_relu.empty()) + char *inpbuf = inpbuf_task + stripe * stripesize * esz; + float *inptr = inp + inp_plane_ofs; + + /* + 1. pack the data. Copy the HkxWk CONV_NR-wide slices from + each feature plane of the input tensor to the input buffer. + */ + if (fast_1x1) { - if (activ_relu->negativeSlope == 0.0f) + int slice_len = zyx_limit - zyx0; + bool partial = slice_len < CONV_NR; + const int CONV_NR_esz = CONV_NR * esz; + // Superfast branch for 1x1 convolutions with sy=sx=1. + // in this case each feature plane can be safely treated + // as 1D array, and we just extract next portion + // of CONV_NR elements from each feature plane and + // put it together. + inptr += zyx0; + if (!partial) { - minval = 0.0f; - ifMinMaxAct = true; - activ = nullptr; + // Make special branch where memcpy() is called with a constant buffer size. + // Compilers will likely unroll this loop properly. +#ifdef CONV_ARM_FP16 + if (useFP16) + { + for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz) + _cvt32f16f(inptr, (float16_t *)inpbuf, CONV_NR); + } + else +#endif + for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz) + memcpy(inpbuf, inptr, CONV_NR_esz); } - else // Leaky ReLU + else { - activ = actLayer.get(); +#ifdef CONV_ARM_FP16 + if (useFP16) + { + for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz) + { + _cvt32f16f(inptr, (float16_t *)inpbuf, slice_len); + memset(inpbuf + slice_len * esz, 0, (CONV_NR - slice_len) * esz); + } + } + else +#endif + for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR_esz) + { + memcpy(inpbuf, inptr, slice_len * esz); + memset(inpbuf + slice_len * esz, 0, (CONV_NR - slice_len) * esz); + } } } - else if (!activ_relu6.empty()) + else if (conv_type == CONV_TYPE_DEPTHWISE_REMAIN) { - minval = activ_relu6->minValue; - maxval = activ_relu6->maxValue; + CV_Assert(Cg == 1); + const int HW0 = H0 * W0; + const int HWi = Hi * Wi; + int slice_len = std::min(zyx_limit - zyx0, CONV_NR); - ifMinMaxAct = true; - activ = nullptr; - } - else - activ = actLayer.get(); - } - else - activ = nullptr; + // here some non-continuous sub-row of the row will not be + // filled from the tensor; we need to make sure that the uncovered + // elements are explicitly set to 0's. the easiest way is to + // set all the elements to 0's before the loop. + memset(inpbuf, 0, stripesize * esz); - if (conv->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd - { - CV_Assert(conv->weightsWinoBufPtr && input.dims == 4 && conv_dim == CONV_2D); - if (runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct)) - return; - } + int z0 = zyx0 / HW0, yx0 = zyx0 - z0 * HW0; + int y0 = yx0 / W0, x0 = yx0 - y0 * W0; - int N = inputShape[0], C = inputShape[1]; + if (conv_dim == CONV_1D) + { + for (int slice_i = 0; slice_i < slice_len; y0++, x0=0) + { + int delta = std::min(slice_len - slice_i, W0 - x0); + int x1 = x0 + delta; - // input shape: [N, C, D, H, W] for Conv3D, [N, C, H, W] for Conv2D, [N, C, W] for Conv1D. - int Di = conv_dim == CONV_3D ? inputShape[2] : 1; - int Hi = conv_dim == CONV_1D ? 1 : inputShape[inputShape.size() - 2]; - int Wi = inputShape[inputShape.size() - 1]; + int in_w = x0 * stride_w - pad_left; + float* inptrIn = inptr + in_w; - int ngroups = conv->ngroups; - int K = conv->K, Dk = conv->Dk, Hk = conv->Hk, Wk = conv->Wk; + int s0 = slice_i; - int D0 = conv_dim == CONV_3D ? outputShape[2] : 1; - int H0 = conv_dim == CONV_1D ? 1 : outputShape[outputShape.size() - 2]; - int W0 = outputShape[outputShape.size() - 1]; + for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w) + { + // Pack 8 + if (x0 + 8 <= x1 && 0 <= in_w && + in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w) + { + packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz); + } + else if (x0 + 2 <= x1 && 0 <= in_w && + in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w) + { + packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz); + } + else + { + int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w); + int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w); + const float* inptrInC = inptrIn; +#ifdef CONV_ARM_FP16 + if (useFP16) + { + float16_t* inpbufC = (float16_t *)inpbuf + s0; + for (int w = w0; w < w1; w++) + { + int imgofs = w*dilation_w; + inpbufC[w*CONV_NR] = (float16_t)inptrInC[imgofs]; + } + } + else +#endif + { + float* inpbufC = (float *)inpbuf + s0; + for (int w = w0; w < w1; w++) + { + int imgofs = w*dilation_w; + inpbufC[w*CONV_NR] = inptrInC[imgofs]; + } + } + } + } + slice_i += delta; + } + } + else if (conv_dim == CONV_2D) + { + for (int slice_i = 0; slice_i < slice_len; y0++, x0=0) + { + int delta = std::min(slice_len - slice_i, W0 - x0); + int x1 = x0 + delta; - int Cg = C/ngroups, Kg = K/ngroups; + int in_h = y0 * stride_h - pad_top; + int in_w = x0 * stride_w - pad_left; - const size_t inp_planesize = (size_t)Di*Hi*Wi; - const size_t out_planesize = (size_t)D0*H0*W0; + float* inptrIn = inptr + in_h*Wi + in_w; - int pad_front = conv->pad_front; - int pad_top = conv->pad_top; - int pad_left = conv->pad_left; + bool ok_i = 0 <= in_h && in_h < Hi - (Hk-1)*dilation_h; + int h0 = std::max(0, (-in_h + dilation_h-1)/dilation_h); + int h1 = std::min(Hk, (Hi - in_h + dilation_h-1)/dilation_h); - int stride_d = conv->stride_d, stride_h = conv->stride_h, stride_w = conv->stride_w; - int dilation_d = conv->dilation_d, dilation_h = conv->dilation_h, dilation_w = conv->dilation_w; + int s0 = slice_i; + for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w) + { + // Pack 8 + if (ok_i && x0 + 8 <= x1 && 0 <= in_w && + in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w) + { + packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz); + } + else if (ok_i && x0 + 2 <= x1 && 0 <= in_w && + in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w) + { + packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz); + } + else + { + int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w); + int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w); - int ksize = Dk*Hk*Wk; - bool fast_1x1 = ksize == 1 && stride_d == 1 && stride_w == 1 && stride_h == 1 - && pad_front == 0 && pad_left == 0 && pad_top == 0; - int DkHkWkCg = Dk*Hk*Wk*Cg; + const float* inptrInC = inptrIn; +#ifdef CONV_ARM_FP16 + if (useFP16) + { + float16_t* inpbufC = (float16_t *)inpbuf + s0; - std::vector ofstab_(Hk*Wk*Dk*4, 0); - int* ofstab = ofstab_.data(); - int* dhwTab = ofstab + Hk*Wk*Dk; - int padded_ksize = ((ksize + VEC_ALIGN-1) / VEC_ALIGN) * VEC_ALIGN; + for (int h = h0; h < h1; h++) + { + for (int w = w0; w < w1; w++) + { + int imgofs = h*(dilation_h*Wi) + w*dilation_w; + inpbufC[(h*Wk + w)*CONV_NR] = (float16_t)inptrInC[imgofs]; + } + } + } + else +#endif + { + float* inpbufC = (float *)inpbuf + s0; - if (conv_dim == CONV_1D) - { - for( int w = 0; w < Wk; w++) - { - int dw = w*dilation_w; - dhwTab[w*3+2] = dw; - ofstab[w] = dw; - } - } - else if (conv_dim == CONV_2D) - { - for (int h = 0; h < Hk; h++) + for (int h = h0; h < h1; h++) + { + for (int w = w0; w < w1; w++) + { + int imgofs = h*(dilation_h*Wi) + w*dilation_w; + inpbufC[(h*Wk + w)*CONV_NR] = inptrInC[imgofs]; + } + } + } + } + } + slice_i += delta; + } + } + else if (conv_dim == CONV_3D) + { + for (int slice_i = 0; slice_i < slice_len; z0 += (y0+1)/H0, y0 = (y0+1)%H0, x0=0) + { + int delta = std::min(slice_len - slice_i, W0 - x0); + int x1 = x0 + delta; + + int in_d = z0 * stride_d - pad_front; + int in_h = y0 * stride_h - pad_top; + int in_w = x0 * stride_w - pad_left; + + float* inptrIn = inptr + in_d*HWi + in_h*Wi + in_w; + + int d0 = std::max(0, (-in_d + dilation_d - 1) / dilation_d); + int d1 = std::min(Dk, (Di - in_d + dilation_d - 1) / dilation_d); + + bool ok_i = 0 <= in_d && in_d < Di - (Dk-1)*dilation_d && + 0 <= in_h && in_h < Hi - (Hk-1)*dilation_h; + int h0 = std::max(0, (-in_h + dilation_h-1)/dilation_h); + int h1 = std::min(Hk, (Hi - in_h + dilation_h-1)/dilation_h); + + int s0 = slice_i; + for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w) + { + // Pack 8 + if (ok_i && x0 + 8 <= x1 && 0 <= in_w && + in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w) + { + packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz); + } + else if (ok_i && x0 + 2 <= x1 && 0 <= in_w && + in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w) + { + packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize, esz); + } + else + { + int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w); + int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w); + const float* inptrInC = inptrIn; +#ifdef CONV_ARM_FP16 + if (useFP16) + { + float16_t* inpbufC = (float16_t* )inpbuf + s0; + + for ( int d = d0; d < d1; d++) + { + for (int h = h0; h < h1; h++) + { + for (int w = w0; w < w1; w++) + { + int imgofs = d*dilation_d*HWi + h*(dilation_h*Wi) + w*dilation_w; + inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = (float16_t)inptrInC[imgofs]; + } + } + } + } + else +#endif + { + float* inpbufC = (float* )inpbuf + s0; + + for ( int d = d0; d < d1; d++) + { + for (int h = h0; h < h1; h++) + { + for (int w = w0; w < w1; w++) + { + int imgofs = d*dilation_d*HWi + h*(dilation_h*Wi) + w*dilation_w; + inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = inptrInC[imgofs]; + } + } + } + } + } + } + slice_i += delta; + } + } + } + else + { + const int HW0 = H0 * W0; + const int HWi = Hi * Wi; + int z0_ = zyx0 / HW0, yx0 = zyx0 - z0_ * HW0; + int y0_ = yx0 / W0, x0_ = yx0 - y0_ * W0; + for (int k = 0; k < ksize; k++) + { + int dz = dhwTab[k * 3], dy = dhwTab[k * 3 + 1], dx = dhwTab[k * 3 + 2]; + int i = 0, z0 = z0_, y0 = y0_, x0 = x0_; + for (; i < CONV_NR;) + { + float* inpbuf_ki = (float* )inpbuf + k * CONV_NR * Cg + i; +#ifdef CONV_ARM_FP16 + float16_t * inpbuf_ki_FP16 = (float16_t *)inpbuf + k * CONV_NR * Cg + i; +#endif + + int zi = z0 * stride_d + dz - pad_front; + int yi = y0 * stride_h + dy - pad_top; + int xi = x0 * stride_w + dx - pad_left; + + if ((unsigned) zi < (unsigned) Di && (unsigned) yi < (unsigned) Hi && + (unsigned) xi < (unsigned) Wi) + { + const float *inptr_ki = inptr + zi * HWi + yi * Wi + xi; + if (i + 8 <= CONV_NR && x0 + 8 <= W0 && xi + stride_w * 8 <= Wi) + { + if (stride_w == 1) + { +#ifdef CONV_ARM_FP16 + if (useFP16) + { + for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize) + { + float32x4_t v0 = vld1q_f32(inptr_ki); + float32x4_t v1 = vld1q_f32(inptr_ki + 4); + + vst1q_f16((__fp16* )inpbuf_ki_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1))); + } + } + else +#endif + for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize) + { + float t0 = inptr_ki[0], t1 = inptr_ki[1]; + float t2 = inptr_ki[2], t3 = inptr_ki[3]; + float t4 = inptr_ki[4], t5 = inptr_ki[5]; + float t6 = inptr_ki[6], t7 = inptr_ki[7]; + inpbuf_ki[0] = t0; + inpbuf_ki[1] = t1; + inpbuf_ki[2] = t2; + inpbuf_ki[3] = t3; + inpbuf_ki[4] = t4; + inpbuf_ki[5] = t5; + inpbuf_ki[6] = t6; + inpbuf_ki[7] = t7; + } + } + else if (stride_w == 2) + { +#ifdef CONV_ARM_FP16 + if (useFP16) + { + for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize) + { + float32x4_t v0, v1; + v0[0] = inptr_ki[0], v0[1] = inptr_ki[2]; + v0[2] = inptr_ki[4], v0[3] = inptr_ki[6]; + v1[0] = inptr_ki[8], v1[1] = inptr_ki[10]; + v1[2] = inptr_ki[12], v1[3] = inptr_ki[14]; + vst1q_f16((__fp16* )inpbuf_ki_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1))); + } + } + else +#endif + for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize) + { + float t0 = inptr_ki[0], t1 = inptr_ki[2]; + float t2 = inptr_ki[4], t3 = inptr_ki[6]; + float t4 = inptr_ki[8], t5 = inptr_ki[10]; + float t6 = inptr_ki[12], t7 = inptr_ki[14]; + inpbuf_ki[0] = t0; + inpbuf_ki[1] = t1; + inpbuf_ki[2] = t2; + inpbuf_ki[3] = t3; + inpbuf_ki[4] = t4; + inpbuf_ki[5] = t5; + inpbuf_ki[6] = t6; + inpbuf_ki[7] = t7; + } + } + else + { +#ifdef CONV_ARM_FP16 + if (useFP16) + { + for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize) + { + float32x4_t v0, v1; + + v0[0] = inptr_ki[0], v0[1] = inptr_ki[stride_w]; + v0[2] = inptr_ki[stride_w * 2], v0[3] = inptr_ki[stride_w * 3]; + v1[0] = inptr_ki[stride_w * 4], v1[1] = inptr_ki[stride_w * 5]; + v1[2] = inptr_ki[stride_w * 6], v1[3] = inptr_ki[stride_w * 7]; + vst1q_f16((__fp16* )inpbuf_ki_FP16, vcombine_f16(vcvt_f16_f32(v0), vcvt_f16_f32(v1))); + } + } + else +#endif + for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize) + { + float t0 = inptr_ki[0], t1 = inptr_ki[stride_w]; + float t2 = inptr_ki[stride_w * 2], t3 = inptr_ki[stride_w * 3]; + float t4 = inptr_ki[stride_w * 4], t5 = inptr_ki[stride_w * 5]; + float t6 = inptr_ki[stride_w * 6], t7 = inptr_ki[stride_w * 7]; + inpbuf_ki[0] = t0; + inpbuf_ki[1] = t1; + inpbuf_ki[2] = t2; + inpbuf_ki[3] = t3; + inpbuf_ki[4] = t4; + inpbuf_ki[5] = t5; + inpbuf_ki[6] = t6; + inpbuf_ki[7] = t7; + } + } + i += 8; + x0 += 8; + } + else if (i + 4 <= CONV_NR && x0 + 4 <= W0 && xi + stride_w * 4 <= Wi) + { + if (stride_w == 1) + { +#ifdef CONV_ARM_FP16 + if (useFP16) + { + for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize) + { + float32x4_t v0 = vld1q_f32(inptr_ki); + vst1_f16((__fp16* )inpbuf_ki_FP16, vcvt_f16_f32(v0)); + } + } + else +#endif + for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize) + { + float t0 = inptr_ki[0], t1 = inptr_ki[1]; + float t2 = inptr_ki[2], t3 = inptr_ki[3]; + inpbuf_ki[0] = t0; + inpbuf_ki[1] = t1; + inpbuf_ki[2] = t2; + inpbuf_ki[3] = t3; + } + } + else + { +#ifdef CONV_ARM_FP16 + if (useFP16) + { + for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize) + { + float32x4_t v0; + v0[0] = inptr_ki[0], v0[1] = inptr_ki[stride_w]; + v0[2] = inptr_ki[stride_w * 2], v0[3] = inptr_ki[stride_w * 3]; + vst1_f16((__fp16* )inpbuf_ki_FP16, vcvt_f16_f32(v0)); + } + } + else +#endif + for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize) + { + float t0 = inptr_ki[0], t1 = inptr_ki[stride_w]; + float t2 = inptr_ki[stride_w * 2], t3 = inptr_ki[stride_w * 3]; + inpbuf_ki[0] = t0; + inpbuf_ki[1] = t1; + inpbuf_ki[2] = t2; + inpbuf_ki[3] = t3; + } + } + i += 4; + x0 += 4; + } + else + { +#ifdef CONV_ARM_FP16 + if (useFP16) + { + for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR, inptr_ki += inp_planesize) + inpbuf_ki_FP16[0] = (float16_t)(*inptr_ki); + } + else +#endif + for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize) + *inpbuf_ki = *inptr_ki; + i++; + x0++; + } + } + else + { +#ifdef CONV_ARM_FP16 + if (useFP16) + { + for (int c = 0; c < Cg; c++, inpbuf_ki_FP16 += CONV_NR) + inpbuf_ki_FP16[0] = (float16_t)0.f; + } + else +#endif + for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR) + inpbuf_ki[0] = 0.f; + i++; + x0++; + } + + int mask = x0 >= W0; + y0 += mask; + x0 &= mask - 1; + + mask = y0 >= H0; // Only Conv 3D need jump at z0 dimension + if (mask && conv_dim != CONV_3D) + break; + + z0 += mask; + y0 &= mask - 1; + } + } + } + } +} + +void runFastConv(InputArray _input, OutputArray _output, const Ptr& conv, int ntasks, + const Ptr& actLayer, const std::vector& reluslope, bool fusedAdd) +{ + Mat input = _input.getMat(); + Mat output = _output.getMat(); + int conv_dim = conv->conv_dim; + + CV_Assert_N(input.dims == output.dims, + input.size[0] == output.size[0], + conv->C == input.size[1], + conv->K == output.size[1], + input.type() == output.type(), + input.isContinuous(), + output.isContinuous()); + + const bool useFP16 = conv->useFP16; + Mat fusedAddMat; + if (fusedAdd) + { + CV_Assert(conv->conv_dim != CONV_3D && "Conv3D does not support Conv+Add fusion optimization!"); + fusedAddMat = _output.getMat(); + } + + if (conv->conv_type == CONV_TYPE_DEPTHWISE) + { + // Depthwise-Convolution layer should not be followed by Add layer. + CV_Assert((conv_dim == CONV_1D || conv_dim == CONV_2D) && !useFP16); + return runDepthwise(input, output, conv, actLayer.get(), reluslope, fusedAdd); + } + + MatShape inputShape = shape(input); + MatShape outputShape = shape(output); + + CV_Assert(inputShape.size() == outputShape.size()); + + ActivationLayer* activ = nullptr; + float minval = -FLT_MAX, maxval = FLT_MAX; + bool ifMinMaxAct = false; + + if (actLayer) + { + Ptr activ_relu = actLayer.dynamicCast(); + Ptr activ_relu6 = actLayer.dynamicCast(); + + if (!activ_relu.empty()) + { + if (activ_relu->negativeSlope == 0.0f) + { + minval = 0.0f; + ifMinMaxAct = true; + activ = nullptr; + } + else // Leaky ReLU + { + activ = actLayer.get(); + } + } + else if (!activ_relu6.empty()) + { + minval = activ_relu6->minValue; + maxval = activ_relu6->maxValue; + + ifMinMaxAct = true; + activ = nullptr; + } + else + activ = actLayer.get(); + } + else + activ = nullptr; + + // TODO: support FP16 for winograd. + if (conv->conv_type == CONV_TYPE_WINOGRAD3X3) // winograd + { + CV_Assert(conv->weightsWinoBufPtr && input.dims == 4 && conv_dim == CONV_2D && !useFP16); + if (runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct)) + return; + } + + int N = inputShape[0], C = inputShape[1]; + + // input shape: [N, C, D, H, W] for Conv3D, [N, C, H, W] for Conv2D, [N, C, W] for Conv1D. + int Di = conv_dim == CONV_3D ? inputShape[2] : 1; + int Hi = conv_dim == CONV_1D ? 1 : inputShape[inputShape.size() - 2]; + int Wi = inputShape[inputShape.size() - 1]; + + int ngroups = conv->ngroups; + int K = conv->K, Dk = conv->Dk, Hk = conv->Hk, Wk = conv->Wk; + + int D0 = conv_dim == CONV_3D ? outputShape[2] : 1; + int H0 = conv_dim == CONV_1D ? 1 : outputShape[outputShape.size() - 2]; + int W0 = outputShape[outputShape.size() - 1]; + + int Cg = C/ngroups, Kg = K/ngroups; + + const size_t inp_planesize = (size_t)Di*Hi*Wi; + const size_t out_planesize = (size_t)D0*H0*W0; + + int pad_front = conv->pad_front; + int pad_top = conv->pad_top; + int pad_left = conv->pad_left; + + int stride_d = conv->stride_d, stride_h = conv->stride_h, stride_w = conv->stride_w; + int dilation_d = conv->dilation_d, dilation_h = conv->dilation_h, dilation_w = conv->dilation_w; + + int ksize = Dk*Hk*Wk; + bool fast_1x1 = ksize == 1 && stride_d == 1 && stride_w == 1 && stride_h == 1 + && pad_front == 0 && pad_left == 0 && pad_top == 0; + int DkHkWkCg = Dk*Hk*Wk*Cg; + + std::vector ofstab_(Hk*Wk*Dk*4, 0); + int* ofstab = ofstab_.data(); + int* dhwTab = ofstab + Hk*Wk*Dk; + int padded_ksize = ((ksize + VEC_ALIGN-1) / VEC_ALIGN) * VEC_ALIGN; + + if (conv_dim == CONV_1D) + { + for( int w = 0; w < Wk; w++) + { + int dw = w*dilation_w; + dhwTab[w*3+2] = dw; + ofstab[w] = dw; + } + } + else if (conv_dim == CONV_2D) + { + for (int h = 0; h < Hk; h++) for( int w = 0; w < Wk; w++) { int k = h*Wk + w; @@ -503,45 +1241,132 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co } } + int CONV_NR = CONV_NR_FP32; + int CONV_MR = CONV_MR_FP32; + int esz = sizeof(float ); + +#ifdef CONV_ARM_FP16 + if (useFP16) + { + // works at FP 16. + CONV_NR = CONV_NR_FP16; + CONV_MR = CONV_MR_FP16; + esz = sizeof(float16_t); + } +#endif + int MAX_STRIPES = conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? 1 : (56 + CONV_NR - 1)/CONV_NR; // Friendly to L1 cache const int K_BLOCK_SIZE = conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN ? 1 : 32; const int C_BLOCK_SIZE = 256; - int Kg_nblocks = (Kg + CONV_MR-1)/CONV_MR, Kg_aligned = Kg_nblocks * CONV_MR; + int Kg_nblocks = (Kg + CONV_MR-1)/CONV_MR; + int Kg_aligned = Kg_nblocks * CONV_MR; - int stripes_per_sample = ((int)out_planesize + CONV_NR - 1) / CONV_NR; + int stripes_per_plane0 = ((int)out_planesize + CONV_NR - 1) / CONV_NR; + int stripes_per_plane = stripes_per_plane0; - if (stripes_per_sample < ntasks * 4 && conv->conv_type != CONV_TYPE_DEPTHWISE_REMAIN) + if (stripes_per_plane < ntasks * 4 || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN) { MAX_STRIPES = 1; - stripes_per_sample = 1; + stripes_per_plane = 1; } else Kg_nblocks = 1; - int Kstripes = Kg_nblocks*stripes_per_sample; - int nsubtasks = N*ngroups*Kstripes; + bool seperateIm2col = fast_1x1 || stripes_per_plane == 1; + + int Kstripes = Kg_nblocks * stripes_per_plane; + int nsubtasks = N * ngroups * Kstripes; + + size_t stripesize = alignSize(CONV_NR * ksize * Cg, VEC_ALIGN); + size_t cbufsize = alignSize(CONV_NR * K_BLOCK_SIZE * MAX_STRIPES, VEC_ALIGN); + + size_t taskbufsize = cbufsize * sizeof(float ); + + if (!seperateIm2col) + taskbufsize += MAX_STRIPES * stripesize * esz; - size_t stripesize = CONV_NR * ksize * Cg; - size_t taskbufsize = (stripesize + CONV_NR * K_BLOCK_SIZE) * MAX_STRIPES; - size_t totalbufsize = taskbufsize * ntasks; + size_t totalbufsize_base = taskbufsize * ntasks; + size_t totalbufsize = totalbufsize_base; + if (seperateIm2col) + totalbufsize += N * ngroups * stripes_per_plane0 * stripesize * esz; - AutoBuffer inpbuf_all_; - totalbufsize = alignSize(totalbufsize, VEC_ALIGN); - inpbuf_all_.allocate(totalbufsize + VEC_ALIGN); - float* inpbuf_all = alignPtr(inpbuf_all_.data(), (int)(VEC_ALIGN*sizeof(inpbuf_all_[0]))); + AutoBuffer inpbuf_all_; + char* inpbuf_all = nullptr; + + inpbuf_all_.allocate(totalbufsize + VEC_ALIGN * sizeof(float )); + inpbuf_all = alignPtr(inpbuf_all_.data(), (int)(VEC_ALIGN * sizeof(float ))); + char* inpbuf_all_0 = inpbuf_all + totalbufsize_base; float* inp = input.ptr(); float* out = output.ptr(); float* fusedAddPtr0 = fusedAddMat.empty() ? 0 : fusedAddMat.ptr(); + // In the case of 1x1 convolution we first reorder the whole input tensor. + // In general, im2row results in Hk*Wk-x unrolling factor + // (e.g. 3*3=9x unrolling for 3x3 convolution), thus for 1x1 convolution + // the reordered tensor will take as much space as the original tensor. + if (seperateIm2col) + { + // the optional phase 1. im2row + parallel_for_(Range(0, ntasks), [&](const Range& r0) { + for (int task_id = r0.start; task_id < r0.end; task_id++) + { + if (fast_1x1) + { + int nc0 = task_id*N*C/ntasks, nc1 = (task_id+1)*N*C/ntasks, dc = 0; + for (; nc0 < nc1; nc0 += dc) + { + int n = nc0/C, c0 = nc0 - n*C; + int g = c0 / Cg; + c0 -= g*Cg; + dc = Cg - c0 <= nc1 - nc0 ? Cg - c0 : nc1 - nc0; + + float * inptr_ = inp + (size_t)nc0*inp_planesize; + char* inpbuf_ = inpbuf_all_0 + ((n*ngroups + g)*stripes_per_plane0*stripesize + c0*CONV_NR)*esz; + + packInputData(inpbuf_, inptr_, ofstab, dhwTab, 0, out_planesize, ksize, stride_d, stride_h, + stride_w, pad_front, pad_top, pad_left, Dk, Hk, Wk, dilation_d, dilation_h, dilation_w, + Di, Hi, Wi, H0, W0, dc, stripesize, 0, inp_planesize, conv->conv_dim, + conv->conv_type, CONV_NR, esz, fast_1x1, useFP16); + } + } + else + { + const int allTasks = N * ngroups * stripes_per_plane0; + int ngs0 = task_id*allTasks/ntasks, ngs1 = (task_id+1)*allTasks/ntasks, ds = 0; + + for (; ngs0 < ngs1; ngs0 += ds) + { + int n = ngs0 / (ngroups * stripes_per_plane0), gs0 = ngs0 - n*ngroups*stripes_per_plane0; + int g = gs0 / stripes_per_plane0, s0 = gs0 - g*stripes_per_plane0; + + ds = stripes_per_plane0 - s0 <= ngs1 - ngs0 ? stripes_per_plane0 - s0 : ngs1 - ngs0; + + int zyx = s0 * CONV_NR; + int zyx_limit = (s0 + ds) * CONV_NR < out_planesize ? (s0 + ds) * CONV_NR : out_planesize; + + float * inptr_ = inp + (size_t)(n * ngroups + g) * Cg * inp_planesize; + char* inpbuf_ = inpbuf_all_0 + ((n * ngroups + g) * stripes_per_plane0 * stripesize + s0 * stripesize) * esz; + + packInputData(inpbuf_, inptr_, ofstab, dhwTab, zyx, zyx_limit, ksize, stride_d, stride_h, + stride_w, pad_front, pad_top, pad_left, Dk, Hk, Wk, dilation_d, dilation_h, dilation_w, + Di, Hi, Wi, H0, W0, Cg, stripesize, 0, inp_planesize, conv->conv_dim, + conv->conv_type, CONV_NR, esz, fast_1x1, useFP16); + } + } + } + }); + } + + // Compute parallel_for_(Range(0, ntasks), [&](const Range& r0) { for (int task_id = r0.start; task_id < r0.end; task_id++) { - float* inpbuf_task = &inpbuf_all[taskbufsize * task_id]; - float* cbuf_task = inpbuf_task + stripesize * MAX_STRIPES; + float * cbuf_task = (float *)(inpbuf_all + taskbufsize * task_id); + char * inpbuf_task = (char*)(cbuf_task + cbufsize); int ngs0 = (int)((size_t)nsubtasks * task_id / ntasks); int ngs1 = (int)((size_t)nsubtasks * (task_id+1) / ntasks); @@ -557,7 +1382,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co int k0, k1; int zyx0, zyx_limit, zyx_block_limit = 0; - if (stripes_per_sample == 1 && conv->conv_type != CONV_TYPE_DEPTHWISE_REMAIN) + if (stripes_per_plane == 1 || conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN) { k0 = kzyx0 * CONV_MR; k1 = kzyx1 * CONV_MR; @@ -581,380 +1406,75 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co zyx_block_limit = zyx_block_limit < zyx_limit ? zyx_block_limit : zyx_limit; int nstripes = (zyx_block_limit - zyx0 + CONV_NR - 1) / CONV_NR; - int zyx0_saved = zyx0; CV_Assert(nstripes <= MAX_STRIPES); - for (int stripe = 0; zyx0 < zyx_block_limit; stripe++, zyx0 += CONV_NR) + if (!seperateIm2col) { - float *inpbuf = inpbuf_task + stripe * stripesize; - float *inptr = inp + inp_plane_ofs; - - /* - 1. pack the data. Copy the HkxWk CONV_NR-wide slices from - each feature plane of the input tensor to the input buffer. - */ - if (fast_1x1) - { - int slice_len = zyx_block_limit - zyx0; - bool partial = slice_len < CONV_NR; - // Superfast branch for 1x1 convolutions with sy=sx=1. - // in this case each feature plane can be safely treated - // as 1D array, and we just extract next portion - // of CONV_NR elements from each feature plane and - // put it together. - inptr += zyx0; - if (!partial) - { - // Make special branch where memcpy() is called with a constant buffer size. - // Compilers will likely unroll this loop properly. - for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR) - memcpy(inpbuf, inptr, CONV_NR * sizeof(inpbuf[0])); - } - else - { - for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += CONV_NR) - { - memcpy(inpbuf, inptr, slice_len * sizeof(inpbuf[0])); - memset(inpbuf + slice_len, 0, (CONV_NR - slice_len) * sizeof(inpbuf[0])); - } - } - } - else if (conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN) - { - CV_Assert(Cg == 1); - const int HW0 = H0 * W0; - const int HWi = Hi * Wi; - int slice_len = std::min(zyx_block_limit - zyx0, CONV_NR); - - // here some non-continuous sub-row of the row will not be - // filled from the tensor; we need to make sure that the uncovered - // elements are explicitly set to 0's. the easiest way is to - // set all the elements to 0's before the loop. - memset(inpbuf, 0, stripesize*sizeof(inpbuf[0])); - - int z0 = zyx0 / HW0, yx0 = zyx0 - z0 * HW0; - int y0 = yx0 / W0, x0 = yx0 - y0 * W0; - - if (conv_dim == CONV_1D) - { - for (int slice_i = 0; slice_i < slice_len; y0++, x0=0) - { - int delta = std::min(slice_len - slice_i, W0 - x0); - int x1 = x0 + delta; - - int in_w = x0 * stride_w - pad_left; - float* inptrIn = inptr + in_w; - - int s0 = slice_i; - - for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w) - { - // Pack 8 - if (x0 + 8 <= x1 && 0 <= in_w && - in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w) - { - packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize); - } - else if (x0 + 2 <= x1 && 0 <= in_w && - in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w) - { - packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize); - } - else - { - int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w); - int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w); - - float* inpbufC = inpbuf + s0; - float* inptrInC = inptrIn; - for (int w = w0; w < w1; w++) - { - int imgofs = w*dilation_w; - inpbufC[w*CONV_NR] = inptrInC[imgofs]; - } - } - } - slice_i += delta; - } - } - else if (conv_dim == CONV_2D) - { - for (int slice_i = 0; slice_i < slice_len; y0++, x0=0) - { - int delta = std::min(slice_len - slice_i, W0 - x0); - int x1 = x0 + delta; - - int in_h = y0 * stride_h - pad_top; - int in_w = x0 * stride_w - pad_left; - - float* inptrIn = inptr + in_h*Wi + in_w; - - bool ok_i = 0 <= in_h && in_h < Hi - (Hk-1)*dilation_h; - int h0 = std::max(0, (-in_h + dilation_h-1)/dilation_h); - int h1 = std::min(Hk, (Hi - in_h + dilation_h-1)/dilation_h); - - int s0 = slice_i; - for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w) - { - // Pack 8 - if (ok_i && x0 + 8 <= x1 && 0 <= in_w && - in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w) - { - packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize); - } - else if (ok_i && x0 + 2 <= x1 && 0 <= in_w && - in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w) - { - packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize); - } - else - { - int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w); - int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w); - - float* inpbufC = inpbuf + s0; - float* inptrInC = inptrIn; - - for (int h = h0; h < h1; h++) - { - for (int w = w0; w < w1; w++) - { - int imgofs = h*(dilation_h*Wi) + w*dilation_w; - inpbufC[(h*Wk + w)*CONV_NR] = inptrInC[imgofs]; - } - } - } - } - slice_i += delta; - } - } - else if (conv_dim == CONV_3D) - { - for (int slice_i = 0; slice_i < slice_len; z0 += (y0+1)/H0, y0 = (y0+1)%H0, x0=0) - { - int delta = std::min(slice_len - slice_i, W0 - x0); - int x1 = x0 + delta; - - int in_d = z0 * stride_d - pad_front; - int in_h = y0 * stride_h - pad_top; - int in_w = x0 * stride_w - pad_left; - - float* inptrIn = inptr + in_d*HWi + in_h*Wi + in_w; - - int d0 = std::max(0, (-in_d + dilation_d - 1) / dilation_d); - int d1 = std::min(Dk, (Di - in_d + dilation_d - 1) / dilation_d); - - bool ok_i = 0 <= in_d && in_d < Di - (Dk-1)*dilation_d && - 0 <= in_h && in_h < Hi - (Hk-1)*dilation_h; - int h0 = std::max(0, (-in_h + dilation_h-1)/dilation_h); - int h1 = std::min(Hk, (Hi - in_h + dilation_h-1)/dilation_h); - - int s0 = slice_i; - for (; x0 < x1; x0++, s0++, inptrIn += stride_w, in_w += stride_w) - { - // Pack 8 - if (ok_i && x0 + 8 <= x1 && 0 <= in_w && - in_w + stride_w*8 <= Wi - (Wk-1)*dilation_w) - { - packData8(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize); - } - else if (ok_i && x0 + 2 <= x1 && 0 <= in_w && - in_w + stride_w*2 <= Wi - (Wk-1)*dilation_w) - { - packData2(inpbuf, inptrIn, in_w, x0, s0, ofstab, stride_w, ksize); - } - else - { - int w0 = std::max(0, (-in_w + dilation_w-1)/dilation_w); - int w1 = std::min(Wk, (Wi - in_w + dilation_w-1)/dilation_w); - - float* inpbufC = inpbuf + s0; - float* inptrInC = inptrIn; - - for ( int d = d0; d < d1; d++) - { - for (int h = h0; h < h1; h++) - { - for (int w = w0; w < w1; w++) - { - int imgofs = d*dilation_d*HWi + h*(dilation_h*Wi) + w*dilation_w; - inpbufC[((d*Hk + h)*Wk + w)*CONV_NR] = inptrInC[imgofs]; - } - } - } - } - } - slice_i += delta; - } - } - } - else - { - const int HW0 = H0 * W0; - const int HWi = Hi * Wi; - int z0_ = zyx0 / HW0, yx0 = zyx0 - z0_ * HW0; - int y0_ = yx0 / W0, x0_ = yx0 - y0_ * W0; - for (int k = 0; k < ksize; k++) - { - int dz = dhwTab[k * 3], dy = dhwTab[k * 3 + 1], dx = dhwTab[k * 3 + 2]; - int i = 0, z0 = z0_, y0 = y0_, x0 = x0_; - for (; i < CONV_NR;) - { - float *inpbuf_ki = inpbuf + k * CONV_NR * Cg + i; - int zi = z0 * stride_d + dz - pad_front; - int yi = y0 * stride_h + dy - pad_top; - int xi = x0 * stride_w + dx - pad_left; - - if ((unsigned) zi < (unsigned) Di && (unsigned) yi < (unsigned) Hi && - (unsigned) xi < (unsigned) Wi) - { - const float *inptr_ki = inptr + zi * HWi + yi * Wi + xi; - if (i + 8 <= CONV_NR && x0 + 8 <= W0 && xi + stride_w * 8 <= Wi) - { - if (stride_w == 1) - { - for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize) - { - float t0 = inptr_ki[0], t1 = inptr_ki[1]; - float t2 = inptr_ki[2], t3 = inptr_ki[3]; - float t4 = inptr_ki[4], t5 = inptr_ki[5]; - float t6 = inptr_ki[6], t7 = inptr_ki[7]; - inpbuf_ki[0] = t0; - inpbuf_ki[1] = t1; - inpbuf_ki[2] = t2; - inpbuf_ki[3] = t3; - inpbuf_ki[4] = t4; - inpbuf_ki[5] = t5; - inpbuf_ki[6] = t6; - inpbuf_ki[7] = t7; - } - } - else if (stride_w == 2) - { - for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize) - { - float t0 = inptr_ki[0], t1 = inptr_ki[2]; - float t2 = inptr_ki[4], t3 = inptr_ki[6]; - float t4 = inptr_ki[8], t5 = inptr_ki[10]; - float t6 = inptr_ki[12], t7 = inptr_ki[14]; - inpbuf_ki[0] = t0; - inpbuf_ki[1] = t1; - inpbuf_ki[2] = t2; - inpbuf_ki[3] = t3; - inpbuf_ki[4] = t4; - inpbuf_ki[5] = t5; - inpbuf_ki[6] = t6; - inpbuf_ki[7] = t7; - } - } - else - { - for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize) - { - float t0 = inptr_ki[0], t1 = inptr_ki[stride_w]; - float t2 = inptr_ki[stride_w * 2], t3 = inptr_ki[stride_w * 3]; - float t4 = inptr_ki[stride_w * 4], t5 = inptr_ki[stride_w * 5]; - float t6 = inptr_ki[stride_w * 6], t7 = inptr_ki[stride_w * 7]; - inpbuf_ki[0] = t0; - inpbuf_ki[1] = t1; - inpbuf_ki[2] = t2; - inpbuf_ki[3] = t3; - inpbuf_ki[4] = t4; - inpbuf_ki[5] = t5; - inpbuf_ki[6] = t6; - inpbuf_ki[7] = t7; - } - } - i += 8; - x0 += 8; - } - else if (i + 4 <= CONV_NR && x0 + 4 <= W0 && xi + stride_w * 4 <= Wi) - { - if (stride_w == 1) - { - for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize) - { - float t0 = inptr_ki[0], t1 = inptr_ki[1]; - float t2 = inptr_ki[2], t3 = inptr_ki[3]; - inpbuf_ki[0] = t0; - inpbuf_ki[1] = t1; - inpbuf_ki[2] = t2; - inpbuf_ki[3] = t3; - } - } - else - { - for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize) - { - float t0 = inptr_ki[0], t1 = inptr_ki[stride_w]; - float t2 = inptr_ki[stride_w * 2], t3 = inptr_ki[stride_w * 3]; - inpbuf_ki[0] = t0; - inpbuf_ki[1] = t1; - inpbuf_ki[2] = t2; - inpbuf_ki[3] = t3; - } - } - i += 4; - x0 += 4; - } - else - { - for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR, inptr_ki += inp_planesize) - *inpbuf_ki = *inptr_ki; - i++; - x0++; - } - } - else - { - for (int c = 0; c < Cg; c++, inpbuf_ki += CONV_NR) - inpbuf_ki[0] = 0.f; - i++; - x0++; - } - - int mask = x0 >= W0; - y0 += mask; - x0 &= mask - 1; - - mask = y0 >= H0; - z0 += mask; - y0 &= mask - 1; - } - } - } + packInputData(inpbuf_task, inp, ofstab, dhwTab, zyx0, zyx_block_limit, ksize, stride_d, stride_h, + stride_w, pad_front, pad_top, pad_left, Dk, Hk, Wk, dilation_d, dilation_h, dilation_w, + Di, Hi, Wi, H0, W0, Cg, stripesize, inp_plane_ofs, inp_planesize, conv->conv_dim, + conv->conv_type, CONV_NR, esz, fast_1x1, useFP16); } - zyx0 = zyx0_saved; - - // spacial branch for depth-wise convolution implemented using generic convolution. - // In this case, CONV_MR is 1, and CONV_NR is the same. + char *weights = nullptr; +#ifdef CONV_ARM_FP16 + if (useFP16) + { + CV_Assert(!conv->weightsBuf_FP16.empty()); + weights = (char *)conv->weightsBufPtr_FP16; + } + else +#endif + { + CV_Assert(!conv->weightsBuf.empty()); + weights = (char *)conv->weightsBufPtr; + } + // optional branch, only for depth-wise convolution which was implemented by generic convolution. + // In this case, CONV_MR is 1, and CONV_NR remains the same. if (conv->conv_type == CONV_TYPE_DEPTHWISE_REMAIN) { + CV_Assert(weights); size_t outofs = (n * ngroups + g) * out_planesize + zyx0; float *cptr0 = cbuf_task; - float *weights = conv->weightsBufPtr + g * padded_ksize; + weights += g * padded_ksize * esz; + int out_width = zyx_block_limit - zyx0; float *outptr = out + outofs; const float biasVal = *(conv->biasBuf.data() + g); + const char *inptr_ = seperateIm2col ? inpbuf_all_0 + (ng*stripes_per_plane0 + zyx0/CONV_NR) * stripesize * esz: + inpbuf_task; + for (int stripe = 0; stripe < nstripes; stripe++) { - const float *inptr = inpbuf_task + stripe * stripesize; + const char *inptr = inptr_ + stripe * stripesize * esz; const int outLen = std::min(out_width - stripe * CONV_NR, CONV_NR); bool ifBuffer = outLen < CONV_NR; float *cptr = outptr + stripe * CONV_NR; if (ifBuffer) { - memcpy(cptr0, cptr, outLen * sizeof(cptr[0])); + memcpy(cptr0, cptr, outLen * sizeof(float )); cptr = cptr0; } - - convBlockMR1(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR); +#if CV_NEON && CV_NEON_AARCH64 + if (conv->useNEON) + { +#ifdef CONV_ARM_FP16 + if (useFP16) + { + opt_NEON::convBlockMR1_FP16(DkHkWkCg, weights, inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR); + } + else +#endif + opt_NEON::convBlockMR1_F32(DkHkWkCg, (const float *)weights, (const float *)inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR); + } + else +#endif + convBlockMR1(DkHkWkCg, (const float *)weights, (const float *)inptr, cptr, biasVal, fusedAdd, minval, maxval, ifMinMaxAct, outLen, CONV_NR); if (ifBuffer) { - memcpy(outptr + stripe * CONV_NR, cptr, outLen * sizeof(cptr[0])); + memcpy(outptr + stripe * CONV_NR, cptr, outLen * sizeof(float )); } } if (activ) @@ -962,7 +1482,9 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co continue; } - float *weights = conv->weightsBufPtr + g * Kg_aligned * DkHkWkCg; + CV_Assert(weights); + weights += g * Kg_aligned * DkHkWkCg * esz; + const float *biasptr = conv->biasBuf.data() + Kg * g; int ldc = nstripes * CONV_NR; @@ -974,109 +1496,196 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co for (int c0 = 0; c0 < DkHkWkCg; c0 += C_BLOCK_SIZE) { int c1 = c0 + C_BLOCK_SIZE < DkHkWkCg ? c0 + C_BLOCK_SIZE : DkHkWkCg; - for (int stripe = 0; stripe < nstripes; stripe++) + const char *inptr = seperateIm2col ? inpbuf_all_0 + (ng*stripes_per_plane0 + zyx0/CONV_NR)*stripesize*esz: + inpbuf_task; + inptr += (c0 * CONV_NR) * esz; + for (int stripe = 0; stripe < nstripes; stripe++, inptr += stripesize * esz) { const int outLen = std::min(out_width - stripe * CONV_NR, CONV_NR); -#if CV_TRY_AVX || CV_TRY_AVX2 || CV_NEON - // The possible CONV_NR is 28, 24, 12, so the possible CONV_NR/3 is 9, 8, 4. - bool runOpt = outLen > std::min(8, CONV_NR/3); -#endif - float *wptr = weights + k0_block * DkHkWkCg + c0 * CONV_MR; - const float *inptr = inpbuf_task + stripe * stripesize + c0 * CONV_NR; + char *wptr = weights + (k0_block * DkHkWkCg + c0 * CONV_MR) * esz; float *cptr = cbuf_task + stripe * CONV_NR; + float16_t* cptr_f16 = (float16_t*)cbuf_task + stripe*CONV_NR; for (int k = k0_block; k < k1_block; k += CONV_MR, - wptr += DkHkWkCg * CONV_MR, cptr += CONV_MR * ldc) + wptr += DkHkWkCg * CONV_MR * esz, cptr += CONV_MR * ldc, cptr_f16 += CONV_MR * ldc) { #if CV_TRY_AVX2 - if (conv->useAVX2 && runOpt) - opt_AVX2::convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, CONV_MR, CONV_NR); + if (conv->useAVX2) + opt_AVX2::convBlock(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR); else #endif #if CV_TRY_AVX - if (conv->useAVX && runOpt) - opt_AVX::convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, CONV_MR, CONV_NR); + if (conv->useAVX) + opt_AVX::convBlock(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR); else #endif #if CV_NEON - if (conv->useNEON && runOpt) - opt_NEON::convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, CONV_MR, CONV_NR); + if (conv->useNEON) + { +#ifdef CONV_ARM_FP16 + if (useFP16) + { + opt_NEON::convBlock_FP16(c1 - c0, wptr, inptr, (char *)cptr_f16, ldc, c0 == 0, outLen, CONV_MR, CONV_NR); + } + else +#endif + opt_NEON::convBlock(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR); + } else #endif // The possible outLen range is 24 or 8~1. - convBlock(c1 - c0, wptr, inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR); + convBlock(c1 - c0, (const float *)wptr, (const float *)inptr, cptr, ldc, c0 == 0, outLen, CONV_MR, CONV_NR); } } } size_t outofs = ((n * ngroups + g) * Kg + k0_block) * out_planesize + zyx0; const float *cptr = cbuf_task; - + const float16_t *cptr_fp16 = (const float16_t *)cbuf_task; float *outptr = out + outofs; const float *pbptr = fusedAddPtr0 ? fusedAddPtr0 + outofs : 0; for (int k = k0_block; k < k1_block; k++, - cptr += ldc, outptr += out_planesize, - pbptr += (pbptr ? out_planesize : 0)) { + cptr += ldc, cptr_fp16 += ldc, outptr += out_planesize, + pbptr += (pbptr ? out_planesize : 0)) + { float biasval = biasptr[k]; int j = 0; -#if CV_SIMD128 - v_float32x4 vbias = v_setall_f32(biasval); - v_float32x4 vmax = v_setall_f32(maxval); - v_float32x4 vmin = v_setall_f32(minval); - if (pbptr) +#ifdef CONV_ARM_FP16 + if (useFP16) { - for (; j + 7 < out_width; j += 8) + float32x4_t vbias = vdupq_n_f32(biasval); + float32x4_t vmax = vdupq_n_f32(maxval); + float32x4_t vmin = vdupq_n_f32(minval); + if (pbptr) + { + for (; j + 7 < out_width; j += 8) + { + float32x4_t v0 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + j)) + vbias; + float32x4_t v1 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + + j + 4)) + vbias; + + v0 += vld1q_f32(pbptr + j); + v1 += vld1q_f32(pbptr + j + 4); + + if (ifMinMaxAct) + { + v0 = vminq_f32(vmaxq_f32(v0, vmin), vmax); + v1 = vminq_f32(vmaxq_f32(v1, vmin), vmax); + } + + vst1q_f32(outptr + j, v0); + vst1q_f32(outptr + j + 4, v1); + } + } + else { - v_float32x4 v0 = v_add(v_load(cptr + j), vbias); - v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias); - v0 = v_add(v0, v_load(pbptr + j)); - v1 = v_add(v1, v_load(pbptr + j + 4)); + for (; j + 7 < out_width; j += 8) + { + float32x4_t v0 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + j)) + vbias; + float32x4_t v1 = vcvt_f32_f16(vld1_f16((const __fp16 *)cptr_fp16 + j + 4)) + vbias; + + if (ifMinMaxAct) + { + v0 = vminq_f32(vmaxq_f32(v0, vmin), vmax); + v1 = vminq_f32(vmaxq_f32(v1, vmin), vmax); + } + + vst1q_f32(outptr + j, v0); + vst1q_f32(outptr + j + 4, v1); + } + } - if (ifMinMaxAct) + if (pbptr) + { + for (; j < out_width; j++) { - v0 = v_min(v_max(v0, vmin), vmax); - v1 = v_min(v_max(v1, vmin), vmax); + float v = (float )cptr_fp16[j] + biasval; + v += pbptr[j]; + if (ifMinMaxAct) + v = std::min(std::max(v, minval), maxval); + outptr[j] = v; } + } + else + { + for (; j < out_width; j++) + { + float v = (float )cptr_fp16[j] + biasval; - v_store(outptr + j, v0); - v_store(outptr + j + 4, v1); + if (ifMinMaxAct) + v = std::min(std::max(v, minval), maxval); + outptr[j] = v; + } } } else +#endif { - for (; j + 7 < out_width; j += 8) - { - v_float32x4 v0 = v_add(v_load(cptr + j), vbias); - v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias); +#if CV_SIMD128 + v_float32x4 vbias = v_setall_f32(biasval); + v_float32x4 vmax = v_setall_f32(maxval); + v_float32x4 vmin = v_setall_f32(minval); - if (ifMinMaxAct) + if (pbptr) + { + for (; j + 7 < out_width; j += 8) { - v0 = v_min(v_max(v0, vmin), vmax); - v1 = v_min(v_max(v1, vmin), vmax); + v_float32x4 v0 = v_add(v_load(cptr + j), vbias); + v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias); + + v0 = v_add(v0, v_load(pbptr + j)); + v1 = v_add(v1, v_load(pbptr + j + 4)); + + if (ifMinMaxAct) + { + v0 = v_min(v_max(v0, vmin), vmax); + v1 = v_min(v_max(v1, vmin), vmax); + } + + v_store(outptr + j, v0); + v_store(outptr + j + 4, v1); } + } + else + { + for (; j + 7 < out_width; j += 8) + { + v_float32x4 v0 = v_add(v_load(cptr + j), vbias); + v_float32x4 v1 = v_add(v_load(cptr + j + 4), vbias); + + if (ifMinMaxAct) + { + v0 = v_min(v_max(v0, vmin), vmax); + v1 = v_min(v_max(v1, vmin), vmax); + } - v_store(outptr + j, v0); - v_store(outptr + j + 4, v1); + v_store(outptr + j, v0); + v_store(outptr + j + 4, v1); + } } - } #endif - if (pbptr) { - for (; j < out_width; j++) { - float v = cptr[j] + biasval; - v += pbptr[j]; - if (ifMinMaxAct) - v = std::min(std::max(v, minval), maxval); - outptr[j] = v; + if (pbptr) + { + for (; j < out_width; j++) + { + float v = cptr[j] + biasval; + v += pbptr[j]; + if (ifMinMaxAct) + v = std::min(std::max(v, minval), maxval); + outptr[j] = v; + } } - } else { - for (; j < out_width; j++) { - float v = cptr[j] + biasval; + else + { + for (; j < out_width; j++) + { + float v = cptr[j] + biasval; - if (ifMinMaxAct) - v = std::min(std::max(v, minval), maxval); - outptr[j] = v; + if (ifMinMaxAct) + v = std::min(std::max(v, minval), maxval); + outptr[j] = v; + } } } @@ -1095,7 +1704,7 @@ void runFastConv(InputArray _input, OutputArray _output, const Ptr& co SIMD and no-SIMD code for convBlock \****************************************************************************************/ -static void convBlockMR1NoSIMD(int np, const float* a, const float* b, float *c, const float bias, bool init_c, +static inline void convBlockMR1NoSIMD(int np, const float* a, const float* b, float *c, const float bias, bool init_c, const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR) { std::vector cbuffer(outLen, 0); @@ -1128,64 +1737,8 @@ static void convBlockMR1NoSIMD(int np, const float* a, const float* b, float *c, } #if CV_SIMD128 -static void convBlockMR1x28(int np, const float* a, const float* b, float *c, const float bias, bool init_c, - const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR) -{ - CV_Assert(convNR == 28); - v_float32x4 c0 = v_setall_f32(bias), c1 = c0, c2 = c0; - v_float32x4 c3 = c0, c4 = c0, c5 = c0; - v_float32x4 c6 = c0; - - for (int p = 0; p < np; p++, a++, b += convNR) - { - v_float32x4 a0 = v_setall_f32(a[0]); - v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8); - v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20); - v_float32x4 b6 = v_load(b + 24); - - c0 = v_fma(b0, a0, c0); - c1 = v_fma(b1, a0, c1); - c2 = v_fma(b2, a0, c2); - c3 = v_fma(b3, a0, c3); - c4 = v_fma(b4, a0, c4); - c5 = v_fma(b5, a0, c5); - c6 = v_fma(b6, a0, c6); - } - - if (init_c) - { - c0 = v_add(c0, v_load(c)); - c1 = v_add(c1, v_load(c + 4)); - c2 = v_add(c2, v_load(c + 8)); - c3 = v_add(c3, v_load(c + 12)); - c4 = v_add(c4, v_load(c + 16)); - c5 = v_add(c5, v_load(c + 20)); - c6 = v_add(c6, v_load(c + 24)); - } - - if (ifMinMaxAct) - { - v_float32x4 vmax = v_setall_f32(maxval), vmin = v_setall_f32(minval); - c0 = v_min(v_max(c0, vmin), vmax); - c1 = v_min(v_max(c1, vmin), vmax); - c2 = v_min(v_max(c2, vmin), vmax); - c3 = v_min(v_max(c3, vmin), vmax); - c4 = v_min(v_max(c4, vmin), vmax); - c5 = v_min(v_max(c5, vmin), vmax); - c6 = v_min(v_max(c6, vmin), vmax); - } - - v_store(c, c0); - v_store(c + 4, c1); - v_store(c + 8, c2); - v_store(c + 12, c3); - v_store(c + 16, c4); - v_store(c + 20, c5); - v_store(c + 24, c6); -} - -static void convBlockMR1x24(int np, const float* a, const float* b, float *c, const float bias, bool init_c, - const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR) +static inline void convBlockMR1x24(int np, const float* a, const float* b, float *c, const float bias, bool init_c, + const float minval, const float maxval, bool ifMinMaxAct, const int convNR) { CV_Assert(convNR == 24); v_float32x4 c0 = v_setall_f32(bias), c1 = c0, c2 = c0; @@ -1234,8 +1787,8 @@ static void convBlockMR1x24(int np, const float* a, const float* b, float *c, co v_store(c + 20, c5); } -static void convBlockMR1x12(int np, const float* a, const float* b, float *c, const float bias, bool init_c, - const float minval, const float maxval, bool ifMinMaxAct, const int outLen, const int convNR) +static inline void convBlockMR1x12(int np, const float* a, const float* b, float *c, const float bias, bool init_c, + const float minval, const float maxval, bool ifMinMaxAct, const int convNR) { CV_Assert(convNR == 12); v_float32x4 c0 = v_setall_f32(bias), c1 = c0, c2 = c0; @@ -1279,12 +1832,10 @@ void convBlockMR1(int np, const float* a, const float* b, float *c, const float const int convNRby3 = convNR/3; if (outLen > convNRby3) { - if (convNR == 28) - convBlockMR1x28(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR); - else if (convNR == 24) - convBlockMR1x24(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR); + if (convNR == 24) + convBlockMR1x24(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, convNR); else if (convNR == 12) - convBlockMR1x12(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR); + convBlockMR1x12(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, convNR); else convBlockMR1NoSIMD(np, a, b, c, bias, init_c, minval, maxval, ifMinMaxAct, outLen, convNR); } @@ -1296,7 +1847,7 @@ void convBlockMR1(int np, const float* a, const float* b, float *c, const float } #if CV_SIMD128 -static void convBlock4x24(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR) +static inline void convBlock4x24(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR) { v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0; v_float32x4 c6 = v_setzero_f32(), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6; @@ -1401,7 +1952,7 @@ static void convBlock4x24(int np, const float* a, const float* b, float* c, int v_store(c + ldc * 3 + 20, c23); } -static void convBlock4x8(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR) +static inline void convBlock4x8(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR) { CV_Assert(convNR >= 4); v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0; @@ -1454,7 +2005,7 @@ static void convBlock4x8(int np, const float* a, const float* b, float* c, int l v_store(c + ldc * 3 + 4, c7); } -static void convBlock4x4(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR) +static inline void convBlock4x4(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int convMR, const int convNR) { CV_Assert(convNR >= 4); v_float32x4 c0 = v_setzero_f32(), c1 = c0, c2 = c0, c3 = c0; @@ -1489,7 +2040,7 @@ static void convBlock4x4(int np, const float* a, const float* b, float* c, int l } #endif -static void convBlockNoSIMD(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen, +static inline void convBlockNoSIMD(int np, const float* a, const float* b, float* c, int ldc, bool init_c, const int outLen, const int convMR, const int convNR) { std::vector cbuffer(convMR * outLen, 0); diff --git a/modules/dnn/src/layers/cpu_kernels/convolution.hpp b/modules/dnn/src/layers/cpu_kernels/convolution.hpp index 3d44c3189b..6fabc3da7c 100644 --- a/modules/dnn/src/layers/cpu_kernels/convolution.hpp +++ b/modules/dnn/src/layers/cpu_kernels/convolution.hpp @@ -10,14 +10,27 @@ #ifndef CONV_PRAM #define CONV_PRAM #if CV_NEON && CV_NEON_AARCH64 // 32 registers. -#define CONV_MR 4 -#define CONV_NR 28 +#define CONV_MR_FP32 4 +#define CONV_NR_FP32 28 + +// The FP16 can only be supported by ARM64 and with FP16 FMA supported. +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC // check FP16 FMA. +#define CONV_ARM_FP16 1 +#endif + +#ifdef CONV_ARM_FP16 +// Currently, only ARM 64 support FP16. +#define CONV_MR_FP16 8 +#define CONV_NR_FP16 24 +typedef __fp16 float16_t; // Fix conflict between float16_t in arm_neon.h and float16_t in cvdef.h. +#endif + #elif CV_NEON // 16 registers. -#define CONV_MR 4 -#define CONV_NR 12 +#define CONV_MR_FP32 4 +#define CONV_NR_FP32 12 #else // SIMD 128, AVX or AVX2 -#define CONV_MR 4 -#define CONV_NR 24 +#define CONV_MR_FP32 4 +#define CONV_NR_FP32 24 #endif // Winograd Params @@ -41,6 +54,10 @@ enum { #endif CONV_WINO_NATOMS_F32 = CONV_WINO_AREA / CONV_WINO_ATOM_F32, // for AVX2, it is 8, otherwise, it's 16. + + // FP 16 + CONV_WINO_ATOM_F16 = CONV_WINO_ATOM_F32 * 2, + CONV_WINO_NATOMS_F16 = CONV_WINO_AREA / CONV_WINO_ATOM_F16, }; // NOTE that: CONV_TYPE_DEPTHWISE is for 3x3 depthwise conv, and others depthwise will be set as CONV_TYPE_DEPTHWISE_REMAIN. @@ -64,8 +81,17 @@ struct FastConv std::vector weightsWinoBuf; // For Winograd F(6x6, 3x3). float* weightsWinoBufPtr; std::vector biasBuf; + +#if CV_NEON && CV_NEON_AARCH64 && CV_FP16 + std::vector weightsBuf_FP16; + float16_t* weightsBufPtr_FP16; + std::vector weightsWinoBuf_FP16; + float16_t* weightsWinoBufPtr_FP16; +#endif + int conv_type; int conv_dim; // Flag for conv1d, conv2d, or conv3d. + bool useFP16 = false; // Only ARMv8 is supported. #if CV_SIMD128 bool useSIMD128 = true; #else @@ -95,6 +121,7 @@ Ptr initFastConv( const std::vector& pads_begin, const std::vector& pads_end, int conv_dim, + const bool useFP16, bool useWinograd); // It contains different computing branches, like winograd, 1x1 conv. diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index ff53d4d114..02f56e7419 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -215,7 +215,7 @@ public: if (backendId == DNN_BACKEND_OPENCV) { if (kernel_size.size() == 3) - return preferableTarget == DNN_TARGET_CPU; + return IS_DNN_CPU_TARGET(preferableTarget); if (kernel_size.size() <= 2) return true; else diff --git a/modules/dnn/src/net_impl.cpp b/modules/dnn/src/net_impl.cpp index 775016b3b7..8024a05597 100644 --- a/modules/dnn/src/net_impl.cpp +++ b/modules/dnn/src/net_impl.cpp @@ -98,6 +98,7 @@ void Net::Impl::validateBackendAndTarget() CV_Assert(preferableBackend != DNN_BACKEND_OPENCV || preferableTarget == DNN_TARGET_CPU || + preferableTarget == DNN_TARGET_CPU_FP16 || preferableTarget == DNN_TARGET_OPENCL || preferableTarget == DNN_TARGET_OPENCL_FP16); CV_Assert(preferableBackend != DNN_BACKEND_HALIDE || @@ -972,7 +973,8 @@ void Net::Impl::forward(OutputArrayOfArrays outputBlobs, const String& outputNam } else if (outputBlobs.isMatVector()) { - if (preferableTarget != DNN_TARGET_CPU) + // The DNN_TARGET_CPU and DNN_TARGET_CPU_FP16 both use the CPU memory, do not need the copyToHost. + if (preferableTarget != DNN_TARGET_CPU && preferableTarget != DNN_TARGET_CPU_FP16) { for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i) { @@ -1336,7 +1338,7 @@ Mat Net::Impl::getBlob(const LayerPin& pin) const "the #%d was requested", ld.name.c_str(), ld.outputBlobs.size(), pin.oid)); } - if (preferableTarget != DNN_TARGET_CPU) + if (preferableTarget != DNN_TARGET_CPU && preferableTarget != DNN_TARGET_CPU_FP16) { CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty()); // Transfer data to CPU if it's require. @@ -1552,7 +1554,7 @@ string Net::Impl::dump(bool forceAllocation) const prevNode = itBackend->second; } } - std::vector colors = { "#ffffb3", "#fccde5", "#8dd3c7", "#bebada", "#80b1d3", "#fdb462", "#ff4848", "#b35151", "#b266ff", "#b266ff", "#3cb371"}; + std::vector colors = { "#ffffb3", "#fccde5", "#8dd3c7", "#bebada", "#80b1d3", "#fdb462", "#ff4848", "#b35151", "#b266ff", "#b266ff", "#3cb371", "#ffcab3"}; string backend; switch (prefBackend) { @@ -1755,6 +1757,10 @@ string Net::Impl::dump(bool forceAllocation) const out << "NPU"; colorId = 9; break; + case DNN_TARGET_CPU_FP16: + out << "CPU_FP16"; + colorId = 10; + break; // don't use default: } CV_Assert(colorId < colors.size()); diff --git a/modules/dnn/src/net_impl_backend.cpp b/modules/dnn/src/net_impl_backend.cpp index ef816be66d..d29b6934a2 100644 --- a/modules/dnn/src/net_impl_backend.cpp +++ b/modules/dnn/src/net_impl_backend.cpp @@ -17,7 +17,8 @@ CV__DNN_INLINE_NS_BEGIN Ptr Net::Impl::wrap(Mat& host) { - if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU) + if (preferableBackend == DNN_BACKEND_OPENCV && + (preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_CPU_FP16)) return Ptr(); MatShape shape(host.dims); @@ -104,7 +105,7 @@ void Net::Impl::initBackend(const std::vector& blobsToKeep_) CV_TRACE_FUNCTION(); if (preferableBackend == DNN_BACKEND_OPENCV) { - CV_Assert(preferableTarget == DNN_TARGET_CPU || IS_DNN_OPENCL_TARGET(preferableTarget)); + CV_Assert(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_CPU_FP16 || IS_DNN_OPENCL_TARGET(preferableTarget)); } else if (preferableBackend == DNN_BACKEND_HALIDE) { @@ -232,6 +233,15 @@ void Net::Impl::setPreferableTarget(int targetId) preferableTarget = DNN_TARGET_OPENCL; #endif } + +#if !defined(__arm64__) || !__arm64__ + if (targetId == DNN_TARGET_CPU_FP16) + { + CV_LOG_WARNING(NULL, "DNN: fall back to DNN_TARGET_CPU. Only ARM v8 CPU is supported by DNN_TARGET_CPU_FP16."); + targetId = DNN_TARGET_CPU; + } +#endif + clear(); } } diff --git a/modules/dnn/src/registry.cpp b/modules/dnn/src/registry.cpp index f5c9e584c6..40630a93e4 100644 --- a/modules/dnn/src/registry.cpp +++ b/modules/dnn/src/registry.cpp @@ -61,6 +61,11 @@ private: } #endif + bool haveBackendCPU_FP16 = false; +#if defined(__arm64__) && __arm64__ + haveBackendCPU_FP16 = true; +#endif + if (haveBackendOpenVINO && openvino::checkTarget(DNN_TARGET_CPU)) { backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, DNN_TARGET_CPU)); @@ -104,6 +109,9 @@ private: backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)); + if (haveBackendCPU_FP16) + backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_CPU_FP16)); + #ifdef HAVE_VULKAN if (haveVulkan()) backends.push_back(std::make_pair(DNN_BACKEND_VKCOM, DNN_TARGET_VULKAN)); diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp index 7aa9c756fb..d8e69f3bbb 100644 --- a/modules/dnn/test/test_backends.cpp +++ b/modules/dnn/test/test_backends.cpp @@ -175,6 +175,8 @@ TEST_P(DNNTestNetwork, ENet) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16); + if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); processNet("dnn/Enet-model-best.net", "", Size(512, 512), "l367_Deconvolution", target == DNN_TARGET_OPENCL ? "dnn/halide_scheduler_opencl_enet.yml" : "dnn/halide_scheduler_enet.yml", @@ -189,7 +191,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe) applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE); Mat sample = imread(findDataFile("dnn/street.png")); Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false); - float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 1.5e-2 : 0.0; + float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 1.5e-2 : 0.0; float iouDiff = (target == DNN_TARGET_MYRIAD) ? 0.063 : 0.0; float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.262 : FLT_MIN; processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt", @@ -225,7 +227,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe_Different_Width_Height) Mat sample = imread(findDataFile("dnn/street.png")); Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 560), Scalar(127.5, 127.5, 127.5), false); float scoreDiff = 0.0, iouDiff = 0.0; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { scoreDiff = 0.029; iouDiff = 0.09; @@ -242,7 +244,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe_Different_Width_Height) TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow) { - applyTestTag(target == DNN_TARGET_CPU ? "" : CV_TEST_TAG_MEMORY_512MB); + applyTestTag((target == DNN_TARGET_CPU || target == DNN_TARGET_CPU_FP16) ? "" : CV_TEST_TAG_MEMORY_512MB); if (backend == DNN_BACKEND_HALIDE) applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE); @@ -250,7 +252,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow) Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false); float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.216 : 0.2; float scoreDiff = 0.0, iouDiff = 0.0; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { scoreDiff = 0.095; iouDiff = 0.09; @@ -282,7 +284,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v1_TensorFlow_Different_Width_Height) Mat sample = imread(findDataFile("dnn/street.png")); Mat inp = blobFromImage(sample, 1.0f, Size(300, 560), Scalar(), false); float scoreDiff = 0.0, iouDiff = 0.0; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { scoreDiff = 0.013; iouDiff = 0.06; @@ -306,7 +308,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_v2_TensorFlow) Mat sample = imread(findDataFile("dnn/street.png")); Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false); float scoreDiff = 2e-5, iouDiff = 0.0; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { scoreDiff = 0.013; iouDiff = 0.062; @@ -332,7 +334,7 @@ TEST_P(DNNTestNetwork, SSD_VGG16) Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false); float scoreDiff = 0.0, iouDiff = 0.0; - if (target == DNN_TARGET_OPENCL_FP16) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16) { scoreDiff = 0.04; } @@ -387,7 +389,7 @@ TEST_P(DNNTestNetwork, OpenPose_pose_mpi) // output range: [-0.001, 0.97] const float l1 = (target == DNN_TARGET_MYRIAD) ? 0.02 : 0.0; - const float lInf = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16) ? 0.2 : 0.0; + const float lInf = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16) ? 0.2 : 0.0; processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi.prototxt", Size(46, 46), "", "", l1, lInf); expectNoFallbacksFromIE(net); @@ -461,7 +463,7 @@ TEST_P(DNNTestNetwork, Inception_v2_SSD_TensorFlow) Mat sample = imread(findDataFile("dnn/street.png")); Mat inp = blobFromImage(sample, 1.0f, Size(300, 300), Scalar(), false); float scoreDiff = 0.0, iouDiff = 0.0; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { scoreDiff = 0.02; iouDiff = 0.1; @@ -483,7 +485,7 @@ TEST_P(DNNTestNetwork, DenseNet_121) applyTestTag(CV_TEST_TAG_DNN_SKIP_HALIDE); // Reference output values are in range [-3.807, 4.605] float l1 = 0.0, lInf = 0.0; - if (target == DNN_TARGET_OPENCL_FP16) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16) { l1 = 2e-2; lInf = 9e-2; @@ -538,6 +540,11 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16) l1 = 0.3; lInf = 7.6; } + else if (target == DNN_TARGET_CPU_FP16) + { + l1 = 0.4; + lInf = 19.; + } #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000) if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL) diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp index 8059fc6888..809b959a21 100644 --- a/modules/dnn/test/test_caffe_importer.cpp +++ b/modules/dnn/test/test_caffe_importer.cpp @@ -153,7 +153,7 @@ TEST_P(Test_Caffe_nets, Axpy) } } float l1 = 1e-5, lInf = 1e-4; - if (target == DNN_TARGET_OPENCL_FP16) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16) { l1 = 2e-4; lInf = 1e-3; @@ -180,7 +180,7 @@ TEST_P(Reproducibility_AlexNet, Accuracy) #else applyTestTag(targetId == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB); #endif - ASSERT_TRUE(ocl::useOpenCL() || targetId == DNN_TARGET_CPU); + ASSERT_TRUE(ocl::useOpenCL() || targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16); bool readFromMemory = get<0>(GetParam()); Net net; @@ -214,7 +214,7 @@ TEST_P(Reproducibility_AlexNet, Accuracy) ASSERT_EQ(inLayerShapes[0][3], 227); const float l1 = 1e-5; - const float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 4e-3 : 1e-4; + const float lInf = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_CPU_FP16) ? 4e-3 : 1e-4; net.setPreferableBackend(DNN_BACKEND_OPENCV); net.setPreferableTarget(targetId); @@ -308,7 +308,7 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy) ASSERT_EQ(out.size[2], 100); float scores_diff = 1e-5, boxes_iou_diff = 1e-4; - if (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD) + if (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_MYRIAD || targetId == DNN_TARGET_CPU_FP16) { scores_diff = 1.5e-2; boxes_iou_diff = 6.3e-2; @@ -375,7 +375,7 @@ TEST_P(Reproducibility_ResNet50, Accuracy) { Target targetId = GetParam(); applyTestTag(targetId == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB); - ASSERT_TRUE(ocl::useOpenCL() || targetId == DNN_TARGET_CPU); + ASSERT_TRUE(ocl::useOpenCL() || targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16); Net net = readNetFromCaffe(findDataFile("dnn/ResNet-50-deploy.prototxt"), findDataFile("dnn/ResNet-50-model.caffemodel", false)); @@ -383,8 +383,8 @@ TEST_P(Reproducibility_ResNet50, Accuracy) net.setPreferableBackend(DNN_BACKEND_OPENCV); net.setPreferableTarget(targetId); - float l1 = (targetId == DNN_TARGET_OPENCL_FP16) ? 3e-5 : 1e-5; - float lInf = (targetId == DNN_TARGET_OPENCL_FP16) ? 6e-3 : 1e-4; + float l1 = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_CPU_FP16) ? 3e-5 : 1e-5; + float lInf = (targetId == DNN_TARGET_OPENCL_FP16 || targetId == DNN_TARGET_CPU_FP16) ? 6e-3 : 1e-4; Mat input = blobFromImage(imread(_tf("googlenet_0.png")), 1.0f, Size(224,224), Scalar(), false); ASSERT_TRUE(!input.empty()); @@ -415,6 +415,8 @@ TEST_P(Reproducibility_SqueezeNet_v1_1, Accuracy) int targetId = GetParam(); if(targetId == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + if(targetId == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); Net net = readNetFromCaffe(findDataFile("dnn/squeezenet_v1.1.prototxt"), findDataFile("dnn/squeezenet_v1.1.caffemodel", false)); net.setPreferableBackend(DNN_BACKEND_OPENCV); @@ -509,7 +511,7 @@ TEST_P(Test_Caffe_nets, Colorization) // Reference output values are in range [-29.1, 69.5] double l1 = 4e-4, lInf = 3e-3; - if (target == DNN_TARGET_OPENCL_FP16) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16) { l1 = 0.25; lInf = 5.3; @@ -566,7 +568,7 @@ TEST_P(Test_Caffe_nets, DenseNet_121) { l1 = 0.11; lInf = 0.5; } - else if (target == DNN_TARGET_CUDA_FP16) + else if (target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16) { l1 = 0.04; lInf = 0.2; } @@ -635,6 +637,8 @@ TEST_P(opencv_face_detector, Accuracy) if (targetId == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + if (targetId == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); Net net = readNetFromCaffe(proto, model); Mat img = imread(findDataFile("gpu/lbpcascade/er.png")); @@ -665,6 +669,8 @@ TEST_P(opencv_face_detector, issue_15106) if (targetId == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + if (targetId == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); Net net = readNetFromCaffe(proto, model); Mat img = imread(findDataFile("cv/shared/lena.png")); @@ -768,6 +774,8 @@ TEST_P(Test_Caffe_nets, FasterRCNN_zf) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD); if (target == DNN_TARGET_CUDA_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16); + if (target == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); static Mat ref = (Mat_(3, 7) << 0, 2, 0.90121, 120.407, 115.83, 570.586, 528.395, 0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762, 0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176); @@ -783,7 +791,7 @@ TEST_P(Test_Caffe_nets, RFCN) ); float scoreDiff = default_l1, iouDiff = default_lInf; - if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) + if (backend == DNN_BACKEND_OPENCV && (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)) { scoreDiff = 4e-3; iouDiff = 8e-2; diff --git a/modules/dnn/test/test_common.hpp b/modules/dnn/test/test_common.hpp index e3c7a553f8..6262a0f7a4 100644 --- a/modules/dnn/test/test_common.hpp +++ b/modules/dnn/test/test_common.hpp @@ -21,6 +21,7 @@ #define CV_TEST_TAG_DNN_SKIP_OPENCV_BACKEND "dnn_skip_opencv_backend" #define CV_TEST_TAG_DNN_SKIP_HALIDE "dnn_skip_halide" #define CV_TEST_TAG_DNN_SKIP_CPU "dnn_skip_cpu" +#define CV_TEST_TAG_DNN_SKIP_CPU_FP16 "dnn_skip_cpu_fp16" #define CV_TEST_TAG_DNN_SKIP_OPENCL "dnn_skip_ocl" #define CV_TEST_TAG_DNN_SKIP_OPENCL_FP16 "dnn_skip_ocl_fp16" #define CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER "dnn_skip_ie_nn_builder" @@ -164,7 +165,7 @@ public: static void getDefaultThresholds(int backend, int target, double* l1, double* lInf) { - if (target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_CPU_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) { *l1 = 4e-3; *lInf = 2e-2; diff --git a/modules/dnn/test/test_common.impl.hpp b/modules/dnn/test/test_common.impl.hpp index bad6a8d082..2a4ee34b02 100644 --- a/modules/dnn/test/test_common.impl.hpp +++ b/modules/dnn/test/test_common.impl.hpp @@ -49,6 +49,7 @@ void PrintTo(const cv::dnn::Target& v, std::ostream* os) case DNN_TARGET_CUDA: *os << "CUDA"; return; case DNN_TARGET_CUDA_FP16: *os << "CUDA_FP16"; return; case DNN_TARGET_NPU: *os << "NPU"; return; + case DNN_TARGET_CPU_FP16: *os << "CPU_FP16"; return; } // don't use "default:" to emit compiler warnings *os << "DNN_TARGET_UNKNOWN(" << (int)v << ")"; } @@ -439,7 +440,7 @@ void initDNNTests() registerGlobalSkipTag( CV_TEST_TAG_DNN_SKIP_OPENCV_BACKEND, - CV_TEST_TAG_DNN_SKIP_CPU, + CV_TEST_TAG_DNN_SKIP_CPU, CV_TEST_TAG_DNN_SKIP_CPU_FP16, CV_TEST_TAG_DNN_SKIP_OPENCL, CV_TEST_TAG_DNN_SKIP_OPENCL_FP16 ); #if defined(HAVE_HALIDE) diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp index 2160c81fad..2e6db5ef68 100644 --- a/modules/dnn/test/test_darknet_importer.cpp +++ b/modules/dnn/test/test_darknet_importer.cpp @@ -360,9 +360,9 @@ TEST_P(Test_Darknet_nets, YoloVoc) 1, 6, 0.667770f, 0.446555f, 0.453578f, 0.499986f, 0.519167f, // a car 1, 6, 0.844947f, 0.637058f, 0.460398f, 0.828508f, 0.66427f); // a car - double nmsThreshold = (target == DNN_TARGET_MYRIAD) ? 0.397 : 0.4; + double nmsThreshold = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.397 : 0.4; double scoreDiff = 8e-5, iouDiff = 3e-4; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { scoreDiff = 1e-2; iouDiff = 0.018; @@ -451,7 +451,7 @@ TEST_P(Test_Darknet_nets, TinyYoloVoc) 1, 6, 0.928758f, 0.651024f, 0.463539f, 0.823784f, 0.654998f); // a car double scoreDiff = 8e-5, iouDiff = 3e-4; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { scoreDiff = 8e-3; iouDiff = 0.018; @@ -636,7 +636,7 @@ TEST_P(Test_Darknet_nets, YOLOv3) Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_); double scoreDiff = 8e-5, iouDiff = 3e-4; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2022010000) if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) @@ -725,8 +725,8 @@ TEST_P(Test_Darknet_nets, YOLOv4) }; Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_); - double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.006 : 8e-5; - double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.042 : 3e-4; + double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.006 : 8e-5; + double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.042 : 3e-4; if (target == DNN_TARGET_CUDA_FP16) { scoreDiff = 0.008; @@ -847,7 +847,7 @@ TEST_P(Test_Darknet_nets, YOLOv4_tiny) Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_); double scoreDiff = 0.012f; - double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.15 : 0.01f; + double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.15 : 0.01f; if (target == DNN_TARGET_CUDA_FP16) iouDiff = 0.02; @@ -930,7 +930,7 @@ TEST_P(Test_Darknet_nets, YOLOv4x_mish) double scoreDiff = 8e-5; double iouDiff = 3e-4; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16) { scoreDiff = 0.006; iouDiff = 0.042; @@ -1093,6 +1093,8 @@ TEST_P(Test_Darknet_layers, connected) { if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); testDarknetLayer("connected", true); } diff --git a/modules/dnn/test/test_googlenet.cpp b/modules/dnn/test/test_googlenet.cpp index e51dcd0988..f911ff029f 100644 --- a/modules/dnn/test/test_googlenet.cpp +++ b/modules/dnn/test/test_googlenet.cpp @@ -58,6 +58,8 @@ TEST_P(Reproducibility_GoogLeNet, Batching) const int targetId = GetParam(); if (targetId == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + if (targetId == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt"), findDataFile("dnn/bvlc_googlenet.caffemodel", false)); net.setPreferableBackend(DNN_BACKEND_OPENCV); @@ -89,6 +91,8 @@ TEST_P(Reproducibility_GoogLeNet, IntermediateBlobs) const int targetId = GetParam(); if (targetId == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + if (targetId == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt"), findDataFile("dnn/bvlc_googlenet.caffemodel", false)); net.setPreferableBackend(DNN_BACKEND_OPENCV); @@ -120,6 +124,8 @@ TEST_P(Reproducibility_GoogLeNet, SeveralCalls) const int targetId = GetParam(); if (targetId == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + if (targetId == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt"), findDataFile("dnn/bvlc_googlenet.caffemodel", false)); net.setPreferableBackend(DNN_BACKEND_OPENCV); diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp index a5652f16b7..763d94b99c 100644 --- a/modules/dnn/test/test_layers.cpp +++ b/modules/dnn/test/test_layers.cpp @@ -212,6 +212,8 @@ TEST_P(Test_Caffe_layers, InnerProduct) if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); testLayerUsingCaffeModels("layer_inner_product", true); } @@ -378,7 +380,7 @@ TEST_P(Test_Caffe_layers, Eltwise) TEST_P(Test_Caffe_layers, PReLU) { - double lInf = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16) ? 0.021 : 0.0; + double lInf = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16) ? 0.021 : 0.0; testLayerUsingCaffeModels("layer_prelu", true, true, 0.0, lInf); } @@ -2459,7 +2461,7 @@ TEST_P(ConvolutionActivationFusion, Accuracy) std::vector expectedFusedLayers; if (backendId == DNN_BACKEND_OPENCV) { - if (targetId == DNN_TARGET_CPU) + if (targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16) expectedFusedLayers.push_back(activId); // all activations are fused else if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16) { @@ -2594,7 +2596,7 @@ TEST_P(ConvolutionEltwiseActivationFusion, Accuracy) std::vector expectedFusedLayers; if (backendId == DNN_BACKEND_OPENCV) { - if (targetId == DNN_TARGET_CPU) + if (targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16) expectedFusedLayers.push_back(activId); // activation is fused with eltwise layer else if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16) { @@ -2683,7 +2685,7 @@ TEST_P(ConvolutionActivationEltwiseFusion, Accuracy) std::vector expectedFusedLayers; if (backendId == DNN_BACKEND_OPENCV) { - if (targetId == DNN_TARGET_CPU) + if (targetId == DNN_TARGET_CPU || targetId == DNN_TARGET_CPU_FP16) expectedFusedLayers.push_back(activId); // activation fused with convolution else if (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16) { diff --git a/modules/dnn/test/test_model.cpp b/modules/dnn/test/test_model.cpp index 2d6c4c7ac1..bd03551ab8 100644 --- a/modules/dnn/test/test_model.cpp +++ b/modules/dnn/test/test_model.cpp @@ -332,7 +332,7 @@ TEST_P(Test_Model, DetectRegion) double confThreshold = 0.24; double nmsThreshold = (target == DNN_TARGET_MYRIAD) ? 0.397 : 0.4; double scoreDiff = 8e-5, iouDiff = 1e-5; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16) { scoreDiff = 1e-2; iouDiff = 1.6e-2; @@ -392,7 +392,7 @@ TEST_P(Test_Model, DetectRegionWithNmsAcrossClasses) double confThreshold = 0.24; double nmsThreshold = (target == DNN_TARGET_MYRIAD) ? 0.15: 0.15; double scoreDiff = 8e-5, iouDiff = 1e-5; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16) { scoreDiff = 1e-2; iouDiff = 1.6e-2; @@ -443,7 +443,7 @@ TEST_P(Test_Model, DetectionOutput) double scoreDiff = default_l1, iouDiff = 1e-5; float confThreshold = 0.8; double nmsThreshold = 0.0; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CUDA_FP16) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16) { if (backend == DNN_BACKEND_OPENCV) scoreDiff = 4e-3; @@ -495,7 +495,7 @@ TEST_P(Test_Model, DetectionMobilenetSSD) Size size{300, 300}; double scoreDiff = 1e-5, iouDiff = 1e-5; - if (target == DNN_TARGET_OPENCL_FP16) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16) { scoreDiff = 1.7e-2; iouDiff = 6.91e-2; @@ -522,6 +522,8 @@ TEST_P(Test_Model, Keypoints_pose) { if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + if (target == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); #ifdef HAVE_INF_ENGINE if (target == DNN_TARGET_MYRIAD) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION); @@ -569,7 +571,7 @@ TEST_P(Test_Model, Keypoints_face) // Ref. Range: [-1.1784188, 1.7758257] float norm = 1e-4; - if (target == DNN_TARGET_OPENCL_FP16) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16) norm = 5e-3; if (target == DNN_TARGET_MYRIAD) { @@ -605,7 +607,7 @@ TEST_P(Test_Model, Detection_normalized) scoreDiff = 3e-4; iouDiff = 0.018; } - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_CPU_FP16) { scoreDiff = 5e-3; iouDiff = 0.09; @@ -654,7 +656,7 @@ TEST_P(Test_Model, Segmentation) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION); #endif - if ((backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) + if ((backend == DNN_BACKEND_OPENCV && (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16)) || (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)) { norm = 2.0f; // l1 = 0.01 lInf = 2 @@ -741,6 +743,8 @@ TEST_P(Test_Model, TextDetectionByDB) { if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + if (target == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); std::string imgPath = _tf("text_det_test1.png"); std::string weightPathDB = _tf("onnx/models/DB_TD500_resnet50.onnx", false); @@ -801,7 +805,7 @@ TEST_P(Test_Model, TextDetectionByEAST) double eps_size = 5/*pixels*/; double eps_angle = 1; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { eps_center = 10; eps_size = 25; diff --git a/modules/dnn/test/test_onnx_conformance.cpp b/modules/dnn/test/test_onnx_conformance.cpp index 8f24fdf135..b238427dfb 100644 --- a/modules/dnn/test/test_onnx_conformance.cpp +++ b/modules/dnn/test/test_onnx_conformance.cpp @@ -957,7 +957,7 @@ public: backend = get<0>(get<1>(GetParam())); target = get<1>(get<1>(GetParam())); - if (target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_CUDA_FP16 || target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { default_l1 = 7e-3; default_lInf = 2e-2; diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp index a2fed56b68..12a5ad1957 100644 --- a/modules/dnn/test/test_onnx_importer.cpp +++ b/modules/dnn/test/test_onnx_importer.cpp @@ -2179,7 +2179,7 @@ TEST_P(Test_ONNX_nets, TinyYolov2) // output range: [-11; 8] double l1 = default_l1, lInf = default_lInf; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { l1 = 0.02; lInf = 0.2; diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp index bcedcffe15..b795076f55 100644 --- a/modules/dnn/test/test_tf_importer.cpp +++ b/modules/dnn/test/test_tf_importer.cpp @@ -486,6 +486,11 @@ TEST_P(Test_TensorFlow_layers, slim_batch_norm) l1 = 0.005; lInf = 0.33; } + else if (target == DNN_TARGET_CPU_FP16) + { + l1 = 0.041; + lInf = 0.37; + } runTensorFlowNet("slim_batch_norm", false, l1, lInf); } @@ -710,6 +715,9 @@ TEST_P(Test_TensorFlow_layers, matmul) { if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); + runTensorFlowNet("matmul"); runTensorFlowNet("nhwc_transpose_reshape_matmul"); // Reference output values are in range [-5.688, 4.484] @@ -723,6 +731,8 @@ TEST_P(Test_TensorFlow_layers, batch_matmul) { if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); runTensorFlowNet("batch_matmul"); } @@ -730,6 +740,8 @@ TEST_P(Test_TensorFlow_layers, square) { if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); runTensorFlowNet("square"); } @@ -924,7 +936,7 @@ TEST_P(Test_TensorFlow_nets, MobileNet_SSD) Mat out = net.forward(); double scoreDiff = default_l1, iouDiff = default_lInf; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { scoreDiff = 0.01; iouDiff = 0.1; @@ -971,7 +983,7 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_SSD) 0, 10, 0.93973452, 0.66561931, 0.37841269, 0.68074018, 0.42907384); double scoreDiff = default_l1, iouDiff = default_lInf; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { scoreDiff = 0.0097; iouDiff = 0.09; @@ -1004,7 +1016,7 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD) Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/ssd_mobilenet_v1_coco_2017_11_17.detection_out.npy")); float scoreDiff = 1.5e-5, iouDiff = 1e-3; float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.35 : 0.3; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { scoreDiff = 0.011; iouDiff = 0.012; @@ -1053,6 +1065,8 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN_inception_v2_coco_2018_01_28) if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16); + if (target == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); checkBackend(); @@ -1085,6 +1099,9 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN_inception_v2_coco_2018_01_28) if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + if (target == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); + normAssertDetections(ref, out, name.c_str(), 0.3, scoresDiff, iouDiff); } } @@ -1164,6 +1181,9 @@ TEST_P(Test_TensorFlow_nets, Faster_RCNN_resnet50_coco_2018_01_28) if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + if (target == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); + normAssertDetections(ref, out, name.c_str(), 0.3, scoresDiff, iouDiff); } } @@ -1191,7 +1211,7 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD_PPN) Mat out = net.forward(); double scoreDiff = 1.1e-5, iouDiff = default_lInf; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { scoreDiff = 0.048; iouDiff = 0.058; @@ -1230,7 +1250,7 @@ TEST_P(Test_TensorFlow_nets, opencv_face_detector_uint8) 0, 1, 0.97203469, 0.67965847, 0.06876482, 0.73999709, 0.1513494, 0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427, 0.5347801); double scoreDiff = 3.4e-3, iouDiff = 1e-2; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { scoreDiff = 4e-3; iouDiff = 0.024; @@ -1317,6 +1337,11 @@ TEST_P(Test_TensorFlow_nets, EAST_text_detection) lInf_scores = 0.1; l1_geometry = 0.3; lInf_geometry = 7; } + else if (target == DNN_TARGET_CPU_FP16) + { + lInf_scores = 0.1; + l1_geometry = 0.28; lInf_geometry = 5.94; + } else { l1_geometry = 1e-4, lInf_geometry = 4.3e-3; @@ -1360,6 +1385,10 @@ TEST_P(Test_TensorFlow_layers, fp16_weights_fp16_pad_and_concat) TEST_P(Test_TensorFlow_layers, fp16_weights_fp16_padding_valid) { float l1 = 0.00078, lInf = 0.012; + + if (target == DNN_TARGET_CPU_FP16) + l1 = 0.00083; + runTensorFlowNet("fp16_padding_valid", false, l1, lInf); } TEST_P(Test_TensorFlow_layers, fp16_weights_fp16_max_pool_even) @@ -1407,8 +1436,13 @@ TEST_P(Test_TensorFlow_layers, fp16_weights_fp16_max_pool_odd_valid) TEST_P(Test_TensorFlow_layers, fp16_padding_same) { + float l1 = 7e-4, lInf = 4e-3; + + if (target == DNN_TARGET_CPU_FP16) + lInf = 5e-3; + // Reference output values are in range [-3.504, -0.002] - runTensorFlowNet("fp16_padding_same", false, 7e-4, 4e-3); + runTensorFlowNet("fp16_padding_same", false, l1, lInf); } TEST_P(Test_TensorFlow_layers, defun) @@ -1450,6 +1484,9 @@ TEST_P(Test_TensorFlow_layers, lstm) if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); + runTensorFlowNet("lstm", true); runTensorFlowNet("lstm", true, 0.0, 0.0, true); } @@ -1771,8 +1808,8 @@ TEST_P(Test_TensorFlow_nets, Mask_RCNN) Mat outDetections = outs[0]; Mat outMasks = outs[1]; - double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.2 : 2e-5; - double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.018 : default_lInf; + double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.2 : 2e-5; + double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.018 : default_lInf; normAssertDetections(refDetections, outDetections, "", /*threshold for zero confidence*/1e-5, scoreDiff, iouDiff); // Output size of masks is NxCxHxW where @@ -1805,7 +1842,7 @@ TEST_P(Test_TensorFlow_nets, Mask_RCNN) double inter = cv::countNonZero(masks & refMasks); double area = cv::countNonZero(masks | refMasks); - EXPECT_GE(inter / area, (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.98 : 0.99); + EXPECT_GE(inter / area, (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 0.98 : 0.99); if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) expectNoFallbacks(net); @@ -1815,6 +1852,7 @@ TEST_P(Test_TensorFlow_nets, EfficientDet) { if (target != DNN_TARGET_CPU) { + if (target == DNN_TARGET_CPU_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); if (target == DNN_TARGET_OPENCL) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL); if (target == DNN_TARGET_MYRIAD) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD); diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp index 2e18ac8c48..8510ec4e64 100644 --- a/modules/dnn/test/test_torch_importer.cpp +++ b/modules/dnn/test/test_torch_importer.cpp @@ -113,7 +113,7 @@ TEST_P(Test_Torch_layers, run_convolution) { // Output reference values are in range [23.4018, 72.0181] double l1 = default_l1, lInf = default_lInf; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { l1 = 0.08; lInf = 0.43; @@ -132,6 +132,8 @@ TEST_P(Test_Torch_layers, run_pool_max) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); if (target == DNN_TARGET_CUDA_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16); + if (target == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); double l1 = 0.0, lInf = 0.0; runTorchNet("net_pool_max", "", true, false, true, l1, lInf); } @@ -158,7 +160,7 @@ TEST_P(Test_Torch_layers, run_reshape_single_sample) { // Reference output values in range [14.4586, 18.4492]. double l1 = default_l1, lInf = default_lInf; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { l1 = 0.033; lInf = 0.05; @@ -175,6 +177,8 @@ TEST_P(Test_Torch_layers, run_linear) { if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16); + if (target == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); runTorchNet("net_linear_2d"); } @@ -186,7 +190,7 @@ TEST_P(Test_Torch_layers, run_concat) TEST_P(Test_Torch_layers, run_depth_concat) { double lInf = 0.0; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { lInf = 0.032; } @@ -252,7 +256,7 @@ TEST_P(Test_Torch_layers, net_conv_gemm_lrn) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH); #endif double l1 = 0.0, lInf = 0.0; - if (target == DNN_TARGET_OPENCL_FP16) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_CPU_FP16) { l1 = 0.046; lInf = 0.023; @@ -369,7 +373,7 @@ TEST_P(Test_Torch_nets, OpenFace_accuracy) // Reference output values are in range [-0.17212, 0.263492] // on Myriad problem layer: l4_Pooling - does not use pads_begin float l1 = 1e-5, lInf = 1e-3; - if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) + if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) { l1 = 2e-3; lInf = 5e-3; @@ -431,6 +435,8 @@ TEST_P(Test_Torch_nets, ENet_accuracy) throw SkipTestException(""); if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16); + if (target == DNN_TARGET_CPU_FP16) + applyTestTag(CV_TEST_TAG_DNN_SKIP_CPU_FP16); #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020010000) if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION); @@ -562,6 +568,10 @@ TEST_P(Test_Torch_nets, FastNeuralStyle_accuracy) { normAssert(out, refBlob, "", 0.6, 25); } + else if (target == DNN_TARGET_CPU_FP16) + { + normAssert(out, refBlob, "", 0.62, 25); + } else normAssert(out, refBlob, "", 0.5, 1.1); }