From c240355cc6cd56083b064866d37ff437ed0eeef3 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Tue, 15 Dec 2020 01:34:20 +0000 Subject: [PATCH] dnn(ocl): avoid mess FP16/FP32 in convolution layer --- modules/core/src/convert.dispatch.cpp | 2 +- modules/core/src/opencl/halfconvert.cl | 13 ++- modules/dnn/src/layers/convolution_layer.cpp | 26 +++--- modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp | 2 - .../src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp | 88 ++++++++++--------- modules/dnn/src/opencl/conv_spatial_helper.cl | 7 +- 6 files changed, 77 insertions(+), 61 deletions(-) diff --git a/modules/core/src/convert.dispatch.cpp b/modules/core/src/convert.dispatch.cpp index 4d396b5e99..c32c5bb420 100644 --- a/modules/core/src/convert.dispatch.cpp +++ b/modules/core/src/convert.dispatch.cpp @@ -138,7 +138,7 @@ static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int sdepth, int sdepth == CV_32F ? "half" : "float", rowsPerWI, sdepth == CV_32F ? " -D FLOAT_TO_HALF " : ""); - ocl::Kernel k("convertFp16", ocl::core::halfconvert_oclsrc, build_opt); + ocl::Kernel k(sdepth == CV_32F ? "convertFp16_FP32_to_FP16" : "convertFp16_FP16_to_FP32", ocl::core::halfconvert_oclsrc, build_opt); if (k.empty()) return false; diff --git a/modules/core/src/opencl/halfconvert.cl b/modules/core/src/opencl/halfconvert.cl index 506df69faf..9df602f406 100644 --- a/modules/core/src/opencl/halfconvert.cl +++ b/modules/core/src/opencl/halfconvert.cl @@ -47,8 +47,17 @@ #endif #endif -__kernel void convertFp16(__global const uchar * srcptr, int src_step, int src_offset, - __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols) +__kernel void +#ifdef FLOAT_TO_HALF + convertFp16_FP32_to_FP16 +#else + convertFp16_FP16_to_FP32 +#endif +( + __global const uchar * srcptr, int src_step, int src_offset, + __global uchar * dstptr, int dst_step, int dst_offset, + int dst_rows, int dst_cols +) { int x = get_global_id(0); int y0 = get_global_id(1) * rowsPerWI; diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 63bd386119..f131c023ab 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -1461,16 +1461,7 @@ public: umat_blobs.resize(n); for (size_t i = 0; i < n; i++) { - if (use_half) - { - Mat matFP32; - convertFp16(inputs[i + 1], matFP32); - matFP32.copyTo(umat_blobs[i]); - } - else - { - inputs[i + 1].copyTo(umat_blobs[i]); - } + inputs[i + 1].copyTo(umat_blobs[i]); } inputs.resize(1); } @@ -1481,7 +1472,10 @@ public: umat_blobs.resize(n); for (size_t i = 0; i < n; i++) { - blobs[i].copyTo(umat_blobs[i]); + if (use_half) + convertFp16(blobs[i], umat_blobs[i]); + else + blobs[i].copyTo(umat_blobs[i]); } } @@ -1537,14 +1531,20 @@ public: if (fusedWeights) { - weightsMat.copyTo(umat_blobs[0]); + if (use_half) + convertFp16(weightsMat, umat_blobs[0]); + else + weightsMat.copyTo(umat_blobs[0]); fusedWeights = false; } if (fusedBias) { if ( umat_blobs.size() < 2 ) umat_blobs.resize(2); - umat_blobs[1] = UMat(biasvec, true); + if (use_half) + convertFp16(Mat(biasvec, true), umat_blobs[1]); + else + Mat(biasvec, true).copyTo(umat_blobs[1]); convolutionOp->setBias(true); fusedBias = false; } diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp index 8de7ba26e2..7bb277d102 100644 --- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp +++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp @@ -274,8 +274,6 @@ class OCL4DNNConvSpatial int32_t group_; bool bias_term_; UMat swizzled_weights_umat; - UMat weights_half; - UMat bias_half; UMat bottom_data2_; int32_t bottom_index_; diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp index 3707a31846..fd98919343 100644 --- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp @@ -588,16 +588,16 @@ bool OCL4DNNConvSpatial::Forward(const UMat& bottom, fused_eltwise_ = false; } - if (use_half_ && bias_half.empty() && !bias.empty()) - convertFp16(bias, bias_half); + if (use_half_ && !bias.empty()) + CV_CheckTypeEQ(bias.type(), CV_16SC1, ""); - if (use_half_ && weights_half.empty()) - convertFp16(weight, weights_half); + if (use_half_) + CV_CheckTypeEQ(weight.type(), CV_16SC1, ""); - prepareKernel(bottom, top, weight, (use_half_) ? bias_half : bias, numImages); + prepareKernel(bottom, top, weight, bias, numImages); if (bestKernelConfig.empty()) return false; - return convolve(bottom, top, weight, (use_half_) ? bias_half : bias, numImages, bestKernelConfig); + return convolve(bottom, top, weight, bias, numImages, bestKernelConfig); } template @@ -744,29 +744,26 @@ bool OCL4DNNConvSpatial::swizzleWeight(const UMat &weight, kernel_h_ * (int)alignSize(kernel_w_, 2), (use_half_) ? CV_16SC1 : CV_32FC1); - UMat swizzled_weights_tmp; - if (use_half_) - swizzled_weights_tmp.create(shape(swizzled_weights_umat), CV_32F); - if (!interleave) { - cl_uint argIdx = 0; int32_t channels = channels_ / group_; - ocl::Kernel oclk_copy_weight(CL_KERNEL_SELECT("copyWeightsSwizzled"), - cv::ocl::dnn::conv_spatial_helper_oclsrc); + ocl::Kernel oclk_copy_weight( + use_half_ ? "copyWeightsSwizzled_half" : "copyWeightsSwizzled_float", + cv::ocl::dnn::conv_spatial_helper_oclsrc, + use_half_ ? "-DHALF_SUPPORT=1 -DDtype=half" : "-DDtype=float" + ); if (oclk_copy_weight.empty()) return false; - oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); - if (use_half_) - oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_tmp)); - else - oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat)); - oclk_copy_weight.set(argIdx++, kernel_w_); - oclk_copy_weight.set(argIdx++, kernel_h_); - oclk_copy_weight.set(argIdx++, channels); - oclk_copy_weight.set(argIdx++, num_output_); - oclk_copy_weight.set(argIdx++, swizzled_factor); + oclk_copy_weight.args( + ocl::KernelArg::PtrReadOnly(weight), + ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat), + kernel_w_, + kernel_h_, + channels, + num_output_, + swizzled_factor + ); size_t global_work_size_copy[3] = { (size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 }; @@ -778,13 +775,24 @@ bool OCL4DNNConvSpatial::swizzleWeight(const UMat &weight, } } else { // assumption: kernel dimension is 2 - Mat weightMat = weight.getMat(ACCESS_READ); - Dtype* cpu_weight = (Dtype *)weightMat.ptr(); + Mat weightMat; Mat swizzledWeightMat; + UMat weight_tmp; // FP32 in half mode, TODO implement FP16 repack if (use_half_) - swizzledWeightMat = swizzled_weights_tmp.getMat(ACCESS_WRITE); + { + CV_CheckTypeEQ(weight.type(), CV_16SC1, ""); + convertFp16(weight, weight_tmp); + weightMat = weight_tmp.getMat(ACCESS_READ); + swizzledWeightMat.create(shape(swizzled_weights_umat), CV_32F); + } else + { + weightMat = weight.getMat(ACCESS_READ); swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE); + } + + CV_CheckTypeEQ(weightMat.type(), CV_32FC1, ""); + Dtype* cpu_weight = (Dtype *)weightMat.ptr(); Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr(); int interleavedRows = (kernel_w_ / 2) * 2; @@ -792,26 +800,28 @@ bool OCL4DNNConvSpatial::swizzleWeight(const UMat &weight, int blockWidth = swizzled_factor; // should equal to simd size. int rowAlignment = 32; size_t interleaved_filter_size = M_ * kernel_w_ * kernel_h_ * channels_ * sizeof(Dtype); - Dtype * tmpSwizzledWeight = reinterpret_cast(malloc(interleaved_filter_size)); - CHECK_EQ(tmpSwizzledWeight != NULL, true) << "Failed to allocate temporary swizzled weight"; + cv::AutoBuffer tmpSwizzledWeight(interleaved_filter_size); for (int od = 0; od < M_; od++) for (int id = 0; id < channels_; id++) for (int r = 0; r < kernel_h_; r++) for (int c = 0; c < kernel_w_; c++) tmpSwizzledWeight[((id * kernel_h_ + r)* kernel_w_ + c) * M_ + od] = cpu_weight[((od * channels_ + id) * kernel_h_ + r)*kernel_w_+c]; + interleaveMatrix(cpu_swizzled_weight, - tmpSwizzledWeight, + tmpSwizzledWeight.data(), kernel_w_ * kernel_h_ * channels_, M_, interleavedRows, nonInterleavedRows, blockWidth, rowAlignment); - free(tmpSwizzledWeight); - } - if (use_half_) - convertFp16(swizzled_weights_tmp, swizzled_weights_umat); + // unmap OpenCL buffers + weightMat.release(); + + if (use_half_) + convertFp16(swizzledWeightMat, swizzled_weights_umat); + } return true; } @@ -1104,10 +1114,7 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, cl_uint argIdx = 0; setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); - if (use_half_) - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half)); - else - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); if (bias_term_) kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); @@ -1148,10 +1155,7 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); kernel.set(argIdx++, image_offset); - if (use_half_) - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half)); - else - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); kernel.set(argIdx++, kernel_offset); if (bias_term_) kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); @@ -1956,7 +1960,7 @@ void OCL4DNNConvSpatial::prepareKernel(const UMat &bottom, UMat &top, UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1); - calculateBenchmark(bottom, benchData, (use_half_) ? weights_half : weight, bias, numImages); + calculateBenchmark(bottom, benchData, weight, bias, numImages); if (run_auto_tuning_ || force_auto_tuning_) { diff --git a/modules/dnn/src/opencl/conv_spatial_helper.cl b/modules/dnn/src/opencl/conv_spatial_helper.cl index 9d5a89f7b1..33d9db57c8 100644 --- a/modules/dnn/src/opencl/conv_spatial_helper.cl +++ b/modules/dnn/src/opencl/conv_spatial_helper.cl @@ -39,9 +39,14 @@ // //M*/ +#ifdef HALF_SUPPORT +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16:enable +#endif +#endif + #define CONCAT(A,B) A##_##B #define TEMPLATE(name,type) CONCAT(name,type) -#define Dtype float __kernel void TEMPLATE(copyWeightsSwizzled, Dtype) (__global Dtype* weightIn,