dnn(ocl): avoid mess FP16/FP32 in convolution layer

pull/19115/head
Alexander Alekhin 4 years ago
parent 1bfc75ac23
commit c240355cc6
  1. 2
      modules/core/src/convert.dispatch.cpp
  2. 13
      modules/core/src/opencl/halfconvert.cl
  3. 26
      modules/dnn/src/layers/convolution_layer.cpp
  4. 2
      modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
  5. 88
      modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
  6. 7
      modules/dnn/src/opencl/conv_spatial_helper.cl

@ -138,7 +138,7 @@ static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int sdepth, int
sdepth == CV_32F ? "half" : "float", sdepth == CV_32F ? "half" : "float",
rowsPerWI, rowsPerWI,
sdepth == CV_32F ? " -D FLOAT_TO_HALF " : ""); sdepth == CV_32F ? " -D FLOAT_TO_HALF " : "");
ocl::Kernel k("convertFp16", ocl::core::halfconvert_oclsrc, build_opt); ocl::Kernel k(sdepth == CV_32F ? "convertFp16_FP32_to_FP16" : "convertFp16_FP16_to_FP32", ocl::core::halfconvert_oclsrc, build_opt);
if (k.empty()) if (k.empty())
return false; return false;

@ -47,8 +47,17 @@
#endif #endif
#endif #endif
__kernel void convertFp16(__global const uchar * srcptr, int src_step, int src_offset, __kernel void
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols) #ifdef FLOAT_TO_HALF
convertFp16_FP32_to_FP16
#else
convertFp16_FP16_to_FP32
#endif
(
__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset,
int dst_rows, int dst_cols
)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y0 = get_global_id(1) * rowsPerWI; int y0 = get_global_id(1) * rowsPerWI;

@ -1461,16 +1461,7 @@ public:
umat_blobs.resize(n); umat_blobs.resize(n);
for (size_t i = 0; i < n; i++) for (size_t i = 0; i < n; i++)
{ {
if (use_half) inputs[i + 1].copyTo(umat_blobs[i]);
{
Mat matFP32;
convertFp16(inputs[i + 1], matFP32);
matFP32.copyTo(umat_blobs[i]);
}
else
{
inputs[i + 1].copyTo(umat_blobs[i]);
}
} }
inputs.resize(1); inputs.resize(1);
} }
@ -1481,7 +1472,10 @@ public:
umat_blobs.resize(n); umat_blobs.resize(n);
for (size_t i = 0; i < n; i++) for (size_t i = 0; i < n; i++)
{ {
blobs[i].copyTo(umat_blobs[i]); if (use_half)
convertFp16(blobs[i], umat_blobs[i]);
else
blobs[i].copyTo(umat_blobs[i]);
} }
} }
@ -1537,14 +1531,20 @@ public:
if (fusedWeights) if (fusedWeights)
{ {
weightsMat.copyTo(umat_blobs[0]); if (use_half)
convertFp16(weightsMat, umat_blobs[0]);
else
weightsMat.copyTo(umat_blobs[0]);
fusedWeights = false; fusedWeights = false;
} }
if (fusedBias) if (fusedBias)
{ {
if ( umat_blobs.size() < 2 ) if ( umat_blobs.size() < 2 )
umat_blobs.resize(2); umat_blobs.resize(2);
umat_blobs[1] = UMat(biasvec, true); if (use_half)
convertFp16(Mat(biasvec, true), umat_blobs[1]);
else
Mat(biasvec, true).copyTo(umat_blobs[1]);
convolutionOp->setBias(true); convolutionOp->setBias(true);
fusedBias = false; fusedBias = false;
} }

@ -274,8 +274,6 @@ class OCL4DNNConvSpatial
int32_t group_; int32_t group_;
bool bias_term_; bool bias_term_;
UMat swizzled_weights_umat; UMat swizzled_weights_umat;
UMat weights_half;
UMat bias_half;
UMat bottom_data2_; UMat bottom_data2_;
int32_t bottom_index_; int32_t bottom_index_;

@ -588,16 +588,16 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
fused_eltwise_ = false; fused_eltwise_ = false;
} }
if (use_half_ && bias_half.empty() && !bias.empty()) if (use_half_ && !bias.empty())
convertFp16(bias, bias_half); CV_CheckTypeEQ(bias.type(), CV_16SC1, "");
if (use_half_ && weights_half.empty()) if (use_half_)
convertFp16(weight, weights_half); CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
prepareKernel(bottom, top, weight, (use_half_) ? bias_half : bias, numImages); prepareKernel(bottom, top, weight, bias, numImages);
if (bestKernelConfig.empty()) if (bestKernelConfig.empty())
return false; return false;
return convolve(bottom, top, weight, (use_half_) ? bias_half : bias, numImages, bestKernelConfig); return convolve(bottom, top, weight, bias, numImages, bestKernelConfig);
} }
template<typename Dtype> template<typename Dtype>
@ -744,29 +744,26 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
kernel_h_ * (int)alignSize(kernel_w_, 2), kernel_h_ * (int)alignSize(kernel_w_, 2),
(use_half_) ? CV_16SC1 : CV_32FC1); (use_half_) ? CV_16SC1 : CV_32FC1);
UMat swizzled_weights_tmp;
if (use_half_)
swizzled_weights_tmp.create(shape(swizzled_weights_umat), CV_32F);
if (!interleave) { if (!interleave) {
cl_uint argIdx = 0;
int32_t channels = channels_ / group_; int32_t channels = channels_ / group_;
ocl::Kernel oclk_copy_weight(CL_KERNEL_SELECT("copyWeightsSwizzled"), ocl::Kernel oclk_copy_weight(
cv::ocl::dnn::conv_spatial_helper_oclsrc); use_half_ ? "copyWeightsSwizzled_half" : "copyWeightsSwizzled_float",
cv::ocl::dnn::conv_spatial_helper_oclsrc,
use_half_ ? "-DHALF_SUPPORT=1 -DDtype=half" : "-DDtype=float"
);
if (oclk_copy_weight.empty()) if (oclk_copy_weight.empty())
return false; return false;
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight)); oclk_copy_weight.args(
if (use_half_) ocl::KernelArg::PtrReadOnly(weight),
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_tmp)); ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat),
else kernel_w_,
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat)); kernel_h_,
oclk_copy_weight.set(argIdx++, kernel_w_); channels,
oclk_copy_weight.set(argIdx++, kernel_h_); num_output_,
oclk_copy_weight.set(argIdx++, channels); swizzled_factor
oclk_copy_weight.set(argIdx++, num_output_); );
oclk_copy_weight.set(argIdx++, swizzled_factor);
size_t global_work_size_copy[3] = { size_t global_work_size_copy[3] = {
(size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 }; (size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
@ -778,13 +775,24 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
} }
} else { } else {
// assumption: kernel dimension is 2 // assumption: kernel dimension is 2
Mat weightMat = weight.getMat(ACCESS_READ); Mat weightMat;
Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
Mat swizzledWeightMat; Mat swizzledWeightMat;
UMat weight_tmp; // FP32 in half mode, TODO implement FP16 repack
if (use_half_) if (use_half_)
swizzledWeightMat = swizzled_weights_tmp.getMat(ACCESS_WRITE); {
CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
convertFp16(weight, weight_tmp);
weightMat = weight_tmp.getMat(ACCESS_READ);
swizzledWeightMat.create(shape(swizzled_weights_umat), CV_32F);
}
else else
{
weightMat = weight.getMat(ACCESS_READ);
swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE); swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
}
CV_CheckTypeEQ(weightMat.type(), CV_32FC1, "");
Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>(); Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>();
int interleavedRows = (kernel_w_ / 2) * 2; int interleavedRows = (kernel_w_ / 2) * 2;
@ -792,26 +800,28 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
int blockWidth = swizzled_factor; // should equal to simd size. int blockWidth = swizzled_factor; // should equal to simd size.
int rowAlignment = 32; int rowAlignment = 32;
size_t interleaved_filter_size = M_ * kernel_w_ * kernel_h_ * channels_ * sizeof(Dtype); size_t interleaved_filter_size = M_ * kernel_w_ * kernel_h_ * channels_ * sizeof(Dtype);
Dtype * tmpSwizzledWeight = reinterpret_cast<Dtype*>(malloc(interleaved_filter_size)); cv::AutoBuffer<Dtype, 0> tmpSwizzledWeight(interleaved_filter_size);
CHECK_EQ(tmpSwizzledWeight != NULL, true) << "Failed to allocate temporary swizzled weight";
for (int od = 0; od < M_; od++) for (int od = 0; od < M_; od++)
for (int id = 0; id < channels_; id++) for (int id = 0; id < channels_; id++)
for (int r = 0; r < kernel_h_; r++) for (int r = 0; r < kernel_h_; r++)
for (int c = 0; c < kernel_w_; c++) for (int c = 0; c < kernel_w_; c++)
tmpSwizzledWeight[((id * kernel_h_ + r)* kernel_w_ + c) * M_ + od] = tmpSwizzledWeight[((id * kernel_h_ + r)* kernel_w_ + c) * M_ + od] =
cpu_weight[((od * channels_ + id) * kernel_h_ + r)*kernel_w_+c]; cpu_weight[((od * channels_ + id) * kernel_h_ + r)*kernel_w_+c];
interleaveMatrix(cpu_swizzled_weight, interleaveMatrix(cpu_swizzled_weight,
tmpSwizzledWeight, tmpSwizzledWeight.data(),
kernel_w_ * kernel_h_ * channels_, M_, kernel_w_ * kernel_h_ * channels_, M_,
interleavedRows, interleavedRows,
nonInterleavedRows, nonInterleavedRows,
blockWidth, blockWidth,
rowAlignment); rowAlignment);
free(tmpSwizzledWeight);
}
if (use_half_) // unmap OpenCL buffers
convertFp16(swizzled_weights_tmp, swizzled_weights_umat); weightMat.release();
if (use_half_)
convertFp16(swizzledWeightMat, swizzled_weights_umat);
}
return true; return true;
} }
@ -1104,10 +1114,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
cl_uint argIdx = 0; cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
if (use_half_) kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
else
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
if (bias_term_) if (bias_term_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
@ -1148,10 +1155,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx); setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, image_offset); kernel.set(argIdx++, image_offset);
if (use_half_) kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
else
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
kernel.set(argIdx++, kernel_offset); kernel.set(argIdx++, kernel_offset);
if (bias_term_) if (bias_term_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
@ -1956,7 +1960,7 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1); UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
calculateBenchmark(bottom, benchData, (use_half_) ? weights_half : weight, bias, numImages); calculateBenchmark(bottom, benchData, weight, bias, numImages);
if (run_auto_tuning_ || force_auto_tuning_) if (run_auto_tuning_ || force_auto_tuning_)
{ {

@ -39,9 +39,14 @@
// //
//M*/ //M*/
#ifdef HALF_SUPPORT
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16:enable
#endif
#endif
#define CONCAT(A,B) A##_##B #define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type) #define TEMPLATE(name,type) CONCAT(name,type)
#define Dtype float
__kernel void TEMPLATE(copyWeightsSwizzled, Dtype) __kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
(__global Dtype* weightIn, (__global Dtype* weightIn,

Loading…
Cancel
Save