dnn(ocl): avoid mess FP16/FP32 in convolution layer

pull/19115/head
Alexander Alekhin 4 years ago
parent 1bfc75ac23
commit c240355cc6
  1. 2
      modules/core/src/convert.dispatch.cpp
  2. 13
      modules/core/src/opencl/halfconvert.cl
  3. 26
      modules/dnn/src/layers/convolution_layer.cpp
  4. 2
      modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
  5. 88
      modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
  6. 7
      modules/dnn/src/opencl/conv_spatial_helper.cl

@ -138,7 +138,7 @@ static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int sdepth, int
sdepth == CV_32F ? "half" : "float",
rowsPerWI,
sdepth == CV_32F ? " -D FLOAT_TO_HALF " : "");
ocl::Kernel k("convertFp16", ocl::core::halfconvert_oclsrc, build_opt);
ocl::Kernel k(sdepth == CV_32F ? "convertFp16_FP32_to_FP16" : "convertFp16_FP16_to_FP32", ocl::core::halfconvert_oclsrc, build_opt);
if (k.empty())
return false;

@ -47,8 +47,17 @@
#endif
#endif
__kernel void convertFp16(__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols)
__kernel void
#ifdef FLOAT_TO_HALF
convertFp16_FP32_to_FP16
#else
convertFp16_FP16_to_FP32
#endif
(
__global const uchar * srcptr, int src_step, int src_offset,
__global uchar * dstptr, int dst_step, int dst_offset,
int dst_rows, int dst_cols
)
{
int x = get_global_id(0);
int y0 = get_global_id(1) * rowsPerWI;

@ -1461,16 +1461,7 @@ public:
umat_blobs.resize(n);
for (size_t i = 0; i < n; i++)
{
if (use_half)
{
Mat matFP32;
convertFp16(inputs[i + 1], matFP32);
matFP32.copyTo(umat_blobs[i]);
}
else
{
inputs[i + 1].copyTo(umat_blobs[i]);
}
inputs[i + 1].copyTo(umat_blobs[i]);
}
inputs.resize(1);
}
@ -1481,7 +1472,10 @@ public:
umat_blobs.resize(n);
for (size_t i = 0; i < n; i++)
{
blobs[i].copyTo(umat_blobs[i]);
if (use_half)
convertFp16(blobs[i], umat_blobs[i]);
else
blobs[i].copyTo(umat_blobs[i]);
}
}
@ -1537,14 +1531,20 @@ public:
if (fusedWeights)
{
weightsMat.copyTo(umat_blobs[0]);
if (use_half)
convertFp16(weightsMat, umat_blobs[0]);
else
weightsMat.copyTo(umat_blobs[0]);
fusedWeights = false;
}
if (fusedBias)
{
if ( umat_blobs.size() < 2 )
umat_blobs.resize(2);
umat_blobs[1] = UMat(biasvec, true);
if (use_half)
convertFp16(Mat(biasvec, true), umat_blobs[1]);
else
Mat(biasvec, true).copyTo(umat_blobs[1]);
convolutionOp->setBias(true);
fusedBias = false;
}

@ -274,8 +274,6 @@ class OCL4DNNConvSpatial
int32_t group_;
bool bias_term_;
UMat swizzled_weights_umat;
UMat weights_half;
UMat bias_half;
UMat bottom_data2_;
int32_t bottom_index_;

@ -588,16 +588,16 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
fused_eltwise_ = false;
}
if (use_half_ && bias_half.empty() && !bias.empty())
convertFp16(bias, bias_half);
if (use_half_ && !bias.empty())
CV_CheckTypeEQ(bias.type(), CV_16SC1, "");
if (use_half_ && weights_half.empty())
convertFp16(weight, weights_half);
if (use_half_)
CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
prepareKernel(bottom, top, weight, (use_half_) ? bias_half : bias, numImages);
prepareKernel(bottom, top, weight, bias, numImages);
if (bestKernelConfig.empty())
return false;
return convolve(bottom, top, weight, (use_half_) ? bias_half : bias, numImages, bestKernelConfig);
return convolve(bottom, top, weight, bias, numImages, bestKernelConfig);
}
template<typename Dtype>
@ -744,29 +744,26 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
kernel_h_ * (int)alignSize(kernel_w_, 2),
(use_half_) ? CV_16SC1 : CV_32FC1);
UMat swizzled_weights_tmp;
if (use_half_)
swizzled_weights_tmp.create(shape(swizzled_weights_umat), CV_32F);
if (!interleave) {
cl_uint argIdx = 0;
int32_t channels = channels_ / group_;
ocl::Kernel oclk_copy_weight(CL_KERNEL_SELECT("copyWeightsSwizzled"),
cv::ocl::dnn::conv_spatial_helper_oclsrc);
ocl::Kernel oclk_copy_weight(
use_half_ ? "copyWeightsSwizzled_half" : "copyWeightsSwizzled_float",
cv::ocl::dnn::conv_spatial_helper_oclsrc,
use_half_ ? "-DHALF_SUPPORT=1 -DDtype=half" : "-DDtype=float"
);
if (oclk_copy_weight.empty())
return false;
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
if (use_half_)
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_tmp));
else
oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat));
oclk_copy_weight.set(argIdx++, kernel_w_);
oclk_copy_weight.set(argIdx++, kernel_h_);
oclk_copy_weight.set(argIdx++, channels);
oclk_copy_weight.set(argIdx++, num_output_);
oclk_copy_weight.set(argIdx++, swizzled_factor);
oclk_copy_weight.args(
ocl::KernelArg::PtrReadOnly(weight),
ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat),
kernel_w_,
kernel_h_,
channels,
num_output_,
swizzled_factor
);
size_t global_work_size_copy[3] = {
(size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
@ -778,13 +775,24 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
}
} else {
// assumption: kernel dimension is 2
Mat weightMat = weight.getMat(ACCESS_READ);
Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
Mat weightMat;
Mat swizzledWeightMat;
UMat weight_tmp; // FP32 in half mode, TODO implement FP16 repack
if (use_half_)
swizzledWeightMat = swizzled_weights_tmp.getMat(ACCESS_WRITE);
{
CV_CheckTypeEQ(weight.type(), CV_16SC1, "");
convertFp16(weight, weight_tmp);
weightMat = weight_tmp.getMat(ACCESS_READ);
swizzledWeightMat.create(shape(swizzled_weights_umat), CV_32F);
}
else
{
weightMat = weight.getMat(ACCESS_READ);
swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
}
CV_CheckTypeEQ(weightMat.type(), CV_32FC1, "");
Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>();
int interleavedRows = (kernel_w_ / 2) * 2;
@ -792,26 +800,28 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
int blockWidth = swizzled_factor; // should equal to simd size.
int rowAlignment = 32;
size_t interleaved_filter_size = M_ * kernel_w_ * kernel_h_ * channels_ * sizeof(Dtype);
Dtype * tmpSwizzledWeight = reinterpret_cast<Dtype*>(malloc(interleaved_filter_size));
CHECK_EQ(tmpSwizzledWeight != NULL, true) << "Failed to allocate temporary swizzled weight";
cv::AutoBuffer<Dtype, 0> tmpSwizzledWeight(interleaved_filter_size);
for (int od = 0; od < M_; od++)
for (int id = 0; id < channels_; id++)
for (int r = 0; r < kernel_h_; r++)
for (int c = 0; c < kernel_w_; c++)
tmpSwizzledWeight[((id * kernel_h_ + r)* kernel_w_ + c) * M_ + od] =
cpu_weight[((od * channels_ + id) * kernel_h_ + r)*kernel_w_+c];
interleaveMatrix(cpu_swizzled_weight,
tmpSwizzledWeight,
tmpSwizzledWeight.data(),
kernel_w_ * kernel_h_ * channels_, M_,
interleavedRows,
nonInterleavedRows,
blockWidth,
rowAlignment);
free(tmpSwizzledWeight);
}
if (use_half_)
convertFp16(swizzled_weights_tmp, swizzled_weights_umat);
// unmap OpenCL buffers
weightMat.release();
if (use_half_)
convertFp16(swizzledWeightMat, swizzled_weights_umat);
}
return true;
}
@ -1104,10 +1114,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
cl_uint argIdx = 0;
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
if (use_half_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
else
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
if (bias_term_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
@ -1148,10 +1155,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
setFusionArg(fused_activ_, fused_eltwise_, kernel, argIdx);
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
kernel.set(argIdx++, image_offset);
if (use_half_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weights_half));
else
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
kernel.set(argIdx++, kernel_offset);
if (bias_term_)
kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
@ -1956,7 +1960,7 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
UMat benchData(1, numImages * top_dim_, (use_half_) ? CV_16SC1 : CV_32FC1);
calculateBenchmark(bottom, benchData, (use_half_) ? weights_half : weight, bias, numImages);
calculateBenchmark(bottom, benchData, weight, bias, numImages);
if (run_auto_tuning_ || force_auto_tuning_)
{

@ -39,9 +39,14 @@
//
//M*/
#ifdef HALF_SUPPORT
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16:enable
#endif
#endif
#define CONCAT(A,B) A##_##B
#define TEMPLATE(name,type) CONCAT(name,type)
#define Dtype float
__kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
(__global Dtype* weightIn,

Loading…
Cancel
Save