dnn(ocl4dnn): cleanup dead code, improve logging

pull/20840/head
Alexander Alekhin 3 years ago
parent a3d7811f24
commit 8c2dd5fb9a
  1. 2
      modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
  2. 100
      modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
  3. 39
      modules/dnn/src/opencl/conv_layer_spatial.cl

@ -222,8 +222,6 @@ class OCL4DNNConvSpatial
bool createDWConvKernel(int32_t blockWidth,
int32_t blockHeight,
int32_t blockDepth);
void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
int32_t offset, int32_t size, bool write_only);
bool convolve(const UMat &bottom, UMat &top,
const UMat &weight, const UMat &bias,
int32_t numImages,

@ -218,14 +218,7 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
#endif
if (!use_cache_path_)
{
static int warn_ = 0;
if (!warn_)
{
std::cerr
<< "OpenCV(ocl4dnn): Kernel configuration cache directory doesn't exist: " << cache_path_ << std::endl
<< std::endl;
warn_ = true;
}
CV_LOG_ONCE_ERROR(NULL, "OpenCV(ocl4dnn): Kernel configuration cache directory doesn't exist: " << cache_path_);
}
}
@ -418,7 +411,6 @@ void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
addDef("CHANNELS", channels_ / group_);
addDef("APPLY_BIAS", bias_term_);
addDef("OUTPUT_Z", M_);
addDef("ZPAR", 1);
setFusionDefine(fused_activ_, fused_eltwise_);
src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
@ -672,8 +664,7 @@ void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
int r, int c, int interleavedRows, int nonInterleavedRows,
int blockWidth, int rowAlignment )
{
CHECK_EQ(interleavedRows % 2, 0) <<
"interleaveMatrix only supports even values for interleavedRows.";
CV_Check(interleavedRows, interleavedRows % 2 == 0, "interleaveMatrix only supports even values for interleavedRows.");
size_t memSize = r * c * sizeof(float);
size_t dstSize = memSize *
@ -685,9 +676,12 @@ void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
const int yStride = c * 2;
const Dtype *pSrc = mem;
Dtype* pDst = mem_dst;
for (int y = 0; y < r;) {
for (int rows = 0; rows < interleavedRows; rows += 2) {
if ( y >= r ) break;
for (int y = 0; y < r;)
{
for (int rows = 0; rows < interleavedRows; rows += 2)
{
if (y >= r)
break;
if ((c % xStride) == 0) {
for (int x = 0; x < c / xStride; x++) {
memcpy(pDst + x * xStride * 2, // NOLINT
@ -712,11 +706,14 @@ void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
y += 2;
}
for (int rows = 0; rows < nonInterleavedRows; rows++) {
if (y >= r) break;
for (int rows = 0; rows < nonInterleavedRows; rows++)
{
if (y >= r)
break;
const int stride = rowAlignment;
int remaining = c;
for (int x = 0; x < c; x += stride) {
for (int x = 0; x < c; x += stride)
{
if (remaining >= stride) {
memcpy(pDst + x * 2, pSrc + x, stride * sizeof(Dtype)); // NOLINT
remaining -=stride;
@ -852,34 +849,6 @@ bool OCL4DNNConvSpatial<float>::createBasicKernel(int32_t blockWidth,
return false;
}
template<>
void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
int32_t offset, int32_t size, bool write_only)
{
cl_mem sub_mem;
cl_buffer_region region;
cl_int err;
size_t element_size = (use_half_) ? sizeof(short) : sizeof(float);
region.origin = offset * element_size + buffer.offset;
region.size = size * element_size;
sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
if (err)
{
std::cout << "Failed to create sub buffer." << std::endl;
return;
}
int step = element_size, rows = size, cols = 1;
ocl::convertFromBuffer(sub_mem, step, rows, cols,
(use_half_) ? CV_16SC1 : CV_32FC1, sub_buffer);
//decrease ocl mem refcount
clReleaseMemObject(sub_mem);
}
template<>
bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
const UMat &weight, const UMat &bias,
@ -938,7 +907,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
kernel.set(argIdx++, (uint16_t)output_h_);
if (!kernel.run_(3, config->global_work_size, config->local_work_size, false))
{
std::cout << "IDLF kernel run failed." << std::endl;
CV_LOG_ERROR(NULL, "DNN/OpenCL: IDLF kernel run failed");
return false;
}
}
@ -1012,7 +981,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
if (!kernel.run_(3, global_size, config->local_work_size, false))
{
std::cout << "GEMM like kernel run failed." << std::endl;
CV_LOG_ERROR(NULL, "DNN/OpenCL: GEMM like kernel run failed");
return false;
}
}
@ -1115,14 +1084,9 @@ float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
{
queue = cv::ocl::Queue::getDefault();
}
catch (const cv::Exception&)
catch (const std::exception& e)
{
static int warn_ = 0;
if (!warn_)
{
std::cout << "OpenCV(ocl4dnn): Can't get OpenCL default queue for auto-tuning." << std::endl;
warn_ = true;
}
CV_LOG_ONCE_ERROR(NULL, "OpenCV(ocl4dnn): Can't get OpenCL default queue for auto-tuning: " << e.what());
return 1e6;
}
@ -1326,9 +1290,9 @@ ocl::Program OCL4DNNConvSpatial<Dtype>::compileKernel()
phash.insert(std::pair<std::string, ocl::Program>(kernel_name_, program));
if (!program.ptr())
{
std::cout << "Failed to compile kernel: " << kernel_name_
<< ", buildflags: " << options
<< ", errmsg: " << errmsg << std::endl;
CV_LOG_WARNING(NULL, "DNN/OpenCL: Failed to compile kernel: " << kernel_name_
<< ", buildflags: '" << options << "', errmsg: '" << errmsg << "'"
);
}
return program;
}
@ -1754,7 +1718,8 @@ void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
fastestTime = kernelQueue[x]->executionTime;
}
}
if (fastestKernel < 0) break;
if (fastestKernel < 0)
break;
// Test fastest kernel
bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[fastestKernel], verifyTop);
if (verified == true) {
@ -1913,17 +1878,18 @@ bool OCL4DNNConvSpatial<Dtype>::setupKernelByConfig(int x, int y, int z, int typ
{
if (z == 1)
z = 16;
CHECK_EQ(z == 16 || z == 8, true) << "invalid SIMD size" << std::endl;
CV_Check(z, z == 16 || z == 8, "DNN/OpenCL: IDLF - invalid SIMD size");
}
kernelQueue.clear();
createConvolutionKernel(type, x, y, z);
if (kernelQueue.size() != 1) {
std::cerr << "Failed setup kernel by config:"
if (kernelQueue.size() != 1)
{
CV_LOG_ERROR(NULL, "DNN/OpenCL: Failed setup kernel by config: "
<< " x = " << x
<< " y = " << y
<< " z = " << z
<< " type = " << type
<< std::endl;
);
return false;
}
bestKernelConfig = kernelQueue[0];
@ -1955,13 +1921,9 @@ bool OCL4DNNConvSpatial<Dtype>::loadTunedConfig()
{
if (cache_path_.empty())
{
static int warn_ = 0;
if (!warn_)
{
std::cout << "OpenCV(ocl4dnn): consider to specify kernel configuration cache directory " << std::endl
<< " via OPENCV_OCL4DNN_CONFIG_PATH parameter." << std::endl;
warn_ = true;
}
CV_LOG_ONCE_WARNING(NULL, "OpenCV(ocl4dnn): consider to specify kernel configuration cache directory "
"through OPENCV_OCL4DNN_CONFIG_PATH parameter."
);
}
return false;
}

@ -161,23 +161,15 @@ __kernel void ConvolveBasic(
const int out_idx = get_global_id(0); // 1D task layout: [output_width * output_height * OUTPUT_Z]
const int plane_size = output_width * output_height;
const int out_plane_idx = out_idx % plane_size;
const int outputZ = out_idx / plane_size;
const int outputZ = out_idx / plane_size; // kernelNum
const int outputY = out_plane_idx / output_width;
const int outputX = out_plane_idx % output_width;
const int kernelNum = outputZ * ZPAR;
if (kernelNum < OUTPUT_Z)
if (outputZ < OUTPUT_Z)
{
Dtype sum[ZPAR];
for (int kern = 0; kern < ZPAR; kern++)
{
sum[kern] = 0.0f;
}
Dtype sum = 0.0f;
const int org_y = outputY * STRIDE_Y - pad_h;
const int org_x = outputX * STRIDE_X - pad_w;
const int currentKernelOffset = kernel_offset + kernelNum*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS;
#if APPLY_BIAS
const int biasIndex = bias_offset + kernelNum;
#endif
const int currentKernelOffset = kernel_offset + outputZ*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS;
const int local_image_offset = org_y * input_width + org_x;
const int imageSize = input_width * input_height;
__global Dtype* image_dataPtr = (image_data + (image_offset + local_image_offset));
@ -186,17 +178,13 @@ __kernel void ConvolveBasic(
{
for (int y = 0; y < KERNEL_HEIGHT; y++)
{
int y_ = org_y + y * DILATION_Y;
for (int x = 0; x < KERNEL_WIDTH; x++)
{
int y_ = org_y + y * DILATION_Y;
int x_ = org_x + x * DILATION_X;
if (!(y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width))
{
continue;
}
for (int kern = 0; kern < ZPAR; kern++)
if (y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width)
{
sum[kern] += image_dataPtr[x * DILATION_X] * kernel_dataPtr[kern*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS + x];
sum = mad(image_dataPtr[x * DILATION_X], kernel_dataPtr[x], sum);
}
}
image_dataPtr += input_width * DILATION_Y;
@ -205,18 +193,13 @@ __kernel void ConvolveBasic(
image_dataPtr += imageSize - input_width*KERNEL_HEIGHT*DILATION_Y;
}
for (int kern = 0; kern < ZPAR; kern++)
{
if (kernelNum + kern < OUTPUT_Z)
{
int offset = convolved_image_offset + (kernelNum+kern)*output_height*output_width + outputY*output_width + outputX;
int offset = convolved_image_offset + out_idx;
#if APPLY_BIAS
ACTIVATION_FUNCTION(convolved_image, offset, sum[kern] + bias[biasIndex + kern], biasIndex + kern);
int biasIndex = bias_offset + outputZ;
ACTIVATION_FUNCTION(convolved_image, offset, sum + bias[biasIndex], biasIndex);
#else
ACTIVATION_FUNCTION(convolved_image, offset, sum[kern], kernelNum + kern);
ACTIVATION_FUNCTION(convolved_image, offset, sum, outputZ);
#endif
}
}
}
}

Loading…
Cancel
Save