diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp index 3b73da801c..5eee1da4a0 100644 --- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp @@ -945,9 +945,11 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, } else if (config->kernelType == KERNEL_TYPE_GEMM_LIKE) { if (!swizzleWeight(weight, config->workItem_output[1], true)) return false; +#if 0 size_t total_bottom_size = bottom_dim_ * numImages; size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_; size_t total_bias_size = M_ * group_; +#endif size_t total_top_size = top_dim_ * numImages; for (int32_t g = 0; g < group_; ++g) { bias_offset = M_ * g; @@ -960,72 +962,25 @@ bool OCL4DNNConvSpatial::convolve(const UMat &bottom, UMat &top, return false; cl_uint argIdx = 0; - setFusionArg(fused_activ_, fused_eltwise_, -1, kernel, argIdx); - - UMat img_buffer; - if (image_offset) - { - CreateSubBuffer(bottom, img_buffer, image_offset, - total_bottom_size - image_offset, false); - if (img_buffer.empty()) - return false; - - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer)); - } - else - { - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); - } + setFusionArg(fused_activ_, fused_eltwise_, output_image_offset, kernel, argIdx); - UMat kernel_buffer; - if (kernel_offset) - { - CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset, - total_kernel_size - kernel_offset, false); - if (kernel_buffer.empty()) - return false; + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom)); + kernel.set(argIdx++, (int)image_offset); + kernel.set(argIdx++, (int)(bottom.total() - image_offset)); - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer)); - } - else - { - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat)); - } + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat)); + kernel.set(argIdx++, (int)kernel_offset); + kernel.set(argIdx++, (int)(swizzled_weights_umat.total() - kernel_offset)); - UMat bias_buffer; if (bias_term_) { - if (bias_offset) - { - CreateSubBuffer(bias, bias_buffer, bias_offset, - total_bias_size - bias_offset, false); - if (bias_buffer.empty()) - return false; - - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer)); - } - else - { - kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); - } + kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias)); + kernel.set(argIdx++, (int)bias_offset); } - UMat out_buffer; - if (output_image_offset) - { - CreateSubBuffer(top, out_buffer, output_image_offset, - total_top_size - output_image_offset, true); - if (out_buffer.empty()) - return false; - - kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer)); - kernel.set(argIdx++, (int)(out_buffer.offset / element_size)); - } - else - { - kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); - kernel.set(argIdx++, (int)(top.offset / element_size)); - } + kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top)); + kernel.set(argIdx++, (int)(top.offset / element_size) + output_image_offset); + kernel.set(argIdx++, (int)total_top_size - (int)(top.offset / element_size)); kernel.set(argIdx++, (uint16_t)width_); kernel.set(argIdx++, (uint16_t)height_); diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl index 55015557a0..e7bbacd4c4 100644 --- a/modules/dnn/src/opencl/conv_layer_spatial.cl +++ b/modules/dnn/src/opencl/conv_layer_spatial.cl @@ -401,13 +401,12 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy. #define ROW_PITCH input_width #define GEMM_LIKE_KERNEL_ARGS \ - ELTWISE_DATA_ARG \ + ELTWISE_DATA_ARG_WITH_OFFSET \ FUSED_ARG \ - const __global Dtype *src0, \ - const __global Dtype *src1, \ - BIAS_KERNEL_ARG \ - __global Dtype *dst_base, \ - const int dst_offset, \ + const __global Dtype *src0_ptr, const unsigned int src0_offset, const unsigned int src0_limit, \ + const __global Dtype *src1_ptr, const unsigned int src1_offset, const unsigned int src1_limit, \ + BIAS_KERNEL_ARG_WITH_OFFSET \ + __global Dtype *dst_base, const unsigned int dst_offset, const unsigned int dst_limit, \ const ushort input_width, \ const ushort input_height, \ const ushort output_width, \ @@ -437,7 +436,17 @@ typedef struct half0 { half s0; } half0; //never used but makes compiler happy. __attribute__((intel_reqd_sub_group_size(8))) __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) { + const __global Dtype *src0 = src0_ptr + src0_offset; + const __global Dtype *src1 = src1_ptr + src1_offset; +#if APPLY_BIAS + __global Dtype* biases_base = biases_base_ptr + biases_base_offset; +#endif + __global Dtype *dst = dst_base + dst_offset; +#ifdef FUSED_CONV_ELTWISE + __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset; +#endif + const int group_x = get_group_id(0); const int group_y = get_group_id(1); const int global_x = get_global_id(0); @@ -460,6 +469,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) } typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t; +// U_GEMM_LIKE_CONV_k11x11_cn3_g1_s4x4_d1x1_b1_in240x240_p0x0_num1_M96_activ1_eltwise0_FP32_5_1_8_32_SIMD8 doesn't run properly (src0_read out of bounds) +// Test: DNNTestNetwork.AlexNet/0 (to run all kernels use OPENCV_OCL4DNN_FORCE_AUTO_TUNING=1) +#if 0 // INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 + #define OPTIMIZE_READ 1 +#else + #define OPTIMIZE_READ 0 +#endif + // True for all threads if filter_width is multiple of TILE_N // else, true for all but right-most column of threads. if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N ) @@ -476,7 +493,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // atile is M rows x K columns. int curr_x = ( global_y % output_width ) * STRIDE_X; int curr_y = ( global_y / output_width ) * STRIDE_Y; -#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 +#if !OPTIMIZE_READ int saved_y = curr_y; #endif const __global Dtype *src0_read = src0 @@ -496,7 +513,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) do { int patch_row = 0; -#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 +#if !OPTIMIZE_READ curr_y = saved_y; #endif @@ -514,11 +531,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // ... const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; -#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 +#if OPTIMIZE_READ #if KERNEL_WIDTH == 3 Dtype_t blockA00 = vload3(0, src0_read); Dtype* pblockA00 = (Dtype*)(&blockA00); #else + #if 0 // debug + if ((int)(src0_read - src0) >= src0_limit - KERNEL_WIDTH) + { + printf("CATCH: src0_read-src0: %d limit=%d curr_y,curr_x=%d,%d\n", (int)(src0_read - src0), src0_limit, curr_y, curr_x); + } + #endif Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; Dtype* pblockA00 = (Dtype*)(&blockA00); #endif @@ -639,7 +662,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // atile is M rows x K columns. int curr_x = ( global_y % output_width ) * STRIDE_X; int curr_y = ( global_y / output_width ) * STRIDE_Y; -#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 +#if !OPTIMIZE_READ int saved_y = curr_y; #endif const __global Dtype *src0_read = src0 @@ -659,14 +682,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) do { int patch_row = 0; -#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 +#if !OPTIMIZE_READ curr_y = saved_y; #endif do { // Load atile and interleaved btile. const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; -#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 +#if OPTIMIZE_READ Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; Dtype* pblockA00 = (Dtype*)(&blockA00); #else @@ -803,7 +826,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) } } } -#endif +#endif // TILE_N_LAST > 0 } #endif #ifdef GEMM_LIKE_CONV_32_2 @@ -826,7 +849,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) __attribute__((intel_reqd_sub_group_size(8))) __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) { + const __global Dtype *src0 = src0_ptr + src0_offset; + const __global Dtype *src1 = src1_ptr + src1_offset; +#if APPLY_BIAS + __global Dtype* biases_base = biases_base_ptr + biases_base_offset; +#endif + __global Dtype *dst = dst_base + dst_offset; +#ifdef FUSED_CONV_ELTWISE + __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset; +#endif + const int group_x = get_group_id(0); const int group_y = get_group_id(1); const int global_x = get_global_id(0); @@ -1388,7 +1421,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) __attribute__((intel_reqd_sub_group_size(16))) __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) { + const __global Dtype *src0 = src0_ptr + src0_offset; + const __global Dtype *src1 = src1_ptr + src1_offset; +#if APPLY_BIAS + __global Dtype* biases_base = biases_base_ptr + biases_base_offset; +#endif + __global Dtype *dst = dst_base + dst_offset; +#ifdef FUSED_CONV_ELTWISE + __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset; +#endif + const int group_x = get_group_id(0); const int group_y = get_group_id(1); const int global_x = get_global_id(0); @@ -1574,7 +1617,17 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) __attribute__((intel_reqd_sub_group_size(16))) __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) { + const __global Dtype *src0 = src0_ptr + src0_offset; + const __global Dtype *src1 = src1_ptr + src1_offset; +#if APPLY_BIAS + __global Dtype* biases_base = biases_base_ptr + biases_base_offset; +#endif + __global Dtype *dst = dst_base + dst_offset; +#ifdef FUSED_CONV_ELTWISE + __global Dtype* eltwise_data = eltwise_ptr + eltwise_offset; +#endif + const int group_x = get_group_id(0); const int group_y = get_group_id(1); const int global_x = get_global_id(0);