add one more convolution kernel tuning candidate

Signed-off-by: Li Peng <peng.li@intel.com>
7 years ago · 181b448c4d
parent 1bc1f3d311
commit 181b448c4d
2 changed files with 236 additions and 11 deletions
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@ -1432,6 +1432,7 @@ void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerPar
        generate_gemmlike_tuneritems(tunerItems, 1, 8, 32);
        generate_gemmlike_tuneritems(tunerItems, 2, 8, 32);
        generate_gemmlike_tuneritems(tunerItems, 1, 16, 32);
        generate_gemmlike_tuneritems(tunerItems, 2, 16, 32);
        // idlf kernel
        for (int simd_size = 8; simd_size <= 16; simd_size += 8)
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@ -384,7 +384,6 @@ convolve_simd(
 #elif defined KERNEL_GEMM_LIKE
 #if APPLY_BIAS
 // Dtype bias[4];
 #define SUBGROUP_GET_BIAS(k, i) intel_sub_group_shuffle(bias[k], i)
 #else
 #define SUBGROUP_GET_BIAS(k, i) ((Dtype)0)
@ -446,9 +445,7 @@ typedef struct float0 { float s0; } float0; //never used but makes compiler happ
 #define TILE_K          KERNEL_WIDTH
 #define TILE_N          32
 #ifndef __BEIGNET__
 __attribute__((intel_reqd_sub_group_size(8)))
 #endif
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
    const int group_x = get_group_id(0);
@ -608,6 +605,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
        Dtype4 *bias_vec;
        bias_vec = (Dtype4*)bias;
        *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
        if (group_x > 0xFFFFFFFEul) {
          dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
        }
 #else
        const Dtype bias[4] = {0, 0, 0, 0};
 #endif
        if (global_y * TILE_M < output_width * output_height )
        {
@ -768,6 +770,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
        Dtype4 *bias_vec;
        bias_vec = (Dtype4*)bias;
        *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
        if (group_x > 0xFFFFFFFEul) {
          dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
        }
 #else
        const Dtype bias[4] = {0, 0, 0, 0};
 #endif
        if (global_y * TILE_M < output_width * output_height )
@ -813,9 +820,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 #define TILE_K          KERNEL_WIDTH
 #define TILE_N          32
 #ifndef __BEIGNET__
 __attribute__((intel_reqd_sub_group_size(8)))
 #endif
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
    const int group_x = get_group_id(0);
@ -1012,6 +1017,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
        Dtype4 *bias_vec;
        bias_vec = (Dtype4*)bias;
        *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
        if (group_x > 0xFFFFFFFEul) {
          dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
        }
 #else
        const Dtype bias[4] = {0, 0, 0, 0};
 #endif
        if( global_y * TILE_M < output_width * output_height )
@ -1221,6 +1231,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
        Dtype4 *bias_vec;
        bias_vec = (Dtype4*)bias;
        *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
        if (group_x > 0xFFFFFFFEul) {
          dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
        }
 #else
        const Dtype bias[4] = {0, 0, 0, 0};
 #endif
        if( global_y * TILE_M < output_width * output_height )
        {
@ -1334,9 +1349,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 #define TILE_K          KERNEL_WIDTH
 #define TILE_N          32
 #ifndef __BEIGNET__
 __attribute__((intel_reqd_sub_group_size(16)))
 #endif
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
    const int group_x = get_group_id(0);
@ -1396,18 +1409,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
    // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
    // and KERNEL_WIDTH/2 rows of interleaved filter.
    int patch_depth = 0;
 #ifndef __BEIGNET__
    __attribute__((opencl_unroll_hint(1)))
 #endif
    do
    {
        int patch_row = 0;
 #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
        curr_y = saved_y;
 #endif
 #ifndef __BEIGNET__
        __attribute__((opencl_unroll_hint(1)))
 #endif
        do
        {
            // Load atile and btile.
@ -1495,11 +1504,226 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
    Dtype2 *bias_vec;
    bias_vec = (Dtype2*)bias;
    *bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));
    if (group_x > 0xFFFFFFFEul) {
      dst[0] = bias[0] + bias[1];
    }
 #else
    const Dtype bias[2] = {0, 0};
 #endif
    INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
 }
 #endif
 #ifdef GEMM_LIKE_CONV_32_2_SIMD16
 //////////////////////////////////////////////////////////////////////////////
 // Conv_Interleaved_32_2_SIMD16
 //
 // Convolution: each workitem computes 1 patch x 32 filters worth of output
 // data.
 #define TILE_M          2
 #define TILE_K          KERNEL_WIDTH
 #define TILE_N          32
 __attribute__((intel_reqd_sub_group_size(16)))
 __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 {
    const int group_x = get_group_id(0);
    const int group_y = get_group_id(1);
    const int global_x = get_global_id(0);
    const int global_y = get_global_id(1);
    const int global_z = get_global_id(2);
    int interleaved_y;
    int kernel_y;
    int kernel_idx;
 #define DOT_PRODUCT_16( _result, _rowA, colB )    \
    {   \
        _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 );  \
        _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 );  \
        _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 );  \
        _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 );  \
        _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 );  \
        _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 );  \
        _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 );  \
        _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 );  \
        _result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 );  \
        _result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 );  \
        _result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa );  \
        _result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb );  \
        _result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc );  \
        _result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd );  \
        _result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se );  \
        _result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf );  \
    }
        typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
    // True for all threads if filter_width is multiple of TILE_N
    // else, true for all but right-most column of threads.
    {
        // Result ctile (*dst) is M rows x N columns
        // LWG size is 1x8.  Thus each thread calculates 8*M rows x N cols of ctile.
        Dtype16  blockC00 = 0.f;
        Dtype16  blockC10 = 0.f;
        Dtype16  blockC01 = 0.f;
        Dtype16  blockC11 = 0.f;
        // Src0 (patch input) is directly used as atile.
        // Each work item points to the start of a different patch.
        // atile is M rows x K columns.
        int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;
        int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
        int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
        int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
 #if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
        int saved_y0 = curr_y0;
        int saved_y1 = curr_y1;
 #endif
        const __global Dtype *src0_read0 = src0
         + aligned_input_size * global_z                                            // batch offset
         + (curr_y0 - INPUT_PAD_H) * ROW_PITCH   // y offset
         + curr_x0 - INPUT_PAD_W;                // x offset
        const __global Dtype *src0_read1 = src0
         + aligned_input_size * global_z                                            // batch offset
         + (curr_y1 - INPUT_PAD_H) * ROW_PITCH   // y offset
         + curr_x1 - INPUT_PAD_W;                // x offset
        // Src1 (filter) is directly used as btile.
        // It starts at the top of src1 and walks down.
        // btile is K rows x N columns.
        const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);
        // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
        // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
        // and KERNEL_WIDTH/2 rows of interleaved filter.
        int patch_depth = 0;
        do
        {
            int patch_row = 0;
            do
            {
                // Load atile and btile.
                // Kernel data is partially interleaved.  Every 2 rows are interleaved at Dtype8 granularity.
                // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved.  The non
                // interleaved row is padded with zero to ensure same size as interleaved rows. This
                // interleaving is done to ensure 0% GDR bank conflicts.  For example, this is how the
                // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
                // (0, 0) (8, 0) (16, 0) (24, 0) ...       (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
                // (0, 1) (8, 1) (16, 1) (24, 1) ... =>    (0, 2) (8, 2) (16, 2) (24, 2) ...
                // (0, 2) (8, 2) (16, 2) (24, 2) ...       ...
                // ...
                const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
 #if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1
                Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[  0  ]; src0_read0 += ROW_PITCH;
                Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[  0  ]; src0_read1 += ROW_PITCH;
                Dtype*  pblockA00 = (Dtype*)(&blockA00);
                Dtype*  pblockA01 = (Dtype*)(&blockA01);
 #else
                Dtype_t blockA00;
                Dtype*  pblockA00 = (Dtype*)(&blockA00);
                int pos = 0;
                LOOP(KERNEL_WIDTH, pos,
                {
                  if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
                    pblockA00[pos] = src0_read0[pos * DILATION_X];
                  else
                    pblockA00[pos] = 0;
                })
                curr_y0 += DILATION_Y;
                Dtype_t blockA01;
                Dtype*  pblockA01 = (Dtype*)(&blockA01);
                pos = 0;
                LOOP(KERNEL_WIDTH, pos,
                {
                  if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
                    pblockA01[pos] = src0_read1[pos * DILATION_X];
                  else
                    pblockA01[pos] = 0;
                })
                curr_y1 += DILATION_Y;
                src0_read0 += (ROW_PITCH * DILATION_Y);
                src0_read1 += (ROW_PITCH * DILATION_Y);
 #endif
                Dtype blockB00[KERNEL_WIDTH*2];
                Dtype4* p4BlockB00 = (Dtype4*)blockB00;
                Dtype2* p2BlockB00 = (Dtype2*)blockB00;
                Dtype*  pBlockB00 =  (Dtype* )blockB00;
                interleaved_y = 0;
                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
                {
                    p4BlockB00[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
                    src1_read += WIDTH1 * 2;
                } )
                if ( kernel_width_is_odd )
                {
                    p2BlockB00[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
                    src1_read += WIDTH1 * 2;
                }
                // Perform MADs
                kernel_idx = 0;
                interleaved_y = 0;
                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
                {
                    kernel_y = interleaved_y * 2;
                    DOT_PRODUCT_16( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
                    DOT_PRODUCT_16( blockC01, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
                    DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
                    DOT_PRODUCT_16( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
                    DOT_PRODUCT_16( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
                    DOT_PRODUCT_16( blockC11, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
                    DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
                    DOT_PRODUCT_16( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
                } )
                if ( kernel_width_is_odd )
                {
                    kernel_y = interleaved_y * 2;
                    DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );
                    DOT_PRODUCT_16( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
                    DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );
                    DOT_PRODUCT_16( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
                }
            }
            //while( ++patch_row < 1 ); //debug
            while( ++patch_row < KERNEL_HEIGHT );
 #if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1
            curr_y0 = saved_y0;
            curr_y1 = saved_y1;
 #endif
            src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch
            src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
        }
        //while ( ++patch_depth < 1 );  //debug
        while ( ++patch_depth < INPUT_DEPTH );
        // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
        // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
        int out0_offset = global_z * out_pitch_z                                                       // batch offset
         + ( group_x * TILE_N ) * out_pitch_y                                           // channel offset
         + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
         + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;               // x offset
        int out1_offset = global_z * out_pitch_z                                                       // batch offset
         + ( group_x * TILE_N ) * out_pitch_y                                           // channel offset
         + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
         + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;               // x offset
 #if APPLY_BIAS
        Dtype bias[2];
        Dtype2 *bias_vec;
        bias_vec = (Dtype2*)bias;
        *bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));
        if (group_x > 0xFFFFFFFEul) {
          dst[0] = bias[0] + bias[1];
        }
 #else
        const Dtype bias[2] = {0, 0};
 #endif
        INTERLEAVED_SIMD16_OUTPUT(dst, out0_offset, 0);
        INTERLEAVED_SIMD16_OUTPUT(dst, out1_offset, 1);
    }
 }
 #endif
 #elif defined KERNEL_DWCONV
 __kernel void DWCONV(