From 4c5a86828a81cc3445c102004591157f28f331b3 Mon Sep 17 00:00:00 2001 From: Li Peng Date: Tue, 10 Jul 2018 12:43:03 +0800 Subject: [PATCH] Fix gemmlike convolution input reading use vload3 for half3 or float3 input vector reading, also check read position to see if it exceed input width Signed-off-by: Li Peng --- modules/dnn/src/opencl/conv_layer_spatial.cl | 169 +++++++++++++------ 1 file changed, 115 insertions(+), 54 deletions(-) diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl index dc7b047fe5..2cc161d3ff 100644 --- a/modules/dnn/src/opencl/conv_layer_spatial.cl +++ b/modules/dnn/src/opencl/conv_layer_spatial.cl @@ -467,7 +467,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int saved_y = curr_y; #endif const __global Dtype *src0_read = src0 - + aligned_input_size * global_z // batch offset + + aligned_input_size * global_z // batch offset + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset + (curr_x - INPUT_PAD_W); // x offset @@ -502,15 +502,23 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 + #if KERNEL_WIDTH == 3 + Dtype_t blockA00 = vload3(0, src0_read); + Dtype* pblockA00 = (Dtype*)(&blockA00); + #else Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; Dtype* pblockA00 = (Dtype*)(&blockA00); + #endif #else Dtype_t blockA00; Dtype* pblockA00 = (Dtype*)(&blockA00); int pos = 0; LOOP(KERNEL_WIDTH, pos, { - if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) + if (curr_y >= INPUT_PAD_H && + curr_y < input_height + INPUT_PAD_H && + curr_x + pos * DILATION_X >= INPUT_PAD_W && + curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read[pos * DILATION_X]; else pblockA00[pos] = 0; @@ -564,17 +572,18 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) //while( ++patch_row < 1 ); //debug while( ++patch_row < KERNEL_HEIGHT ); - src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch + // reset to start of next slice of patch + src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); } //while ( ++patch_depth < 1 ); //debug while ( ++patch_depth < INPUT_DEPTH ); // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. - int out_offset = global_z * out_pitch_z // batch offset - + ( group_x * TILE_N ) * out_pitch_y // channel offset + int out_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset - + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset + + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset __global Dtype *out = dst + out_offset; #if APPLY_BIAS @@ -621,7 +630,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int saved_y = curr_y; #endif const __global Dtype *src0_read = src0 - + aligned_input_size * global_z // batch offset + + aligned_input_size * global_z // batch offset + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset + (curr_x - INPUT_PAD_W); // x offset @@ -653,7 +662,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int pos = 0; LOOP(KERNEL_WIDTH, pos, { - if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) + if (curr_y >= INPUT_PAD_H && + curr_y < input_height + INPUT_PAD_H && + curr_x + pos * DILATION_X >= INPUT_PAD_W && + curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read[pos * DILATION_X]; else pblockA00[pos] = 0; @@ -730,17 +742,18 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) //while( ++patch_row < 1 ); //debug while( ++patch_row < KERNEL_HEIGHT ); - src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch + // reset to start of next slice of patch + src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); } //while ( ++patch_depth < 1 ); //debug while ( ++patch_depth < INPUT_DEPTH ); // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. - int out_offset = global_z * out_pitch_z // batch offset - + ( group_x * TILE_N ) * out_pitch_y // channel offset + int out_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset - + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset + + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset __global Dtype *out = dst + out_offset; #if APPLY_BIAS Dtype bias[4]; @@ -849,11 +862,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int saved_y1 = curr_y1; #endif const __global Dtype *src0_read0 = src0 - + aligned_input_size * global_z // batch offset + + aligned_input_size * global_z // batch offset + (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x0 - INPUT_PAD_W; // x offset const __global Dtype *src0_read1 = src0 - + aligned_input_size * global_z // batch offset + + aligned_input_size * global_z // batch offset + (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x1 - INPUT_PAD_W; // x offset @@ -883,17 +896,38 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // ... const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; #if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 - Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; - Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; + #if KERNEL_WIDTH == 3 + Dtype_t blockA00 = vload3(0, src0_read0); src0_read0 += ROW_PITCH; + Dtype_t blockA01 = vload3(0, src0_read1); src0_read1 += ROW_PITCH; Dtype* pblockA00 = (Dtype*)(&blockA00); Dtype* pblockA01 = (Dtype*)(&blockA01); + #else + Dtype_t blockA00 = { (Dtype)0.f }; + Dtype_t blockA01 = { (Dtype)0.f }; + Dtype* pblockA00 = (Dtype*)(&blockA00); + Dtype* pblockA01 = (Dtype*)(&blockA01); + int pos = 0; + LOOP(KERNEL_WIDTH, pos, + { + if (curr_x0 + pos < input_width) + pblockA00[pos] = src0_read0[pos]; + + if (curr_x1 + pos < input_width) + pblockA01[pos] = src0_read1[pos]; + }) + src0_read0 += ROW_PITCH; + src0_read1 += ROW_PITCH; + #endif #else Dtype_t blockA00; Dtype* pblockA00 = (Dtype*)(&blockA00); int pos = 0; LOOP(KERNEL_WIDTH, pos, { - if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) + if (curr_y0 >= INPUT_PAD_H && + curr_y0 < input_height + INPUT_PAD_H && + curr_x0 + pos * DILATION_X >= INPUT_PAD_W && + curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read0[pos * DILATION_X]; else pblockA00[pos] = 0; @@ -904,7 +938,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) pos = 0; LOOP(KERNEL_WIDTH, pos, { - if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) + if (curr_y1 >= INPUT_PAD_H && + curr_y1 < input_height + INPUT_PAD_H && + curr_x1 + pos * DILATION_X >= INPUT_PAD_W && + curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA01[pos] = src0_read1[pos * DILATION_X]; else pblockA01[pos] = 0; @@ -972,7 +1009,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) curr_y0 = saved_y0; curr_y1 = saved_y1; #endif - src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch + // reset to start of next slice of patch + src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); } //while ( ++patch_depth < 1 ); //debug @@ -980,14 +1018,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. - int out0_offset = global_z * out_pitch_z // batch offset - + ( group_x * TILE_N ) * out_pitch_y // channel offset + int out0_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset - + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset - int out1_offset = global_z * out_pitch_z // batch offset - + ( group_x * TILE_N ) * out_pitch_y // channel offset + + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset + int out1_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset - + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset + + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset #if APPLY_BIAS Dtype bias[4]; @@ -1049,11 +1087,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int saved_y1 = curr_y1; #endif const __global Dtype *src0_read0 = src0 - + aligned_input_size * global_z // batch offset + + aligned_input_size * global_z // batch offset + (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x0 - INPUT_PAD_W; // x offset const __global Dtype *src0_read1 = src0 - + aligned_input_size * global_z // batch offset + + aligned_input_size * global_z // batch offset + (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x1 - INPUT_PAD_W; // x offset @@ -1084,7 +1122,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int pos = 0; LOOP(KERNEL_WIDTH, pos, { - if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) + if (curr_y0 >= INPUT_PAD_H && + curr_y0 < input_height + INPUT_PAD_H && + curr_x0 + pos * DILATION_X >= INPUT_PAD_W && + curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read0[pos * DILATION_X]; else pblockA00[pos] = 0; @@ -1095,7 +1136,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) pos = 0; LOOP(KERNEL_WIDTH, pos, { - if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) + if (curr_y1 >= INPUT_PAD_H && + curr_y1 < input_height + INPUT_PAD_H && + curr_x1 + pos * DILATION_X >= INPUT_PAD_W && + curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA01[pos] = src0_read1[pos * DILATION_X]; else pblockA01[pos] = 0; @@ -1185,7 +1229,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) curr_y0 = saved_y0; curr_y1 = saved_y1; #endif - src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch + // reset to start of next slice of patch + src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); } //while ( ++patch_depth < 1 ); //debug @@ -1193,14 +1238,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. - int out0_offset = global_z * out_pitch_z // batch offset - + ( group_x * TILE_N ) * out_pitch_y // channel offset + int out0_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset - + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset - int out1_offset = global_z * out_pitch_z // batch offset - + ( group_x * TILE_N ) * out_pitch_y // channel offset + + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset + int out1_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset - + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset + + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset __global Dtype *out1 = dst + out1_offset; #if APPLY_BIAS @@ -1352,9 +1397,9 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int saved_y = curr_y; #endif const __global Dtype *src0_read = src0 - + aligned_input_size * global_z // batch offset + + aligned_input_size * global_z // batch offset + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset - + curr_x - INPUT_PAD_W; // x offset + + curr_x - INPUT_PAD_W; // x offset const __global Dtype *src0_read_orig = src0_read; // Src1 (filter) is directly used as btile. @@ -1409,15 +1454,23 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; #if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 + #if KERNEL_WIDTH == 3 + Dtype_t blockA00 = vload3(0, src0_read); + Dtype* pblockA00 = (Dtype*)(&blockA00); + #else Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; Dtype* pblockA00 = (Dtype*)(&blockA00); + #endif #else Dtype_t blockA00; Dtype* pblockA00 = (Dtype*)(&blockA00); int pos = 0; LOOP(KERNEL_WIDTH, pos, { - if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) + if (curr_y >= INPUT_PAD_H && + curr_y < input_height + INPUT_PAD_H && + curr_x + pos * DILATION_X >= INPUT_PAD_W && + curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read[pos * DILATION_X]; else pblockA00[pos] = 0; @@ -1463,17 +1516,18 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) //while( ++patch_row < 1 ); //debug while( ++patch_row < KERNEL_HEIGHT ); - src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch + // reset to start of next slice of patch + src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); } //while ( ++patch_depth < 1 ); //debug while ( ++patch_depth < INPUT_DEPTH ); // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. - int out_offset = global_z * out_pitch_z // batch offset - + ( group_x * TILE_N ) * out_pitch_y // channel offset + int out_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset - + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset + + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset __global Dtype *out = dst + out_offset; #if APPLY_BIAS @@ -1556,11 +1610,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int saved_y1 = curr_y1; #endif const __global Dtype *src0_read0 = src0 - + aligned_input_size * global_z // batch offset + + aligned_input_size * global_z // batch offset + (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x0 - INPUT_PAD_W; // x offset const __global Dtype *src0_read1 = src0 - + aligned_input_size * global_z // batch offset + + aligned_input_size * global_z // batch offset + (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset + curr_x1 - INPUT_PAD_W; // x offset @@ -1600,7 +1654,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) int pos = 0; LOOP(KERNEL_WIDTH, pos, { - if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) + if (curr_y0 >= INPUT_PAD_H && + curr_y0 < input_height + INPUT_PAD_H && + curr_x0 + pos * DILATION_X >= INPUT_PAD_W && + curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA00[pos] = src0_read0[pos * DILATION_X]; else pblockA00[pos] = 0; @@ -1611,7 +1668,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) pos = 0; LOOP(KERNEL_WIDTH, pos, { - if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) + if (curr_y1 >= INPUT_PAD_H && + curr_y1 < input_height + INPUT_PAD_H && + curr_x1 + pos * DILATION_X >= INPUT_PAD_W && + curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) pblockA01[pos] = src0_read1[pos * DILATION_X]; else pblockA01[pos] = 0; @@ -1667,7 +1727,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) curr_y0 = saved_y0; curr_y1 = saved_y1; #endif - src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch + // reset to start of next slice of patch + src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); } //while ( ++patch_depth < 1 ); //debug @@ -1675,14 +1736,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. - int out0_offset = global_z * out_pitch_z // batch offset - + ( group_x * TILE_N ) * out_pitch_y // channel offset + int out0_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset - + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset - int out1_offset = global_z * out_pitch_z // batch offset - + ( group_x * TILE_N ) * out_pitch_y // channel offset + + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset + int out1_offset = global_z * out_pitch_z // batch offset + + ( group_x * TILE_N ) * out_pitch_y // channel offset + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset - + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset + + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset #if APPLY_BIAS Dtype bias[2];