|
|
|
@ -467,7 +467,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
int saved_y = curr_y; |
|
|
|
|
#endif |
|
|
|
|
const __global Dtype *src0_read = src0 |
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
+ (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset |
|
|
|
|
+ (curr_x - INPUT_PAD_W); // x offset |
|
|
|
|
|
|
|
|
@ -502,15 +502,23 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; |
|
|
|
|
|
|
|
|
|
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 |
|
|
|
|
#if KERNEL_WIDTH == 3 |
|
|
|
|
Dtype_t blockA00 = vload3(0, src0_read); |
|
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
|
#else |
|
|
|
|
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; |
|
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
|
#endif |
|
|
|
|
#else |
|
|
|
|
Dtype_t blockA00; |
|
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
|
int pos = 0; |
|
|
|
|
LOOP(KERNEL_WIDTH, pos, |
|
|
|
|
{ |
|
|
|
|
if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
if (curr_y >= INPUT_PAD_H && |
|
|
|
|
curr_y < input_height + INPUT_PAD_H && |
|
|
|
|
curr_x + pos * DILATION_X >= INPUT_PAD_W && |
|
|
|
|
curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
pblockA00[pos] = src0_read[pos * DILATION_X]; |
|
|
|
|
else |
|
|
|
|
pblockA00[pos] = 0; |
|
|
|
@ -564,17 +572,18 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
//while( ++patch_row < 1 ); //debug |
|
|
|
|
while( ++patch_row < KERNEL_HEIGHT ); |
|
|
|
|
|
|
|
|
|
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch |
|
|
|
|
// reset to start of next slice of patch |
|
|
|
|
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); |
|
|
|
|
} |
|
|
|
|
//while ( ++patch_depth < 1 ); //debug |
|
|
|
|
while ( ++patch_depth < INPUT_DEPTH ); |
|
|
|
|
|
|
|
|
|
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes: |
|
|
|
|
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. |
|
|
|
|
int out_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
int out_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
+ ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset |
|
|
|
|
+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
|
|
|
|
|
__global Dtype *out = dst + out_offset; |
|
|
|
|
#if APPLY_BIAS |
|
|
|
@ -621,7 +630,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
int saved_y = curr_y; |
|
|
|
|
#endif |
|
|
|
|
const __global Dtype *src0_read = src0 |
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
+ (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset |
|
|
|
|
+ (curr_x - INPUT_PAD_W); // x offset |
|
|
|
|
|
|
|
|
@ -653,7 +662,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
int pos = 0; |
|
|
|
|
LOOP(KERNEL_WIDTH, pos, |
|
|
|
|
{ |
|
|
|
|
if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
if (curr_y >= INPUT_PAD_H && |
|
|
|
|
curr_y < input_height + INPUT_PAD_H && |
|
|
|
|
curr_x + pos * DILATION_X >= INPUT_PAD_W && |
|
|
|
|
curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
pblockA00[pos] = src0_read[pos * DILATION_X]; |
|
|
|
|
else |
|
|
|
|
pblockA00[pos] = 0; |
|
|
|
@ -730,17 +742,18 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
//while( ++patch_row < 1 ); //debug |
|
|
|
|
while( ++patch_row < KERNEL_HEIGHT ); |
|
|
|
|
|
|
|
|
|
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch |
|
|
|
|
// reset to start of next slice of patch |
|
|
|
|
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); |
|
|
|
|
} |
|
|
|
|
//while ( ++patch_depth < 1 ); //debug |
|
|
|
|
while ( ++patch_depth < INPUT_DEPTH ); |
|
|
|
|
|
|
|
|
|
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes: |
|
|
|
|
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. |
|
|
|
|
int out_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
int out_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
+ ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset |
|
|
|
|
+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
__global Dtype *out = dst + out_offset; |
|
|
|
|
#if APPLY_BIAS |
|
|
|
|
Dtype bias[4]; |
|
|
|
@ -849,11 +862,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
int saved_y1 = curr_y1; |
|
|
|
|
#endif |
|
|
|
|
const __global Dtype *src0_read0 = src0 |
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset |
|
|
|
|
+ curr_x0 - INPUT_PAD_W; // x offset |
|
|
|
|
const __global Dtype *src0_read1 = src0 |
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset |
|
|
|
|
+ curr_x1 - INPUT_PAD_W; // x offset |
|
|
|
|
|
|
|
|
@ -883,17 +896,38 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
// ... |
|
|
|
|
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; |
|
|
|
|
#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 |
|
|
|
|
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; |
|
|
|
|
Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; |
|
|
|
|
#if KERNEL_WIDTH == 3 |
|
|
|
|
Dtype_t blockA00 = vload3(0, src0_read0); src0_read0 += ROW_PITCH; |
|
|
|
|
Dtype_t blockA01 = vload3(0, src0_read1); src0_read1 += ROW_PITCH; |
|
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
|
Dtype* pblockA01 = (Dtype*)(&blockA01); |
|
|
|
|
#else |
|
|
|
|
Dtype_t blockA00 = { (Dtype)0.f }; |
|
|
|
|
Dtype_t blockA01 = { (Dtype)0.f }; |
|
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
|
Dtype* pblockA01 = (Dtype*)(&blockA01); |
|
|
|
|
int pos = 0; |
|
|
|
|
LOOP(KERNEL_WIDTH, pos, |
|
|
|
|
{ |
|
|
|
|
if (curr_x0 + pos < input_width) |
|
|
|
|
pblockA00[pos] = src0_read0[pos]; |
|
|
|
|
|
|
|
|
|
if (curr_x1 + pos < input_width) |
|
|
|
|
pblockA01[pos] = src0_read1[pos]; |
|
|
|
|
}) |
|
|
|
|
src0_read0 += ROW_PITCH; |
|
|
|
|
src0_read1 += ROW_PITCH; |
|
|
|
|
#endif |
|
|
|
|
#else |
|
|
|
|
Dtype_t blockA00; |
|
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
|
int pos = 0; |
|
|
|
|
LOOP(KERNEL_WIDTH, pos, |
|
|
|
|
{ |
|
|
|
|
if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
if (curr_y0 >= INPUT_PAD_H && |
|
|
|
|
curr_y0 < input_height + INPUT_PAD_H && |
|
|
|
|
curr_x0 + pos * DILATION_X >= INPUT_PAD_W && |
|
|
|
|
curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
pblockA00[pos] = src0_read0[pos * DILATION_X]; |
|
|
|
|
else |
|
|
|
|
pblockA00[pos] = 0; |
|
|
|
@ -904,7 +938,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
pos = 0; |
|
|
|
|
LOOP(KERNEL_WIDTH, pos, |
|
|
|
|
{ |
|
|
|
|
if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
if (curr_y1 >= INPUT_PAD_H && |
|
|
|
|
curr_y1 < input_height + INPUT_PAD_H && |
|
|
|
|
curr_x1 + pos * DILATION_X >= INPUT_PAD_W && |
|
|
|
|
curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
pblockA01[pos] = src0_read1[pos * DILATION_X]; |
|
|
|
|
else |
|
|
|
|
pblockA01[pos] = 0; |
|
|
|
@ -972,7 +1009,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
curr_y0 = saved_y0; |
|
|
|
|
curr_y1 = saved_y1; |
|
|
|
|
#endif |
|
|
|
|
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch |
|
|
|
|
// reset to start of next slice of patch |
|
|
|
|
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); |
|
|
|
|
src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); |
|
|
|
|
} |
|
|
|
|
//while ( ++patch_depth < 1 ); //debug |
|
|
|
@ -980,14 +1018,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
|
|
|
|
|
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes: |
|
|
|
|
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. |
|
|
|
|
int out0_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
int out0_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset |
|
|
|
|
+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
int out1_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
int out1_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset |
|
|
|
|
+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
|
|
|
|
|
#if APPLY_BIAS |
|
|
|
|
Dtype bias[4]; |
|
|
|
@ -1049,11 +1087,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
int saved_y1 = curr_y1; |
|
|
|
|
#endif |
|
|
|
|
const __global Dtype *src0_read0 = src0 |
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset |
|
|
|
|
+ curr_x0 - INPUT_PAD_W; // x offset |
|
|
|
|
const __global Dtype *src0_read1 = src0 |
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset |
|
|
|
|
+ curr_x1 - INPUT_PAD_W; // x offset |
|
|
|
|
|
|
|
|
@ -1084,7 +1122,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
int pos = 0; |
|
|
|
|
LOOP(KERNEL_WIDTH, pos, |
|
|
|
|
{ |
|
|
|
|
if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
if (curr_y0 >= INPUT_PAD_H && |
|
|
|
|
curr_y0 < input_height + INPUT_PAD_H && |
|
|
|
|
curr_x0 + pos * DILATION_X >= INPUT_PAD_W && |
|
|
|
|
curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
pblockA00[pos] = src0_read0[pos * DILATION_X]; |
|
|
|
|
else |
|
|
|
|
pblockA00[pos] = 0; |
|
|
|
@ -1095,7 +1136,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
pos = 0; |
|
|
|
|
LOOP(KERNEL_WIDTH, pos, |
|
|
|
|
{ |
|
|
|
|
if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
if (curr_y1 >= INPUT_PAD_H && |
|
|
|
|
curr_y1 < input_height + INPUT_PAD_H && |
|
|
|
|
curr_x1 + pos * DILATION_X >= INPUT_PAD_W && |
|
|
|
|
curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
pblockA01[pos] = src0_read1[pos * DILATION_X]; |
|
|
|
|
else |
|
|
|
|
pblockA01[pos] = 0; |
|
|
|
@ -1185,7 +1229,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
curr_y0 = saved_y0; |
|
|
|
|
curr_y1 = saved_y1; |
|
|
|
|
#endif |
|
|
|
|
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch |
|
|
|
|
// reset to start of next slice of patch |
|
|
|
|
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); |
|
|
|
|
src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); |
|
|
|
|
} |
|
|
|
|
//while ( ++patch_depth < 1 ); //debug |
|
|
|
@ -1193,14 +1238,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
|
|
|
|
|
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes: |
|
|
|
|
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. |
|
|
|
|
int out0_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
int out0_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset |
|
|
|
|
+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
int out1_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
int out1_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset |
|
|
|
|
+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
__global Dtype *out1 = dst + out1_offset; |
|
|
|
|
|
|
|
|
|
#if APPLY_BIAS |
|
|
|
@ -1352,9 +1397,9 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
int saved_y = curr_y; |
|
|
|
|
#endif |
|
|
|
|
const __global Dtype *src0_read = src0 |
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
+ (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset |
|
|
|
|
+ curr_x - INPUT_PAD_W; // x offset |
|
|
|
|
+ curr_x - INPUT_PAD_W; // x offset |
|
|
|
|
const __global Dtype *src0_read_orig = src0_read; |
|
|
|
|
|
|
|
|
|
// Src1 (filter) is directly used as btile. |
|
|
|
@ -1409,15 +1454,23 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; |
|
|
|
|
|
|
|
|
|
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 |
|
|
|
|
#if KERNEL_WIDTH == 3 |
|
|
|
|
Dtype_t blockA00 = vload3(0, src0_read); |
|
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
|
#else |
|
|
|
|
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; |
|
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
|
#endif |
|
|
|
|
#else |
|
|
|
|
Dtype_t blockA00; |
|
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
|
int pos = 0; |
|
|
|
|
LOOP(KERNEL_WIDTH, pos, |
|
|
|
|
{ |
|
|
|
|
if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
if (curr_y >= INPUT_PAD_H && |
|
|
|
|
curr_y < input_height + INPUT_PAD_H && |
|
|
|
|
curr_x + pos * DILATION_X >= INPUT_PAD_W && |
|
|
|
|
curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
pblockA00[pos] = src0_read[pos * DILATION_X]; |
|
|
|
|
else |
|
|
|
|
pblockA00[pos] = 0; |
|
|
|
@ -1463,17 +1516,18 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
//while( ++patch_row < 1 ); //debug |
|
|
|
|
while( ++patch_row < KERNEL_HEIGHT ); |
|
|
|
|
|
|
|
|
|
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch |
|
|
|
|
// reset to start of next slice of patch |
|
|
|
|
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); |
|
|
|
|
} |
|
|
|
|
//while ( ++patch_depth < 1 ); //debug |
|
|
|
|
while ( ++patch_depth < INPUT_DEPTH ); |
|
|
|
|
|
|
|
|
|
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes: |
|
|
|
|
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. |
|
|
|
|
int out_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
int out_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
+ ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset |
|
|
|
|
+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
+ ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
__global Dtype *out = dst + out_offset; |
|
|
|
|
|
|
|
|
|
#if APPLY_BIAS |
|
|
|
@ -1556,11 +1610,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
int saved_y1 = curr_y1; |
|
|
|
|
#endif |
|
|
|
|
const __global Dtype *src0_read0 = src0 |
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset |
|
|
|
|
+ curr_x0 - INPUT_PAD_W; // x offset |
|
|
|
|
const __global Dtype *src0_read1 = src0 |
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset |
|
|
|
|
+ curr_x1 - INPUT_PAD_W; // x offset |
|
|
|
|
|
|
|
|
@ -1600,7 +1654,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
int pos = 0; |
|
|
|
|
LOOP(KERNEL_WIDTH, pos, |
|
|
|
|
{ |
|
|
|
|
if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
if (curr_y0 >= INPUT_PAD_H && |
|
|
|
|
curr_y0 < input_height + INPUT_PAD_H && |
|
|
|
|
curr_x0 + pos * DILATION_X >= INPUT_PAD_W && |
|
|
|
|
curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
pblockA00[pos] = src0_read0[pos * DILATION_X]; |
|
|
|
|
else |
|
|
|
|
pblockA00[pos] = 0; |
|
|
|
@ -1611,7 +1668,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
pos = 0; |
|
|
|
|
LOOP(KERNEL_WIDTH, pos, |
|
|
|
|
{ |
|
|
|
|
if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
if (curr_y1 >= INPUT_PAD_H && |
|
|
|
|
curr_y1 < input_height + INPUT_PAD_H && |
|
|
|
|
curr_x1 + pos * DILATION_X >= INPUT_PAD_W && |
|
|
|
|
curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
pblockA01[pos] = src0_read1[pos * DILATION_X]; |
|
|
|
|
else |
|
|
|
|
pblockA01[pos] = 0; |
|
|
|
@ -1667,7 +1727,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
curr_y0 = saved_y0; |
|
|
|
|
curr_y1 = saved_y1; |
|
|
|
|
#endif |
|
|
|
|
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch |
|
|
|
|
// reset to start of next slice of patch |
|
|
|
|
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); |
|
|
|
|
src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); |
|
|
|
|
} |
|
|
|
|
//while ( ++patch_depth < 1 ); //debug |
|
|
|
@ -1675,14 +1736,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
|
|
|
|
|
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes: |
|
|
|
|
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. |
|
|
|
|
int out0_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
int out0_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset |
|
|
|
|
+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
int out1_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
int out1_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset |
|
|
|
|
+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
|
|
|
|
|
#if APPLY_BIAS |
|
|
|
|
Dtype bias[2]; |
|
|
|
|