|
|
@ -238,7 +238,7 @@ convolve_simd( |
|
|
|
int curr_local_x = ( lid % ( TILE_X / 4 ) ) * 4; |
|
|
|
int curr_local_x = ( lid % ( TILE_X / 4 ) ) * 4; |
|
|
|
int curr_y = or * STRIDE_Y + curr_local_y; |
|
|
|
int curr_y = or * STRIDE_Y + curr_local_y; |
|
|
|
int curr_x = oc * STRIDE_X + curr_local_x; |
|
|
|
int curr_x = oc * STRIDE_X + curr_local_x; |
|
|
|
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 |
|
|
|
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 |
|
|
|
int saved_y = curr_y; |
|
|
|
int saved_y = curr_y; |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
in_addr = input_batch_offset |
|
|
|
in_addr = input_batch_offset |
|
|
@ -256,19 +256,22 @@ convolve_simd( |
|
|
|
LOOP(INVEC_SIZE, reg, |
|
|
|
LOOP(INVEC_SIZE, reg, |
|
|
|
{ |
|
|
|
{ |
|
|
|
if (curr_local_y + reg * TILE_Y_STRIDE < TILE_Y || INVEC_SIZE * TILE_Y_STRIDE <= (TILE_Y + 2) || reg < INVEC_SIZE - 1) { |
|
|
|
if (curr_local_y + reg * TILE_Y_STRIDE < TILE_Y || INVEC_SIZE * TILE_Y_STRIDE <= (TILE_Y + 2) || reg < INVEC_SIZE - 1) { |
|
|
|
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 |
|
|
|
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 |
|
|
|
if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + 3 >= INPUT_PAD_W && curr_x < input_width + INPUT_PAD_W) { |
|
|
|
if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + 3 >= INPUT_PAD_W && curr_x < input_width + INPUT_PAD_W) { |
|
|
|
if (curr_x < INPUT_PAD_W) { |
|
|
|
if (curr_x < INPUT_PAD_W) { |
|
|
|
in_buf.in_vec[reg].s0 = 0; |
|
|
|
in_buf.in_vec[reg].s0 = 0; |
|
|
|
if (curr_x + 1 >= INPUT_PAD_W) |
|
|
|
if (curr_x + 1 >= INPUT_PAD_W && curr_x + 1 < input_width + INPUT_PAD_W) |
|
|
|
in_buf.in_vec[reg].s1 = *(inputs + in_offset + 1); |
|
|
|
in_buf.in_vec[reg].s1 = *(inputs + in_offset + 1); |
|
|
|
else |
|
|
|
else |
|
|
|
in_buf.in_vec[reg].s1 = 0; |
|
|
|
in_buf.in_vec[reg].s1 = 0; |
|
|
|
if (curr_x + 2 >= INPUT_PAD_W) |
|
|
|
if (curr_x + 2 >= INPUT_PAD_W && curr_x + 2 < input_width + INPUT_PAD_W) |
|
|
|
in_buf.in_vec[reg].s2 = *(inputs + in_offset + 2); |
|
|
|
in_buf.in_vec[reg].s2 = *(inputs + in_offset + 2); |
|
|
|
else |
|
|
|
else |
|
|
|
in_buf.in_vec[reg].s2 = 0; |
|
|
|
in_buf.in_vec[reg].s2 = 0; |
|
|
|
in_buf.in_vec[reg].s3 = *(inputs + in_offset + 3); |
|
|
|
if (curr_x + 3 < input_width + INPUT_PAD_W) |
|
|
|
|
|
|
|
in_buf.in_vec[reg].s3 = *(inputs + in_offset + 3); |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
in_buf.in_vec[reg].s3 = 0; |
|
|
|
} else { |
|
|
|
} else { |
|
|
|
VLOAD4(in_buf.in_vec[reg], inputs + in_offset); |
|
|
|
VLOAD4(in_buf.in_vec[reg], inputs + in_offset); |
|
|
|
if (curr_x + 1 >= input_width + INPUT_PAD_W) |
|
|
|
if (curr_x + 1 >= input_width + INPUT_PAD_W) |
|
|
@ -289,7 +292,7 @@ convolve_simd( |
|
|
|
in_offset += input_width * TILE_Y_STRIDE; |
|
|
|
in_offset += input_width * TILE_Y_STRIDE; |
|
|
|
}); |
|
|
|
}); |
|
|
|
in_addr += input_height * input_width; |
|
|
|
in_addr += input_height * input_width; |
|
|
|
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 |
|
|
|
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 |
|
|
|
curr_y = saved_y; |
|
|
|
curr_y = saved_y; |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
@ -492,7 +495,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
// atile is M rows x K columns. |
|
|
|
// atile is M rows x K columns. |
|
|
|
int curr_x = ( global_y % output_width ) * STRIDE_X; |
|
|
|
int curr_x = ( global_y % output_width ) * STRIDE_X; |
|
|
|
int curr_y = ( global_y / output_width ) * STRIDE_Y; |
|
|
|
int curr_y = ( global_y / output_width ) * STRIDE_Y; |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 |
|
|
|
int saved_y = curr_y; |
|
|
|
int saved_y = curr_y; |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
const __global Dtype *src0_read = src0 |
|
|
|
const __global Dtype *src0_read = src0 |
|
|
@ -512,7 +515,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
do |
|
|
|
do |
|
|
|
{ |
|
|
|
{ |
|
|
|
int patch_row = 0; |
|
|
|
int patch_row = 0; |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 |
|
|
|
curr_y = saved_y; |
|
|
|
curr_y = saved_y; |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
@ -530,7 +533,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
// ... |
|
|
|
// ... |
|
|
|
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; |
|
|
|
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; |
|
|
|
|
|
|
|
|
|
|
|
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 |
|
|
|
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 |
|
|
|
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; |
|
|
|
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; |
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
#else |
|
|
|
#else |
|
|
@ -646,7 +649,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
// atile is M rows x K columns. |
|
|
|
// atile is M rows x K columns. |
|
|
|
int curr_x = ( global_y % output_width ) * STRIDE_X; |
|
|
|
int curr_x = ( global_y % output_width ) * STRIDE_X; |
|
|
|
int curr_y = ( global_y / output_width ) * STRIDE_Y; |
|
|
|
int curr_y = ( global_y / output_width ) * STRIDE_Y; |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 |
|
|
|
int saved_y = curr_y; |
|
|
|
int saved_y = curr_y; |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
const __global Dtype *src0_read = src0 |
|
|
|
const __global Dtype *src0_read = src0 |
|
|
@ -666,14 +669,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
do |
|
|
|
do |
|
|
|
{ |
|
|
|
{ |
|
|
|
int patch_row = 0; |
|
|
|
int patch_row = 0; |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 |
|
|
|
curr_y = saved_y; |
|
|
|
curr_y = saved_y; |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
do |
|
|
|
do |
|
|
|
{ |
|
|
|
{ |
|
|
|
// Load atile and interleaved btile. |
|
|
|
// Load atile and interleaved btile. |
|
|
|
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; |
|
|
|
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; |
|
|
|
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 |
|
|
|
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 |
|
|
|
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; |
|
|
|
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; |
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
#else |
|
|
|
#else |
|
|
@ -873,7 +876,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X; |
|
|
|
int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X; |
|
|
|
int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y; |
|
|
|
int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y; |
|
|
|
int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y; |
|
|
|
int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y; |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 |
|
|
|
int saved_y0 = curr_y0; |
|
|
|
int saved_y0 = curr_y0; |
|
|
|
int saved_y1 = curr_y1; |
|
|
|
int saved_y1 = curr_y1; |
|
|
|
#endif |
|
|
|
#endif |
|
|
@ -911,7 +914,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
// (0, 2) (8, 2) (16, 2) (24, 2) ... ... |
|
|
|
// (0, 2) (8, 2) (16, 2) (24, 2) ... ... |
|
|
|
// ... |
|
|
|
// ... |
|
|
|
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; |
|
|
|
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; |
|
|
|
#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 |
|
|
|
#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 |
|
|
|
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; |
|
|
|
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; |
|
|
|
Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; |
|
|
|
Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; |
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
@ -997,7 +1000,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
|
|
|
|
|
|
|
//while( ++patch_row < 1 ); //debug |
|
|
|
//while( ++patch_row < 1 ); //debug |
|
|
|
while( ++patch_row < KERNEL_HEIGHT ); |
|
|
|
while( ++patch_row < KERNEL_HEIGHT ); |
|
|
|
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 |
|
|
|
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 |
|
|
|
curr_y0 = saved_y0; |
|
|
|
curr_y0 = saved_y0; |
|
|
|
curr_y1 = saved_y1; |
|
|
|
curr_y1 = saved_y1; |
|
|
|
#endif |
|
|
|
#endif |
|
|
@ -1073,7 +1076,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X; |
|
|
|
int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X; |
|
|
|
int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y; |
|
|
|
int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y; |
|
|
|
int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y; |
|
|
|
int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y; |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 |
|
|
|
int saved_y0 = curr_y0; |
|
|
|
int saved_y0 = curr_y0; |
|
|
|
int saved_y1 = curr_y1; |
|
|
|
int saved_y1 = curr_y1; |
|
|
|
#endif |
|
|
|
#endif |
|
|
@ -1102,7 +1105,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
{ |
|
|
|
{ |
|
|
|
// Load atile and interleaved btile. |
|
|
|
// Load atile and interleaved btile. |
|
|
|
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; |
|
|
|
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; |
|
|
|
#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 |
|
|
|
#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 |
|
|
|
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; |
|
|
|
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; |
|
|
|
Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; |
|
|
|
Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; |
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
@ -1210,7 +1213,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
|
|
|
|
|
|
|
//while( ++patch_row < 1 ); //debug |
|
|
|
//while( ++patch_row < 1 ); //debug |
|
|
|
while( ++patch_row < KERNEL_HEIGHT ); |
|
|
|
while( ++patch_row < KERNEL_HEIGHT ); |
|
|
|
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 |
|
|
|
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 |
|
|
|
curr_y0 = saved_y0; |
|
|
|
curr_y0 = saved_y0; |
|
|
|
curr_y1 = saved_y1; |
|
|
|
curr_y1 = saved_y1; |
|
|
|
#endif |
|
|
|
#endif |
|
|
@ -1377,7 +1380,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
// atile is M rows x K columns. |
|
|
|
// atile is M rows x K columns. |
|
|
|
int curr_x = ( global_y % output_width ) * STRIDE_X; |
|
|
|
int curr_x = ( global_y % output_width ) * STRIDE_X; |
|
|
|
int curr_y = ( global_y / output_width ) * STRIDE_Y; |
|
|
|
int curr_y = ( global_y / output_width ) * STRIDE_Y; |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 |
|
|
|
int saved_y = curr_y; |
|
|
|
int saved_y = curr_y; |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
const __global Dtype *src0_read = src0 |
|
|
|
const __global Dtype *src0_read = src0 |
|
|
@ -1419,7 +1422,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
do |
|
|
|
do |
|
|
|
{ |
|
|
|
{ |
|
|
|
int patch_row = 0; |
|
|
|
int patch_row = 0; |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 |
|
|
|
curr_y = saved_y; |
|
|
|
curr_y = saved_y; |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
__attribute__((opencl_unroll_hint(1))) |
|
|
|
__attribute__((opencl_unroll_hint(1))) |
|
|
@ -1437,7 +1440,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
// ... |
|
|
|
// ... |
|
|
|
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; |
|
|
|
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; |
|
|
|
|
|
|
|
|
|
|
|
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 |
|
|
|
#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 |
|
|
|
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; |
|
|
|
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; |
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
#else |
|
|
|
#else |
|
|
@ -1580,7 +1583,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X; |
|
|
|
int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X; |
|
|
|
int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y; |
|
|
|
int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y; |
|
|
|
int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y; |
|
|
|
int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y; |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 |
|
|
|
int saved_y0 = curr_y0; |
|
|
|
int saved_y0 = curr_y0; |
|
|
|
int saved_y1 = curr_y1; |
|
|
|
int saved_y1 = curr_y1; |
|
|
|
#endif |
|
|
|
#endif |
|
|
@ -1618,7 +1621,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
// (0, 2) (8, 2) (16, 2) (24, 2) ... ... |
|
|
|
// (0, 2) (8, 2) (16, 2) (24, 2) ... ... |
|
|
|
// ... |
|
|
|
// ... |
|
|
|
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; |
|
|
|
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; |
|
|
|
#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 |
|
|
|
#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 |
|
|
|
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; |
|
|
|
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; |
|
|
|
Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; |
|
|
|
Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; |
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
@ -1692,7 +1695,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
|
|
|
|
|
|
|
//while( ++patch_row < 1 ); //debug |
|
|
|
//while( ++patch_row < 1 ); //debug |
|
|
|
while( ++patch_row < KERNEL_HEIGHT ); |
|
|
|
while( ++patch_row < KERNEL_HEIGHT ); |
|
|
|
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 |
|
|
|
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 |
|
|
|
curr_y0 = saved_y0; |
|
|
|
curr_y0 = saved_y0; |
|
|
|
curr_y1 = saved_y1; |
|
|
|
curr_y1 = saved_y1; |
|
|
|
#endif |
|
|
|
#endif |
|
|
|