@ -502,15 +502,23 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1 ;
# if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
# if KERNEL_WIDTH == 3
Dtype_t blockA00 = vload3 ( 0 , src0_read ) ;
Dtype* pblockA00 = ( Dtype* ) ( &blockA00 ) ;
# else
Dtype_t blockA00 = ( ( const __global Dtype_t* ) src0_read ) [ 0 ] ;
Dtype* pblockA00 = ( Dtype* ) ( &blockA00 ) ;
# endif
# else
Dtype_t blockA00 ;
Dtype* pblockA00 = ( Dtype* ) ( &blockA00 ) ;
int pos = 0 ;
LOOP ( KERNEL_WIDTH, pos,
{
if ( curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W )
if ( curr_y >= INPUT_PAD_H &&
curr_y < input_height + INPUT_PAD_H &&
curr_x + pos * DILATION_X >= INPUT_PAD_W &&
curr_x + pos * DILATION_X < input_width + INPUT_PAD_W )
pblockA00[pos] = src0_read[pos * DILATION_X] ;
else
pblockA00[pos] = 0 ;
@ -564,7 +572,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
//while ( ++patch_row < 1 ) ; //debug
while ( ++patch_row < KERNEL_HEIGHT ) ;
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ) ; // reset to start of next slice of patch
// reset to start of next slice of patch
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ) ;
}
//while ( ++patch_depth < 1 ) ; //debug
while ( ++patch_depth < INPUT_DEPTH ) ;
@ -653,7 +662,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
int pos = 0 ;
LOOP ( KERNEL_WIDTH, pos,
{
if ( curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W )
if ( curr_y >= INPUT_PAD_H &&
curr_y < input_height + INPUT_PAD_H &&
curr_x + pos * DILATION_X >= INPUT_PAD_W &&
curr_x + pos * DILATION_X < input_width + INPUT_PAD_W )
pblockA00[pos] = src0_read[pos * DILATION_X] ;
else
pblockA00[pos] = 0 ;
@ -730,7 +742,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
//while ( ++patch_row < 1 ) ; //debug
while ( ++patch_row < KERNEL_HEIGHT ) ;
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ) ; // reset to start of next slice of patch
// reset to start of next slice of patch
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ) ;
}
//while ( ++patch_depth < 1 ) ; //debug
while ( ++patch_depth < INPUT_DEPTH ) ;
@ -883,17 +896,38 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
// . . .
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1 ;
# if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
Dtype_t blockA00 = ( ( const __global Dtype_t* ) src0_read0 ) [ 0 ] ; src0_read0 += ROW_PITCH;
Dtype_t blockA01 = ( ( const __global Dtype_t* ) src0_read1 ) [ 0 ] ; src0_read1 += ROW_PITCH;
# if KERNEL_WIDTH == 3
Dtype_t blockA00 = vload3 ( 0 , src0_read0 ) ; src0_read0 += ROW_PITCH;
Dtype_t blockA01 = vload3 ( 0 , src0_read1 ) ; src0_read1 += ROW_PITCH;
Dtype* pblockA00 = ( Dtype* ) ( &blockA00 ) ;
Dtype* pblockA01 = ( Dtype* ) ( &blockA01 ) ;
# else
Dtype_t blockA00 = { ( Dtype ) 0.f } ;
Dtype_t blockA01 = { ( Dtype ) 0.f } ;
Dtype* pblockA00 = ( Dtype* ) ( &blockA00 ) ;
Dtype* pblockA01 = ( Dtype* ) ( &blockA01 ) ;
int pos = 0 ;
LOOP ( KERNEL_WIDTH, pos,
{
if ( curr_x0 + pos < input_width )
pblockA00[pos] = src0_read0[pos] ;
if ( curr_x1 + pos < input_width )
pblockA01[pos] = src0_read1[pos] ;
} )
src0_read0 += ROW_PITCH ;
src0_read1 += ROW_PITCH ;
# endif
# else
Dtype_t blockA00 ;
Dtype* pblockA00 = ( Dtype* ) ( &blockA00 ) ;
int pos = 0 ;
LOOP ( KERNEL_WIDTH, pos,
{
if ( curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W )
if ( curr_y0 >= INPUT_PAD_H &&
curr_y0 < input_height + INPUT_PAD_H &&
curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W )
pblockA00[pos] = src0_read0[pos * DILATION_X] ;
else
pblockA00[pos] = 0 ;
@ -904,7 +938,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
pos = 0 ;
LOOP ( KERNEL_WIDTH, pos,
{
if ( curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W )
if ( curr_y1 >= INPUT_PAD_H &&
curr_y1 < input_height + INPUT_PAD_H &&
curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W )
pblockA01[pos] = src0_read1[pos * DILATION_X] ;
else
pblockA01[pos] = 0 ;
@ -972,7 +1009,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
curr_y0 = saved_y0 ;
curr_y1 = saved_y1 ;
# endif
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ) ; // reset to start of next slice of patch
// reset to start of next slice of patch
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ) ;
src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ) ;
}
//while ( ++patch_depth < 1 ) ; //debug
@ -1084,7 +1122,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
int pos = 0 ;
LOOP ( KERNEL_WIDTH, pos,
{
if ( curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W )
if ( curr_y0 >= INPUT_PAD_H &&
curr_y0 < input_height + INPUT_PAD_H &&
curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W )
pblockA00[pos] = src0_read0[pos * DILATION_X] ;
else
pblockA00[pos] = 0 ;
@ -1095,7 +1136,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
pos = 0 ;
LOOP ( KERNEL_WIDTH, pos,
{
if ( curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W )
if ( curr_y1 >= INPUT_PAD_H &&
curr_y1 < input_height + INPUT_PAD_H &&
curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W )
pblockA01[pos] = src0_read1[pos * DILATION_X] ;
else
pblockA01[pos] = 0 ;
@ -1185,7 +1229,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
curr_y0 = saved_y0 ;
curr_y1 = saved_y1 ;
# endif
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ) ; // reset to start of next slice of patch
// reset to start of next slice of patch
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ) ;
src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ) ;
}
//while ( ++patch_depth < 1 ) ; //debug
@ -1409,15 +1454,23 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1 ;
# if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
# if KERNEL_WIDTH == 3
Dtype_t blockA00 = vload3 ( 0 , src0_read ) ;
Dtype* pblockA00 = ( Dtype* ) ( &blockA00 ) ;
# else
Dtype_t blockA00 = ( ( const __global Dtype_t* ) src0_read ) [ 0 ] ;
Dtype* pblockA00 = ( Dtype* ) ( &blockA00 ) ;
# endif
# else
Dtype_t blockA00 ;
Dtype* pblockA00 = ( Dtype* ) ( &blockA00 ) ;
int pos = 0 ;
LOOP ( KERNEL_WIDTH, pos,
{
if ( curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W )
if ( curr_y >= INPUT_PAD_H &&
curr_y < input_height + INPUT_PAD_H &&
curr_x + pos * DILATION_X >= INPUT_PAD_W &&
curr_x + pos * DILATION_X < input_width + INPUT_PAD_W )
pblockA00[pos] = src0_read[pos * DILATION_X] ;
else
pblockA00[pos] = 0 ;
@ -1463,7 +1516,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
//while ( ++patch_row < 1 ) ; //debug
while ( ++patch_row < KERNEL_HEIGHT ) ;
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ) ; // reset to start of next slice of patch
// reset to start of next slice of patch
src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ) ;
}
//while ( ++patch_depth < 1 ) ; //debug
while ( ++patch_depth < INPUT_DEPTH ) ;
@ -1600,7 +1654,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
int pos = 0 ;
LOOP ( KERNEL_WIDTH, pos,
{
if ( curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W )
if ( curr_y0 >= INPUT_PAD_H &&
curr_y0 < input_height + INPUT_PAD_H &&
curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W )
pblockA00[pos] = src0_read0[pos * DILATION_X] ;
else
pblockA00[pos] = 0 ;
@ -1611,7 +1668,10 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
pos = 0 ;
LOOP ( KERNEL_WIDTH, pos,
{
if ( curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W )
if ( curr_y1 >= INPUT_PAD_H &&
curr_y1 < input_height + INPUT_PAD_H &&
curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W )
pblockA01[pos] = src0_read1[pos * DILATION_X] ;
else
pblockA01[pos] = 0 ;
@ -1667,7 +1727,8 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
curr_y0 = saved_y0 ;
curr_y1 = saved_y1 ;
# endif
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ) ; // reset to start of next slice of patch
// reset to start of next slice of patch
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ) ;
src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ) ;
}
//while ( ++patch_depth < 1 ) ; //debug