|
|
@ -384,7 +384,6 @@ convolve_simd( |
|
|
|
#elif defined KERNEL_GEMM_LIKE |
|
|
|
#elif defined KERNEL_GEMM_LIKE |
|
|
|
|
|
|
|
|
|
|
|
#if APPLY_BIAS |
|
|
|
#if APPLY_BIAS |
|
|
|
// Dtype bias[4]; |
|
|
|
|
|
|
|
#define SUBGROUP_GET_BIAS(k, i) intel_sub_group_shuffle(bias[k], i) |
|
|
|
#define SUBGROUP_GET_BIAS(k, i) intel_sub_group_shuffle(bias[k], i) |
|
|
|
#else |
|
|
|
#else |
|
|
|
#define SUBGROUP_GET_BIAS(k, i) ((Dtype)0) |
|
|
|
#define SUBGROUP_GET_BIAS(k, i) ((Dtype)0) |
|
|
@ -446,9 +445,7 @@ typedef struct float0 { float s0; } float0; //never used but makes compiler happ |
|
|
|
#define TILE_K KERNEL_WIDTH |
|
|
|
#define TILE_K KERNEL_WIDTH |
|
|
|
#define TILE_N 32 |
|
|
|
#define TILE_N 32 |
|
|
|
|
|
|
|
|
|
|
|
#ifndef __BEIGNET__ |
|
|
|
|
|
|
|
__attribute__((intel_reqd_sub_group_size(8))) |
|
|
|
__attribute__((intel_reqd_sub_group_size(8))) |
|
|
|
#endif |
|
|
|
|
|
|
|
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const int group_x = get_group_id(0); |
|
|
|
const int group_x = get_group_id(0); |
|
|
@ -608,6 +605,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
Dtype4 *bias_vec; |
|
|
|
Dtype4 *bias_vec; |
|
|
|
bias_vec = (Dtype4*)bias; |
|
|
|
bias_vec = (Dtype4*)bias; |
|
|
|
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N)); |
|
|
|
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N)); |
|
|
|
|
|
|
|
if (group_x > 0xFFFFFFFEul) { |
|
|
|
|
|
|
|
dst[0] = bias[0] + bias[1] + bias[2] + bias[3]; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
const Dtype bias[4] = {0, 0, 0, 0}; |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
if (global_y * TILE_M < output_width * output_height ) |
|
|
|
if (global_y * TILE_M < output_width * output_height ) |
|
|
|
{ |
|
|
|
{ |
|
|
@ -768,6 +770,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
Dtype4 *bias_vec; |
|
|
|
Dtype4 *bias_vec; |
|
|
|
bias_vec = (Dtype4*)bias; |
|
|
|
bias_vec = (Dtype4*)bias; |
|
|
|
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N)); |
|
|
|
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N)); |
|
|
|
|
|
|
|
if (group_x > 0xFFFFFFFEul) { |
|
|
|
|
|
|
|
dst[0] = bias[0] + bias[1] + bias[2] + bias[3]; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
const Dtype bias[4] = {0, 0, 0, 0}; |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
if (global_y * TILE_M < output_width * output_height ) |
|
|
|
if (global_y * TILE_M < output_width * output_height ) |
|
|
@ -813,9 +820,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
#define TILE_K KERNEL_WIDTH |
|
|
|
#define TILE_K KERNEL_WIDTH |
|
|
|
#define TILE_N 32 |
|
|
|
#define TILE_N 32 |
|
|
|
|
|
|
|
|
|
|
|
#ifndef __BEIGNET__ |
|
|
|
|
|
|
|
__attribute__((intel_reqd_sub_group_size(8))) |
|
|
|
__attribute__((intel_reqd_sub_group_size(8))) |
|
|
|
#endif |
|
|
|
|
|
|
|
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const int group_x = get_group_id(0); |
|
|
|
const int group_x = get_group_id(0); |
|
|
@ -1012,6 +1017,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
Dtype4 *bias_vec; |
|
|
|
Dtype4 *bias_vec; |
|
|
|
bias_vec = (Dtype4*)bias; |
|
|
|
bias_vec = (Dtype4*)bias; |
|
|
|
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N)); |
|
|
|
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N)); |
|
|
|
|
|
|
|
if (group_x > 0xFFFFFFFEul) { |
|
|
|
|
|
|
|
dst[0] = bias[0] + bias[1] + bias[2] + bias[3]; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
const Dtype bias[4] = {0, 0, 0, 0}; |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
if( global_y * TILE_M < output_width * output_height ) |
|
|
|
if( global_y * TILE_M < output_width * output_height ) |
|
|
@ -1221,6 +1231,11 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
Dtype4 *bias_vec; |
|
|
|
Dtype4 *bias_vec; |
|
|
|
bias_vec = (Dtype4*)bias; |
|
|
|
bias_vec = (Dtype4*)bias; |
|
|
|
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N)); |
|
|
|
*bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N)); |
|
|
|
|
|
|
|
if (group_x > 0xFFFFFFFEul) { |
|
|
|
|
|
|
|
dst[0] = bias[0] + bias[1] + bias[2] + bias[3]; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
const Dtype bias[4] = {0, 0, 0, 0}; |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
if( global_y * TILE_M < output_width * output_height ) |
|
|
|
if( global_y * TILE_M < output_width * output_height ) |
|
|
|
{ |
|
|
|
{ |
|
|
@ -1334,9 +1349,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
#define TILE_K KERNEL_WIDTH |
|
|
|
#define TILE_K KERNEL_WIDTH |
|
|
|
#define TILE_N 32 |
|
|
|
#define TILE_N 32 |
|
|
|
|
|
|
|
|
|
|
|
#ifndef __BEIGNET__ |
|
|
|
|
|
|
|
__attribute__((intel_reqd_sub_group_size(16))) |
|
|
|
__attribute__((intel_reqd_sub_group_size(16))) |
|
|
|
#endif |
|
|
|
|
|
|
|
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const int group_x = get_group_id(0); |
|
|
|
const int group_x = get_group_id(0); |
|
|
@ -1396,18 +1409,14 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch |
|
|
|
// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch |
|
|
|
// and KERNEL_WIDTH/2 rows of interleaved filter. |
|
|
|
// and KERNEL_WIDTH/2 rows of interleaved filter. |
|
|
|
int patch_depth = 0; |
|
|
|
int patch_depth = 0; |
|
|
|
#ifndef __BEIGNET__ |
|
|
|
|
|
|
|
__attribute__((opencl_unroll_hint(1))) |
|
|
|
__attribute__((opencl_unroll_hint(1))) |
|
|
|
#endif |
|
|
|
|
|
|
|
do |
|
|
|
do |
|
|
|
{ |
|
|
|
{ |
|
|
|
int patch_row = 0; |
|
|
|
int patch_row = 0; |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 |
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 |
|
|
|
curr_y = saved_y; |
|
|
|
curr_y = saved_y; |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
#ifndef __BEIGNET__ |
|
|
|
|
|
|
|
__attribute__((opencl_unroll_hint(1))) |
|
|
|
__attribute__((opencl_unroll_hint(1))) |
|
|
|
#endif |
|
|
|
|
|
|
|
do |
|
|
|
do |
|
|
|
{ |
|
|
|
{ |
|
|
|
// Load atile and btile. |
|
|
|
// Load atile and btile. |
|
|
@ -1495,11 +1504,226 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
Dtype2 *bias_vec; |
|
|
|
Dtype2 *bias_vec; |
|
|
|
bias_vec = (Dtype2*)bias; |
|
|
|
bias_vec = (Dtype2*)bias; |
|
|
|
*bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N)); |
|
|
|
*bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N)); |
|
|
|
|
|
|
|
if (group_x > 0xFFFFFFFEul) { |
|
|
|
|
|
|
|
dst[0] = bias[0] + bias[1]; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
const Dtype bias[2] = {0, 0}; |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0); |
|
|
|
INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0); |
|
|
|
} |
|
|
|
} |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef GEMM_LIKE_CONV_32_2_SIMD16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
|
|
|
// Conv_Interleaved_32_2_SIMD16 |
|
|
|
|
|
|
|
// |
|
|
|
|
|
|
|
// Convolution: each workitem computes 1 patch x 32 filters worth of output |
|
|
|
|
|
|
|
// data. |
|
|
|
|
|
|
|
#define TILE_M 2 |
|
|
|
|
|
|
|
#define TILE_K KERNEL_WIDTH |
|
|
|
|
|
|
|
#define TILE_N 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__attribute__((intel_reqd_sub_group_size(16))) |
|
|
|
|
|
|
|
__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
const int group_x = get_group_id(0); |
|
|
|
|
|
|
|
const int group_y = get_group_id(1); |
|
|
|
|
|
|
|
const int global_x = get_global_id(0); |
|
|
|
|
|
|
|
const int global_y = get_global_id(1); |
|
|
|
|
|
|
|
const int global_z = get_global_id(2); |
|
|
|
|
|
|
|
int interleaved_y; |
|
|
|
|
|
|
|
int kernel_y; |
|
|
|
|
|
|
|
int kernel_idx; |
|
|
|
|
|
|
|
#define DOT_PRODUCT_16( _result, _rowA, colB ) \ |
|
|
|
|
|
|
|
{ \ |
|
|
|
|
|
|
|
_result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \ |
|
|
|
|
|
|
|
_result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \ |
|
|
|
|
|
|
|
_result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \ |
|
|
|
|
|
|
|
_result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \ |
|
|
|
|
|
|
|
_result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \ |
|
|
|
|
|
|
|
_result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \ |
|
|
|
|
|
|
|
_result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \ |
|
|
|
|
|
|
|
_result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \ |
|
|
|
|
|
|
|
_result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 ); \ |
|
|
|
|
|
|
|
_result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 ); \ |
|
|
|
|
|
|
|
_result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa ); \ |
|
|
|
|
|
|
|
_result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb ); \ |
|
|
|
|
|
|
|
_result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc ); \ |
|
|
|
|
|
|
|
_result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd ); \ |
|
|
|
|
|
|
|
_result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se ); \ |
|
|
|
|
|
|
|
_result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf ); \ |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// True for all threads if filter_width is multiple of TILE_N |
|
|
|
|
|
|
|
// else, true for all but right-most column of threads. |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
// Result ctile (*dst) is M rows x N columns |
|
|
|
|
|
|
|
// LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. |
|
|
|
|
|
|
|
Dtype16 blockC00 = 0.f; |
|
|
|
|
|
|
|
Dtype16 blockC10 = 0.f; |
|
|
|
|
|
|
|
Dtype16 blockC01 = 0.f; |
|
|
|
|
|
|
|
Dtype16 blockC11 = 0.f; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Src0 (patch input) is directly used as atile. |
|
|
|
|
|
|
|
// Each work item points to the start of a different patch. |
|
|
|
|
|
|
|
// atile is M rows x K columns. |
|
|
|
|
|
|
|
int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X; |
|
|
|
|
|
|
|
int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X; |
|
|
|
|
|
|
|
int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y; |
|
|
|
|
|
|
|
int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y; |
|
|
|
|
|
|
|
#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 |
|
|
|
|
|
|
|
int saved_y0 = curr_y0; |
|
|
|
|
|
|
|
int saved_y1 = curr_y1; |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
const __global Dtype *src0_read0 = src0 |
|
|
|
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
|
|
|
+ (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset |
|
|
|
|
|
|
|
+ curr_x0 - INPUT_PAD_W; // x offset |
|
|
|
|
|
|
|
const __global Dtype *src0_read1 = src0 |
|
|
|
|
|
|
|
+ aligned_input_size * global_z // batch offset |
|
|
|
|
|
|
|
+ (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset |
|
|
|
|
|
|
|
+ curr_x1 - INPUT_PAD_W; // x offset |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Src1 (filter) is directly used as btile. |
|
|
|
|
|
|
|
// It starts at the top of src1 and walks down. |
|
|
|
|
|
|
|
// btile is K rows x N columns. |
|
|
|
|
|
|
|
const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. |
|
|
|
|
|
|
|
// Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch |
|
|
|
|
|
|
|
// and KERNEL_WIDTH/2 rows of interleaved filter. |
|
|
|
|
|
|
|
int patch_depth = 0; |
|
|
|
|
|
|
|
do |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
int patch_row = 0; |
|
|
|
|
|
|
|
do |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
// Load atile and btile. |
|
|
|
|
|
|
|
// Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype8 granularity. |
|
|
|
|
|
|
|
// The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non |
|
|
|
|
|
|
|
// interleaved row is padded with zero to ensure same size as interleaved rows. This |
|
|
|
|
|
|
|
// interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the |
|
|
|
|
|
|
|
// kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3. |
|
|
|
|
|
|
|
// (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) .. |
|
|
|
|
|
|
|
// (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ... |
|
|
|
|
|
|
|
// (0, 2) (8, 2) (16, 2) (24, 2) ... ... |
|
|
|
|
|
|
|
// ... |
|
|
|
|
|
|
|
const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; |
|
|
|
|
|
|
|
#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 |
|
|
|
|
|
|
|
Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; |
|
|
|
|
|
|
|
Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; |
|
|
|
|
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
|
|
|
|
Dtype* pblockA01 = (Dtype*)(&blockA01); |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
Dtype_t blockA00; |
|
|
|
|
|
|
|
Dtype* pblockA00 = (Dtype*)(&blockA00); |
|
|
|
|
|
|
|
int pos = 0; |
|
|
|
|
|
|
|
LOOP(KERNEL_WIDTH, pos, |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
|
|
|
pblockA00[pos] = src0_read0[pos * DILATION_X]; |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
pblockA00[pos] = 0; |
|
|
|
|
|
|
|
}) |
|
|
|
|
|
|
|
curr_y0 += DILATION_Y; |
|
|
|
|
|
|
|
Dtype_t blockA01; |
|
|
|
|
|
|
|
Dtype* pblockA01 = (Dtype*)(&blockA01); |
|
|
|
|
|
|
|
pos = 0; |
|
|
|
|
|
|
|
LOOP(KERNEL_WIDTH, pos, |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) |
|
|
|
|
|
|
|
pblockA01[pos] = src0_read1[pos * DILATION_X]; |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
pblockA01[pos] = 0; |
|
|
|
|
|
|
|
}) |
|
|
|
|
|
|
|
curr_y1 += DILATION_Y; |
|
|
|
|
|
|
|
src0_read0 += (ROW_PITCH * DILATION_Y); |
|
|
|
|
|
|
|
src0_read1 += (ROW_PITCH * DILATION_Y); |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
Dtype blockB00[KERNEL_WIDTH*2]; |
|
|
|
|
|
|
|
Dtype4* p4BlockB00 = (Dtype4*)blockB00; |
|
|
|
|
|
|
|
Dtype2* p2BlockB00 = (Dtype2*)blockB00; |
|
|
|
|
|
|
|
Dtype* pBlockB00 = (Dtype* )blockB00; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
interleaved_y = 0; |
|
|
|
|
|
|
|
LOOP(KERNEL_WIDTH_DIV2, interleaved_y, |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
p4BlockB00[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) ); |
|
|
|
|
|
|
|
src1_read += WIDTH1 * 2; |
|
|
|
|
|
|
|
} ) |
|
|
|
|
|
|
|
if ( kernel_width_is_odd ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
p2BlockB00[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) ); |
|
|
|
|
|
|
|
src1_read += WIDTH1 * 2; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
// Perform MADs |
|
|
|
|
|
|
|
kernel_idx = 0; |
|
|
|
|
|
|
|
interleaved_y = 0; |
|
|
|
|
|
|
|
LOOP(KERNEL_WIDTH_DIV2, interleaved_y, |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
kernel_y = interleaved_y * 2; |
|
|
|
|
|
|
|
DOT_PRODUCT_16( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); |
|
|
|
|
|
|
|
DOT_PRODUCT_16( blockC01, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; |
|
|
|
|
|
|
|
DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); |
|
|
|
|
|
|
|
DOT_PRODUCT_16( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; |
|
|
|
|
|
|
|
DOT_PRODUCT_16( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); |
|
|
|
|
|
|
|
DOT_PRODUCT_16( blockC11, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; |
|
|
|
|
|
|
|
DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); |
|
|
|
|
|
|
|
DOT_PRODUCT_16( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; |
|
|
|
|
|
|
|
} ) |
|
|
|
|
|
|
|
if ( kernel_width_is_odd ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
kernel_y = interleaved_y * 2; |
|
|
|
|
|
|
|
DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); |
|
|
|
|
|
|
|
DOT_PRODUCT_16( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; |
|
|
|
|
|
|
|
DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); |
|
|
|
|
|
|
|
DOT_PRODUCT_16( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//while( ++patch_row < 1 ); //debug |
|
|
|
|
|
|
|
while( ++patch_row < KERNEL_HEIGHT ); |
|
|
|
|
|
|
|
#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 |
|
|
|
|
|
|
|
curr_y0 = saved_y0; |
|
|
|
|
|
|
|
curr_y1 = saved_y1; |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch |
|
|
|
|
|
|
|
src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
//while ( ++patch_depth < 1 ); //debug |
|
|
|
|
|
|
|
while ( ++patch_depth < INPUT_DEPTH ); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes: |
|
|
|
|
|
|
|
// (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. |
|
|
|
|
|
|
|
int out0_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
|
|
|
+ ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset |
|
|
|
|
|
|
|
+ ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
|
|
|
int out1_offset = global_z * out_pitch_z // batch offset |
|
|
|
|
|
|
|
+ ( group_x * TILE_N ) * out_pitch_y // channel offset |
|
|
|
|
|
|
|
+ ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset |
|
|
|
|
|
|
|
+ ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#if APPLY_BIAS |
|
|
|
|
|
|
|
Dtype bias[2]; |
|
|
|
|
|
|
|
Dtype2 *bias_vec; |
|
|
|
|
|
|
|
bias_vec = (Dtype2*)bias; |
|
|
|
|
|
|
|
*bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N)); |
|
|
|
|
|
|
|
if (group_x > 0xFFFFFFFEul) { |
|
|
|
|
|
|
|
dst[0] = bias[0] + bias[1]; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
const Dtype bias[2] = {0, 0}; |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
INTERLEAVED_SIMD16_OUTPUT(dst, out0_offset, 0); |
|
|
|
|
|
|
|
INTERLEAVED_SIMD16_OUTPUT(dst, out1_offset, 1); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
#elif defined KERNEL_DWCONV |
|
|
|
#elif defined KERNEL_DWCONV |
|
|
|
|
|
|
|
|
|
|
|
__kernel void DWCONV( |
|
|
|
__kernel void DWCONV( |
|
|
|