@ -39,24 +39,42 @@
//
//M*/
# if defined ( cl_khr_fp16 )
# pragma OPENCL EXTENSION cl_khr_fp16 : enable
# endif
# define CONCAT ( A,B ) A##_##B
# define TEMPLATE ( name,type ) CONCAT ( name,type )
// Types used for parameters, offset computations and so on
# define int_tp int
# define uint_tp unsigned int
# define KERNEL_ARG_DTYPE float
# define TYPE_FLOAT 1
# define TYPE_HALF 2
# if TYPE == TYPE_HALF
# define Dtype half
# define Dtype2 half2
# define Dtype4 half4
# define Dtype8 half8
# define Dtype16 half16
# define as_Dtype as_half
# define as_Dtype2 as_half2
# define as_Dtype4 as_half4
# define as_Dtype8 as_half8
# define as_Dtype16 as_half16
# else
# define Dtype float
# define Dtype2 float2
# define Dtype4 float4
# define Dtype8 float8
# define Dtype16 float16
# define as_Dtype as_float
# define as_Dtype2 as_float2
# define as_Dtype4 as_float4
# define as_Dtype8 as_float8
# define KERNEL_ARG_DTYPE float
# define as_Dtype16 as_float16
# endif
# if defined ( cl_intel_subgroups )
# pragma OPENCL EXTENSION cl_intel_subgroups : enable
@ -67,6 +85,15 @@
// common block to calculate ( alpha * AxB + beta * C ) and output to destination image.
# if TYPE == TYPE_HALF
# define SUBGROUP_BLOCK_READ8 ( __image, __coord ) intel_sub_group_block_read_us8 ( __image, __coord )
# define SHUFFLE_TYPE2 ( val ) as_ushort2 ( val )
# define SHUFFLE_TYPE8 ( val ) as_ushort8 ( val )
# define READ_IMAGE ( __image, __coord ) read_imageh ( __image, sampler, __coord )
# define SIZE_OF_ELEMENT sizeof ( ushort )
# define SIMD_SIZE_GEMM 16
# define TILE_N 16
# else
# define SUBGROUP_BLOCK_READ8 ( __image, __coord ) intel_sub_group_block_read8 ( __image, __coord )
# define SHUFFLE_TYPE2 ( val ) val
# define SHUFFLE_TYPE8 ( val ) val
@ -74,11 +101,17 @@
# define SIZE_OF_ELEMENT sizeof ( uint )
# define SIMD_SIZE_GEMM 8
# define TILE_N 8
# endif
//#define USE_IMAGE_C
# ifdef USE_IMAGE_C
# if TYPE == TYPE_HALF
# define BLOCKC_READ8 ( _C, _coordC ) as_Dtype8 ( intel_sub_group_block_read_us8 ( _C, _coordC ) )
# define BLOCKC_WRITE8 ( _C, _coordC, _val ) intel_sub_group_block_write_us8 ( _C, _coordC, as_ushort8 ( _val ) )
# else
# define BLOCKC_READ8 ( _C, _coordC ) as_Dtype8 ( intel_sub_group_block_read8 ( _C, _coordC ) )
# define BLOCKC_WRITE8 ( _C, _coordC, _val ) intel_sub_group_block_write8 ( _C, _coordC, as_uint8 ( _val ) )
# endif
# define MATC_PARAMETER __read_only image2d_t C, __write_only image2d_t dst
# define GEMM_OUTPUT ( ALPHA1, BETA_NOT0 ) GEMM_OUTPUT_EXT ( ALPHA1, BETA_NOT0, C, dst, sizeof ( uint ) )
# else
@ -139,10 +172,10 @@
blockC03 += blockAxB03 ; \
} \
} else { \
blockC00 = isFirstColBlock ? BLOCKC_READ8 ( _C, coordC ) * beta : BLOCKC_READ8 ( _C, coordC ) ; coordC.y += 8; \
blockC01 = isFirstColBlock ? BLOCKC_READ8 ( _C, coordC ) * beta : BLOCKC_READ8 ( _C, coordC ) ; coordC.y += 8; \
blockC02 = isFirstColBlock ? BLOCKC_READ8 ( _C, coordC ) * beta : BLOCKC_READ8 ( _C, coordC ) ; coordC.y += 8; \
blockC03 = isFirstColBlock ? BLOCKC_READ8 ( _C, coordC ) * beta : BLOCKC_READ8 ( _C, coordC ) ; \
blockC00 = isFirstColBlock ? ( Dtype ) 0. : BLOCKC_READ8 ( _C, coordC ) ; coordC.y += 8; \
blockC01 = isFirstColBlock ? ( Dtype ) 0. : BLOCKC_READ8 ( _C, coordC ) ; coordC.y += 8; \
blockC02 = isFirstColBlock ? ( Dtype ) 0. : BLOCKC_READ8 ( _C, coordC ) ; coordC.y += 8; \
blockC03 = isFirstColBlock ? ( Dtype ) 0. : BLOCKC_READ8 ( _C, coordC ) ; \
if ( !ALPHA1 ) { \
blockC00 = mad ( blockAxB00, ( Dtype8 ) alpha, blockC00 ) ; \
blockC01 = mad ( blockAxB01, ( Dtype8 ) alpha, blockC01 ) ; \
@ -172,6 +205,43 @@
intel_sub_group_shuffle ( _block.s7, _col ) ) ;
// A 's column block multiply B 's row block.
# if TYPE == TYPE_HALF
# define MULTIPLY_BLOCKS_8x8 ( _result, _blockA, _blockB00, _blockB01 ) \
{ \
const Dtype8 acol0 = TRANSPOSE_BLOCK_8 ( _blockA, 0 ) ; \
const Dtype8 acol1 = TRANSPOSE_BLOCK_8 ( _blockA, 1 ) ; \
const Dtype8 acol2 = TRANSPOSE_BLOCK_8 ( _blockA, 2 ) ; \
const Dtype8 acol3 = TRANSPOSE_BLOCK_8 ( _blockA, 3 ) ; \
const Dtype8 acol4 = TRANSPOSE_BLOCK_8 ( _blockA, 4 ) ; \
const Dtype8 acol5 = TRANSPOSE_BLOCK_8 ( _blockA, 5 ) ; \
const Dtype8 acol6 = TRANSPOSE_BLOCK_8 ( _blockA, 6 ) ; \
const Dtype8 acol7 = TRANSPOSE_BLOCK_8 ( _blockA, 7 ) ; \
const Dtype8 acol8 = TRANSPOSE_BLOCK_8 ( _blockA, 8 ) ; \
const Dtype8 acol9 = TRANSPOSE_BLOCK_8 ( _blockA, 9 ) ; \
const Dtype8 acola = TRANSPOSE_BLOCK_8 ( _blockA, 10 ) ; \
const Dtype8 acolb = TRANSPOSE_BLOCK_8 ( _blockA, 11 ) ; \
const Dtype8 acolc = TRANSPOSE_BLOCK_8 ( _blockA, 12 ) ; \
const Dtype8 acold = TRANSPOSE_BLOCK_8 ( _blockA, 13 ) ; \
const Dtype8 acole = TRANSPOSE_BLOCK_8 ( _blockA, 14 ) ; \
const Dtype8 acolf = TRANSPOSE_BLOCK_8 ( _blockA, 15 ) ; \
_result = mad ( ( Dtype8 ) ( _blockB00.s0 ) , acol0, _result ) ; \
_result = mad ( ( Dtype8 ) ( _blockB00.s1 ) , acol1, _result ) ; \
_result = mad ( ( Dtype8 ) ( _blockB00.s2 ) , acol2, _result ) ; \
_result = mad ( ( Dtype8 ) ( _blockB00.s3 ) , acol3, _result ) ; \
_result = mad ( ( Dtype8 ) ( _blockB00.s4 ) , acol4, _result ) ; \
_result = mad ( ( Dtype8 ) ( _blockB00.s5 ) , acol5, _result ) ; \
_result = mad ( ( Dtype8 ) ( _blockB00.s6 ) , acol6, _result ) ; \
_result = mad ( ( Dtype8 ) ( _blockB00.s7 ) , acol7, _result ) ; \
_result = mad ( ( Dtype8 ) ( _blockB01.s0 ) , acol8, _result ) ; \
_result = mad ( ( Dtype8 ) ( _blockB01.s1 ) , acol9, _result ) ; \
_result = mad ( ( Dtype8 ) ( _blockB01.s2 ) , acola, _result ) ; \
_result = mad ( ( Dtype8 ) ( _blockB01.s3 ) , acolb, _result ) ; \
_result = mad ( ( Dtype8 ) ( _blockB01.s4 ) , acolc, _result ) ; \
_result = mad ( ( Dtype8 ) ( _blockB01.s5 ) , acold, _result ) ; \
_result = mad ( ( Dtype8 ) ( _blockB01.s6 ) , acole, _result ) ; \
_result = mad ( ( Dtype8 ) ( _blockB01.s7 ) , acolf, _result ) ; \
}
# else
# define MULTIPLY_BLOCKS_8x8 ( _result, _blockA, _blockB ) \
{ \
const Dtype8 acol0 = TRANSPOSE_BLOCK_8 ( _blockA, 0 ) ; \
@ -191,7 +261,50 @@
_result = mad ( ( Dtype8 ) ( _blockB.s6 ) , acol6, _result ) ; \
_result = mad ( ( Dtype8 ) ( _blockB.s7 ) , acol7, _result ) ; \
}
# endif
# if TYPE == TYPE_HALF
# define GEMM_NN ( ALPHA1, BETA_NOT0 ) \
__attribute__ ( ( intel_reqd_sub_group_size ( SIMD_SIZE_GEMM ) ) ) \
__attribute__ ( ( reqd_work_group_size ( SIMD_SIZE_GEMM, 1 , 1 ) ) ) \
__kernel void TEMPLATE ( gemm_32_1_NN_ # # ALPHA1 # # _ # # BETA_NOT0, Dtype ) ( \
__read_only image2d_t A, \
__read_only image2d_t B, \
MATC_PARAMETER, \
KERNEL_ARG_DTYPE alpha_in, \
KERNEL_ARG_DTYPE beta_in, \
int width0, \
int isFirstColBlock ) \
{ \
const Dtype alpha = ( Dtype ) alpha_in ; \
const Dtype beta = ( Dtype ) beta_in ; \
const int group_x = get_group_id ( 0 ) ; \
const int group_y = get_group_id ( 1 ) ; \
Dtype8 blockAxB00 = 0 ; \
Dtype8 blockAxB01 = 0 ; \
Dtype8 blockAxB02 = 0 ; \
Dtype8 blockAxB03 = 0 ; \
int2 coordA = ( int2 ) ( 0 , group_y * TILE_M ) ; \
int2 coordB = ( int2 ) ( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 ) ; \
do \
{ \
int2 coordBTemp = coordB ; \
Dtype8 blockB00 = as_Dtype8 ( SUBGROUP_BLOCK_READ8 ( B, coordBTemp ) ) ; coordB.y += TILE_K; \
Dtype8 blockB01 = as_Dtype8 ( SUBGROUP_BLOCK_READ8 ( B, coordBTemp ) ) ; coordB.y += TILE_K; \
int2 coordATemp = coordA ; \
Dtype8 blockA00 = as_Dtype8 ( SUBGROUP_BLOCK_READ8 ( A, coordATemp ) ) ; coordATemp.y += 8; \
Dtype8 blockA01 = as_Dtype8 ( SUBGROUP_BLOCK_READ8 ( A, coordATemp ) ) ; coordATemp.y += 8; \
Dtype8 blockA02 = as_Dtype8 ( SUBGROUP_BLOCK_READ8 ( A, coordATemp ) ) ; coordATemp.y += 8; \
Dtype8 blockA03 = as_Dtype8 ( SUBGROUP_BLOCK_READ8 ( A, coordATemp ) ) ; coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \
MULTIPLY_BLOCKS_8x8 ( blockAxB00, blockA00, blockB00, blockB01 ) ; \
MULTIPLY_BLOCKS_8x8 ( blockAxB01, blockA01, blockB00, blockB01 ) ; \
MULTIPLY_BLOCKS_8x8 ( blockAxB02, blockA02, blockB00, blockB01 ) ; \
MULTIPLY_BLOCKS_8x8 ( blockAxB03, blockA03, blockB00, blockB01 ) ; \
} \
while ( coordB.y < width0 ) ; \
GEMM_OUTPUT ( ALPHA1, BETA_NOT0 ) ; \
}
# else
# define GEMM_NN ( ALPHA1, BETA_NOT0 ) \
__attribute__ ( ( intel_reqd_sub_group_size ( SIMD_SIZE_GEMM ) ) ) \
__attribute__ ( ( reqd_work_group_size ( SIMD_SIZE_GEMM, 1 , 1 ) ) ) \
@ -231,6 +344,7 @@ __kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
while ( coordB.y < width0 ) ; \
GEMM_OUTPUT ( ALPHA1, BETA_NOT0 ) ; \
}
# endif
GEMM_NN ( 1 , 0 ) // ALPHA == 1 , BETA == 0
GEMM_NN ( 1 , 1 ) // ALPHA == 1 , BETA != 0
@ -264,6 +378,45 @@ GEMM_NN(0, 1) // ALPHA != 1, BETA != 0
_result = mad ( ( Dtype8 ) ( _blockB.s7 ) , TRANSPOSE_BLOCK_8 ( _blockA.s7, _col ) , _result ) ; \
}
# if TYPE == TYPE_HALF
# define GEMM_TN ( ALPHA1, BETA_NOT0 ) \
__attribute__ ( ( intel_reqd_sub_group_size ( SIMD_SIZE_GEMM ) ) ) \
__attribute__ ( ( reqd_work_group_size ( SIMD_SIZE_GEMM, 1 , 1 ) ) ) \
__kernel void TEMPLATE ( gemm_32_1_TN_ # # ALPHA1 # # _ # # BETA_NOT0,Dtype ) ( \
__read_only image2d_t A, \
__read_only image2d_t B, \
MATC_PARAMETER, \
KERNEL_ARG_DTYPE alpha_in, \
KERNEL_ARG_DTYPE beta_in, \
int width0, \
int isFirstColBlock ) \
{ \
const Dtype alpha = ( Dtype ) alpha_in ; \
const Dtype beta = ( Dtype ) beta_in ; \
const int group_x = get_group_id ( 0 ) ;\
const int group_y = get_group_id ( 1 ) ;\
Dtype8 blockAxB00 = 0 ;\
Dtype8 blockAxB01 = 0 ;\
Dtype8 blockAxB02 = 0 ;\
Dtype8 blockAxB03 = 0 ;\
int2 coordA = ( int2 ) ( group_y * TILE_M * SIZE_OF_ELEMENT, 0 ) ;\
int2 coordB = ( int2 ) ( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 ) ;\
do \
{ \
int2 coordBTemp = coordB ;\
Dtype8 blockB00 = as_Dtype8 ( SUBGROUP_BLOCK_READ8 ( B, coordBTemp ) ) ; coordB.y += TILE_K;\
int2 coordATemp = coordA ;\
Dtype8 blockA00 = as_Dtype8 ( SUBGROUP_BLOCK_READ8 ( A, coordATemp ) ) ; coordATemp.x += 16 * SIZE_OF_ELEMENT;\
Dtype8 blockA01 = as_Dtype8 ( SUBGROUP_BLOCK_READ8 ( A, coordATemp ) ) ; coordA.y += TILE_K;\
MULTIPLY_BLOCKS_8x8 ( blockAxB00, blockA00, blockB00, 0 ) ; \
MULTIPLY_BLOCKS_8x8 ( blockAxB01, blockA00, blockB00, 8 ) ; \
MULTIPLY_BLOCKS_8x8 ( blockAxB02, blockA01, blockB00, 0 ) ; \
MULTIPLY_BLOCKS_8x8 ( blockAxB03, blockA01, blockB00, 8 ) ; \
} \
while ( coordB.y < width0 ) ; \
GEMM_OUTPUT ( ALPHA1, BETA_NOT0 ) ; \
}
# else
# define GEMM_TN ( ALPHA1, BETA_NOT0 ) \
__attribute__ ( ( intel_reqd_sub_group_size ( SIMD_SIZE_GEMM ) ) ) \
__attribute__ ( ( reqd_work_group_size ( SIMD_SIZE_GEMM, 1 , 1 ) ) ) \
@ -303,6 +456,7 @@ __kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
while ( coordB.y < width0 ) ; \
GEMM_OUTPUT ( ALPHA1, BETA_NOT0 ) ; \
}
# endif
GEMM_TN ( 1 , 0 ) // ALPHA == 1 , BETA == 0
GEMM_TN ( 1 , 1 ) // ALPHA == 1 , BETA != 0
@ -324,6 +478,7 @@ GEMM_TN(0, 1) // ALPHA != 1, BETA != 0
intel_sub_group_shuffle ( _block.s6, _col ) , \
intel_sub_group_shuffle ( _block.s7, _col ) )
# if TYPE == TYPE_HALF
# define MULTIPLY_BLOCKS_8x8 ( _result, _blockA, _blockB ) \
{ \
const Dtype8 acol0 = TRANSPOSE_BLOCK_8 ( _blockA, 0 ) ; \
@ -334,6 +489,14 @@ GEMM_TN(0, 1) // ALPHA != 1, BETA != 0
const Dtype8 acol5 = TRANSPOSE_BLOCK_8 ( _blockA, 5 ) ; \
const Dtype8 acol6 = TRANSPOSE_BLOCK_8 ( _blockA, 6 ) ; \
const Dtype8 acol7 = TRANSPOSE_BLOCK_8 ( _blockA, 7 ) ; \
const Dtype8 acol8 = TRANSPOSE_BLOCK_8 ( _blockA, 8 ) ; \
const Dtype8 acol9 = TRANSPOSE_BLOCK_8 ( _blockA, 9 ) ; \
const Dtype8 acola = TRANSPOSE_BLOCK_8 ( _blockA, 10 ) ; \
const Dtype8 acolb = TRANSPOSE_BLOCK_8 ( _blockA, 11 ) ; \
const Dtype8 acolc = TRANSPOSE_BLOCK_8 ( _blockA, 12 ) ; \
const Dtype8 acold = TRANSPOSE_BLOCK_8 ( _blockA, 13 ) ; \
const Dtype8 acole = TRANSPOSE_BLOCK_8 ( _blockA, 14 ) ; \
const Dtype8 acolf = TRANSPOSE_BLOCK_8 ( _blockA, 15 ) ; \
_result = mad ( ( Dtype8 ) _blockB.s0, acol0, _result ) ; \
_result = mad ( ( Dtype8 ) _blockB.s1, acol1, _result ) ; \
_result = mad ( ( Dtype8 ) _blockB.s2, acol2, _result ) ; \
@ -342,8 +505,80 @@ GEMM_TN(0, 1) // ALPHA != 1, BETA != 0
_result = mad ( ( Dtype8 ) _blockB.s5, acol5, _result ) ; \
_result = mad ( ( Dtype8 ) _blockB.s6, acol6, _result ) ; \
_result = mad ( ( Dtype8 ) _blockB.s7, acol7, _result ) ; \
_result = mad ( ( Dtype8 ) _blockB.s8, acol8, _result ) ; \
_result = mad ( ( Dtype8 ) _blockB.s9, acol9, _result ) ; \
_result = mad ( ( Dtype8 ) _blockB.sa, acola, _result ) ; \
_result = mad ( ( Dtype8 ) _blockB.sb, acolb, _result ) ; \
_result = mad ( ( Dtype8 ) _blockB.sc, acolc, _result ) ; \
_result = mad ( ( Dtype8 ) _blockB.sd, acold, _result ) ; \
_result = mad ( ( Dtype8 ) _blockB.se, acole, _result ) ; \
_result = mad ( ( Dtype8 ) _blockB.sf, acolf, _result ) ; \
}
# else
# define MULTIPLY_BLOCKS_8x8 ( _result, _blockA, _blockB ) \
{ \
const Dtype8 acol0 = TRANSPOSE_BLOCK_8 ( _blockA, 0 ) ; \
const Dtype8 acol1 = TRANSPOSE_BLOCK_8 ( _blockA, 1 ) ; \
const Dtype8 acol2 = TRANSPOSE_BLOCK_8 ( _blockA, 2 ) ; \
const Dtype8 acol3 = TRANSPOSE_BLOCK_8 ( _blockA, 3 ) ; \
const Dtype8 acol4 = TRANSPOSE_BLOCK_8 ( _blockA, 4 ) ; \
const Dtype8 acol5 = TRANSPOSE_BLOCK_8 ( _blockA, 5 ) ; \
const Dtype8 acol6 = TRANSPOSE_BLOCK_8 ( _blockA, 6 ) ; \
const Dtype8 acol7 = TRANSPOSE_BLOCK_8 ( _blockA, 7 ) ; \
_result = mad ( ( Dtype8 ) _blockB.s0, acol0, _result ) ; \
_result = mad ( ( Dtype8 ) _blockB.s1, acol1, _result ) ; \
_result = mad ( ( Dtype8 ) _blockB.s2, acol2, _result ) ; \
_result = mad ( ( Dtype8 ) _blockB.s3, acol3, _result ) ; \
_result = mad ( ( Dtype8 ) _blockB.s4, acol4, _result ) ; \
_result = mad ( ( Dtype8 ) _blockB.s5, acol5, _result ) ; \
_result = mad ( ( Dtype8 ) _blockB.s6, acol6, _result ) ; \
_result = mad ( ( Dtype8 ) _blockB.s7, acol7, _result ) ; \
}
# endif
# if TYPE == TYPE_HALF
# define GEMM_NT ( ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE ) \
__attribute__ ( ( intel_reqd_sub_group_size ( SIMD_SIZE_GEMM ) ) ) \
__attribute__ ( ( reqd_work_group_size ( SIMD_SIZE_GEMM, 1 , 1 ) ) ) \
__kernel void TEMPLATE ( gemm_32_1_NT_ # # VECSCALAR # # _ # # ALPHA1 # # _ # # BETA_NOT0,Dtype ) ( \
__read_only image2d_t A, \
MATB_PARAMETER, \
MATC_PARAMETER, \
KERNEL_ARG_DTYPE alpha_in, \
KERNEL_ARG_DTYPE beta_in, \
int padded_k, \
int k, \
int isFirstColBlock ) \
{ \
const Dtype alpha = ( Dtype ) alpha_in ; \
const Dtype beta = ( Dtype ) beta_in ; \
const int group_x = get_group_id ( 0 ) ; \
const int group_y = get_group_id ( 1 ) ; \
Dtype8 blockAxB00 = 0 ; \
Dtype8 blockAxB01 = 0 ; \
Dtype8 blockAxB02 = 0 ; \
Dtype8 blockAxB03 = 0 ; \
int2 coordA = ( int2 ) ( 0 , group_y * TILE_M ) ; \
int2 coordB = ( int2 ) ( 0 , ( group_x * TILE_N ) ) ; \
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST ; \
do \
{ \
Dtype16 blockB00 ; \
BLOCKB_READ8 ( blockB00, B, coordB ) ; \
int2 coordATemp = coordA ; \
Dtype8 blockA00 = as_Dtype8 ( SUBGROUP_BLOCK_READ8 ( A, coordATemp ) ) ; coordATemp.y += 8; \
Dtype8 blockA01 = as_Dtype8 ( SUBGROUP_BLOCK_READ8 ( A, coordATemp ) ) ; coordATemp.y += 8; \
Dtype8 blockA02 = as_Dtype8 ( SUBGROUP_BLOCK_READ8 ( A, coordATemp ) ) ; coordATemp.y += 8; \
Dtype8 blockA03 = as_Dtype8 ( SUBGROUP_BLOCK_READ8 ( A, coordATemp ) ) ; coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \
MULTIPLY_BLOCKS_8x8 ( blockAxB00, blockA00, blockB00 ) ; \
MULTIPLY_BLOCKS_8x8 ( blockAxB01, blockA01, blockB00 ) ; \
MULTIPLY_BLOCKS_8x8 ( blockAxB02, blockA02, blockB00 ) ; \
MULTIPLY_BLOCKS_8x8 ( blockAxB03, blockA03, blockB00 ) ; \
} \
while ( coordB.x < padded_k / VECSIZE ) ; \
GEMM_OUTPUT ( ALPHA1, BETA_NOT0 ) ; \
}
# else
# define GEMM_NT ( ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE ) \
__attribute__ ( ( intel_reqd_sub_group_size ( SIMD_SIZE_GEMM ) ) ) \
__attribute__ ( ( reqd_work_group_size ( SIMD_SIZE_GEMM, 1 , 1 ) ) ) \
@ -385,12 +620,23 @@ __kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dt
while ( coordB.x < padded_k / VECSIZE ) ; \
GEMM_OUTPUT ( ALPHA1, BETA_NOT0 ) ; \
}
# endif
# if TYPE == TYPE_HALF
# define BLOCKB_READ8 ( _blockb, _B, _coordB ) \
int2 _coordBTemp = _coordB ; \
_coordBTemp.y += get_local_id ( 0 ) ; \
_blockb.s0123 = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.s4567 = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.s89ab = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.scdef = READ_IMAGE ( _B, _coordBTemp ) ; _coordB.x += 4;
# else
# define BLOCKB_READ8 ( _blockb, _B, _coordB ) \
int2 _coordBTemp = _coordB ; \
_coordBTemp.y += get_local_id ( 0 ) ; \
_blockb.s0123 = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.s4567 = READ_IMAGE ( _B, _coordBTemp ) ; _coordB.x += 2;
# endif
# define MATB_PARAMETER __read_only image2d_t B
@ -401,12 +647,21 @@ GEMM_NT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
# undef BLOCKB_READ8
# undef MATB_PARAMETER
# if TYPE == TYPE_HALF
# define BLOCKB_READ8 ( _blockb, _B, _coordB ) \
int2 _coordBTemp = _coordB ; \
_coordBTemp.y += get_local_id ( 0 ) ; \
const __global float *B_read = ( __global float * ) ( _B + ( _coordBTemp.y * ldb ) + _coordBTemp.x + offB ) ; \
_blockb = as_Dtype16 ( as_ushort16 ( vload8 ( 0 , B_read ) ) ) ; \
_coordB.x += TILE_K * 2 ;
# else
# define BLOCKB_READ8 ( _blockb, _B, _coordB ) \
int2 _coordBTemp = _coordB ; \
_coordBTemp.y += get_local_id ( 0 ) ; \
const __global Dtype *B_read = ( __global Dtype * ) ( _B + ( _coordBTemp.y * ldb ) + _coordBTemp.x + offB ) ; \
_blockb = vload8 ( 0 , B_read ) ; \
_coordB.x += TILE_K ;
# endif
# define MATB_PARAMETER __global Dtype *B, int offB, int ldb
@ -417,6 +672,45 @@ GEMM_NT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
# undef BLOCKB_READ8
# undef MATB_PARAMETER
# if TYPE == TYPE_HALF
# define BLOCKB_READ8 ( _blockb, _B, _coordB ) \
int2 _coordBTemp = _coordB ; \
_coordBTemp.y += get_local_id ( 0 ) ; \
Dtype4 temp ; \
temp = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.s0 = temp.s0 ; \
temp = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.s1 = temp.s0 ; \
temp = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.s2 = temp.s0 ; \
temp = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.s3 = temp.s0 ; \
temp = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.s4 = temp.s0 ; \
temp = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.s5 = temp.s0 ; \
temp = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.s6 = temp.s0 ; \
temp = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.s7 = temp.s0 ; \
temp = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.s8 = temp.s0 ; \
temp = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.s9 = temp.s0 ; \
temp = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.sa = temp.s0 ; \
temp = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.sb = temp.s0 ; \
temp = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.sc = temp.s0 ; \
temp = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.sd = temp.s0 ; \
temp = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.se = temp.s0 ; \
temp = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.sf = temp.s0 ; \
_coordB.x += 16 ;
# else
# define BLOCKB_READ8 ( _blockb, _B, _coordB ) \
int2 _coordBTemp = _coordB ; \
_coordBTemp.y += get_local_id ( 0 ) ; \
@ -438,6 +732,7 @@ GEMM_NT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
temp = READ_IMAGE ( _B, _coordBTemp ) ; _coordBTemp.x += 1; \
_blockb.s7 = temp.s0 ; \
_coordB.x += 8 ;
# endif
# define MATB_PARAMETER __read_only image2d_t B
@ -483,6 +778,47 @@ GEMM_NT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0
_result = mad ( ( Dtype8 ) _blockB.s7, acol7, _result ) ; \
}
# if TYPE == TYPE_HALF
# define GEMM_TT ( ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE ) \
__attribute__ ( ( intel_reqd_sub_group_size ( SIMD_SIZE_GEMM ) ) ) \
__attribute__ ( ( reqd_work_group_size ( SIMD_SIZE_GEMM, 1 , 1 ) ) ) \
__kernel void TEMPLATE ( gemm_32_1_TT_ # # VECSCALAR # # _ # # ALPHA1 # # _ # # BETA_NOT0, Dtype ) ( \
__read_only image2d_t A, \
MATB_PARAMETER, \
MATC_PARAMETER, \
KERNEL_ARG_DTYPE alpha_in, \
KERNEL_ARG_DTYPE beta_in, \
int padded_k, \
int k, \
int isFirstColBlock ) \
{ \
const Dtype alpha = ( Dtype ) alpha_in ; \
const Dtype beta = ( Dtype ) beta_in ; \
const int group_x = get_group_id ( 0 ) ; \
const int group_y = get_group_id ( 1 ) ; \
Dtype8 blockAxB00 = 0 ; \
Dtype8 blockAxB01 = 0 ; \
Dtype8 blockAxB02 = 0 ; \
Dtype8 blockAxB03 = 0 ; \
int2 coordA = ( int2 ) ( group_y * TILE_M * SIZE_OF_ELEMENT, 0 ) ; \
int2 coordB = ( int2 ) ( 0 , ( group_x * TILE_N ) ) ; \
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST ; \
do \
{ \
Dtype8 blockB00 ; \
BLOCKB_READ8 ( blockB00, B, coordB ) ; \
int2 coordATemp = coordA ; \
Dtype8 blockA00 = as_Dtype8 ( SUBGROUP_BLOCK_READ8 ( A, coordATemp ) ) ; coordATemp.x += 16 * SIZE_OF_ELEMENT;\
Dtype8 blockA01 = as_Dtype8 ( SUBGROUP_BLOCK_READ8 ( A, coordATemp ) ) ; coordA.y += TILE_K;\
MULTIPLY_BLOCKS_8x8 ( blockAxB00, blockA00, blockB00, 0 ) ; \
MULTIPLY_BLOCKS_8x8 ( blockAxB01, blockA00, blockB00, 8 ) ; \
MULTIPLY_BLOCKS_8x8 ( blockAxB02, blockA01, blockB00, 0 ) ; \
MULTIPLY_BLOCKS_8x8 ( blockAxB03, blockA01, blockB00, 8 ) ; \
} \
while ( coordB.x < padded_k / VECSIZE ) ; \
GEMM_OUTPUT ( ALPHA1, BETA_NOT0 ) ;\
}
# else
# define GEMM_TT ( ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE ) \
__attribute__ ( ( intel_reqd_sub_group_size ( SIMD_SIZE_GEMM ) ) ) \
__attribute__ ( ( reqd_work_group_size ( SIMD_SIZE_GEMM, 1 , 1 ) ) ) \
@ -524,6 +860,7 @@ __kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, D
while ( coordB.x < padded_k / VECSIZE ) ; \
GEMM_OUTPUT ( ALPHA1, BETA_NOT0 ) ;\
}
# endif
# define BLOCKB_READ8 ( _blockb, _B, _coordB ) \
int2 _coordBTemp = _coordB ; \
@ -540,12 +877,21 @@ GEMM_TT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
# undef BLOCKB_READ8
# undef MATB_PARAMETER
# if TYPE == TYPE_HALF
# define BLOCKB_READ8 ( _blockb, _B, _coordB ) \
int2 _coordBTemp = _coordB ; \
_coordBTemp.y += get_local_id ( 0 ) ; \
const __global float *B_read = ( __global float * ) ( _B + ( _coordBTemp.y * k ) + _coordBTemp.x + offB ) ; \
_blockb = as_Dtype8 ( as_ushort8 ( vload4 ( 0 , B_read ) ) ) ; \
_coordB.x += TILE_K ;
# else
# define BLOCKB_READ8 ( _blockb, _B, _coordB ) \
int2 _coordBTemp = _coordB ; \
_coordBTemp.y += get_local_id ( 0 ) ; \
const __global Dtype *B_read = ( __global Dtype * ) ( _B + ( _coordBTemp.y * k ) + _coordBTemp.x + offB ) ; \
_blockb = vload8 ( 0 , B_read ) ; \
_coordB.x += TILE_K ;
# endif
# define MATB_PARAMETER __global Dtype *B, int offB, int ldb
@ -598,7 +944,7 @@ GEMM_TT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0
# undef READ_IMAGE
# undef SIZE_OF_ELEMENT
__kernel void TEMPLATE ( gemm_buffer_copy_image_transpose,Dtype ) (
__kernel void TEMPLATE ( gemm_buffer_copy_image_transpose, Dtype ) (
__global Dtype* A,
__write_only image2d_t ImA,
int offA,
@ -611,10 +957,14 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_transpose,Dtype)(
int2 coord_dst = ( int2 ) ( gidx, gidy ) ;
__global Dtype* A_off = A + offA ;
Dtype srcA = A_off[gidy * ldA + gidx] ;
# if TYPE == TYPE_HALF
write_imageh ( ImA, coord_dst, ( Dtype4 ) srcA ) ;
# else
write_imagef ( ImA, coord_dst, ( Dtype4 ) srcA ) ;
# endif
}
__kernel void TEMPLATE ( gemm_buffer_copy_image_no_transpose,Dtype ) (
__kernel void TEMPLATE ( gemm_buffer_copy_image_no_transpose, Dtype ) (
__global Dtype* A,
__write_only image2d_t ImA,
int offA,
@ -625,6 +975,14 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose,Dtype)(
const int gidx = get_global_id ( 0 ) ;
const int gidy = get_global_id ( 1 ) ;
int2 coord_dst = ( int2 ) ( gidx, gidy ) ;
# if TYPE == TYPE_HALF
if ( gidx >= width | | gidy >= height ) {
write_imageh ( ImA, coord_dst, 0 ) ;
return ;
}
__global Dtype* A_off = A + offA ;
write_imageh ( ImA, coord_dst, A_off[gidy * ldA + gidx] ) ;
# else
if ( gidx >= width | | gidy >= height ) {
write_imageui ( ImA, coord_dst, ( uint4 ) 0 ) ;
return ;
@ -632,4 +990,5 @@ __kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose,Dtype)(
__global Dtype* A_off = A + offA ;
uint4 srcA = convert_uint4 ( as_uchar4 ( A_off[gidy * ldA + gidx] ) ) ;
write_imageui ( ImA, coord_dst, srcA ) ;
# endif
}