|
|
|
@ -15,7 +15,9 @@ |
|
|
|
|
// Third party copyrights are property of their respective owners. |
|
|
|
|
// |
|
|
|
|
// @Authors |
|
|
|
|
// Pang Erping, erping@multicorewareinc.com |
|
|
|
|
// Jia Haipeng, jiahaipeng95@gmail.com |
|
|
|
|
// Peng Xiao, pengxiao@outlook.com |
|
|
|
|
// |
|
|
|
|
// Redistribution and use in source and binary forms, with or without modification, |
|
|
|
|
// are permitted provided that the following conditions are met: |
|
|
|
@ -42,292 +44,229 @@ |
|
|
|
|
// the use of this software, even if advised of the possibility of such damage. |
|
|
|
|
// |
|
|
|
|
//M*/ |
|
|
|
|
#define BORDER_REFLECT_101 |
|
|
|
|
//#define BORDER_REFLECT_101 |
|
|
|
|
|
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
/////////////////////////////////Macro for border type//////////////////////////////////////////// |
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
#ifdef BORDER_REPLICATE |
|
|
|
|
|
|
|
|
|
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh |
|
|
|
|
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) |
|
|
|
|
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) |
|
|
|
|
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i)) |
|
|
|
|
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) : (i)) |
|
|
|
|
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#ifdef BORDER_REFLECT |
|
|
|
|
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb |
|
|
|
|
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) |
|
|
|
|
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i)-1 : (i)) |
|
|
|
|
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) |
|
|
|
|
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i)) |
|
|
|
|
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i)-1 : (i)) |
|
|
|
|
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#ifdef BORDER_REFLECT_101 |
|
|
|
|
//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba |
|
|
|
|
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) |
|
|
|
|
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i) : (i)) |
|
|
|
|
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) |
|
|
|
|
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) |
|
|
|
|
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i) : (i)) |
|
|
|
|
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#ifdef BORDER_WRAP |
|
|
|
|
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg |
|
|
|
|
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) |
|
|
|
|
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) |
|
|
|
|
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i)) |
|
|
|
|
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) |
|
|
|
|
#ifdef IMG_C_1_0 |
|
|
|
|
#define T_IMG uchar |
|
|
|
|
#define T_IMGx4 uchar4 |
|
|
|
|
#define T_IMG_C1 uchar |
|
|
|
|
#define CONVERT_TYPE convert_uchar_sat |
|
|
|
|
#define CONVERT_TYPEx4 convert_uchar4_sat |
|
|
|
|
#endif |
|
|
|
|
#ifdef IMG_C_4_0 |
|
|
|
|
#define T_IMG uchar4 |
|
|
|
|
#define T_IMGx4 uchar16 |
|
|
|
|
#define T_IMG_C1 uchar |
|
|
|
|
#define CONVERT_TYPE convert_uchar4_sat |
|
|
|
|
#define CONVERT_TYPEx4 convert_uchar16_sat |
|
|
|
|
#endif |
|
|
|
|
#ifdef IMG_C_1_5 |
|
|
|
|
#define T_IMG float |
|
|
|
|
#define T_IMGx4 float4 |
|
|
|
|
#define T_IMG_C1 float |
|
|
|
|
#define CONVERT_TYPE convert_float |
|
|
|
|
#define CONVERT_TYPEx4 convert_float4 |
|
|
|
|
#endif |
|
|
|
|
#ifdef IMG_C_4_5 |
|
|
|
|
#define T_IMG float4 |
|
|
|
|
#define T_IMGx4 float16 |
|
|
|
|
#define T_IMG_C1 float |
|
|
|
|
#define CONVERT_TYPE convert_float4 |
|
|
|
|
#define CONVERT_TYPEx4 convert_float16 |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
/////////////////////////////Macro for define elements number per thread///////////////////////////// |
|
|
|
|
//////////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
//#define ANCHOR 3 |
|
|
|
|
//#define ANX 1 |
|
|
|
|
//#define ANY 1 |
|
|
|
|
|
|
|
|
|
#define ROWS_PER_GROUP 4 |
|
|
|
|
#define ROWS_PER_GROUP_BITS 2 |
|
|
|
|
#define ROWS_FETCH (ROWS_PER_GROUP + ANY + ANY) //(ROWS_PER_GROUP + anY * 2) |
|
|
|
|
|
|
|
|
|
#define THREADS_PER_ROW 64 |
|
|
|
|
#define THREADS_PER_ROW_BIT 6 |
|
|
|
|
#ifndef CN |
|
|
|
|
#define CN 1 |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#define ELEMENTS_PER_THREAD 4 |
|
|
|
|
#define ELEMENTS_PER_THREAD_BIT 2 |
|
|
|
|
#if CN == 1 |
|
|
|
|
#define T_SUM float |
|
|
|
|
#define T_SUMx4 float4 |
|
|
|
|
#define CONVERT_TYPE_SUM convert_float |
|
|
|
|
#define CONVERT_TYPE_SUMx4 convert_float4 |
|
|
|
|
#define SUM_ZERO (0.0f) |
|
|
|
|
#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f) |
|
|
|
|
#define VLOAD4 vload4 |
|
|
|
|
#define SX x |
|
|
|
|
#define SY y |
|
|
|
|
#define SZ z |
|
|
|
|
#define SW w |
|
|
|
|
#elif CN == 4 |
|
|
|
|
#define T_SUM float4 |
|
|
|
|
#define T_SUMx4 float16 |
|
|
|
|
#define CONVERT_TYPE_SUM convert_float4 |
|
|
|
|
#define CONVERT_TYPE_SUMx4 convert_float16 |
|
|
|
|
#define SUM_ZERO (0.0f, 0.0f, 0.0f, 0.0f) |
|
|
|
|
#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f) |
|
|
|
|
#define VLOAD4 vload16 |
|
|
|
|
#define SX s0123 |
|
|
|
|
#define SY s4567 |
|
|
|
|
#define SZ s89ab |
|
|
|
|
#define SW scdef |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#define LOCAL_MEM_STEP 260 //divup((get_local_size(0) + anX * 2), 4) * 4 |
|
|
|
|
#ifndef FILTER_SIZE |
|
|
|
|
#define FILTER_SIZE 3 |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
/////////////////////////////////////////8uC1//////////////////////////////////////////////////////// |
|
|
|
|
//////////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
__kernel void filter2D_C1_D0(__global uchar *src, int src_step, int src_offset_x, int src_offset_y, |
|
|
|
|
__global uchar *dst, int dst_step, int dst_offset_x, int dst_offset_y, |
|
|
|
|
__constant int *mat_kernel __attribute__((max_constant_size (16384))), |
|
|
|
|
int cols,int rows, int operate_cols, int wholecols, int wholerows) |
|
|
|
|
#define LOCAL_GROUP_SIZE 16 |
|
|
|
|
|
|
|
|
|
#define LOCAL_WIDTH ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE) |
|
|
|
|
#define LOCAL_HEIGHT ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE) |
|
|
|
|
|
|
|
|
|
#define FILTER_RADIUS (FILTER_SIZE >> 1) |
|
|
|
|
|
|
|
|
|
__kernel void filter2D( |
|
|
|
|
__global T_IMG *src, |
|
|
|
|
__global T_IMG *dst, |
|
|
|
|
int src_step, |
|
|
|
|
int dst_step, |
|
|
|
|
__constant float *mat_kernel, |
|
|
|
|
__local T_IMG *local_data, |
|
|
|
|
int wholerows, |
|
|
|
|
int wholecols, |
|
|
|
|
int src_offset_x, |
|
|
|
|
int src_offset_y, |
|
|
|
|
int dst_offset_x, |
|
|
|
|
int dst_offset_y, |
|
|
|
|
int cols, |
|
|
|
|
int rows, |
|
|
|
|
int operate_cols |
|
|
|
|
) |
|
|
|
|
{ |
|
|
|
|
int gX = get_global_id(0); |
|
|
|
|
int gY = get_global_id(1); |
|
|
|
|
|
|
|
|
|
int lX = get_local_id(0); |
|
|
|
|
|
|
|
|
|
int groupX_size = get_local_size(0); |
|
|
|
|
int groupX_id = get_group_id(0); |
|
|
|
|
|
|
|
|
|
#define dst_align (dst_offset_x & 3) |
|
|
|
|
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; |
|
|
|
|
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; |
|
|
|
|
|
|
|
|
|
__local uchar local_data[LOCAL_MEM_STEP * ROWS_FETCH]; |
|
|
|
|
if((gY << 2) < rows) |
|
|
|
|
int groupStartCol = get_group_id(0) * get_local_size(0); |
|
|
|
|
int groupStartRow = get_group_id(1) * get_local_size(1); |
|
|
|
|
|
|
|
|
|
int localCol = get_local_id(0); |
|
|
|
|
int localRow = get_local_id(1); |
|
|
|
|
int globalCol = groupStartCol + localCol; |
|
|
|
|
int globalRow = groupStartRow + localRow; |
|
|
|
|
const int src_offset = mad24(src_offset_y, src_step, src_offset_x); |
|
|
|
|
const int dst_offset = mad24(dst_offset_y, dst_step, dst_offset_x); |
|
|
|
|
#ifdef BORDER_CONSTANT |
|
|
|
|
for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1)) |
|
|
|
|
{ |
|
|
|
|
for(int i = 0; i < ROWS_FETCH; ++i) |
|
|
|
|
int curRow = groupStartRow + i; |
|
|
|
|
for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0)) |
|
|
|
|
{ |
|
|
|
|
if((rows_start_index - src_offset_y) + i < rows + ANY) |
|
|
|
|
int curCol = groupStartCol + j; |
|
|
|
|
if(curRow < FILTER_RADIUS - src_offset_y || (curRow - FILTER_RADIUS) >= wholerows - src_offset_y|| |
|
|
|
|
curCol < FILTER_RADIUS - src_offset_x || (curCol - FILTER_RADIUS) >= wholecols - src_offset_x) |
|
|
|
|
{ |
|
|
|
|
#ifdef BORDER_CONSTANT |
|
|
|
|
int selected_row = rows_start_index + i; |
|
|
|
|
int selected_cols = cols_start_index_group + lX; |
|
|
|
|
|
|
|
|
|
uchar data = *(src + selected_row * src_step + selected_cols); |
|
|
|
|
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols; |
|
|
|
|
data = con ? data : 0; |
|
|
|
|
local_data[i * LOCAL_MEM_STEP + lX ] =data; |
|
|
|
|
|
|
|
|
|
if(lX < (ANX << 1)) |
|
|
|
|
{ |
|
|
|
|
selected_cols = cols_start_index_group + lX + groupX_size; |
|
|
|
|
|
|
|
|
|
data = *(src + selected_row * src_step + selected_cols); |
|
|
|
|
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols; |
|
|
|
|
data = con ? data : 0; |
|
|
|
|
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; |
|
|
|
|
} |
|
|
|
|
#else |
|
|
|
|
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); |
|
|
|
|
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); |
|
|
|
|
|
|
|
|
|
int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols); |
|
|
|
|
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols); |
|
|
|
|
|
|
|
|
|
uchar data = *(src + selected_row * src_step + selected_cols); |
|
|
|
|
|
|
|
|
|
local_data[i * LOCAL_MEM_STEP + lX ] =data; |
|
|
|
|
|
|
|
|
|
if(lX < (ANX << 1)) |
|
|
|
|
{ |
|
|
|
|
selected_cols = cols_start_index_group + lX + groupX_size; |
|
|
|
|
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols); |
|
|
|
|
|
|
|
|
|
data = *(src + selected_row * src_step + selected_cols); |
|
|
|
|
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
local_data[(i) * LOCAL_WIDTH + j] = 0; |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
local_data[(i) * LOCAL_WIDTH + j] = src[(curRow - FILTER_RADIUS) * src_step + curCol - FILTER_RADIUS + src_offset]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2); |
|
|
|
|
if(((gY << 2) < rows) && (process_col < operate_cols)) |
|
|
|
|
#else |
|
|
|
|
for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1)) |
|
|
|
|
{ |
|
|
|
|
int dst_cols_start = dst_offset_x; |
|
|
|
|
int dst_cols_end = dst_offset_x + cols; |
|
|
|
|
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc; |
|
|
|
|
|
|
|
|
|
int dst_rows_end = dst_offset_y + rows; |
|
|
|
|
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT); |
|
|
|
|
int curRow = groupStartRow + i; |
|
|
|
|
|
|
|
|
|
uchar4 dst_data = *((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)); |
|
|
|
|
curRow = ADDR_H(curRow, FILTER_RADIUS - src_offset_y, wholerows - src_offset_y); |
|
|
|
|
|
|
|
|
|
int4 sum = (int4)(0); |
|
|
|
|
uchar4 data; |
|
|
|
|
curRow = ADDR_B(curRow - FILTER_RADIUS, wholerows - src_offset_y, curRow - FILTER_RADIUS); |
|
|
|
|
|
|
|
|
|
for(int i = 0; i < ANCHOR; i++) |
|
|
|
|
for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0)) |
|
|
|
|
{ |
|
|
|
|
#pragma unroll |
|
|
|
|
for(int j = 0; j < ANCHOR; j++) |
|
|
|
|
int curCol = groupStartCol + j; |
|
|
|
|
curCol = ADDR_L(curCol, FILTER_RADIUS - src_offset_x, wholecols - src_offset_x); |
|
|
|
|
curCol = ADDR_R(curCol - FILTER_RADIUS, wholecols - src_offset_x, curCol - FILTER_RADIUS); |
|
|
|
|
if(curRow < wholerows && curCol < wholecols) |
|
|
|
|
{ |
|
|
|
|
if(dst_rows_index < dst_rows_end) |
|
|
|
|
{ |
|
|
|
|
int local_row = (lX >> THREADS_PER_ROW_BIT) + i; |
|
|
|
|
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; |
|
|
|
|
|
|
|
|
|
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); |
|
|
|
|
sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int4_sat(data)); |
|
|
|
|
} |
|
|
|
|
local_data[(i) * LOCAL_WIDTH + j] = src[(curRow) * src_step + curCol + src_offset]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if(dst_rows_index < dst_rows_end) |
|
|
|
|
{ |
|
|
|
|
sum.x = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ? sum.x : dst_data.x; |
|
|
|
|
sum.y = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? sum.y : dst_data.y; |
|
|
|
|
sum.z = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? sum.z : dst_data.z; |
|
|
|
|
sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w; |
|
|
|
|
*((__global uchar4 *)(dst + dst_rows_index * dst_step + dst_cols_index)) = convert_uchar4_sat(sum); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
/////////////////////////////////////////32FC1//////////////////////////////////////////////////////// |
|
|
|
|
//////////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
__kernel void filter2D_C1_D5(__global float *src, int src_step, int src_offset_x, int src_offset_y, |
|
|
|
|
__global float *dst, int dst_step, int dst_offset_x, int dst_offset_y, |
|
|
|
|
__constant int *mat_kernel __attribute__((max_constant_size (16384))), |
|
|
|
|
int cols,int rows, int operate_cols, int wholecols, int wholerows) |
|
|
|
|
{ |
|
|
|
|
int gX = get_global_id(0); |
|
|
|
|
int gY = get_global_id(1); |
|
|
|
|
|
|
|
|
|
int lX = get_local_id(0); |
|
|
|
|
|
|
|
|
|
int groupX_size = get_local_size(0); |
|
|
|
|
int groupX_id = get_group_id(0); |
|
|
|
|
|
|
|
|
|
#define dst_align (dst_offset_x & 3) |
|
|
|
|
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; |
|
|
|
|
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; |
|
|
|
|
|
|
|
|
|
__local float local_data[LOCAL_MEM_STEP * ROWS_FETCH]; |
|
|
|
|
if(((gY << 2) < rows)) |
|
|
|
|
#endif |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
if(globalRow < rows && globalCol < cols) |
|
|
|
|
{ |
|
|
|
|
for(int i = 0; i < ROWS_FETCH; ++i) |
|
|
|
|
T_SUM sum = (T_SUM)SUM_ZERO; |
|
|
|
|
int filterIdx = 0; |
|
|
|
|
for(int i = 0; i < FILTER_SIZE; i++) |
|
|
|
|
{ |
|
|
|
|
if((rows_start_index - src_offset_y) + i < rows + ANY) |
|
|
|
|
{ |
|
|
|
|
#ifdef BORDER_CONSTANT |
|
|
|
|
int selected_row = rows_start_index + i; |
|
|
|
|
int selected_cols = cols_start_index_group + lX; |
|
|
|
|
|
|
|
|
|
float data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2))); |
|
|
|
|
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols; |
|
|
|
|
data = con ? data : 0; |
|
|
|
|
local_data[i * LOCAL_MEM_STEP + lX ] =data; |
|
|
|
|
int offset = (i + localRow) * LOCAL_WIDTH; |
|
|
|
|
|
|
|
|
|
if(lX < (ANX << 1)) |
|
|
|
|
{ |
|
|
|
|
selected_cols = cols_start_index_group + lX + groupX_size; |
|
|
|
|
|
|
|
|
|
data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2))); |
|
|
|
|
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols; |
|
|
|
|
data = con ? data : 0; |
|
|
|
|
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; |
|
|
|
|
} |
|
|
|
|
#else |
|
|
|
|
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); |
|
|
|
|
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); |
|
|
|
|
|
|
|
|
|
int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols); |
|
|
|
|
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols); |
|
|
|
|
|
|
|
|
|
float data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2))); |
|
|
|
|
local_data[i * LOCAL_MEM_STEP + lX] =data; |
|
|
|
|
|
|
|
|
|
if(lX < (ANX << 1)) |
|
|
|
|
{ |
|
|
|
|
selected_cols = cols_start_index_group + lX + groupX_size; |
|
|
|
|
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols); |
|
|
|
|
|
|
|
|
|
data = *((__global float *)((__global char *)src + selected_row * src_step + (selected_cols << 2))); |
|
|
|
|
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
for(int j = 0; j < FILTER_SIZE; j++) |
|
|
|
|
{ |
|
|
|
|
sum += CONVERT_TYPE_SUM(local_data[offset + j + localCol]) * mat_kernel[filterIdx++]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
dst[(globalRow)*dst_step + (globalCol) + dst_offset] = CONVERT_TYPE(sum); |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2); |
|
|
|
|
if(((gY << 2) < rows) && (process_col < operate_cols)) |
|
|
|
|
{ |
|
|
|
|
int dst_cols_start = dst_offset_x; |
|
|
|
|
int dst_cols_end = dst_offset_x + cols; |
|
|
|
|
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc; |
|
|
|
|
|
|
|
|
|
int dst_rows_end = dst_offset_y + rows; |
|
|
|
|
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
float4 dst_data = *((__global float4*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))); |
|
|
|
|
/// following is specific for 3x3 kernels |
|
|
|
|
|
|
|
|
|
float4 sum = (float4)(0); |
|
|
|
|
float4 data; |
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
/////////////////////////////Macro for define elements number per thread///////////////////////////// |
|
|
|
|
//////////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
#define ANX 1 |
|
|
|
|
#define ANY 1 |
|
|
|
|
|
|
|
|
|
for(int i = 0; i < ANCHOR; i++) |
|
|
|
|
{ |
|
|
|
|
#pragma unroll |
|
|
|
|
for(int j = 0; j < ANCHOR; j++) |
|
|
|
|
{ |
|
|
|
|
if(dst_rows_index < dst_rows_end) |
|
|
|
|
{ |
|
|
|
|
int local_row = (lX >> THREADS_PER_ROW_BIT) + i; |
|
|
|
|
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; |
|
|
|
|
#define ROWS_PER_GROUP 4 |
|
|
|
|
#define ROWS_PER_GROUP_BITS 2 |
|
|
|
|
#define ROWS_FETCH (ROWS_PER_GROUP + ANY + ANY) //(ROWS_PER_GROUP + anY * 2) |
|
|
|
|
|
|
|
|
|
data = vload4(0, local_data+local_row * LOCAL_MEM_STEP + local_cols); |
|
|
|
|
sum = sum + ((float)(mat_kernel[i * ANCHOR + j]) * data); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#define THREADS_PER_ROW 64 |
|
|
|
|
#define THREADS_PER_ROW_BIT 6 |
|
|
|
|
|
|
|
|
|
if(dst_rows_index < dst_rows_end) |
|
|
|
|
{ |
|
|
|
|
sum.x = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ? sum.x : dst_data.x; |
|
|
|
|
sum.y = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? sum.y : dst_data.y; |
|
|
|
|
sum.z = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? sum.z : dst_data.z; |
|
|
|
|
sum.w = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? sum.w : dst_data.w; |
|
|
|
|
#define ELEMENTS_PER_THREAD 4 |
|
|
|
|
#define ELEMENTS_PER_THREAD_BIT 2 |
|
|
|
|
|
|
|
|
|
*((__global float4 *)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#define LOCAL_MEM_STEP 260 //divup((get_local_size(0) + anX * 2), 4) * 4 |
|
|
|
|
|
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
/////////////////////////////////////////8uC4//////////////////////////////////////////////////////// |
|
|
|
|
/////////////////////////////////////////8uC1//////////////////////////////////////////////////////// |
|
|
|
|
//////////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
__kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_x, int src_offset_y, |
|
|
|
|
__global uchar4 *dst, int dst_step, int dst_offset_x, int dst_offset_y, |
|
|
|
|
__constant int *mat_kernel __attribute__((max_constant_size (16384))), |
|
|
|
|
int cols,int rows, int operate_cols, int wholecols, int wholerows) |
|
|
|
|
__kernel void filter2D_3x3( |
|
|
|
|
__global T_IMG *src, |
|
|
|
|
__global T_IMG *dst, |
|
|
|
|
int src_step, |
|
|
|
|
int dst_step, |
|
|
|
|
__constant float *mat_kernel, |
|
|
|
|
__local T_IMG *local_data, |
|
|
|
|
int wholerows, |
|
|
|
|
int wholecols, |
|
|
|
|
int src_offset_x, |
|
|
|
|
int src_offset_y, |
|
|
|
|
int dst_offset_x, |
|
|
|
|
int dst_offset_y, |
|
|
|
|
int cols, |
|
|
|
|
int rows, |
|
|
|
|
int operate_cols |
|
|
|
|
) |
|
|
|
|
{ |
|
|
|
|
int gX = get_global_id(0); |
|
|
|
|
int gY = get_global_id(1); |
|
|
|
@ -341,9 +280,7 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ |
|
|
|
|
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; |
|
|
|
|
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; |
|
|
|
|
|
|
|
|
|
__local uchar4 local_data[LOCAL_MEM_STEP * ROWS_FETCH]; |
|
|
|
|
|
|
|
|
|
if(((gY << 2) < rows)) |
|
|
|
|
if((gY << 2) < rows) |
|
|
|
|
{ |
|
|
|
|
for(int i = 0; i < ROWS_FETCH; ++i) |
|
|
|
|
{ |
|
|
|
@ -353,19 +290,19 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ |
|
|
|
|
int selected_row = rows_start_index + i; |
|
|
|
|
int selected_cols = cols_start_index_group + lX; |
|
|
|
|
|
|
|
|
|
uchar4 data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2))); |
|
|
|
|
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols; |
|
|
|
|
T_IMG data = src[mad24(selected_row, src_step, selected_cols)]; |
|
|
|
|
int con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols; |
|
|
|
|
data = con ? data : 0; |
|
|
|
|
local_data[i * LOCAL_MEM_STEP + lX ] =data; |
|
|
|
|
local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data; |
|
|
|
|
|
|
|
|
|
if(lX < (ANX << 1)) |
|
|
|
|
{ |
|
|
|
|
selected_cols = cols_start_index_group + lX + groupX_size; |
|
|
|
|
|
|
|
|
|
data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2))); |
|
|
|
|
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols; |
|
|
|
|
data = src[mad24(selected_row, src_step, selected_cols)]; |
|
|
|
|
con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols; |
|
|
|
|
data = con ? data : 0; |
|
|
|
|
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; |
|
|
|
|
local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data; |
|
|
|
|
} |
|
|
|
|
#else |
|
|
|
|
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); |
|
|
|
@ -374,17 +311,17 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ |
|
|
|
|
int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols); |
|
|
|
|
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols); |
|
|
|
|
|
|
|
|
|
uchar4 data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2))); |
|
|
|
|
T_IMG data = src[mad24(selected_row, src_step, selected_cols)]; |
|
|
|
|
|
|
|
|
|
local_data[i * LOCAL_MEM_STEP + lX] =data; |
|
|
|
|
local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data; |
|
|
|
|
|
|
|
|
|
if(lX < (ANX << 1)) |
|
|
|
|
{ |
|
|
|
|
selected_cols = cols_start_index_group + lX + groupX_size; |
|
|
|
|
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols); |
|
|
|
|
|
|
|
|
|
data = *((__global uchar4*)((__global char*)src + selected_row * src_step + (selected_cols << 2))); |
|
|
|
|
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; |
|
|
|
|
data = src[mad24(selected_row, src_step, selected_cols)]; |
|
|
|
|
local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data; |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
@ -401,131 +338,40 @@ __kernel void filter2D_C4_D0(__global uchar4 *src, int src_step, int src_offset_ |
|
|
|
|
|
|
|
|
|
int dst_rows_end = dst_offset_y + rows; |
|
|
|
|
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT); |
|
|
|
|
dst = dst + mad24(dst_rows_index, dst_step, dst_cols_index); |
|
|
|
|
|
|
|
|
|
uchar16 dst_data; |
|
|
|
|
dst_data = *((__global uchar16*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))); |
|
|
|
|
T_IMGx4 dst_data = *(__global T_IMGx4 *)dst; |
|
|
|
|
|
|
|
|
|
int16 sum = (int16)(0); |
|
|
|
|
uchar16 data; |
|
|
|
|
T_SUMx4 sum = (T_SUMx4)SUM_ZEROx4; |
|
|
|
|
T_IMGx4 data; |
|
|
|
|
|
|
|
|
|
for(int i = 0; i < ANCHOR; i++) |
|
|
|
|
for(int i = 0; i < FILTER_SIZE; i++) |
|
|
|
|
{ |
|
|
|
|
#pragma unroll |
|
|
|
|
for(int j = 0; j < ANCHOR; j++) |
|
|
|
|
for(int j = 0; j < FILTER_SIZE; j++) |
|
|
|
|
{ |
|
|
|
|
if(dst_rows_index < dst_rows_end) |
|
|
|
|
{ |
|
|
|
|
int local_row = (lX >> THREADS_PER_ROW_BIT) + i; |
|
|
|
|
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; |
|
|
|
|
|
|
|
|
|
data = vload16(0, (__local uchar *)(local_data+local_row * LOCAL_MEM_STEP + local_cols)); |
|
|
|
|
sum = sum + (mat_kernel[i * ANCHOR + j] * convert_int16_sat(data)); |
|
|
|
|
data = VLOAD4(0, (__local T_IMG_C1 *)(local_data + local_row * LOCAL_MEM_STEP + local_cols)); |
|
|
|
|
sum = sum + (mat_kernel[i * FILTER_SIZE + j] * CONVERT_TYPE_SUMx4(data)); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if(dst_rows_index < dst_rows_end) |
|
|
|
|
{ |
|
|
|
|
uchar16 sum1 = convert_uchar16_sat(sum); |
|
|
|
|
sum1.s0123 = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end))? |
|
|
|
|
sum1.s0123 : dst_data.s0123; |
|
|
|
|
sum1.s4567 = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end))? |
|
|
|
|
sum1.s4567 : dst_data.s4567; |
|
|
|
|
sum1.s89ab = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end))? |
|
|
|
|
sum1.s89ab : dst_data.s89ab; |
|
|
|
|
sum1.scdef = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end))? |
|
|
|
|
sum1.scdef : dst_data.scdef; |
|
|
|
|
|
|
|
|
|
*((__global uchar16*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 2))) = sum1; |
|
|
|
|
T_IMGx4 tmp_dst = CONVERT_TYPEx4(sum); |
|
|
|
|
tmp_dst.SX = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ? |
|
|
|
|
tmp_dst.SX : dst_data.SX; |
|
|
|
|
tmp_dst.SY = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? |
|
|
|
|
tmp_dst.SY : dst_data.SY; |
|
|
|
|
tmp_dst.SZ = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? |
|
|
|
|
tmp_dst.SZ : dst_data.SZ; |
|
|
|
|
tmp_dst.SW = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? |
|
|
|
|
tmp_dst.SW : dst_data.SW; |
|
|
|
|
*(__global T_IMGx4 *)dst = tmp_dst; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
/////////////////////////////////////////32FC4//////////////////////////////////////////////////////// |
|
|
|
|
//////////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
#define ROWS_FETCH_C4 (1 + ANY + ANY) //(ROWS_PER_GROUP + anY * 2) |
|
|
|
|
#define LOCAL_MEM_STEP_C4 260 //divup((get_local_size(0) + anX * 2), 4) * 4) |
|
|
|
|
__kernel void filter2D_C4_D5(__global float4 *src, int src_step, int src_offset_x, int src_offset_y, |
|
|
|
|
__global float4 *dst, int dst_step, int dst_offset_x, int dst_offset_y, |
|
|
|
|
__constant int *mat_kernel __attribute__((max_constant_size (16384))), |
|
|
|
|
int cols,int rows, int operate_cols, int wholecols, int wholerows) |
|
|
|
|
{ |
|
|
|
|
int gX = get_global_id(0); |
|
|
|
|
int gY = get_global_id(1); |
|
|
|
|
|
|
|
|
|
int lX = get_local_id(0); |
|
|
|
|
|
|
|
|
|
int groupX_size = get_local_size(0); |
|
|
|
|
int groupX_id = get_group_id(0); |
|
|
|
|
|
|
|
|
|
int cols_start_index_group = src_offset_x + groupX_size * groupX_id - ANX; |
|
|
|
|
int rows_start_index = src_offset_y + gY - ANY; |
|
|
|
|
|
|
|
|
|
__local float4 local_data[LOCAL_MEM_STEP_C4 * ROWS_FETCH_C4]; |
|
|
|
|
if((gY < rows) && (gX < (operate_cols + ANX + ANX))) |
|
|
|
|
{ |
|
|
|
|
for(int i = 0; i < ROWS_FETCH_C4; ++i) |
|
|
|
|
{ |
|
|
|
|
if((rows_start_index - src_offset_y) + i < rows + ANY) |
|
|
|
|
{ |
|
|
|
|
#ifdef BORDER_CONSTANT |
|
|
|
|
int selected_row = rows_start_index + i; |
|
|
|
|
int selected_cols = cols_start_index_group + lX; |
|
|
|
|
|
|
|
|
|
float4 data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4))); |
|
|
|
|
int con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols; |
|
|
|
|
data = con ? data : 0; |
|
|
|
|
local_data[i * LOCAL_MEM_STEP + lX ] =data; |
|
|
|
|
|
|
|
|
|
if(lX < (ANX << 1)) |
|
|
|
|
{ |
|
|
|
|
selected_cols = cols_start_index_group + lX + groupX_size; |
|
|
|
|
|
|
|
|
|
data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4))); |
|
|
|
|
con = selected_row >=0 && selected_row < wholerows && selected_cols >=0 && selected_cols < wholecols; |
|
|
|
|
data = con ? data : 0; |
|
|
|
|
local_data[i * LOCAL_MEM_STEP + lX + groupX_size] =data; |
|
|
|
|
} |
|
|
|
|
#else |
|
|
|
|
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); |
|
|
|
|
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); |
|
|
|
|
|
|
|
|
|
int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols); |
|
|
|
|
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols); |
|
|
|
|
|
|
|
|
|
float4 data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4))); |
|
|
|
|
local_data[i * LOCAL_MEM_STEP_C4 + lX] =data; |
|
|
|
|
|
|
|
|
|
if(lX < (ANX << 1)) |
|
|
|
|
{ |
|
|
|
|
selected_cols = cols_start_index_group + lX + groupX_size; |
|
|
|
|
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols); |
|
|
|
|
|
|
|
|
|
data = *((__global float4*)((__global char*)src + selected_row * src_step + (selected_cols << 4))); |
|
|
|
|
local_data[i * LOCAL_MEM_STEP_C4 + lX + groupX_size] =data; |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if((gY < rows) && (gX < operate_cols)) |
|
|
|
|
{ |
|
|
|
|
int dst_cols_index = dst_offset_x + gX; |
|
|
|
|
int dst_rows_index = dst_offset_y + gY; |
|
|
|
|
|
|
|
|
|
float4 sum = (float4)(0); |
|
|
|
|
|
|
|
|
|
for(int i = 0; i < ANCHOR; i++) |
|
|
|
|
{ |
|
|
|
|
for(int j = 0; j < ANCHOR; j++) |
|
|
|
|
{ |
|
|
|
|
int local_cols = lX + j; |
|
|
|
|
sum = sum + ((float)mat_kernel[i * ANCHOR + j] * local_data[i * LOCAL_MEM_STEP_C4 + local_cols]); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
*((__global float4*)((__global char *)dst + dst_rows_index * dst_step + (dst_cols_index << 4))) = sum; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|