|
|
|
@ -79,11 +79,69 @@ |
|
|
|
|
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#define THREADS 256 |
|
|
|
|
#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2) |
|
|
|
|
|
|
|
|
|
inline void update_dst_C1_D0(__global uchar *dst, __local uint* temp, |
|
|
|
|
int dst_rows, int dst_cols, |
|
|
|
|
int dst_startX, int dst_x_off, |
|
|
|
|
float alpha) |
|
|
|
|
{ |
|
|
|
|
if(get_local_id(0) < anX || get_local_id(0) >= (THREADS-ksX+anX+1)) |
|
|
|
|
{ |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
uint4 tmp_sum = 0; |
|
|
|
|
int posX = dst_startX - dst_x_off + (get_local_id(0)-anX)*4; |
|
|
|
|
int posY = (get_group_id(1) << 1); |
|
|
|
|
|
|
|
|
|
for(int i=-anX; i<=anX; i++) |
|
|
|
|
{ |
|
|
|
|
tmp_sum += vload4(get_local_id(0), temp+i); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if(posY < dst_rows && posX < dst_cols) |
|
|
|
|
{ |
|
|
|
|
tmp_sum /= (uint4) alpha; |
|
|
|
|
if(posX >= 0 && posX < dst_cols) |
|
|
|
|
*(dst) = tmp_sum.x; |
|
|
|
|
if(posX+1 >= 0 && posX+1 < dst_cols) |
|
|
|
|
*(dst + 1) = tmp_sum.y; |
|
|
|
|
if(posX+2 >= 0 && posX+2 < dst_cols) |
|
|
|
|
*(dst + 2) = tmp_sum.z; |
|
|
|
|
if(posX+3 >= 0 && posX+3 < dst_cols) |
|
|
|
|
*(dst + 3) = tmp_sum.w; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inline void update_dst_C4_D0(__global uchar4 *dst, __local uint4* temp, |
|
|
|
|
int dst_rows, int dst_cols, |
|
|
|
|
int dst_startX, int dst_x_off, |
|
|
|
|
float alpha) |
|
|
|
|
{ |
|
|
|
|
if(get_local_id(0) >= (THREADS-ksX+1)) |
|
|
|
|
{ |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
int posX = dst_startX - dst_x_off + get_local_id(0); |
|
|
|
|
int posY = (get_group_id(1) << 1); |
|
|
|
|
|
|
|
|
|
uint4 temp_sum = 0; |
|
|
|
|
for(int i=-anX; i<=anX; i++) |
|
|
|
|
{ |
|
|
|
|
temp_sum += temp[get_local_id(0) + anX + i]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if(posX >= 0 && posX < dst_cols && posY >= 0 && posY < dst_rows) |
|
|
|
|
*dst = convert_uchar4(convert_float4(temp_sum)/alpha); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
/////////////////////////////////////////8uC1//////////////////////////////////////////////////////// |
|
|
|
|
//////////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
|
#define THREADS 256 |
|
|
|
|
#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2) |
|
|
|
|
__kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global uchar *dst, float alpha, |
|
|
|
|
int src_offset, int src_whole_rows, int src_whole_cols, int src_step, |
|
|
|
|
int dst_offset, int dst_rows, int dst_cols, int dst_step |
|
|
|
@ -105,14 +163,19 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha |
|
|
|
|
int dst_startY = (gY << 1) + dst_y_off; |
|
|
|
|
|
|
|
|
|
uint4 data[ksY+1]; |
|
|
|
|
__local uint4 temp[(THREADS<<1)]; |
|
|
|
|
__local uint4 temp[2][THREADS]; |
|
|
|
|
|
|
|
|
|
#ifdef BORDER_CONSTANT |
|
|
|
|
|
|
|
|
|
for(int i=0; i < ksY+1; i++) |
|
|
|
|
{ |
|
|
|
|
if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols) |
|
|
|
|
data[i] = convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX))); |
|
|
|
|
{ |
|
|
|
|
data[i].x = *(src+(startY+i)*src_step + startX + col * 4); |
|
|
|
|
data[i].y = *(src+(startY+i)*src_step + startX + col * 4 + 1); |
|
|
|
|
data[i].z = *(src+(startY+i)*src_step + startX + col * 4 + 2); |
|
|
|
|
data[i].w = *(src+(startY+i)*src_step + startX + col * 4 + 3); |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
data[i]=0; |
|
|
|
@ -163,57 +226,21 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
uint4 sum0 = 0, sum1 = 0, sum2 = 0; |
|
|
|
|
uint4 tmp_sum = 0; |
|
|
|
|
for(int i=1; i < ksY; i++) |
|
|
|
|
{ |
|
|
|
|
sum0 += (data[i]); |
|
|
|
|
} |
|
|
|
|
sum1 = sum0 + (data[0]); |
|
|
|
|
sum2 = sum0 + (data[ksY]); |
|
|
|
|
|
|
|
|
|
temp[col] = sum1; |
|
|
|
|
temp[col+THREADS] = sum2; |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
if(col >= anX && col < (THREADS-ksX+anX+1)) |
|
|
|
|
{ |
|
|
|
|
int posX = dst_startX - dst_x_off + (col-anX)*4; |
|
|
|
|
int posY = (gY << 1); |
|
|
|
|
uint4 tmp_sum1=0, tmp_sum2=0; |
|
|
|
|
for(int i=-anX; i<=anX; i++) |
|
|
|
|
{ |
|
|
|
|
tmp_sum1 += vload4(col, (__local uint*)temp+i); |
|
|
|
|
tmp_sum += (data[i]); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for(int i=-anX; i<=anX; i++) |
|
|
|
|
{ |
|
|
|
|
tmp_sum2 += vload4(col, (__local uint*)(temp+THREADS)+i); |
|
|
|
|
} |
|
|
|
|
int index = dst_startY * dst_step + dst_startX + (col-anX)*4; |
|
|
|
|
|
|
|
|
|
if(posY < dst_rows && posX < dst_cols) |
|
|
|
|
{ |
|
|
|
|
if(posX >= 0 && posX < dst_cols) |
|
|
|
|
*(dst+dst_startY * dst_step + dst_startX + (col-anX)*4) = tmp_sum1.x/alpha; |
|
|
|
|
if(posX+1 >= 0 && posX+1 < dst_cols) |
|
|
|
|
*(dst+dst_startY * dst_step + dst_startX+1 + (col-anX)*4) = tmp_sum1.y/alpha; |
|
|
|
|
if(posX+2 >= 0 && posX+2 < dst_cols) |
|
|
|
|
*(dst+dst_startY * dst_step + dst_startX+2 + (col-anX)*4) = tmp_sum1.z/alpha; |
|
|
|
|
if(posX+3 >= 0 && posX+3 < dst_cols) |
|
|
|
|
*(dst+dst_startY * dst_step + dst_startX+3 + (col-anX)*4) = tmp_sum1.w/alpha; |
|
|
|
|
} |
|
|
|
|
if(posY+1 < dst_rows && posX < dst_cols) |
|
|
|
|
{ |
|
|
|
|
dst_startY+=1; |
|
|
|
|
if(posX >= 0 && posX < dst_cols) |
|
|
|
|
*(dst+dst_startY * dst_step + dst_startX + (col-anX)*4) = tmp_sum2.x/alpha; |
|
|
|
|
if(posX+1 >= 0 && posX+1 < dst_cols) |
|
|
|
|
*(dst+dst_startY * dst_step + dst_startX+1 + (col-anX)*4) = tmp_sum2.y/alpha; |
|
|
|
|
if(posX+2 >= 0 && posX+2 < dst_cols) |
|
|
|
|
*(dst+dst_startY * dst_step + dst_startX+2 + (col-anX)*4) = tmp_sum2.z/alpha; |
|
|
|
|
if(posX+3 >= 0 && posX+3 < dst_cols) |
|
|
|
|
*(dst+dst_startY * dst_step + dst_startX+3 + (col-anX)*4) = tmp_sum2.w/alpha; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
temp[0][col] = tmp_sum + (data[0]); |
|
|
|
|
temp[1][col] = tmp_sum + (data[ksY]); |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
update_dst_C1_D0(dst+index, (__local uint *)(temp[0]), |
|
|
|
|
dst_rows, dst_cols, dst_startX, dst_x_off, alpha); |
|
|
|
|
update_dst_C1_D0(dst+index+dst_step, (__local uint *)(temp[1]), |
|
|
|
|
dst_rows, dst_cols, dst_startX, dst_x_off, alpha); |
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -238,26 +265,21 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch |
|
|
|
|
int startY = (gY << 1) - anY + src_y_off; |
|
|
|
|
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; |
|
|
|
|
int dst_startY = (gY << 1) + dst_y_off; |
|
|
|
|
//int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4; |
|
|
|
|
|
|
|
|
|
int end_addr = src_whole_cols-4; |
|
|
|
|
uint4 data[ksY+1]; |
|
|
|
|
__local uint4 temp[2][THREADS]; |
|
|
|
|
|
|
|
|
|
#ifdef BORDER_CONSTANT |
|
|
|
|
bool con; |
|
|
|
|
uint4 ss; |
|
|
|
|
for(int i=0; i < ksY+1; i++) |
|
|
|
|
{ |
|
|
|
|
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; |
|
|
|
|
|
|
|
|
|
//int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr); |
|
|
|
|
//ss = convert_uint4(src[cur_addr]); |
|
|
|
|
|
|
|
|
|
int cur_col = clamp(startX + col, 0, src_whole_cols); |
|
|
|
|
if(con) |
|
|
|
|
ss = convert_uint4(src[(startY+i)*(src_step>>2) + cur_col]); |
|
|
|
|
|
|
|
|
|
data[i] = con ? ss : 0; |
|
|
|
|
data[i].x = con ? src[(startY+i)*(src_step>>2) + cur_col].x : 0; |
|
|
|
|
data[i].y = con ? src[(startY+i)*(src_step>>2) + cur_col].y : 0; |
|
|
|
|
data[i].z = con ? src[(startY+i)*(src_step>>2) + cur_col].z : 0; |
|
|
|
|
data[i].w = con ? src[(startY+i)*(src_step>>2) + cur_col].w : 0; |
|
|
|
|
} |
|
|
|
|
#else |
|
|
|
|
for(int i=0; i < ksY+1; i++) |
|
|
|
@ -275,35 +297,22 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#endif |
|
|
|
|
uint4 sum0 = 0, sum1 = 0, sum2 = 0; |
|
|
|
|
uint4 tmp_sum = 0; |
|
|
|
|
for(int i=1; i < ksY; i++) |
|
|
|
|
{ |
|
|
|
|
sum0 += (data[i]); |
|
|
|
|
tmp_sum += (data[i]); |
|
|
|
|
} |
|
|
|
|
sum1 = sum0 + (data[0]); |
|
|
|
|
sum2 = sum0 + (data[ksY]); |
|
|
|
|
temp[0][col] = sum1; |
|
|
|
|
temp[1][col] = sum2; |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
if(col < (THREADS-(ksX-1))) |
|
|
|
|
{ |
|
|
|
|
col += anX; |
|
|
|
|
int posX = dst_startX - dst_x_off + col - anX; |
|
|
|
|
int posY = (gY << 1); |
|
|
|
|
|
|
|
|
|
uint4 tmp_sum[2]={(uint4)(0,0,0,0),(uint4)(0,0,0,0)}; |
|
|
|
|
for(int k=0; k<2; k++) |
|
|
|
|
for(int i=-anX; i<=anX; i++) |
|
|
|
|
{ |
|
|
|
|
tmp_sum[k] += temp[k][col+i]; |
|
|
|
|
} |
|
|
|
|
for(int i=0; i<2; i++) |
|
|
|
|
{ |
|
|
|
|
if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows) |
|
|
|
|
dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = convert_uchar4(convert_float4(tmp_sum[i])/alpha); |
|
|
|
|
} |
|
|
|
|
int index = dst_startY * (dst_step>>2)+ dst_startX + col; |
|
|
|
|
|
|
|
|
|
temp[0][col] = tmp_sum + (data[0]); |
|
|
|
|
temp[1][col] = tmp_sum + (data[ksY]); |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
update_dst_C4_D0(dst+index, (__local uint4 *)(temp[0]), |
|
|
|
|
dst_rows, dst_cols, dst_startX, dst_x_off, alpha); |
|
|
|
|
update_dst_C4_D0(dst+index+(dst_step>>2), (__local uint4 *)(temp[1]), |
|
|
|
|
dst_rows, dst_cols, dst_startX, dst_x_off, alpha); |
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////// |
|
|
|
@ -327,7 +336,6 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float |
|
|
|
|
int startY = (gY << 1) - anY + src_y_off; |
|
|
|
|
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; |
|
|
|
|
int dst_startY = (gY << 1) + dst_y_off; |
|
|
|
|
int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4; |
|
|
|
|
float data[ksY+1]; |
|
|
|
|
__local float temp[2][THREADS]; |
|
|
|
|
#ifdef BORDER_CONSTANT |
|
|
|
@ -336,12 +344,9 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float |
|
|
|
|
for(int i=0; i < ksY+1; i++) |
|
|
|
|
{ |
|
|
|
|
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; |
|
|
|
|
//int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr); |
|
|
|
|
//ss = src[cur_addr]; |
|
|
|
|
|
|
|
|
|
int cur_col = clamp(startX + col, 0, src_whole_cols); |
|
|
|
|
//ss = src[(startY+i)*(src_step>>2) + cur_col]; |
|
|
|
|
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>2) + cur_col]:0; |
|
|
|
|
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>2) + cur_col]:(float)0; |
|
|
|
|
|
|
|
|
|
data[i] = con ? ss : 0.f; |
|
|
|
|
} |
|
|
|
@ -376,7 +381,7 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float |
|
|
|
|
int posX = dst_startX - dst_x_off + col - anX; |
|
|
|
|
int posY = (gY << 1); |
|
|
|
|
|
|
|
|
|
float tmp_sum[2]={0.0, 0.0}; |
|
|
|
|
float tmp_sum[2]= {0.0, 0.0}; |
|
|
|
|
for(int k=0; k<2; k++) |
|
|
|
|
for(int i=-anX; i<=anX; i++) |
|
|
|
|
{ |
|
|
|
@ -412,7 +417,6 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa |
|
|
|
|
int startY = (gY << 1) - anY + src_y_off; |
|
|
|
|
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; |
|
|
|
|
int dst_startY = (gY << 1) + dst_y_off; |
|
|
|
|
int end_addr = (src_whole_rows-1)*(src_step>>4) + src_whole_cols-16; |
|
|
|
|
float4 data[ksY+1]; |
|
|
|
|
__local float4 temp[2][THREADS]; |
|
|
|
|
#ifdef BORDER_CONSTANT |
|
|
|
@ -421,12 +425,9 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa |
|
|
|
|
for(int i=0; i < ksY+1; i++) |
|
|
|
|
{ |
|
|
|
|
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; |
|
|
|
|
//int cur_addr = clamp((startY+i)*(src_step>>4)+(startX+col),0,end_addr); |
|
|
|
|
//ss = src[cur_addr]; |
|
|
|
|
|
|
|
|
|
int cur_col = clamp(startX + col, 0, src_whole_cols); |
|
|
|
|
//ss = src[(startY+i)*(src_step>>4) + cur_col]; |
|
|
|
|
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>4) + cur_col]:0; |
|
|
|
|
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>4) + cur_col]:(float4)0; |
|
|
|
|
|
|
|
|
|
data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0); |
|
|
|
|
} |
|
|
|
@ -461,7 +462,7 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa |
|
|
|
|
int posX = dst_startX - dst_x_off + col - anX; |
|
|
|
|
int posY = (gY << 1); |
|
|
|
|
|
|
|
|
|
float4 tmp_sum[2]={(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)}; |
|
|
|
|
float4 tmp_sum[2]= {(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)}; |
|
|
|
|
for(int k=0; k<2; k++) |
|
|
|
|
for(int i=-anX; i<=anX; i++) |
|
|
|
|
{ |
|
|
|
|