From dec6a3b080374004d9b748998c583b48abf5ab50 Mon Sep 17 00:00:00 2001 From: yao Date: Mon, 15 Apr 2013 16:46:25 +0800 Subject: [PATCH] make boxfilter kernel compile on Mac GPU OCL --- modules/ocl/src/opencl/filtering_boxFilter.cl | 367 +++++++++--------- 1 file changed, 184 insertions(+), 183 deletions(-) diff --git a/modules/ocl/src/opencl/filtering_boxFilter.cl b/modules/ocl/src/opencl/filtering_boxFilter.cl index 79ca8d735b..512e32997d 100644 --- a/modules/ocl/src/opencl/filtering_boxFilter.cl +++ b/modules/ocl/src/opencl/filtering_boxFilter.cl @@ -79,15 +79,73 @@ #define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) #endif +#define THREADS 256 +#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2) + +inline void update_dst_C1_D0(__global uchar *dst, __local uint* temp, + int dst_rows, int dst_cols, + int dst_startX, int dst_x_off, + float alpha) +{ + if(get_local_id(0) < anX || get_local_id(0) >= (THREADS-ksX+anX+1)) + { + return; + } + + uint4 tmp_sum = 0; + int posX = dst_startX - dst_x_off + (get_local_id(0)-anX)*4; + int posY = (get_group_id(1) << 1); + + for(int i=-anX; i<=anX; i++) + { + tmp_sum += vload4(get_local_id(0), temp+i); + } + + if(posY < dst_rows && posX < dst_cols) + { + tmp_sum /= (uint4) alpha; + if(posX >= 0 && posX < dst_cols) + *(dst) = tmp_sum.x; + if(posX+1 >= 0 && posX+1 < dst_cols) + *(dst + 1) = tmp_sum.y; + if(posX+2 >= 0 && posX+2 < dst_cols) + *(dst + 2) = tmp_sum.z; + if(posX+3 >= 0 && posX+3 < dst_cols) + *(dst + 3) = tmp_sum.w; + } +} + + +inline void update_dst_C4_D0(__global uchar4 *dst, __local uint4* temp, + int dst_rows, int dst_cols, + int dst_startX, int dst_x_off, + float alpha) +{ + if(get_local_id(0) >= (THREADS-ksX+1)) + { + return; + } + + int posX = dst_startX - dst_x_off + get_local_id(0); + int posY = (get_group_id(1) << 1); + + uint4 temp_sum = 0; + for(int i=-anX; i<=anX; i++) + { + temp_sum += temp[get_local_id(0) + anX + i]; + } + + if(posX >= 0 && posX < dst_cols && posY >= 0 && posY < dst_rows) + *dst = convert_uchar4(convert_float4(temp_sum)/alpha); +} + /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////8uC1//////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////// -#define THREADS 256 -#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2) __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global uchar *dst, float alpha, - int src_offset, int src_whole_rows, int src_whole_cols, int src_step, - int dst_offset, int dst_rows, int dst_cols, int dst_step - ) + int src_offset, int src_whole_rows, int src_whole_cols, int src_step, + int dst_offset, int dst_rows, int dst_cols, int dst_step + ) { int col = get_local_id(0); @@ -105,115 +163,84 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha int dst_startY = (gY << 1) + dst_y_off; uint4 data[ksY+1]; - __local uint4 temp[(THREADS<<1)]; + __local uint4 temp[2][THREADS]; #ifdef BORDER_CONSTANT - for(int i=0; i < ksY+1; i++) + for(int i=0; i < ksY+1; i++) + { + if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3src_whole_cols-1) - | (startY+i<0) | (startY+i>src_whole_rows-1); - if(not_all_in_range) - { - int selected_row; - int4 selected_col; - selected_row = ADDR_H(startY+i, 0, src_whole_rows); - selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); - - selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols); - selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x); - - selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols); - selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y); - - selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols); - selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z); - - selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols); - selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w); - - data[i].x = *(src + selected_row * src_step + selected_col.x); - data[i].y = *(src + selected_row * src_step + selected_col.y); - data[i].z = *(src + selected_row * src_step + selected_col.z); - data[i].w = *(src + selected_row * src_step + selected_col.w); - } - else - { - data[i] = convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX))); - } - } -#endif - uint4 sum0 = 0, sum1 = 0, sum2 = 0; - for(int i=1; i < ksY; i++) + int not_all_in_range; + for(int i=0; i < ksY+1; i++) { - sum0 += (data[i]); - } - sum1 = sum0 + (data[0]); - sum2 = sum0 + (data[ksY]); + not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1) + | (startY+i<0) | (startY+i>src_whole_rows-1); + if(not_all_in_range) + { + int selected_row; + int4 selected_col; + selected_row = ADDR_H(startY+i, 0, src_whole_rows); + selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); - temp[col] = sum1; - temp[col+THREADS] = sum2; - barrier(CLK_LOCAL_MEM_FENCE); + selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols); + selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x); - if(col >= anX && col < (THREADS-ksX+anX+1)) - { - int posX = dst_startX - dst_x_off + (col-anX)*4; - int posY = (gY << 1); - uint4 tmp_sum1=0, tmp_sum2=0; - for(int i=-anX; i<=anX; i++) - { - tmp_sum1 += vload4(col, (__local uint*)temp+i); - } + selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols); + selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y); - for(int i=-anX; i<=anX; i++) - { - tmp_sum2 += vload4(col, (__local uint*)(temp+THREADS)+i); - } + selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols); + selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z); - if(posY < dst_rows && posX < dst_cols) - { - if(posX >= 0 && posX < dst_cols) - *(dst+dst_startY * dst_step + dst_startX + (col-anX)*4) = tmp_sum1.x/alpha; - if(posX+1 >= 0 && posX+1 < dst_cols) - *(dst+dst_startY * dst_step + dst_startX+1 + (col-anX)*4) = tmp_sum1.y/alpha; - if(posX+2 >= 0 && posX+2 < dst_cols) - *(dst+dst_startY * dst_step + dst_startX+2 + (col-anX)*4) = tmp_sum1.z/alpha; - if(posX+3 >= 0 && posX+3 < dst_cols) - *(dst+dst_startY * dst_step + dst_startX+3 + (col-anX)*4) = tmp_sum1.w/alpha; + selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols); + selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w); + + data[i].x = *(src + selected_row * src_step + selected_col.x); + data[i].y = *(src + selected_row * src_step + selected_col.y); + data[i].z = *(src + selected_row * src_step + selected_col.z); + data[i].w = *(src + selected_row * src_step + selected_col.w); } - if(posY+1 < dst_rows && posX < dst_cols) + else { - dst_startY+=1; - if(posX >= 0 && posX < dst_cols) - *(dst+dst_startY * dst_step + dst_startX + (col-anX)*4) = tmp_sum2.x/alpha; - if(posX+1 >= 0 && posX+1 < dst_cols) - *(dst+dst_startY * dst_step + dst_startX+1 + (col-anX)*4) = tmp_sum2.y/alpha; - if(posX+2 >= 0 && posX+2 < dst_cols) - *(dst+dst_startY * dst_step + dst_startX+2 + (col-anX)*4) = tmp_sum2.z/alpha; - if(posX+3 >= 0 && posX+3 < dst_cols) - *(dst+dst_startY * dst_step + dst_startX+3 + (col-anX)*4) = tmp_sum2.w/alpha; + data[i] = convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX))); } } +#endif + uint4 tmp_sum = 0; + for(int i=1; i < ksY; i++) + { + tmp_sum += (data[i]); + } + + int index = dst_startY * dst_step + dst_startX + (col-anX)*4; + + temp[0][col] = tmp_sum + (data[0]); + temp[1][col] = tmp_sum + (data[ksY]); + barrier(CLK_LOCAL_MEM_FENCE); + update_dst_C1_D0(dst+index, (__local uint *)(temp[0]), + dst_rows, dst_cols, dst_startX, dst_x_off, alpha); + update_dst_C1_D0(dst+index+dst_step, (__local uint *)(temp[1]), + dst_rows, dst_cols, dst_startX, dst_x_off, alpha); } @@ -221,9 +248,9 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha /////////////////////////////////////////8uC4//////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////// __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, float alpha, - int src_offset, int src_whole_rows, int src_whole_cols, int src_step, - int dst_offset, int dst_rows, int dst_cols, int dst_step - ) + int src_offset, int src_whole_rows, int src_whole_cols, int src_step, + int dst_offset, int dst_rows, int dst_cols, int dst_step + ) { int col = get_local_id(0); const int gX = get_group_id(0); @@ -238,81 +265,63 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch int startY = (gY << 1) - anY + src_y_off; int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; int dst_startY = (gY << 1) + dst_y_off; - //int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4; - int end_addr = src_whole_cols-4; uint4 data[ksY+1]; __local uint4 temp[2][THREADS]; + #ifdef BORDER_CONSTANT bool con; - uint4 ss; for(int i=0; i < ksY+1; i++) { con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; - - //int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr); - //ss = convert_uint4(src[cur_addr]); - int cur_col = clamp(startX + col, 0, src_whole_cols); - if(con) - ss = convert_uint4(src[(startY+i)*(src_step>>2) + cur_col]); - data[i] = con ? ss : 0; + data[i].x = con ? src[(startY+i)*(src_step>>2) + cur_col].x : 0; + data[i].y = con ? src[(startY+i)*(src_step>>2) + cur_col].y : 0; + data[i].z = con ? src[(startY+i)*(src_step>>2) + cur_col].z : 0; + data[i].w = con ? src[(startY+i)*(src_step>>2) + cur_col].w : 0; } #else - for(int i=0; i < ksY+1; i++) - { - int selected_row; - int selected_col; - selected_row = ADDR_H(startY+i, 0, src_whole_rows); - selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); + for(int i=0; i < ksY+1; i++) + { + int selected_row; + int selected_col; + selected_row = ADDR_H(startY+i, 0, src_whole_rows); + selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); - selected_col = ADDR_L(startX+col, 0, src_whole_cols); - selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); + selected_col = ADDR_L(startX+col, 0, src_whole_cols); + selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); - data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]); - } + data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]); + } #endif - uint4 sum0 = 0, sum1 = 0, sum2 = 0; + uint4 tmp_sum = 0; for(int i=1; i < ksY; i++) { - sum0 += (data[i]); + tmp_sum += (data[i]); } - sum1 = sum0 + (data[0]); - sum2 = sum0 + (data[ksY]); - temp[0][col] = sum1; - temp[1][col] = sum2; - barrier(CLK_LOCAL_MEM_FENCE); - if(col < (THREADS-(ksX-1))) - { - col += anX; - int posX = dst_startX - dst_x_off + col - anX; - int posY = (gY << 1); - uint4 tmp_sum[2]={(uint4)(0,0,0,0),(uint4)(0,0,0,0)}; - for(int k=0; k<2; k++) - for(int i=-anX; i<=anX; i++) - { - tmp_sum[k] += temp[k][col+i]; - } - for(int i=0; i<2; i++) - { - if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows) - dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = convert_uchar4(convert_float4(tmp_sum[i])/alpha); - } + int index = dst_startY * (dst_step>>2)+ dst_startX + col; + + temp[0][col] = tmp_sum + (data[0]); + temp[1][col] = tmp_sum + (data[ksY]); + barrier(CLK_LOCAL_MEM_FENCE); + update_dst_C4_D0(dst+index, (__local uint4 *)(temp[0]), + dst_rows, dst_cols, dst_startX, dst_x_off, alpha); + update_dst_C4_D0(dst+index+(dst_step>>2), (__local uint4 *)(temp[1]), + dst_rows, dst_cols, dst_startX, dst_x_off, alpha); - } } /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////32fC1//////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////// __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float *dst, float alpha, - int src_offset, int src_whole_rows, int src_whole_cols, int src_step, - int dst_offset, int dst_rows, int dst_cols, int dst_step - ) + int src_offset, int src_whole_rows, int src_whole_cols, int src_step, + int dst_offset, int dst_rows, int dst_cols, int dst_step + ) { int col = get_local_id(0); const int gX = get_group_id(0); @@ -327,7 +336,6 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float int startY = (gY << 1) - anY + src_y_off; int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; int dst_startY = (gY << 1) + dst_y_off; - int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4; float data[ksY+1]; __local float temp[2][THREADS]; #ifdef BORDER_CONSTANT @@ -336,28 +344,25 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float for(int i=0; i < ksY+1; i++) { con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; - //int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr); - //ss = src[cur_addr]; int cur_col = clamp(startX + col, 0, src_whole_cols); - //ss = src[(startY+i)*(src_step>>2) + cur_col]; - ss = (startY+i)=0&&cur_col>=0&&cur_col>2) + cur_col]:0; + ss = (startY+i)=0&&cur_col>=0&&cur_col>2) + cur_col]:(float)0; data[i] = con ? ss : 0.f; } #else - for(int i=0; i < ksY+1; i++) - { - int selected_row; - int selected_col; - selected_row = ADDR_H(startY+i, 0, src_whole_rows); - selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); + for(int i=0; i < ksY+1; i++) + { + int selected_row; + int selected_col; + selected_row = ADDR_H(startY+i, 0, src_whole_rows); + selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); - selected_col = ADDR_L(startX+col, 0, src_whole_cols); - selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); + selected_col = ADDR_L(startX+col, 0, src_whole_cols); + selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); - data[i] = src[selected_row * (src_step>>2) + selected_col]; - } + data[i] = src[selected_row * (src_step>>2) + selected_col]; + } #endif float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; @@ -376,7 +381,7 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float int posX = dst_startX - dst_x_off + col - anX; int posY = (gY << 1); - float tmp_sum[2]={0.0, 0.0}; + float tmp_sum[2]= {0.0, 0.0}; for(int k=0; k<2; k++) for(int i=-anX; i<=anX; i++) { @@ -395,9 +400,9 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float /////////////////////////////////////////32fC4//////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////// __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global float4 *dst, float alpha, - int src_offset, int src_whole_rows, int src_whole_cols, int src_step, - int dst_offset, int dst_rows, int dst_cols, int dst_step - ) + int src_offset, int src_whole_rows, int src_whole_cols, int src_step, + int dst_offset, int dst_rows, int dst_cols, int dst_step + ) { int col = get_local_id(0); const int gX = get_group_id(0); @@ -412,7 +417,6 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa int startY = (gY << 1) - anY + src_y_off; int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; int dst_startY = (gY << 1) + dst_y_off; - int end_addr = (src_whole_rows-1)*(src_step>>4) + src_whole_cols-16; float4 data[ksY+1]; __local float4 temp[2][THREADS]; #ifdef BORDER_CONSTANT @@ -421,28 +425,25 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa for(int i=0; i < ksY+1; i++) { con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; - //int cur_addr = clamp((startY+i)*(src_step>>4)+(startX+col),0,end_addr); - //ss = src[cur_addr]; int cur_col = clamp(startX + col, 0, src_whole_cols); - //ss = src[(startY+i)*(src_step>>4) + cur_col]; - ss = (startY+i)=0&&cur_col>=0&&cur_col>4) + cur_col]:0; + ss = (startY+i)=0&&cur_col>=0&&cur_col>4) + cur_col]:(float4)0; data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0); } #else - for(int i=0; i < ksY+1; i++) - { - int selected_row; - int selected_col; - selected_row = ADDR_H(startY+i, 0, src_whole_rows); - selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); + for(int i=0; i < ksY+1; i++) + { + int selected_row; + int selected_col; + selected_row = ADDR_H(startY+i, 0, src_whole_rows); + selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); - selected_col = ADDR_L(startX+col, 0, src_whole_cols); - selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); + selected_col = ADDR_L(startX+col, 0, src_whole_cols); + selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); - data[i] = src[selected_row * (src_step>>4) + selected_col]; - } + data[i] = src[selected_row * (src_step>>4) + selected_col]; + } #endif float4 sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; @@ -461,7 +462,7 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa int posX = dst_startX - dst_x_off + col - anX; int posY = (gY << 1); - float4 tmp_sum[2]={(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)}; + float4 tmp_sum[2]= {(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)}; for(int k=0; k<2; k++) for(int i=-anX; i<=anX; i++) {