@ -16,7 +16,7 @@
//
// * Redistribution 's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclM aterials provided with the distribution.
// and/or other m aterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
@ -48,34 +48,43 @@
# define ALIGN ( RADIUS )
# endif
# ifdef BORDER_CONSTANT
//BORDER_CONSTANT: iiiiii |abcdefgh| iiiiiii
# define ELEM ( i,l_edge,r_edge,elem1,elem2 ) ( i ) < ( l_edge ) | ( i ) >= ( r_edge ) ? ( elem1 ) : ( elem2 )
# endif
# ifdef BORDER_REPLICATE
//BORDER_REPLICATE: aaaaaa |abcdefgh| hhhhhhh
# define ADDR_L ( i,l_edge,r_edge,addr ) ( i ) < ( l_edge ) ? ( l_edge ) : ( addr )
# define ADDR_R ( i,r_edge,addr ) ( i ) >= ( r_edge ) ? ( r_edge ) -1 : ( addr )
# endif
# elif defined BORDER_REPLICATE
# define EXTRAPOLATE ( x, maxV ) \
{ \
x = max ( min ( x, maxV - 1 ) , 0 ) ; \
}
# elif defined BORDER_WRAP
# define EXTRAPOLATE ( x, maxV ) \
{ \
if ( x < 0 ) \
x -= ( ( x - maxV + 1 ) / maxV ) * maxV ; \
if ( x >= maxV ) \
x %= maxV ; \
}
# elif defined ( BORDER_REFLECT ) | | defined ( BORDER_REFLECT_101 )
# define EXTRAPOLATE_ ( x, maxV, delta ) \
{ \
if ( maxV == 1 ) \
x = 0 ; \
else \
do \
{ \
if ( x < 0 ) \
x = -x - 1 + delta ; \
else \
x = maxV - 1 - ( x - maxV ) - delta ; \
} \
while ( x >= maxV | | x < 0 ) ; \
}
# ifdef BORDER_REFLECT
//BORDER_REFLECT: fedcba |abcdefgh| hgfedcb
# define ADDR_L ( i,l_edge,r_edge,addr ) ( i ) < ( l_edge ) ? - ( i ) -1 : ( addr )
# define ADDR_R ( i,r_edge,addr ) ( i ) >= ( r_edge ) ? - ( i ) -1+ ( ( r_edge ) <<1 ) : ( addr )
# endif
# ifdef BORDER_REFLECT_101
//BORDER_REFLECT_101: gfedcb |abcdefgh| gfedcba
# define ADDR_L ( i,l_edge,r_edge,addr ) ( i ) < ( l_edge ) ? - ( i ) : ( addr )
# define ADDR_R ( i,r_edge,addr ) ( i ) >= ( r_edge ) ? - ( i ) -2+ ( ( r_edge ) <<1 ) : ( addr )
# define EXTRAPOLATE ( x, maxV ) EXTRAPOLATE_ ( x, maxV, 0 )
# else
# define EXTRAPOLATE ( x, maxV ) EXTRAPOLATE_ ( x, maxV, 1 )
# endif
# ifdef BORDER_WRAP
//BORDER_WRAP: cdefgh |abcdefgh| abcdefg
# define ADDR_L ( i,l_edge,r_edge,addr ) ( i ) < ( l_edge ) ? ( i ) + ( r_edge ) : ( addr )
# define ADDR_R ( i,r_edge,addr ) ( i ) >= ( r_edge ) ? ( i ) - ( r_edge ) : ( addr )
# else
# error No extrapolation method
# endif
/**********************************************************************************
@ -96,23 +105,20 @@ The info above maybe obsolete.
*********************************************************************************** /
__kernel __attribute__ ( ( reqd_work_group_size ( LSIZE0,LSIZE1,1 ) ) ) void row_filter_C1_D0
( __global const uchar * restrict src,
( __global uchar * restrict src,
__global float * dst,
const int dst_cols,
const int dst_rows,
const int src_whole_cols,
const int src_whole_rows,
const int src_step_in_pixel,
const int src_offset_x,
const int src_offset_y,
const int dst_step_in_pixel,
const int radiusy,
int dst_cols, int dst_rows,
int src_whole_cols, int src_whole_rows,
int src_step_in_pixel,
int src_offset_x, int src_offset_y,
int dst_step_in_pixel, int radiusy,
__constant float * mat_kernel __attribute__ ( ( max_constant_size ( 4* ( 2*RADIUSX+1 ) ) ) ) )
{
int x = get_global_id ( 0 ) <<2 ;
int y = get_global_id ( 1 ) ;
int l_x = get_local_id ( 0 ) ;
int l_y = get_local_id ( 1 ) ;
int start_x = x+src_offset_x - RADIUSX & 0xfffffffc ;
int offset = src_offset_x - RADIUSX & 3 ;
int start_y = y + src_offset_y - radiusy ;
@ -124,6 +130,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1] ;
# ifdef BORDER_CONSTANT
int end_addr = mad24 ( src_whole_rows - 1 , src_step_in_pixel, src_whole_cols ) ;
// read pixels from src
for ( i = 0 ; i < READ_TIMES_ROW; i++)
{
@ -131,6 +138,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
current_addr = ( ( current_addr < end_addr ) && ( current_addr > 0 ) ) ? current_addr : 0 ;
temp[i] = * ( __global uchar4* ) &src[current_addr] ;
}
// judge if read out of boundary
for ( i = 0 ; i<READ_TIMES_ROW; i++)
{
@ -145,22 +153,21 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
int4 index[READ_TIMES_ROW] ;
int4 addr ;
int s_y ;
if ( not_all_in_range )
{
// judge if read out of boundary
for ( i = 0 ; i < READ_TIMES_ROW; i++)
{
index[i].x= ADDR_L ( start_x+i*LSIZE0*4,0,src_whole_cols,start_x+i*LSIZE0*4 ) ;
index[i].x= ADDR_R ( start_x+i*LSIZE0*4,src_whole_cols,index[i].x ) ;
index[i].y= ADDR_L ( start_x+i*LSIZE0*4+1,0,src_whole_cols,start_x+i*LSIZE0*4+1 ) ;
index[i].y= ADDR_R ( start_x+i*LSIZE0*4+1,src_whole_cols,index[i].y ) ;
index[i].z= ADDR_L ( start_x+i*LSIZE0*4+2,0,src_whole_cols,start_x+i*LSIZE0*4+2 ) ;
index[i].z= ADDR_R ( start_x+i*LSIZE0*4+2,src_whole_cols,index[i].z ) ;
index[i].w= ADDR_L ( start_x+i*LSIZE0*4+3,0,src_whole_cols,start_x+i*LSIZE0*4+3 ) ;
index[i].w= ADDR_R ( start_x+i*LSIZE0*4+3,src_whole_cols,index[i].w ) ;
index[i] = ( int4 ) ( start_x+i*LSIZE0*4 ) + ( int4 ) ( 0 , 1 , 2 , 3 ) ;
EXTRAPOLATE ( index[i].x, src_whole_cols ) ;
EXTRAPOLATE ( index[i].y, src_whole_cols ) ;
EXTRAPOLATE ( index[i].z, src_whole_cols ) ;
EXTRAPOLATE ( index[i].w, src_whole_cols ) ;
}
s_y= ADDR_L ( start_y,0,src_whole_rows,start_y ) ;
s_y= ADDR_R ( start_y,src_whole_rows,s_y ) ;
s_y = start_y ;
EXTRAPOLATE ( s_y, src_whole_rows ) ;
// read pixels from src
for ( i = 0 ; i<READ_TIMES_ROW; i++)
{
@ -175,17 +182,13 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
{
// read pixels from src
for ( i = 0 ; i<READ_TIMES_ROW; i++)
{
temp[i] = * ( __global uchar4* ) &src[start_addr+i*LSIZE0*4] ;
}
}
# endif
// save pixels to lds
for ( i = 0 ; i<READ_TIMES_ROW; i++)
{
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i] ;
}
barrier ( CLK_LOCAL_MEM_FENCE ) ;
// read pixels from lds and calculate the result
@ -196,40 +199,35 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
temp[1] = vload4 ( 0 , ( __local uchar* ) &LDS_DAT[l_y][l_x] + RADIUSX + offset + i ) ;
sum += convert_float4 ( temp[0] ) * mat_kernel[RADIUSX-i] + convert_float4 ( temp[1] ) * mat_kernel[RADIUSX+i] ;
}
start_addr = mad24 ( y,dst_step_in_pixel,x ) ;
// write the result to dst
if ( ( x+3<dst_cols ) & ( y<dst_rows ) )
{
* ( __global float4* ) &dst[start_addr] = sum ;
}
else if ( ( x+2<dst_cols ) & ( y<dst_rows ) )
else if ( ( x+2<dst_cols ) && ( y<dst_rows ) )
{
dst[start_addr] = sum.x ;
dst[start_addr+1] = sum.y ;
dst[start_addr+2] = sum.z ;
}
else if ( ( x+1<dst_cols ) & ( y<dst_rows ) )
else if ( ( x+1<dst_cols ) & & ( y<dst_rows ) )
{
dst[start_addr] = sum.x ;
dst[start_addr+1] = sum.y ;
}
else if ( ( x<dst_cols ) & ( y<dst_rows ) )
{
else if ( x<dst_cols && y<dst_rows )
dst[start_addr] = sum.x ;
}
}
__kernel __attribute__ ( ( reqd_work_group_size ( LSIZE0,LSIZE1,1 ) ) ) void row_filter_C4_D0
( __global const uchar4 * restrict src,
( __global uchar4 * restrict src,
__global float4 * dst,
const int dst_cols,
const int dst_rows,
const int src_whole_cols,
const int src_whole_rows,
const int src_step_in_pixel,
const int src_offset_x,
const int src_offset_y,
const int dst_step_in_pixel,
const int radiusy,
int dst_cols, int dst_rows,
int src_whole_cols, int src_whole_rows,
int src_step_in_pixel,
int src_offset_x, int src_offset_y,
int dst_step_in_pixel, int radiusy,
__constant float * mat_kernel __attribute__ ( ( max_constant_size ( 4* ( 2*RADIUSX+1 ) ) ) ) )
{
int x = get_global_id ( 0 ) ;
@ -246,6 +244,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1] ;
# ifdef BORDER_CONSTANT
int end_addr = mad24 ( src_whole_rows - 1 , src_step_in_pixel,src_whole_cols ) ;
// read pixels from src
for ( i = 0 ; i<READ_TIMES_ROW; i++)
{
@ -253,6 +252,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
current_addr = ( ( current_addr < end_addr ) && ( current_addr > 0 ) ) ? current_addr : 0 ;
temp[i] = src[current_addr] ;
}
//judge if read out of boundary
for ( i = 0 ; i<READ_TIMES_ROW; i++)
{
@ -262,27 +262,25 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
# else
int index[READ_TIMES_ROW] ;
int s_x,s_y ;
// judge if read out of boundary
for ( i = 0 ; i<READ_TIMES_ROW; i++)
{
s_x= ADDR_L ( start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0 ) ;
s_x= ADDR_R ( start_x+i*LSIZE0,src_whole_cols,s_x ) ;
s_y= ADDR_L ( start_y,0,src_whole_rows,start_y ) ;
s_y= ADDR_R ( start_y,src_whole_rows,s_y ) ;
s_x = start_x+i*LSIZE0 ;
EXTRAPOLATE ( s_x, src_whole_cols ) ;
s_y = start_y ;
EXTRAPOLATE ( s_y, src_whole_rows ) ;
index[i]=mad24 ( s_y,src_step_in_pixel,s_x ) ;
}
//read pixels from src
for ( i = 0 ; i<READ_TIMES_ROW; i++)
{
temp[i] = src[index[i]] ;
}
# endif
//save pixels to lds
for ( i = 0 ; i<READ_TIMES_ROW; i++)
{
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i] ;
}
barrier ( CLK_LOCAL_MEM_FENCE ) ;
//read pixels from lds and calculate the result
@ -294,7 +292,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
sum += convert_float4 ( temp[0] ) *mat_kernel[RADIUSX-i]+convert_float4 ( temp[1] ) *mat_kernel[RADIUSX+i] ;
}
//write the result to dst
if ( ( x<dst_cols ) & ( y<dst_rows ) )
if ( x<dst_cols && y<dst_rows )
{
start_addr = mad24 ( y,dst_step_in_pixel,x ) ;
dst[start_addr] = sum ;
@ -302,17 +300,13 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
}
__kernel __attribute__ ( ( reqd_work_group_size ( LSIZE0,LSIZE1,1 ) ) ) void row_filter_C1_D5
( __global const float * restrict src,
( __global float * restrict src,
__global float * dst,
const int dst_cols,
const int dst_rows,
const int src_whole_cols,
const int src_whole_rows,
const int src_step_in_pixel,
const int src_offset_x,
const int src_offset_y,
const int dst_step_in_pixel,
const int radiusy,
int dst_cols, int dst_rows,
int src_whole_cols, int src_whole_rows,
int src_step_in_pixel,
int src_offset_x, int src_offset_y,
int dst_step_in_pixel, int radiusy,
__constant float * mat_kernel __attribute__ ( ( max_constant_size ( 4* ( 2*RADIUSX+1 ) ) ) ) )
{
int x = get_global_id ( 0 ) ;
@ -329,6 +323,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
__local float LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1] ;
# ifdef BORDER_CONSTANT
int end_addr = mad24 ( src_whole_rows - 1 , src_step_in_pixel,src_whole_cols ) ;
// read pixels from src
for ( i = 0 ; i<READ_TIMES_ROW; i++)
{
@ -336,6 +331,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
current_addr = ( ( current_addr < end_addr ) && ( current_addr > 0 ) ) ? current_addr : 0 ;
temp[i] = src[current_addr] ;
}
// judge if read out of boundary
for ( i = 0 ; i<READ_TIMES_ROW; i++)
{
@ -348,24 +344,20 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
// judge if read out of boundary
for ( i = 0 ; i<READ_TIMES_ROW; i++)
{
s_x= ADDR_L ( start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0 ) ;
s_x= ADDR_R ( start_x+i*LSIZE0,src_whole_cols,s_x ) ;
s_y= ADDR_L ( start_y,0,src_whole_rows,start_y ) ;
s_y= ADDR_R ( start_y,src_whole_rows,s_y ) ;
s_x = start_x + i*LSIZE0, s_y = start_y ;
EXTRAPOLATE ( s_x, src_whole_cols ) ;
EXTRAPOLATE ( s_y, src_whole_rows ) ;
index[i]=mad24 ( s_y, src_step_in_pixel, s_x ) ;
}
// read pixels from src
for ( i = 0 ; i<READ_TIMES_ROW; i++)
{
temp[i] = src[index[i]] ;
}
# endif
//save pixels to lds
for ( i = 0 ; i<READ_TIMES_ROW; i++)
{
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i] ;
}
barrier ( CLK_LOCAL_MEM_FENCE ) ;
// read pixels from lds and calculate the result
@ -376,8 +368,9 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i] ;
sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i] ;
}
// write the result to dst
if ( ( x<dst_cols ) & ( y<dst_rows ) )
if ( x<dst_cols && y<dst_rows )
{
start_addr = mad24 ( y,dst_step_in_pixel,x ) ;
dst[start_addr] = sum ;
@ -385,17 +378,13 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
}
__kernel __attribute__ ( ( reqd_work_group_size ( LSIZE0,LSIZE1,1 ) ) ) void row_filter_C4_D5
( __global const float4 * restrict src,
( __global float4 * restrict src,
__global float4 * dst,
const int dst_cols,
const int dst_rows,
const int src_whole_cols,
const int src_whole_rows,
const int src_step_in_pixel,
const int src_offset_x,
const int src_offset_y,
const int dst_step_in_pixel,
const int radiusy,
int dst_cols, int dst_rows,
int src_whole_cols, int src_whole_rows,
int src_step_in_pixel,
int src_offset_x, int src_offset_y,
int dst_step_in_pixel, int radiusy,
__constant float * mat_kernel __attribute__ ( ( max_constant_size ( 4* ( 2*RADIUSX+1 ) ) ) ) )
{
int x = get_global_id ( 0 ) ;
@ -412,6 +401,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
__local float4 LDS_DAT[LSIZE1][READ_TIMES_ROW*LSIZE0+1] ;
# ifdef BORDER_CONSTANT
int end_addr = mad24 ( src_whole_rows - 1 , src_step_in_pixel,src_whole_cols ) ;
// read pixels from src
for ( i = 0 ; i<READ_TIMES_ROW; i++)
{
@ -419,6 +409,7 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
current_addr = ( ( current_addr < end_addr ) && ( current_addr > 0 ) ) ? current_addr : 0 ;
temp[i] = src[current_addr] ;
}
// judge if read out of boundary
for ( i = 0 ; i<READ_TIMES_ROW; i++)
{
@ -428,27 +419,24 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
# else
int index[READ_TIMES_ROW] ;
int s_x,s_y ;
// judge if read out of boundary
for ( i = 0 ; i<READ_TIMES_ROW; i++)
{
s_x= ADDR_L ( start_x+i*LSIZE0,0,src_whole_cols,start_x+i*LSIZE0 ) ;
s_x= ADDR_R ( start_x+i*LSIZE0,src_whole_cols,s_x ) ;
s_y= ADDR_L ( start_y,0,src_whole_rows,start_y ) ;
s_y= ADDR_R ( start_y,src_whole_rows,s_y ) ;
s_x = start_x + i*LSIZE0, s_y = start_y ;
EXTRAPOLATE ( s_x, src_whole_cols ) ;
EXTRAPOLATE ( s_y, src_whole_rows ) ;
index[i]=mad24 ( s_y,src_step_in_pixel,s_x ) ;
}
// read pixels from src
for ( i = 0 ; i<READ_TIMES_ROW; i++)
{
temp[i] = src[index[i]] ;
}
# endif
// save pixels to lds
for ( i = 0 ; i<READ_TIMES_ROW; i++)
{
LDS_DAT[l_y][l_x+i*LSIZE0]=temp[i] ;
}
barrier ( CLK_LOCAL_MEM_FENCE ) ;
// read pixels from lds and calculate the result
@ -459,11 +447,11 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
temp[1]=LDS_DAT[l_y][l_x+RADIUSX+i] ;
sum += temp[0]*mat_kernel[RADIUSX-i]+temp[1]*mat_kernel[RADIUSX+i] ;
}
// write the result to dst
if ( ( x<dst_cols ) & ( y<dst_rows ) )
if ( x<dst_cols && y<dst_rows )
{
start_addr = mad24 ( y,dst_step_in_pixel,x ) ;
dst[start_addr] = sum ;
}
}