|
|
|
@ -1,32 +1,31 @@ |
|
|
|
|
/* See LICENSE file in the root OpenCV directory */ |
|
|
|
|
|
|
|
|
|
#if TILE_SIZE > 16 |
|
|
|
|
#error "TILE SIZE should be <= 16" |
|
|
|
|
#if TILE_SIZE != 32 |
|
|
|
|
#error "TILE SIZE should be 32" |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
__kernel void moments(__global const uchar* src, int src_step, int src_offset, |
|
|
|
|
int src_rows, int src_cols, __global int* mom0, int xtiles) |
|
|
|
|
{ |
|
|
|
|
int x = get_global_id(0); |
|
|
|
|
int y = get_global_id(1); |
|
|
|
|
int x_min = x*TILE_SIZE; |
|
|
|
|
int y_min = y*TILE_SIZE; |
|
|
|
|
int x0 = get_global_id(0); |
|
|
|
|
int y0 = get_group_id(1); |
|
|
|
|
int x, y = get_local_id(1); |
|
|
|
|
int x_min = x0*TILE_SIZE; |
|
|
|
|
int ypix = y0*TILE_SIZE + y; |
|
|
|
|
__local int mom[TILE_SIZE][10]; |
|
|
|
|
|
|
|
|
|
if( x_min < src_cols && y_min < src_rows ) |
|
|
|
|
if( x_min < src_cols && y0*TILE_SIZE < src_rows ) |
|
|
|
|
{ |
|
|
|
|
int x_max = min(src_cols - x_min, TILE_SIZE); |
|
|
|
|
int y_max = min(src_rows - y_min, TILE_SIZE); |
|
|
|
|
int m00=0, m10=0, m01=0, m20=0, m11=0, m02=0, m30=0, m21=0, m12=0, m03=0; |
|
|
|
|
__global const uchar* ptr = src + src_offset + y_min*src_step + x_min; |
|
|
|
|
__global int* mom = mom0 + (xtiles*y + x)*10; |
|
|
|
|
x = x_max & -4; |
|
|
|
|
|
|
|
|
|
for( y = 0; y < y_max; y++, ptr += src_step ) |
|
|
|
|
if( ypix < src_rows ) |
|
|
|
|
{ |
|
|
|
|
int x_max = min(src_cols - x_min, TILE_SIZE); |
|
|
|
|
__global const uchar* ptr = src + src_offset + ypix*src_step + x_min; |
|
|
|
|
int4 S = (int4)(0,0,0,0), p; |
|
|
|
|
|
|
|
|
|
#define SUM_ELEM(elem, ofs) \ |
|
|
|
|
(int4)(1, (ofs), ((ofs)*(ofs)), ((ofs)*(ofs)*(ofs)))*elem |
|
|
|
|
(int4)(1, (ofs), (ofs)*(ofs), (ofs)*(ofs)*(ofs))*elem |
|
|
|
|
|
|
|
|
|
x = x_max & -4; |
|
|
|
|
if( x_max >= 4 ) |
|
|
|
|
{ |
|
|
|
|
p = convert_int4(vload4(0, ptr)); |
|
|
|
@ -51,6 +50,30 @@ __kernel void moments(__global const uchar* src, int src_step, int src_offset, |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if( x_max >= 20 ) |
|
|
|
|
{ |
|
|
|
|
p = convert_int4(vload4(0, ptr+16)); |
|
|
|
|
S += SUM_ELEM(p.s0, 16) + SUM_ELEM(p.s1, 17) + SUM_ELEM(p.s2, 18) + SUM_ELEM(p.s3, 19); |
|
|
|
|
|
|
|
|
|
if( x_max >= 24 ) |
|
|
|
|
{ |
|
|
|
|
p = convert_int4(vload4(0, ptr+20)); |
|
|
|
|
S += SUM_ELEM(p.s0, 20) + SUM_ELEM(p.s1, 21) + SUM_ELEM(p.s2, 22) + SUM_ELEM(p.s3, 23); |
|
|
|
|
|
|
|
|
|
if( x_max >= 28 ) |
|
|
|
|
{ |
|
|
|
|
p = convert_int4(vload4(0, ptr+24)); |
|
|
|
|
S += SUM_ELEM(p.s0, 24) + SUM_ELEM(p.s1, 25) + SUM_ELEM(p.s2, 26) + SUM_ELEM(p.s3, 27); |
|
|
|
|
|
|
|
|
|
if( x_max >= 32 ) |
|
|
|
|
{ |
|
|
|
|
p = convert_int4(vload4(0, ptr+28)); |
|
|
|
|
S += SUM_ELEM(p.s0, 28) + SUM_ELEM(p.s1, 29) + SUM_ELEM(p.s2, 30) + SUM_ELEM(p.s3, 31); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if( x < x_max ) |
|
|
|
|
{ |
|
|
|
|
int ps = ptr[x]; |
|
|
|
@ -68,27 +91,57 @@ __kernel void moments(__global const uchar* src, int src_step, int src_offset, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
int sy = y*y; |
|
|
|
|
m00 += S.s0; |
|
|
|
|
m10 += S.s1; |
|
|
|
|
m01 += y*S.s0; |
|
|
|
|
m20 += S.s2; |
|
|
|
|
m11 += y*S.s1; |
|
|
|
|
m02 += sy*S.s0; |
|
|
|
|
m30 += S.s3; |
|
|
|
|
m21 += y*S.s2; |
|
|
|
|
m12 += sy*S.s1; |
|
|
|
|
m03 += y*sy*S.s0; |
|
|
|
|
|
|
|
|
|
mom[y][0] = S.s0; |
|
|
|
|
mom[y][1] = S.s1; |
|
|
|
|
mom[y][2] = y*S.s0; |
|
|
|
|
mom[y][3] = S.s2; |
|
|
|
|
mom[y][4] = y*S.s1; |
|
|
|
|
mom[y][5] = sy*S.s0; |
|
|
|
|
mom[y][6] = S.s3; |
|
|
|
|
mom[y][7] = y*S.s2; |
|
|
|
|
mom[y][8] = sy*S.s1; |
|
|
|
|
mom[y][9] = y*sy*S.s0; |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
mom[y][0] = mom[y][1] = mom[y][2] = mom[y][3] = mom[y][4] = |
|
|
|
|
mom[y][5] = mom[y][6] = mom[y][7] = mom[y][8] = mom[y][9] = 0; |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
mom[0] = m00; |
|
|
|
|
mom[1] = m10; |
|
|
|
|
mom[2] = m01; |
|
|
|
|
mom[3] = m20; |
|
|
|
|
mom[4] = m11; |
|
|
|
|
mom[5] = m02; |
|
|
|
|
mom[6] = m30; |
|
|
|
|
mom[7] = m21; |
|
|
|
|
mom[8] = m12; |
|
|
|
|
mom[9] = m03; |
|
|
|
|
#define REDUCE(d) \ |
|
|
|
|
if( y < d ) \ |
|
|
|
|
{ \ |
|
|
|
|
mom[y][0] += mom[y+d][0]; \ |
|
|
|
|
mom[y][1] += mom[y+d][1]; \ |
|
|
|
|
mom[y][2] += mom[y+d][2]; \ |
|
|
|
|
mom[y][3] += mom[y+d][3]; \ |
|
|
|
|
mom[y][4] += mom[y+d][4]; \ |
|
|
|
|
mom[y][5] += mom[y+d][5]; \ |
|
|
|
|
mom[y][6] += mom[y+d][6]; \ |
|
|
|
|
mom[y][7] += mom[y+d][7]; \ |
|
|
|
|
mom[y][8] += mom[y+d][8]; \ |
|
|
|
|
mom[y][9] += mom[y+d][9]; \ |
|
|
|
|
} \ |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE) |
|
|
|
|
|
|
|
|
|
REDUCE(16); |
|
|
|
|
REDUCE(8); |
|
|
|
|
REDUCE(4); |
|
|
|
|
REDUCE(2); |
|
|
|
|
|
|
|
|
|
if( y == 0 ) |
|
|
|
|
{ |
|
|
|
|
__global int* momout = mom0 + (y0*xtiles + x0)*10; |
|
|
|
|
momout[0] = mom[0][0] + mom[1][0]; |
|
|
|
|
momout[1] = mom[0][1] + mom[1][1]; |
|
|
|
|
momout[2] = mom[0][2] + mom[1][2]; |
|
|
|
|
momout[3] = mom[0][3] + mom[1][3]; |
|
|
|
|
momout[4] = mom[0][4] + mom[1][4]; |
|
|
|
|
momout[5] = mom[0][5] + mom[1][5]; |
|
|
|
|
momout[6] = mom[0][6] + mom[1][6]; |
|
|
|
|
momout[7] = mom[0][7] + mom[1][7]; |
|
|
|
|
momout[8] = mom[0][8] + mom[1][8]; |
|
|
|
|
momout[9] = mom[0][9] + mom[1][9]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|