|
|
|
@ -6,25 +6,27 @@ |
|
|
|
|
#pragma OPENCL EXTENSION cl_amd_fp64:enable |
|
|
|
|
#endif |
|
|
|
|
typedef double T; |
|
|
|
|
typedef double F; |
|
|
|
|
typedef double4 F4; |
|
|
|
|
#define convert_F4 convert_double4 |
|
|
|
|
|
|
|
|
|
#else |
|
|
|
|
typedef float double; |
|
|
|
|
typedef float4 double4; |
|
|
|
|
typedef float F; |
|
|
|
|
typedef float4 F4; |
|
|
|
|
typedef long T; |
|
|
|
|
#define convert_double4 convert_float4 |
|
|
|
|
#define convert_F4 convert_float4 |
|
|
|
|
#endif |
|
|
|
|
//#pragma OPENCL EXTENSION cl_amd_printf:enable |
|
|
|
|
//#if defined (DOUBLE_SUPPORT) |
|
|
|
|
#define DST_ROW_A00 0 |
|
|
|
|
#define DST_ROW_A10 1 |
|
|
|
|
#define DST_ROW_A01 2 |
|
|
|
|
#define DST_ROW_A20 3 |
|
|
|
|
#define DST_ROW_A11 4 |
|
|
|
|
#define DST_ROW_A02 5 |
|
|
|
|
#define DST_ROW_A30 6 |
|
|
|
|
#define DST_ROW_A21 7 |
|
|
|
|
#define DST_ROW_A12 8 |
|
|
|
|
#define DST_ROW_A03 9 |
|
|
|
|
|
|
|
|
|
#define DST_ROW_00 0 |
|
|
|
|
#define DST_ROW_10 1 |
|
|
|
|
#define DST_ROW_01 2 |
|
|
|
|
#define DST_ROW_20 3 |
|
|
|
|
#define DST_ROW_11 4 |
|
|
|
|
#define DST_ROW_02 5 |
|
|
|
|
#define DST_ROW_30 6 |
|
|
|
|
#define DST_ROW_21 7 |
|
|
|
|
#define DST_ROW_12 8 |
|
|
|
|
#define DST_ROW_03 9 |
|
|
|
|
|
|
|
|
|
__kernel void icvContourMoments(int contour_total, |
|
|
|
|
__global float* reader_oclmat_data, |
|
|
|
@ -60,36 +62,76 @@ __kernel void icvContourMoments(int contour_total, |
|
|
|
|
yii_1 = yi_1 + yi; |
|
|
|
|
|
|
|
|
|
dst_step /= sizeof(T); |
|
|
|
|
*( dst_a + DST_ROW_A00 * dst_step + idx) = dxy; |
|
|
|
|
*( dst_a + DST_ROW_A10 * dst_step + idx) = dxy * xii_1; |
|
|
|
|
*( dst_a + DST_ROW_A01 * dst_step + idx) = dxy * yii_1; |
|
|
|
|
*( dst_a + DST_ROW_A20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2); |
|
|
|
|
*( dst_a + DST_ROW_A11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi)); |
|
|
|
|
*( dst_a + DST_ROW_A02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2); |
|
|
|
|
*( dst_a + DST_ROW_A30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2); |
|
|
|
|
*( dst_a + DST_ROW_A03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2); |
|
|
|
|
*( dst_a + DST_ROW_A21 * dst_step + idx) = |
|
|
|
|
*( dst_a + DST_ROW_00 * dst_step + idx) = dxy; |
|
|
|
|
*( dst_a + DST_ROW_10 * dst_step + idx) = dxy * xii_1; |
|
|
|
|
*( dst_a + DST_ROW_01 * dst_step + idx) = dxy * yii_1; |
|
|
|
|
*( dst_a + DST_ROW_20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2); |
|
|
|
|
*( dst_a + DST_ROW_11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi)); |
|
|
|
|
*( dst_a + DST_ROW_02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2); |
|
|
|
|
*( dst_a + DST_ROW_30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2); |
|
|
|
|
*( dst_a + DST_ROW_03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2); |
|
|
|
|
*( dst_a + DST_ROW_21 * dst_step + idx) = |
|
|
|
|
dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 + |
|
|
|
|
xi2 * (yi_1 + 3 * yi)); |
|
|
|
|
*( dst_a + DST_ROW_A12 * dst_step + idx) = |
|
|
|
|
*( dst_a + DST_ROW_12 * dst_step + idx) = |
|
|
|
|
dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 + |
|
|
|
|
yi2 * (xi_1 + 3 * xi)); |
|
|
|
|
} |
|
|
|
|
//#endif |
|
|
|
|
|
|
|
|
|
//#if defined (DOUBLE_SUPPORT) |
|
|
|
|
__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE, |
|
|
|
|
__global F* sum, __global F* dst_m, int dst_step) |
|
|
|
|
{ |
|
|
|
|
int gidy = get_global_id(0); |
|
|
|
|
int gidx = get_global_id(1); |
|
|
|
|
int block_y = src_rows/tile_height; |
|
|
|
|
int block_x = src_cols/tile_width; |
|
|
|
|
int block_num; |
|
|
|
|
|
|
|
|
|
if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0) |
|
|
|
|
block_y ++; |
|
|
|
|
if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0) |
|
|
|
|
block_x ++; |
|
|
|
|
block_num = block_y * block_x; |
|
|
|
|
__local F dst_sum[10][128]; |
|
|
|
|
if(gidy<128-block_num) |
|
|
|
|
for(int i=0; i<10; i++) |
|
|
|
|
dst_sum[i][gidy+block_num]=0; |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
dst_step /= sizeof(F); |
|
|
|
|
if(gidy<block_num) |
|
|
|
|
{ |
|
|
|
|
dst_sum[0][gidy] = *(dst_m + mad24(DST_ROW_00 * block_y, dst_step, gidy)); |
|
|
|
|
dst_sum[1][gidy] = *(dst_m + mad24(DST_ROW_10 * block_y, dst_step, gidy)); |
|
|
|
|
dst_sum[2][gidy] = *(dst_m + mad24(DST_ROW_01 * block_y, dst_step, gidy)); |
|
|
|
|
dst_sum[3][gidy] = *(dst_m + mad24(DST_ROW_20 * block_y, dst_step, gidy)); |
|
|
|
|
dst_sum[4][gidy] = *(dst_m + mad24(DST_ROW_11 * block_y, dst_step, gidy)); |
|
|
|
|
dst_sum[5][gidy] = *(dst_m + mad24(DST_ROW_02 * block_y, dst_step, gidy)); |
|
|
|
|
dst_sum[6][gidy] = *(dst_m + mad24(DST_ROW_30 * block_y, dst_step, gidy)); |
|
|
|
|
dst_sum[7][gidy] = *(dst_m + mad24(DST_ROW_21 * block_y, dst_step, gidy)); |
|
|
|
|
dst_sum[8][gidy] = *(dst_m + mad24(DST_ROW_12 * block_y, dst_step, gidy)); |
|
|
|
|
dst_sum[9][gidy] = *(dst_m + mad24(DST_ROW_03 * block_y, dst_step, gidy)); |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
for(int lsize=64; lsize>0; lsize>>=1) |
|
|
|
|
{ |
|
|
|
|
if(gidy<lsize) |
|
|
|
|
{ |
|
|
|
|
int lsize2 = gidy + lsize; |
|
|
|
|
for(int i=0; i<10; i++) |
|
|
|
|
dst_sum[i][gidy] += dst_sum[i][lsize2]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
} |
|
|
|
|
if(gidy==0) |
|
|
|
|
for(int i=0; i<10; i++) |
|
|
|
|
sum[i] = dst_sum[i][0]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, |
|
|
|
|
__global double* dst_m00, |
|
|
|
|
__global double* dst_m10, |
|
|
|
|
__global double* dst_m01, |
|
|
|
|
__global double* dst_m20, |
|
|
|
|
__global double* dst_m11, |
|
|
|
|
__global double* dst_m02, |
|
|
|
|
__global double* dst_m30, |
|
|
|
|
__global double* dst_m21, |
|
|
|
|
__global double* dst_m12, |
|
|
|
|
__global double* dst_m03, |
|
|
|
|
int dst_cols, int dst_step, int type, int depth, int cn, int coi, int binary, int TILE_SIZE) |
|
|
|
|
__global F* dst_m, |
|
|
|
|
int dst_cols, int dst_step, int blocky, |
|
|
|
|
int type, int depth, int cn, int coi, int binary, int TILE_SIZE) |
|
|
|
|
{ |
|
|
|
|
uchar tmp_coi[16]; // get the coi data |
|
|
|
|
uchar16 tmp[16]; |
|
|
|
@ -127,7 +169,7 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col |
|
|
|
|
if( binary ) |
|
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_C) |
|
|
|
|
tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero; |
|
|
|
|
double mom[10]; |
|
|
|
|
F mom[10]; |
|
|
|
|
__local int m[10][128]; |
|
|
|
|
if(lidy == 0) |
|
|
|
|
for(int i=0; i<10; i++) |
|
|
|
@ -197,119 +239,53 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col |
|
|
|
|
if(lidy == 0&&lidx == 0) |
|
|
|
|
{ |
|
|
|
|
for( int mt = 0; mt < 10; mt++ ) |
|
|
|
|
mom[mt] = (double)lm[mt]; |
|
|
|
|
mom[mt] = (F)lm[mt]; |
|
|
|
|
if(binary) |
|
|
|
|
{ |
|
|
|
|
double s = 1./255; |
|
|
|
|
F s = 1./255; |
|
|
|
|
for( int mt = 0; mt < 10; mt++ ) |
|
|
|
|
mom[mt] *= s; |
|
|
|
|
} |
|
|
|
|
double xm = x * mom[0], ym = y * mom[0]; |
|
|
|
|
F xm = x * mom[0], ym = y * mom[0]; |
|
|
|
|
|
|
|
|
|
// accumulate moments computed in each tile |
|
|
|
|
dst_step /= sizeof(F); |
|
|
|
|
|
|
|
|
|
// + m00 ( = m00' ) |
|
|
|
|
dst_m00[wgidy*dst_cols+wgidx] = mom[0]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; |
|
|
|
|
|
|
|
|
|
// + m10 ( = m10' + x*m00' ) |
|
|
|
|
dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm; |
|
|
|
|
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; |
|
|
|
|
|
|
|
|
|
// + m01 ( = m01' + y*m00' ) |
|
|
|
|
dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym; |
|
|
|
|
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; |
|
|
|
|
|
|
|
|
|
// + m20 ( = m20' + 2*x*m10' + x*x*m00' ) |
|
|
|
|
dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm); |
|
|
|
|
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); |
|
|
|
|
|
|
|
|
|
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) |
|
|
|
|
dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; |
|
|
|
|
|
|
|
|
|
// + m02 ( = m02' + 2*y*m01' + y*y*m00' ) |
|
|
|
|
dst_m02[wgidy*dst_cols+wgidx] = mom[5] + y * (mom[2] * 2 + ym); |
|
|
|
|
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); |
|
|
|
|
|
|
|
|
|
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) |
|
|
|
|
dst_m30[wgidy*dst_cols+wgidx] = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); |
|
|
|
|
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); |
|
|
|
|
|
|
|
|
|
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') |
|
|
|
|
dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; |
|
|
|
|
|
|
|
|
|
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') |
|
|
|
|
dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; |
|
|
|
|
|
|
|
|
|
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) |
|
|
|
|
dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); |
|
|
|
|
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
//#endif |
|
|
|
|
//#if defined (DOUBLE_SUPPORT) |
|
|
|
|
__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE, __global double* sum, __global double* dst_m00, |
|
|
|
|
__global double* dst_m10, |
|
|
|
|
__global double* dst_m01, |
|
|
|
|
__global double* dst_m20, |
|
|
|
|
__global double* dst_m11, |
|
|
|
|
__global double* dst_m02, |
|
|
|
|
__global double* dst_m30, |
|
|
|
|
__global double* dst_m21, |
|
|
|
|
__global double* dst_m12, |
|
|
|
|
__global double* dst_m03) |
|
|
|
|
{ |
|
|
|
|
int gidy = get_global_id(0); |
|
|
|
|
int gidx = get_global_id(1); |
|
|
|
|
int block_y = src_rows/tile_height; |
|
|
|
|
int block_x = src_cols/tile_width; |
|
|
|
|
int block_num; |
|
|
|
|
|
|
|
|
|
if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0) |
|
|
|
|
block_y ++; |
|
|
|
|
if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0) |
|
|
|
|
block_x ++; |
|
|
|
|
block_num = block_y * block_x; |
|
|
|
|
__local double dst_sum[10][128]; |
|
|
|
|
if(gidy<128-block_num) |
|
|
|
|
for(int i=0; i<10; i++) |
|
|
|
|
dst_sum[i][gidy+block_num]=0; |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
if(gidy<block_num) |
|
|
|
|
{ |
|
|
|
|
dst_sum[0][gidy] = dst_m00[gidy]; |
|
|
|
|
dst_sum[1][gidy] = dst_m10[gidy]; |
|
|
|
|
dst_sum[2][gidy] = dst_m01[gidy]; |
|
|
|
|
dst_sum[3][gidy] = dst_m20[gidy]; |
|
|
|
|
dst_sum[4][gidy] = dst_m11[gidy]; |
|
|
|
|
dst_sum[5][gidy] = dst_m02[gidy]; |
|
|
|
|
dst_sum[6][gidy] = dst_m30[gidy]; |
|
|
|
|
dst_sum[7][gidy] = dst_m21[gidy]; |
|
|
|
|
dst_sum[8][gidy] = dst_m12[gidy]; |
|
|
|
|
dst_sum[9][gidy] = dst_m03[gidy]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
for(int lsize=64; lsize>0; lsize>>=1) |
|
|
|
|
{ |
|
|
|
|
if(gidy<lsize) |
|
|
|
|
{ |
|
|
|
|
int lsize2 = gidy + lsize; |
|
|
|
|
for(int i=0; i<10; i++) |
|
|
|
|
dst_sum[i][gidy] += dst_sum[i][lsize2]; |
|
|
|
|
} |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
} |
|
|
|
|
if(gidy==0) |
|
|
|
|
for(int i=0; i<10; i++) |
|
|
|
|
sum[i] = dst_sum[i][0]; |
|
|
|
|
} |
|
|
|
|
//#endif |
|
|
|
|
//#if defined (DOUBLE_SUPPORT) |
|
|
|
|
__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, |
|
|
|
|
__global double* dst_m00, |
|
|
|
|
__global double* dst_m10, |
|
|
|
|
__global double* dst_m01, |
|
|
|
|
__global double* dst_m20, |
|
|
|
|
__global double* dst_m11, |
|
|
|
|
__global double* dst_m02, |
|
|
|
|
__global double* dst_m30, |
|
|
|
|
__global double* dst_m21, |
|
|
|
|
__global double* dst_m12, |
|
|
|
|
__global double* dst_m03, |
|
|
|
|
int dst_cols, int dst_step, |
|
|
|
|
__global F* dst_m, |
|
|
|
|
int dst_cols, int dst_step, int blocky, |
|
|
|
|
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE) |
|
|
|
|
{ |
|
|
|
|
ushort tmp_coi[8]; // get the coi data |
|
|
|
@ -345,7 +321,7 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col |
|
|
|
|
if( binary ) |
|
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_US) |
|
|
|
|
tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero; |
|
|
|
|
double mom[10]; |
|
|
|
|
F mom[10]; |
|
|
|
|
__local long m[10][128]; |
|
|
|
|
if(lidy == 0) |
|
|
|
|
for(int i=0; i<10; i++) |
|
|
|
@ -415,64 +391,55 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col |
|
|
|
|
if(lidy == 0&&lidx == 0) |
|
|
|
|
{ |
|
|
|
|
for(int mt = 0; mt < 10; mt++ ) |
|
|
|
|
mom[mt] = (double)lm[mt]; |
|
|
|
|
mom[mt] = (F)lm[mt]; |
|
|
|
|
|
|
|
|
|
if(binary) |
|
|
|
|
{ |
|
|
|
|
double s = 1./255; |
|
|
|
|
F s = 1./255; |
|
|
|
|
for( int mt = 0; mt < 10; mt++ ) |
|
|
|
|
mom[mt] *= s; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
double xm = x *mom[0], ym = y * mom[0]; |
|
|
|
|
F xm = x *mom[0], ym = y * mom[0]; |
|
|
|
|
|
|
|
|
|
// accumulate moments computed in each tile |
|
|
|
|
dst_step /= sizeof(F); |
|
|
|
|
|
|
|
|
|
// + m00 ( = m00' ) |
|
|
|
|
dst_m00[wgidy*dst_cols+wgidx] = mom[0]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; |
|
|
|
|
|
|
|
|
|
// + m10 ( = m10' + x*m00' ) |
|
|
|
|
dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm; |
|
|
|
|
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; |
|
|
|
|
|
|
|
|
|
// + m01 ( = m01' + y*m00' ) |
|
|
|
|
dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym; |
|
|
|
|
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; |
|
|
|
|
|
|
|
|
|
// + m20 ( = m20' + 2*x*m10' + x*x*m00' ) |
|
|
|
|
dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm); |
|
|
|
|
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); |
|
|
|
|
|
|
|
|
|
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) |
|
|
|
|
dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; |
|
|
|
|
|
|
|
|
|
// + m02 ( = m02' + 2*y*m01' + y*y*m00' ) |
|
|
|
|
dst_m02[wgidy*dst_cols+wgidx] = mom[5] + y * (mom[2] * 2 + ym); |
|
|
|
|
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); |
|
|
|
|
|
|
|
|
|
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) |
|
|
|
|
dst_m30[wgidy*dst_cols+wgidx] = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); |
|
|
|
|
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); |
|
|
|
|
|
|
|
|
|
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') |
|
|
|
|
dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; |
|
|
|
|
|
|
|
|
|
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') |
|
|
|
|
dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; |
|
|
|
|
|
|
|
|
|
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) |
|
|
|
|
dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); |
|
|
|
|
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
//#endif |
|
|
|
|
//#if defined (DOUBLE_SUPPORT) |
|
|
|
|
|
|
|
|
|
__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, |
|
|
|
|
__global double* dst_m00, |
|
|
|
|
__global double* dst_m10, |
|
|
|
|
__global double* dst_m01, |
|
|
|
|
__global double* dst_m20, |
|
|
|
|
__global double* dst_m11, |
|
|
|
|
__global double* dst_m02, |
|
|
|
|
__global double* dst_m30, |
|
|
|
|
__global double* dst_m21, |
|
|
|
|
__global double* dst_m12, |
|
|
|
|
__global double* dst_m03, |
|
|
|
|
int dst_cols, int dst_step, |
|
|
|
|
__global F* dst_m, |
|
|
|
|
int dst_cols, int dst_step, int blocky, |
|
|
|
|
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE) |
|
|
|
|
{ |
|
|
|
|
short tmp_coi[8]; // get the coi data |
|
|
|
@ -509,7 +476,7 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols |
|
|
|
|
for(int i=0; i < tileSize_width; i+=(VLEN_S)) |
|
|
|
|
tmp[i/VLEN_S] = (tmp[i/VLEN_S]!=zero)?full:zero; |
|
|
|
|
|
|
|
|
|
double mom[10]; |
|
|
|
|
F mom[10]; |
|
|
|
|
__local long m[10][128]; |
|
|
|
|
if(lidy == 0) |
|
|
|
|
for(int i=0; i<10; i++) |
|
|
|
@ -579,64 +546,55 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols |
|
|
|
|
if(lidy ==0 &&lidx ==0) |
|
|
|
|
{ |
|
|
|
|
for(int mt = 0; mt < 10; mt++ ) |
|
|
|
|
mom[mt] = (double)lm[mt]; |
|
|
|
|
mom[mt] = (F)lm[mt]; |
|
|
|
|
|
|
|
|
|
if(binary) |
|
|
|
|
{ |
|
|
|
|
double s = 1./255; |
|
|
|
|
F s = 1./255; |
|
|
|
|
for( int mt = 0; mt < 10; mt++ ) |
|
|
|
|
mom[mt] *= s; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
double xm = x * mom[0], ym = y*mom[0]; |
|
|
|
|
F xm = x * mom[0], ym = y*mom[0]; |
|
|
|
|
|
|
|
|
|
// accumulate moments computed in each tile |
|
|
|
|
dst_step /= sizeof(F); |
|
|
|
|
|
|
|
|
|
// + m00 ( = m00' ) |
|
|
|
|
dst_m00[wgidy*dst_cols+wgidx] = mom[0]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; |
|
|
|
|
|
|
|
|
|
// + m10 ( = m10' + x*m00' ) |
|
|
|
|
dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm; |
|
|
|
|
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; |
|
|
|
|
|
|
|
|
|
// + m01 ( = m01' + y*m00' ) |
|
|
|
|
dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym; |
|
|
|
|
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; |
|
|
|
|
|
|
|
|
|
// + m20 ( = m20' + 2*x*m10' + x*x*m00' ) |
|
|
|
|
dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm); |
|
|
|
|
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); |
|
|
|
|
|
|
|
|
|
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) |
|
|
|
|
dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; |
|
|
|
|
|
|
|
|
|
// + m02 ( = m02' + 2*y*m01' + y*y*m00' ) |
|
|
|
|
dst_m02[wgidy*dst_cols+wgidx] = mom[5] + y * (mom[2] * 2 + ym); |
|
|
|
|
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); |
|
|
|
|
|
|
|
|
|
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) |
|
|
|
|
dst_m30[wgidy*dst_cols+wgidx] = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); |
|
|
|
|
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); |
|
|
|
|
|
|
|
|
|
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') |
|
|
|
|
dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; |
|
|
|
|
|
|
|
|
|
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') |
|
|
|
|
dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; |
|
|
|
|
|
|
|
|
|
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) |
|
|
|
|
dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); |
|
|
|
|
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
//#endif |
|
|
|
|
//#if defined (DOUBLE_SUPPORT) |
|
|
|
|
|
|
|
|
|
__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, |
|
|
|
|
__global double* dst_m00, |
|
|
|
|
__global double* dst_m10, |
|
|
|
|
__global double* dst_m01, |
|
|
|
|
__global double* dst_m20, |
|
|
|
|
__global double* dst_m11, |
|
|
|
|
__global double* dst_m02, |
|
|
|
|
__global double* dst_m30, |
|
|
|
|
__global double* dst_m21, |
|
|
|
|
__global double* dst_m12, |
|
|
|
|
__global double* dst_m03, |
|
|
|
|
int dst_cols, int dst_step, |
|
|
|
|
__global F* dst_m, |
|
|
|
|
int dst_cols, int dst_step, int blocky, |
|
|
|
|
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE) |
|
|
|
|
{ |
|
|
|
|
float tmp_coi[4]; // get the coi data |
|
|
|
@ -672,23 +630,23 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols |
|
|
|
|
if( binary ) |
|
|
|
|
for(int i=0; i < tileSize_width; i+=4) |
|
|
|
|
tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero; |
|
|
|
|
double mom[10]; |
|
|
|
|
__local double m[10][128]; |
|
|
|
|
F mom[10]; |
|
|
|
|
__local F m[10][128]; |
|
|
|
|
if(lidy == 0) |
|
|
|
|
for(int i = 0; i < 10; i ++) |
|
|
|
|
for(int j = 0; j < 128; j ++) |
|
|
|
|
m[i][j] = 0; |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
double lm[10] = {0}; |
|
|
|
|
double4 x0 = (double4)(0); |
|
|
|
|
double4 x1 = (double4)(0); |
|
|
|
|
double4 x2 = (double4)(0); |
|
|
|
|
double4 x3 = (double4)(0); |
|
|
|
|
F lm[10] = {0}; |
|
|
|
|
F4 x0 = (F4)(0); |
|
|
|
|
F4 x1 = (F4)(0); |
|
|
|
|
F4 x2 = (F4)(0); |
|
|
|
|
F4 x3 = (F4)(0); |
|
|
|
|
for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_F ) |
|
|
|
|
{ |
|
|
|
|
double4 v_xt = (double4)(xt, xt+1, xt+2, xt+3); |
|
|
|
|
double4 p = convert_double4(tmp[xt/VLEN_F]); |
|
|
|
|
double4 xp = v_xt * p, xxp = xp * v_xt; |
|
|
|
|
F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3); |
|
|
|
|
F4 p = convert_F4(tmp[xt/VLEN_F]); |
|
|
|
|
F4 xp = v_xt * p, xxp = xp * v_xt; |
|
|
|
|
x0 += p; |
|
|
|
|
x1 += xp; |
|
|
|
|
x2 += xxp; |
|
|
|
@ -698,14 +656,14 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols |
|
|
|
|
x1.s0 += x1.s1 + x1.s2 + x1.s3; |
|
|
|
|
x2.s0 += x2.s1 + x2.s2 + x2.s3; |
|
|
|
|
x3.s0 += x3.s1 + x3.s2 + x3.s3; |
|
|
|
|
/* |
|
|
|
|
double py = lidy * x0.s0, sy = lidy*lidy; |
|
|
|
|
|
|
|
|
|
F py = lidy * x0.s0, sy = lidy*lidy; |
|
|
|
|
int bheight = min(tileSize_height, TILE_SIZE/2); |
|
|
|
|
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) |
|
|
|
|
{ |
|
|
|
|
m[9][lidy-bheight] = ((double)py) * sy; // m03 |
|
|
|
|
m[8][lidy-bheight] = ((double)x1.s0) * sy; // m12 |
|
|
|
|
m[7][lidy-bheight] = ((double)x2.s0) * lidy; // m21 |
|
|
|
|
m[9][lidy-bheight] = ((F)py) * sy; // m03 |
|
|
|
|
m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12 |
|
|
|
|
m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21 |
|
|
|
|
m[6][lidy-bheight] = x3.s0; // m30 |
|
|
|
|
m[5][lidy-bheight] = x0.s0 * sy; // m02 |
|
|
|
|
m[4][lidy-bheight] = x1.s0 * lidy; // m11 |
|
|
|
@ -714,11 +672,12 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols |
|
|
|
|
m[1][lidy-bheight] = x1.s0; // m10 |
|
|
|
|
m[0][lidy-bheight] = x0.s0; // m00 |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
else if(lidy < bheight) |
|
|
|
|
{ |
|
|
|
|
lm[9] = ((double)py) * sy; // m03 |
|
|
|
|
lm[8] = ((double)x1.s0) * sy; // m12 |
|
|
|
|
lm[7] = ((double)x2.s0) * lidy; // m21 |
|
|
|
|
lm[9] = ((F)py) * sy; // m03 |
|
|
|
|
lm[8] = ((F)x1.s0) * sy; // m12 |
|
|
|
|
lm[7] = ((F)x2.s0) * lidy; // m21 |
|
|
|
|
lm[6] = x3.s0; // m30 |
|
|
|
|
lm[5] = x0.s0 * sy; // m02 |
|
|
|
|
lm[4] = x1.s0 * lidy; // m11 |
|
|
|
@ -741,69 +700,59 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols |
|
|
|
|
} |
|
|
|
|
if(lidy == 0&&lidx == 0) |
|
|
|
|
{ |
|
|
|
|
for(int mt = 0; mt < 10; mt++ ) |
|
|
|
|
mom[mt] = (double)lm[mt]; |
|
|
|
|
|
|
|
|
|
for( int mt = 0; mt < 10; mt++ ) |
|
|
|
|
mom[mt] = (F)lm[mt]; |
|
|
|
|
if(binary) |
|
|
|
|
{ |
|
|
|
|
double s = 1./255; |
|
|
|
|
F s = 1./255; |
|
|
|
|
for( int mt = 0; mt < 10; mt++ ) |
|
|
|
|
mom[mt] *= s; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
double xm = x * mom[0], ym = y * mom[0]; |
|
|
|
|
F xm = x * mom[0], ym = y * mom[0]; |
|
|
|
|
|
|
|
|
|
// accumulate moments computed in each tile |
|
|
|
|
dst_step /= sizeof(F); |
|
|
|
|
|
|
|
|
|
// + m00 ( = m00' ) |
|
|
|
|
dst_m00[wgidy*dst_cols+wgidx]= mom[0]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; |
|
|
|
|
|
|
|
|
|
// + m10 ( = m10' + x*m00' ) |
|
|
|
|
dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm; |
|
|
|
|
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; |
|
|
|
|
|
|
|
|
|
// + m01 ( = m01' + y*m00' ) |
|
|
|
|
dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym; |
|
|
|
|
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; |
|
|
|
|
|
|
|
|
|
// + m20 ( = m20' + 2*x*m10' + x*x*m00' ) |
|
|
|
|
dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm); |
|
|
|
|
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); |
|
|
|
|
|
|
|
|
|
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) |
|
|
|
|
dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; |
|
|
|
|
|
|
|
|
|
// + m02 ( = m02' + 2*y*m01' + y*y*m00' ) |
|
|
|
|
dst_m02[wgidy*dst_cols+wgidx]= mom[5] + y * (mom[2] * 2 + ym); |
|
|
|
|
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); |
|
|
|
|
|
|
|
|
|
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) |
|
|
|
|
dst_m30[wgidy*dst_cols+wgidx]= mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); |
|
|
|
|
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); |
|
|
|
|
|
|
|
|
|
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') |
|
|
|
|
dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; |
|
|
|
|
|
|
|
|
|
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') |
|
|
|
|
dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; |
|
|
|
|
|
|
|
|
|
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) |
|
|
|
|
dst_m03[wgidy*dst_cols+wgidx]= mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); |
|
|
|
|
}*/ |
|
|
|
|
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
//#endif |
|
|
|
|
//#if defined (DOUBLE_SUPPORT) |
|
|
|
|
__kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, |
|
|
|
|
__global double* dst_m00, |
|
|
|
|
__global double* dst_m10, |
|
|
|
|
__global double* dst_m01, |
|
|
|
|
__global double* dst_m20, |
|
|
|
|
__global double* dst_m11, |
|
|
|
|
__global double* dst_m02, |
|
|
|
|
__global double* dst_m30, |
|
|
|
|
__global double* dst_m21, |
|
|
|
|
__global double* dst_m12, |
|
|
|
|
__global double* dst_m03, |
|
|
|
|
int dst_cols, int dst_step, |
|
|
|
|
|
|
|
|
|
__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, |
|
|
|
|
__global F* dst_m, |
|
|
|
|
int dst_cols, int dst_step, int blocky, |
|
|
|
|
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE) |
|
|
|
|
{ |
|
|
|
|
double tmp_coi[4]; // get the coi data |
|
|
|
|
double4 tmp[64]; |
|
|
|
|
F tmp_coi[4]; // get the coi data |
|
|
|
|
F4 tmp[64]; |
|
|
|
|
int VLEN_D = 4; // length of vetor |
|
|
|
|
int gidy = get_global_id(0); |
|
|
|
|
int gidx = get_global_id(1); |
|
|
|
@ -820,39 +769,39 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col |
|
|
|
|
|
|
|
|
|
if(tileSize_width < TILE_SIZE) |
|
|
|
|
for(int i = tileSize_width; i < rstep; i++ ) |
|
|
|
|
*((__global double*)src_data+(y+lidy)*src_step/8+x+i) = 0; |
|
|
|
|
*((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0; |
|
|
|
|
if( coi > 0 ) |
|
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_D) |
|
|
|
|
{ |
|
|
|
|
for(int j=0; j<4; j++) |
|
|
|
|
tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1); |
|
|
|
|
tmp[i/VLEN_D] = (double4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); |
|
|
|
|
tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_D) |
|
|
|
|
tmp[i/VLEN_D] = (double4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3)); |
|
|
|
|
double4 zero = (double4)(0); |
|
|
|
|
double4 full = (double4)(255); |
|
|
|
|
tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3)); |
|
|
|
|
F4 zero = (F4)(0); |
|
|
|
|
F4 full = (F4)(255); |
|
|
|
|
if( binary ) |
|
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_D) |
|
|
|
|
tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero; |
|
|
|
|
double mom[10]; |
|
|
|
|
__local double m[10][128]; |
|
|
|
|
F mom[10]; |
|
|
|
|
__local F m[10][128]; |
|
|
|
|
if(lidy == 0) |
|
|
|
|
for(int i=0; i<10; i++) |
|
|
|
|
for(int j=0; j<128; j++) |
|
|
|
|
m[i][j]=0; |
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
double lm[10] = {0}; |
|
|
|
|
double4 x0 = (double4)(0); |
|
|
|
|
double4 x1 = (double4)(0); |
|
|
|
|
double4 x2 = (double4)(0); |
|
|
|
|
double4 x3 = (double4)(0); |
|
|
|
|
F lm[10] = {0}; |
|
|
|
|
F4 x0 = (F4)(0); |
|
|
|
|
F4 x1 = (F4)(0); |
|
|
|
|
F4 x2 = (F4)(0); |
|
|
|
|
F4 x3 = (F4)(0); |
|
|
|
|
for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D ) |
|
|
|
|
{ |
|
|
|
|
double4 v_xt = (double4)(xt, xt+1, xt+2, xt+3); |
|
|
|
|
double4 p = tmp[xt/VLEN_D]; |
|
|
|
|
double4 xp = v_xt * p, xxp = xp * v_xt; |
|
|
|
|
F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3); |
|
|
|
|
F4 p = tmp[xt/VLEN_D]; |
|
|
|
|
F4 xp = v_xt * p, xxp = xp * v_xt; |
|
|
|
|
x0 += p; |
|
|
|
|
x1 += xp; |
|
|
|
|
x2 += xxp; |
|
|
|
@ -863,13 +812,13 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col |
|
|
|
|
x2.s0 += x2.s1 + x2.s2 + x2.s3; |
|
|
|
|
x3.s0 += x3.s1 + x3.s2 + x3.s3; |
|
|
|
|
|
|
|
|
|
double py = lidy * x0.s0, sy = lidy*lidy; |
|
|
|
|
F py = lidy * x0.s0, sy = lidy*lidy; |
|
|
|
|
int bheight = min(tileSize_height, TILE_SIZE/2); |
|
|
|
|
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) |
|
|
|
|
{ |
|
|
|
|
m[9][lidy-bheight] = ((double)py) * sy; // m03 |
|
|
|
|
m[8][lidy-bheight] = ((double)x1.s0) * sy; // m12 |
|
|
|
|
m[7][lidy-bheight] = ((double)x2.s0) * lidy; // m21 |
|
|
|
|
m[9][lidy-bheight] = ((F)py) * sy; // m03 |
|
|
|
|
m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12 |
|
|
|
|
m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21 |
|
|
|
|
m[6][lidy-bheight] = x3.s0; // m30 |
|
|
|
|
m[5][lidy-bheight] = x0.s0 * sy; // m02 |
|
|
|
|
m[4][lidy-bheight] = x1.s0 * lidy; // m11 |
|
|
|
@ -881,9 +830,9 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col |
|
|
|
|
|
|
|
|
|
else if(lidy < bheight) |
|
|
|
|
{ |
|
|
|
|
lm[9] = ((double)py) * sy; // m03 |
|
|
|
|
lm[8] = ((double)x1.s0) * sy; // m12 |
|
|
|
|
lm[7] = ((double)x2.s0) * lidy; // m21 |
|
|
|
|
lm[9] = ((F)py) * sy; // m03 |
|
|
|
|
lm[8] = ((F)x1.s0) * sy; // m12 |
|
|
|
|
lm[7] = ((F)x2.s0) * lidy; // m21 |
|
|
|
|
lm[6] = x3.s0; // m30 |
|
|
|
|
lm[5] = x0.s0 * sy; // m02 |
|
|
|
|
lm[4] = x1.s0 * lidy; // m11 |
|
|
|
@ -907,47 +856,47 @@ __kernel void CvMoments_D6(__global double* src_data, int src_rows, int src_col |
|
|
|
|
if(lidy == 0&&lidx == 0) |
|
|
|
|
{ |
|
|
|
|
for( int mt = 0; mt < 10; mt++ ) |
|
|
|
|
mom[mt] = (double)lm[mt]; |
|
|
|
|
mom[mt] = (F)lm[mt]; |
|
|
|
|
if(binary) |
|
|
|
|
{ |
|
|
|
|
double s = 1./255; |
|
|
|
|
F s = 1./255; |
|
|
|
|
for( int mt = 0; mt < 10; mt++ ) |
|
|
|
|
mom[mt] *= s; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
double xm = x * mom[0], ym = y * mom[0]; |
|
|
|
|
F xm = x * mom[0], ym = y * mom[0]; |
|
|
|
|
|
|
|
|
|
// accumulate moments computed in each tile |
|
|
|
|
dst_step /= sizeof(F); |
|
|
|
|
|
|
|
|
|
// + m00 ( = m00' ) |
|
|
|
|
dst_m00[wgidy*dst_cols+wgidx] = mom[0]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; |
|
|
|
|
|
|
|
|
|
// + m10 ( = m10' + x*m00' ) |
|
|
|
|
dst_m10[wgidy*dst_cols+wgidx] = mom[1] + xm; |
|
|
|
|
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; |
|
|
|
|
|
|
|
|
|
// + m01 ( = m01' + y*m00' ) |
|
|
|
|
dst_m01[wgidy*dst_cols+wgidx] = mom[2] + ym; |
|
|
|
|
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; |
|
|
|
|
|
|
|
|
|
// + m20 ( = m20' + 2*x*m10' + x*x*m00' ) |
|
|
|
|
dst_m20[wgidy*dst_cols+wgidx] = mom[3] + x * (mom[1] * 2 + xm); |
|
|
|
|
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); |
|
|
|
|
|
|
|
|
|
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) |
|
|
|
|
dst_m11[wgidy*dst_cols+wgidx] = mom[4] + x * (mom[2] + ym) + y * mom[1]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; |
|
|
|
|
|
|
|
|
|
// + m02 ( = m02' + 2*y*m01' + y*y*m00' ) |
|
|
|
|
dst_m02[wgidy*dst_cols+wgidx] = mom[5] + y * (mom[2] * 2 + ym); |
|
|
|
|
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); |
|
|
|
|
|
|
|
|
|
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) |
|
|
|
|
dst_m30[wgidy*dst_cols+wgidx] = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); |
|
|
|
|
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); |
|
|
|
|
|
|
|
|
|
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') |
|
|
|
|
dst_m21[wgidy*dst_cols+wgidx] = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; |
|
|
|
|
|
|
|
|
|
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') |
|
|
|
|
dst_m12[wgidy*dst_cols+wgidx] = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; |
|
|
|
|
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; |
|
|
|
|
|
|
|
|
|
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) |
|
|
|
|
dst_m03[wgidy*dst_cols+wgidx] = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); |
|
|
|
|
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
//#endif |
|
|
|
|
} |