|
|
@ -173,10 +173,10 @@ __kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_widt |
|
|
|
sum[i] = dst_sum[i][0]; |
|
|
|
sum[i] = dst_sum[i][0]; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, |
|
|
|
__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, |
|
|
|
__global F* dst_m, |
|
|
|
__global F* dst_m, |
|
|
|
int dst_cols, int dst_step, int blocky, |
|
|
|
int dst_cols, int dst_step, int blocky, |
|
|
|
int type, int depth, int cn, int coi, int binary, int TILE_SIZE) |
|
|
|
int depth, int cn, int coi, int binary, int TILE_SIZE) |
|
|
|
{ |
|
|
|
{ |
|
|
|
uchar tmp_coi[16]; // get the coi data |
|
|
|
uchar tmp_coi[16]; // get the coi data |
|
|
|
uchar16 tmp[16]; |
|
|
|
uchar16 tmp[16]; |
|
|
@ -192,35 +192,43 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col |
|
|
|
int x = wgidx*TILE_SIZE; // vector length of uchar |
|
|
|
int x = wgidx*TILE_SIZE; // vector length of uchar |
|
|
|
int kcn = (cn==2)?2:4; |
|
|
|
int kcn = (cn==2)?2:4; |
|
|
|
int rstep = min(src_step, TILE_SIZE); |
|
|
|
int rstep = min(src_step, TILE_SIZE); |
|
|
|
tileSize_height = min(TILE_SIZE, src_rows - y); |
|
|
|
int tileSize_height = min(TILE_SIZE, src_rows - y); |
|
|
|
tileSize_width = min(TILE_SIZE, src_cols - x); |
|
|
|
int tileSize_width = min(TILE_SIZE, src_cols - x); |
|
|
|
|
|
|
|
|
|
|
|
if( tileSize_width < TILE_SIZE ) |
|
|
|
if ( y+lidy < src_rows ) |
|
|
|
for(int i = tileSize_width; i < rstep; i++ ) |
|
|
|
{ |
|
|
|
*((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0; |
|
|
|
if( tileSize_width < TILE_SIZE ) |
|
|
|
if( coi > 0 ) //channel of interest |
|
|
|
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) |
|
|
|
for(int i = 0; i < tileSize_width; i += VLEN_C) |
|
|
|
*((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0; |
|
|
|
{ |
|
|
|
|
|
|
|
for(int j=0; j<VLEN_C; j++) |
|
|
|
if( coi > 0 ) //channel of interest |
|
|
|
tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1); |
|
|
|
for(int i = 0; i < tileSize_width; i += VLEN_C) |
|
|
|
tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7], |
|
|
|
{ |
|
|
|
tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]); |
|
|
|
for(int j=0; j<VLEN_C; j++) |
|
|
|
} |
|
|
|
tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1); |
|
|
|
else |
|
|
|
tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7], |
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_C) |
|
|
|
tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]); |
|
|
|
tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C); |
|
|
|
} |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_C) |
|
|
|
|
|
|
|
tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
uchar16 zero = (uchar16)(0); |
|
|
|
uchar16 zero = (uchar16)(0); |
|
|
|
uchar16 full = (uchar16)(255); |
|
|
|
uchar16 full = (uchar16)(255); |
|
|
|
if( binary ) |
|
|
|
if( binary ) |
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_C) |
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_C) |
|
|
|
tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero; |
|
|
|
tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero; |
|
|
|
|
|
|
|
|
|
|
|
F mom[10]; |
|
|
|
F mom[10]; |
|
|
|
__local int m[10][128]; |
|
|
|
__local int m[10][128]; |
|
|
|
if(lidy == 0) |
|
|
|
if(lidy < 128) |
|
|
|
|
|
|
|
{ |
|
|
|
for(int i=0; i<10; i++) |
|
|
|
for(int i=0; i<10; i++) |
|
|
|
for(int j=0; j<128; j++) |
|
|
|
m[i][lidy]=0; |
|
|
|
m[i][j]=0; |
|
|
|
} |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
|
|
int lm[10] = {0}; |
|
|
|
int lm[10] = {0}; |
|
|
|
int16 x0 = (int16)(0); |
|
|
|
int16 x0 = (int16)(0); |
|
|
|
int16 x1 = (int16)(0); |
|
|
|
int16 x1 = (int16)(0); |
|
|
@ -281,6 +289,7 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col |
|
|
|
m[i][lidy-j/2] = lm[i]; |
|
|
|
m[i][lidy-j/2] = lm[i]; |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if(lidy == 0&&lidx == 0) |
|
|
|
if(lidy == 0&&lidx == 0) |
|
|
|
{ |
|
|
|
{ |
|
|
|
for( int mt = 0; mt < 10; mt++ ) |
|
|
|
for( int mt = 0; mt < 10; mt++ ) |
|
|
@ -328,10 +337,10 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, |
|
|
|
__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, |
|
|
|
__global F* dst_m, |
|
|
|
__global F* dst_m, |
|
|
|
int dst_cols, int dst_step, int blocky, |
|
|
|
int dst_cols, int dst_step, int blocky, |
|
|
|
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE) |
|
|
|
int depth, int cn, int coi, int binary, const int TILE_SIZE) |
|
|
|
{ |
|
|
|
{ |
|
|
|
ushort tmp_coi[8]; // get the coi data |
|
|
|
ushort tmp_coi[8]; // get the coi data |
|
|
|
ushort8 tmp[32]; |
|
|
|
ushort8 tmp[32]; |
|
|
@ -346,21 +355,26 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col |
|
|
|
int x = wgidx*TILE_SIZE; // real X index of pixel |
|
|
|
int x = wgidx*TILE_SIZE; // real X index of pixel |
|
|
|
int kcn = (cn==2)?2:4; |
|
|
|
int kcn = (cn==2)?2:4; |
|
|
|
int rstep = min(src_step/2, TILE_SIZE); |
|
|
|
int rstep = min(src_step/2, TILE_SIZE); |
|
|
|
tileSize_height = min(TILE_SIZE, src_rows - y); |
|
|
|
int tileSize_height = min(TILE_SIZE, src_rows - y); |
|
|
|
tileSize_width = min(TILE_SIZE, src_cols -x); |
|
|
|
int tileSize_width = min(TILE_SIZE, src_cols -x); |
|
|
|
if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE) |
|
|
|
|
|
|
|
for(int i=tileSize_width; i < rstep; i++ ) |
|
|
|
if ( y+lidy < src_rows ) |
|
|
|
*((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0; |
|
|
|
{ |
|
|
|
if( coi > 0 ) |
|
|
|
if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE) |
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_US) |
|
|
|
for(int i=tileSize_width; i < rstep && (x+i) < src_cols; i++ ) |
|
|
|
{ |
|
|
|
*((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0; |
|
|
|
for(int j=0; j<VLEN_US; j++) |
|
|
|
if( coi > 0 ) |
|
|
|
tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1); |
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_US) |
|
|
|
tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]); |
|
|
|
{ |
|
|
|
} |
|
|
|
for(int j=0; j<VLEN_US; j++) |
|
|
|
else |
|
|
|
tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1); |
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_US) |
|
|
|
tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]); |
|
|
|
tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US); |
|
|
|
} |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_US) |
|
|
|
|
|
|
|
tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
ushort8 zero = (ushort8)(0); |
|
|
|
ushort8 zero = (ushort8)(0); |
|
|
|
ushort8 full = (ushort8)(255); |
|
|
|
ushort8 full = (ushort8)(255); |
|
|
|
if( binary ) |
|
|
|
if( binary ) |
|
|
@ -368,11 +382,11 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col |
|
|
|
tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero; |
|
|
|
tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero; |
|
|
|
F mom[10]; |
|
|
|
F mom[10]; |
|
|
|
__local long m[10][128]; |
|
|
|
__local long m[10][128]; |
|
|
|
if(lidy == 0) |
|
|
|
if(lidy < 128) |
|
|
|
for(int i=0; i<10; i++) |
|
|
|
for(int i=0; i<10; i++) |
|
|
|
for(int j=0; j<128; j++) |
|
|
|
m[i][lidy]=0; |
|
|
|
m[i][j]=0; |
|
|
|
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
|
|
long lm[10] = {0}; |
|
|
|
long lm[10] = {0}; |
|
|
|
int8 x0 = (int8)(0); |
|
|
|
int8 x0 = (int8)(0); |
|
|
|
int8 x1 = (int8)(0); |
|
|
|
int8 x1 = (int8)(0); |
|
|
@ -422,17 +436,22 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col |
|
|
|
lm[0] = x0.s0; // m00 |
|
|
|
lm[0] = x0.s0; // m00 |
|
|
|
} |
|
|
|
} |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
|
|
for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) |
|
|
|
for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
if(lidy < j) |
|
|
|
if(lidy < j) |
|
|
|
for( int i = 0; i < 10; i++ ) |
|
|
|
for( int i = 0; i < 10; i++ ) |
|
|
|
lm[i] = lm[i] + m[i][lidy]; |
|
|
|
lm[i] = lm[i] + m[i][lidy]; |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
} |
|
|
|
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) |
|
|
|
|
|
|
|
{ |
|
|
|
if(lidy >= j/2&&lidy < j) |
|
|
|
if(lidy >= j/2&&lidy < j) |
|
|
|
for( int i = 0; i < 10; i++ ) |
|
|
|
for( int i = 0; i < 10; i++ ) |
|
|
|
m[i][lidy-j/2] = lm[i]; |
|
|
|
m[i][lidy-j/2] = lm[i]; |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
|
|
if(lidy == 0&&lidx == 0) |
|
|
|
if(lidy == 0&&lidx == 0) |
|
|
|
{ |
|
|
|
{ |
|
|
|
for(int mt = 0; mt < 10; mt++ ) |
|
|
|
for(int mt = 0; mt < 10; mt++ ) |
|
|
@ -482,10 +501,10 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, |
|
|
|
__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, |
|
|
|
__global F* dst_m, |
|
|
|
__global F* dst_m, |
|
|
|
int dst_cols, int dst_step, int blocky, |
|
|
|
int dst_cols, int dst_step, int blocky, |
|
|
|
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE) |
|
|
|
int depth, int cn, int coi, int binary, const int TILE_SIZE) |
|
|
|
{ |
|
|
|
{ |
|
|
|
short tmp_coi[8]; // get the coi data |
|
|
|
short tmp_coi[8]; // get the coi data |
|
|
|
short8 tmp[32]; |
|
|
|
short8 tmp[32]; |
|
|
@ -500,21 +519,26 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols |
|
|
|
int x = wgidx*TILE_SIZE; // real X index of pixel |
|
|
|
int x = wgidx*TILE_SIZE; // real X index of pixel |
|
|
|
int kcn = (cn==2)?2:4; |
|
|
|
int kcn = (cn==2)?2:4; |
|
|
|
int rstep = min(src_step/2, TILE_SIZE); |
|
|
|
int rstep = min(src_step/2, TILE_SIZE); |
|
|
|
tileSize_height = min(TILE_SIZE, src_rows - y); |
|
|
|
int tileSize_height = min(TILE_SIZE, src_rows - y); |
|
|
|
tileSize_width = min(TILE_SIZE, src_cols -x); |
|
|
|
int tileSize_width = min(TILE_SIZE, src_cols -x); |
|
|
|
if(tileSize_width < TILE_SIZE) |
|
|
|
|
|
|
|
for(int i = tileSize_width; i < rstep; i++ ) |
|
|
|
if ( y+lidy < src_rows ) |
|
|
|
*((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0; |
|
|
|
{ |
|
|
|
if( coi > 0 ) |
|
|
|
if(tileSize_width < TILE_SIZE) |
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_S) |
|
|
|
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) |
|
|
|
{ |
|
|
|
*((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0; |
|
|
|
for(int j=0; j<VLEN_S; j++) |
|
|
|
if( coi > 0 ) |
|
|
|
tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1); |
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_S) |
|
|
|
tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]); |
|
|
|
{ |
|
|
|
} |
|
|
|
for(int j=0; j<VLEN_S; j++) |
|
|
|
else |
|
|
|
tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1); |
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_S) |
|
|
|
tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]); |
|
|
|
tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S); |
|
|
|
} |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_S) |
|
|
|
|
|
|
|
tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
short8 zero = (short8)(0); |
|
|
|
short8 zero = (short8)(0); |
|
|
|
short8 full = (short8)(255); |
|
|
|
short8 full = (short8)(255); |
|
|
|
if( binary ) |
|
|
|
if( binary ) |
|
|
@ -523,10 +547,9 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols |
|
|
|
|
|
|
|
|
|
|
|
F mom[10]; |
|
|
|
F mom[10]; |
|
|
|
__local long m[10][128]; |
|
|
|
__local long m[10][128]; |
|
|
|
if(lidy == 0) |
|
|
|
if(lidy < 128) |
|
|
|
for(int i=0; i<10; i++) |
|
|
|
for(int i=0; i<10; i++) |
|
|
|
for(int j=0; j<128; j++) |
|
|
|
m[i][lidy]=0; |
|
|
|
m[i][j]=0; |
|
|
|
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
long lm[10] = {0}; |
|
|
|
long lm[10] = {0}; |
|
|
|
int8 x0 = (int8)(0); |
|
|
|
int8 x0 = (int8)(0); |
|
|
@ -637,10 +660,10 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, |
|
|
|
__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, |
|
|
|
__global F* dst_m, |
|
|
|
__global F* dst_m, |
|
|
|
int dst_cols, int dst_step, int blocky, |
|
|
|
int dst_cols, int dst_step, int blocky, |
|
|
|
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE) |
|
|
|
int depth, int cn, int coi, int binary, const int TILE_SIZE) |
|
|
|
{ |
|
|
|
{ |
|
|
|
float tmp_coi[4]; // get the coi data |
|
|
|
float tmp_coi[4]; // get the coi data |
|
|
|
float4 tmp[64] ; |
|
|
|
float4 tmp[64] ; |
|
|
@ -654,33 +677,30 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols |
|
|
|
int y = wgidy*TILE_SIZE; // real Y index of pixel |
|
|
|
int y = wgidy*TILE_SIZE; // real Y index of pixel |
|
|
|
int x = wgidx*TILE_SIZE; // real X index of pixel |
|
|
|
int x = wgidx*TILE_SIZE; // real X index of pixel |
|
|
|
int kcn = (cn==2)?2:4; |
|
|
|
int kcn = (cn==2)?2:4; |
|
|
|
src_step /= sizeof(*src_data); |
|
|
|
int rstep = min(src_step/4, TILE_SIZE); |
|
|
|
int rstep = min(src_step, TILE_SIZE); |
|
|
|
int tileSize_height = min(TILE_SIZE, src_rows - y); |
|
|
|
tileSize_height = min(TILE_SIZE, src_rows - y); |
|
|
|
int tileSize_width = min(TILE_SIZE, src_cols -x); |
|
|
|
tileSize_width = min(TILE_SIZE, src_cols -x); |
|
|
|
|
|
|
|
int maxIdx = mul24(src_rows, src_cols); |
|
|
|
int maxIdx = mul24(src_rows, src_cols); |
|
|
|
int yOff = (y+lidy)*src_step; |
|
|
|
int yOff = (y+lidy)*src_step; |
|
|
|
int index; |
|
|
|
int index; |
|
|
|
if(tileSize_width < TILE_SIZE && yOff < src_rows) |
|
|
|
|
|
|
|
for(int i = tileSize_width; i < rstep && (yOff+x+i) < maxIdx; i++ ) |
|
|
|
if ( y+lidy < src_rows ) |
|
|
|
*(src_data+yOff+x+i) = 0; |
|
|
|
{ |
|
|
|
if( coi > 0 ) |
|
|
|
if(tileSize_width < TILE_SIZE) |
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_F) |
|
|
|
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) |
|
|
|
{ |
|
|
|
*((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0; |
|
|
|
#pragma unroll |
|
|
|
if( coi > 0 ) |
|
|
|
for(int j=0; j<4; j++) |
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_F) |
|
|
|
{ |
|
|
|
{ |
|
|
|
index = yOff+(x+i+j)*kcn+coi-1; |
|
|
|
for(int j=0; j<4; j++) |
|
|
|
if (index < maxIdx) |
|
|
|
tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1); |
|
|
|
tmp_coi[j] = *(src_data+index); |
|
|
|
tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); |
|
|
|
else |
|
|
|
|
|
|
|
tmp_coi[j] = 0; |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); |
|
|
|
else |
|
|
|
} |
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_F) |
|
|
|
else |
|
|
|
tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3)); |
|
|
|
for(int i=0; i < tileSize_width && (yOff+x+i) < maxIdx; i+=VLEN_F) |
|
|
|
} |
|
|
|
tmp[i/VLEN_F] = (*(__global float4 *)(src_data+yOff+x+i)); |
|
|
|
|
|
|
|
float4 zero = (float4)(0); |
|
|
|
float4 zero = (float4)(0); |
|
|
|
float4 full = (float4)(255); |
|
|
|
float4 full = (float4)(255); |
|
|
|
if( binary ) |
|
|
|
if( binary ) |
|
|
@ -688,10 +708,9 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols |
|
|
|
tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero; |
|
|
|
tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero; |
|
|
|
F mom[10]; |
|
|
|
F mom[10]; |
|
|
|
__local F m[10][128]; |
|
|
|
__local F m[10][128]; |
|
|
|
if(lidy == 0) |
|
|
|
if(lidy < 128) |
|
|
|
for(int i = 0; i < 10; i ++) |
|
|
|
for(int i = 0; i < 10; i ++) |
|
|
|
for(int j = 0; j < 128; j ++) |
|
|
|
m[i][lidy] = 0; |
|
|
|
m[i][j] = 0; |
|
|
|
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
F lm[10] = {0}; |
|
|
|
F lm[10] = {0}; |
|
|
|
F4 x0 = (F4)(0); |
|
|
|
F4 x0 = (F4)(0); |
|
|
@ -770,66 +789,42 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols |
|
|
|
// accumulate moments computed in each tile |
|
|
|
// accumulate moments computed in each tile |
|
|
|
dst_step /= sizeof(F); |
|
|
|
dst_step /= sizeof(F); |
|
|
|
|
|
|
|
|
|
|
|
int dst_x_off = mad24(wgidy, dst_cols, wgidx); |
|
|
|
|
|
|
|
int dst_off = 0; |
|
|
|
|
|
|
|
int max_dst_index = 10 * blocky * get_global_size(1); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// + m00 ( = m00' ) |
|
|
|
// + m00 ( = m00' ) |
|
|
|
dst_off = mad24(DST_ROW_00 * blocky, dst_step, dst_x_off); |
|
|
|
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; |
|
|
|
if (dst_off < max_dst_index) |
|
|
|
|
|
|
|
*(dst_m + dst_off) = mom[0]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// + m10 ( = m10' + x*m00' ) |
|
|
|
// + m10 ( = m10' + x*m00' ) |
|
|
|
dst_off = mad24(DST_ROW_10 * blocky, dst_step, dst_x_off); |
|
|
|
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; |
|
|
|
if (dst_off < max_dst_index) |
|
|
|
|
|
|
|
*(dst_m + dst_off) = mom[1] + xm; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// + m01 ( = m01' + y*m00' ) |
|
|
|
// + m01 ( = m01' + y*m00' ) |
|
|
|
dst_off = mad24(DST_ROW_01 * blocky, dst_step, dst_x_off); |
|
|
|
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; |
|
|
|
if (dst_off < max_dst_index) |
|
|
|
|
|
|
|
*(dst_m + dst_off) = mom[2] + ym; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// + m20 ( = m20' + 2*x*m10' + x*x*m00' ) |
|
|
|
// + m20 ( = m20' + 2*x*m10' + x*x*m00' ) |
|
|
|
dst_off = mad24(DST_ROW_20 * blocky, dst_step, dst_x_off); |
|
|
|
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); |
|
|
|
if (dst_off < max_dst_index) |
|
|
|
|
|
|
|
*(dst_m + dst_off) = mom[3] + x * (mom[1] * 2 + xm); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) |
|
|
|
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) |
|
|
|
dst_off = mad24(DST_ROW_11 * blocky, dst_step, dst_x_off); |
|
|
|
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; |
|
|
|
if (dst_off < max_dst_index) |
|
|
|
|
|
|
|
*(dst_m + dst_off) = mom[4] + x * (mom[2] + ym) + y * mom[1]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// + m02 ( = m02' + 2*y*m01' + y*y*m00' ) |
|
|
|
// + m02 ( = m02' + 2*y*m01' + y*y*m00' ) |
|
|
|
dst_off = mad24(DST_ROW_02 * blocky, dst_step, dst_x_off); |
|
|
|
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); |
|
|
|
if (dst_off < max_dst_index) |
|
|
|
|
|
|
|
*(dst_m + dst_off) = mom[5] + y * (mom[2] * 2 + ym); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) |
|
|
|
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) |
|
|
|
dst_off = mad24(DST_ROW_30 * blocky, dst_step, dst_x_off); |
|
|
|
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); |
|
|
|
if (dst_off < max_dst_index) |
|
|
|
|
|
|
|
*(dst_m + dst_off) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') |
|
|
|
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') |
|
|
|
dst_off = mad24(DST_ROW_21 * blocky, dst_step, dst_x_off); |
|
|
|
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; |
|
|
|
if (dst_off < max_dst_index) |
|
|
|
|
|
|
|
*(dst_m + dst_off) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') |
|
|
|
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') |
|
|
|
dst_off = mad24(DST_ROW_12 * blocky, dst_step, dst_x_off); |
|
|
|
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; |
|
|
|
if (dst_off < max_dst_index) |
|
|
|
|
|
|
|
*(dst_m + dst_off) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) |
|
|
|
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) |
|
|
|
dst_off = mad24(DST_ROW_03 * blocky, dst_step, dst_x_off); |
|
|
|
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); |
|
|
|
if (dst_off < max_dst_index) |
|
|
|
|
|
|
|
*(dst_m + dst_off) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, |
|
|
|
__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step, |
|
|
|
__global F* dst_m, |
|
|
|
__global F* dst_m, |
|
|
|
int dst_cols, int dst_step, int blocky, |
|
|
|
int dst_cols, int dst_step, int blocky, |
|
|
|
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE) |
|
|
|
int depth, int cn, int coi, int binary, const int TILE_SIZE) |
|
|
|
{ |
|
|
|
{ |
|
|
|
F tmp_coi[4]; // get the coi data |
|
|
|
F tmp_coi[4]; // get the coi data |
|
|
|
F4 tmp[64]; |
|
|
|
F4 tmp[64]; |
|
|
@ -844,22 +839,26 @@ __kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, in |
|
|
|
int x = wgidx*TILE_SIZE; // real X index of pixel |
|
|
|
int x = wgidx*TILE_SIZE; // real X index of pixel |
|
|
|
int kcn = (cn==2)?2:4; |
|
|
|
int kcn = (cn==2)?2:4; |
|
|
|
int rstep = min(src_step/8, TILE_SIZE); |
|
|
|
int rstep = min(src_step/8, TILE_SIZE); |
|
|
|
tileSize_height = min(TILE_SIZE, src_rows - y); |
|
|
|
int tileSize_height = min(TILE_SIZE, src_rows - y); |
|
|
|
tileSize_width = min(TILE_SIZE, src_cols - x); |
|
|
|
int tileSize_width = min(TILE_SIZE, src_cols - x); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if ( y+lidy < src_rows ) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
if(tileSize_width < TILE_SIZE) |
|
|
|
|
|
|
|
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) |
|
|
|
|
|
|
|
*((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0; |
|
|
|
|
|
|
|
if( coi > 0 ) |
|
|
|
|
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_D) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
for(int j=0; j<4 && ((x+i+j)*kcn+coi-1)<src_cols; j++) |
|
|
|
|
|
|
|
tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1); |
|
|
|
|
|
|
|
tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
for(int i=0; i < tileSize_width && (x+i+3) < src_cols; i+=VLEN_D) |
|
|
|
|
|
|
|
tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3)); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if(tileSize_width < TILE_SIZE) |
|
|
|
|
|
|
|
for(int i = tileSize_width; i < rstep; i++ ) |
|
|
|
|
|
|
|
*((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0; |
|
|
|
|
|
|
|
if( coi > 0 ) |
|
|
|
|
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_D) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
for(int j=0; j<4; j++) |
|
|
|
|
|
|
|
tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1); |
|
|
|
|
|
|
|
tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
for(int i=0; i < tileSize_width; i+=VLEN_D) |
|
|
|
|
|
|
|
tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3)); |
|
|
|
|
|
|
|
F4 zero = (F4)(0); |
|
|
|
F4 zero = (F4)(0); |
|
|
|
F4 full = (F4)(255); |
|
|
|
F4 full = (F4)(255); |
|
|
|
if( binary ) |
|
|
|
if( binary ) |
|
|
@ -867,10 +866,9 @@ __kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, in |
|
|
|
tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero; |
|
|
|
tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero; |
|
|
|
F mom[10]; |
|
|
|
F mom[10]; |
|
|
|
__local F m[10][128]; |
|
|
|
__local F m[10][128]; |
|
|
|
if(lidy == 0) |
|
|
|
if(lidy < 128) |
|
|
|
for(int i=0; i<10; i++) |
|
|
|
for(int i=0; i<10; i++) |
|
|
|
for(int j=0; j<128; j++) |
|
|
|
m[i][lidy]=0; |
|
|
|
m[i][j]=0; |
|
|
|
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
F lm[10] = {0}; |
|
|
|
F lm[10] = {0}; |
|
|
|
F4 x0 = (F4)(0); |
|
|
|
F4 x0 = (F4)(0); |
|
|
@ -907,7 +905,6 @@ __kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, in |
|
|
|
m[1][lidy-bheight] = x1.s0; // m10 |
|
|
|
m[1][lidy-bheight] = x1.s0; // m10 |
|
|
|
m[0][lidy-bheight] = x0.s0; // m00 |
|
|
|
m[0][lidy-bheight] = x0.s0; // m00 |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
else if(lidy < bheight) |
|
|
|
else if(lidy < bheight) |
|
|
|
{ |
|
|
|
{ |
|
|
|
lm[9] = ((F)py) * sy; // m03 |
|
|
|
lm[9] = ((F)py) * sy; // m03 |
|
|
@ -922,6 +919,7 @@ __kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, in |
|
|
|
lm[0] = x0.s0; // m00 |
|
|
|
lm[0] = x0.s0; // m00 |
|
|
|
} |
|
|
|
} |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
|
|
|
|
|
|
|
|
for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) |
|
|
|
for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
if(lidy < j) |
|
|
|
if(lidy < j) |
|
|
|