Merge pull request #1045 from bitwangyaoyao:2.4_fix

pull/1053/merge
Roman Donchenko 12 years ago committed by OpenCV Buildbot
commit bf6b119a3f
  1. 61
      modules/ocl/perf/main.cpp
  2. 2
      modules/ocl/src/mcwutil.cpp
  3. 43
      modules/ocl/src/moments.cpp
  4. 290
      modules/ocl/src/opencl/moments.cl
  5. 4
      modules/ocl/src/opencl/stereobm.cl
  6. 8
      modules/ocl/test/test_moments.cpp

@ -44,43 +44,21 @@
int main(int argc, const char *argv[]) int main(int argc, const char *argv[])
{ {
vector<ocl::Info> oclinfo;
int num_devices = getDevice(oclinfo);
if (num_devices < 1)
{
cerr << "no device found\n";
return -1;
}
// set this to overwrite binary cache every time the test starts
ocl::setBinaryDiskCache(ocl::CACHE_UPDATE);
int devidx = 0;
for (size_t i = 0; i < oclinfo.size(); i++)
{
for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++)
{
printf("device %d: %s\n", devidx++, oclinfo[i].DeviceName[j].c_str());
}
}
redirectError(cvErrorCallback);
const char *keys = const char *keys =
"{ h | help | false | print help message }" "{ h | help | false | print help message }"
"{ f | filter | | filter for test }" "{ f | filter | | filter for test }"
"{ w | workdir | | set working directory }" "{ w | workdir | | set working directory }"
"{ l | list | false | show all tests }" "{ l | list | false | show all tests }"
"{ d | device | 0 | device id }" "{ d | device | 0 | device id }"
"{ c | cpu_ocl | false | use cpu as ocl device}"
"{ i | iters | 10 | iteration count }" "{ i | iters | 10 | iteration count }"
"{ m | warmup | 1 | gpu warm up iteration count}" "{ m | warmup | 1 | gpu warm up iteration count}"
"{ t | xtop | 1.1 | xfactor top boundary}" "{ t | xtop | 1.1 | xfactor top boundary}"
"{ b | xbottom | 0.9 | xfactor bottom boundary}" "{ b | xbottom | 0.9 | xfactor bottom boundary}"
"{ v | verify | false | only run gpu once to verify if problems occur}"; "{ v | verify | false | only run gpu once to verify if problems occur}";
redirectError(cvErrorCallback);
CommandLineParser cmd(argc, argv, keys); CommandLineParser cmd(argc, argv, keys);
if (cmd.get<bool>("help")) if (cmd.get<bool>("help"))
{ {
cout << "Avaible options:" << endl; cout << "Avaible options:" << endl;
@ -88,14 +66,40 @@ int main(int argc, const char *argv[])
return 0; return 0;
} }
int device = cmd.get<int>("device"); // get ocl devices
bool use_cpu = cmd.get<bool>("c");
vector<ocl::Info> oclinfo;
int num_devices = 0;
if(use_cpu)
num_devices = getDevice(oclinfo, ocl::CVCL_DEVICE_TYPE_CPU);
else
num_devices = getDevice(oclinfo);
if (num_devices < 1)
{
cerr << "no device found\n";
return -1;
}
// show device info
int devidx = 0;
for (size_t i = 0; i < oclinfo.size(); i++)
{
for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++)
{
cout << "device " << devidx++ << ": " << oclinfo[i].DeviceName[j] << endl;
}
}
int device = cmd.get<int>("device");
if (device < 0 || device >= num_devices) if (device < 0 || device >= num_devices)
{ {
cerr << "Invalid device ID" << endl; cerr << "Invalid device ID" << endl;
return -1; return -1;
} }
// set this to overwrite binary cache every time the test starts
ocl::setBinaryDiskCache(ocl::CACHE_UPDATE);
if (cmd.get<bool>("verify")) if (cmd.get<bool>("verify"))
{ {
TestSystem::instance().setNumIters(1); TestSystem::instance().setNumIters(1);
@ -104,7 +108,6 @@ int main(int argc, const char *argv[])
} }
devidx = 0; devidx = 0;
for (size_t i = 0; i < oclinfo.size(); i++) for (size_t i = 0; i < oclinfo.size(); i++)
{ {
for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++, devidx++) for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++, devidx++)
@ -113,7 +116,7 @@ int main(int argc, const char *argv[])
{ {
ocl::setDevice(oclinfo[i], (int)j); ocl::setDevice(oclinfo[i], (int)j);
TestSystem::instance().setRecordName(oclinfo[i].DeviceName[j]); TestSystem::instance().setRecordName(oclinfo[i].DeviceName[j]);
printf("\nuse %d: %s\n", devidx, oclinfo[i].DeviceName[j].c_str()); cout << "use " << devidx << ": " <<oclinfo[i].DeviceName[j] << endl;
goto END_DEV; goto END_DEV;
} }
} }

@ -149,7 +149,7 @@ namespace cv
cl_image_format format; cl_image_format format;
int err; int err;
int depth = mat.depth(); int depth = mat.depth();
int channels = mat.channels(); int channels = mat.oclchannels();
switch(depth) switch(depth)
{ {

@ -16,7 +16,7 @@
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// @Authors // @Authors
// Sen Liu, sen@multicorewareinc.com // Sen Liu, swjtuls1987@126.com
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met: // are permitted provided that the following conditions are met:
@ -277,8 +277,8 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
blocky = size.height/TILE_SIZE; blocky = size.height/TILE_SIZE;
else else
blocky = size.height/TILE_SIZE + 1; blocky = size.height/TILE_SIZE + 1;
cv::ocl::oclMat dst_m(blocky * 10, blockx, CV_64FC1); oclMat dst_m(blocky * 10, blockx, CV_64FC1);
cl_mem sum = openCLCreateBuffer(src.clCxt,CL_MEM_READ_WRITE,10*sizeof(double)); oclMat sum(1, 10, CV_64FC1);
int tile_width = std::min(size.width,TILE_SIZE); int tile_width = std::min(size.width,TILE_SIZE);
int tile_height = std::min(size.height,TILE_SIZE); int tile_height = std::min(size.height,TILE_SIZE);
size_t localThreads[3] = { tile_height, 1, 1}; size_t localThreads[3] = { tile_height, 1, 1};
@ -288,19 +288,16 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows )); args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols )); args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step )); args.push_back( make_pair( sizeof(cl_int) , (void *)&src.step ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.width ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&tileSize.height ));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data )); args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.cols )); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.cols ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step )); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&blocky )); args.push_back( make_pair( sizeof(cl_int) , (void *)&blocky ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&type ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&depth )); args.push_back( make_pair( sizeof(cl_int) , (void *)&depth ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&cn )); args.push_back( make_pair( sizeof(cl_int) , (void *)&cn ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&coi )); args.push_back( make_pair( sizeof(cl_int) , (void *)&coi ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&binary )); args.push_back( make_pair( sizeof(cl_int) , (void *)&binary ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE )); args.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
openCLExecuteKernel(dst_m.clCxt, &moments, "CvMoments", globalThreads, localThreads, args, -1, depth); openCLExecuteKernel(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, depth);
size_t localThreadss[3] = { 128, 1, 1}; size_t localThreadss[3] = { 128, 1, 1};
size_t globalThreadss[3] = { 128, 1, 1}; size_t globalThreadss[3] = { 128, 1, 1};
@ -309,25 +306,23 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_height )); args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_height ));
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_width )); args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&tile_width ));
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE )); args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&TILE_SIZE ));
args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum )); args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&sum.data ));
args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data )); args_sum.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data ));
args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step )); args_sum.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.step ));
openCLExecuteKernel(dst_m.clCxt, &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1); openCLExecuteKernel(Context::getContext(), &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1);
double* dstsum = new double[10];
memset(dstsum,0,10*sizeof(double)); Mat dstsum(sum);
openCLReadBuffer(dst_m.clCxt,sum,(void *)dstsum,10*sizeof(double)); mom->m00 = dstsum.at<double>(0, 0);
mom->m00 = dstsum[0]; mom->m10 = dstsum.at<double>(0, 1);
mom->m10 = dstsum[1]; mom->m01 = dstsum.at<double>(0, 2);
mom->m01 = dstsum[2]; mom->m20 = dstsum.at<double>(0, 3);
mom->m20 = dstsum[3]; mom->m11 = dstsum.at<double>(0, 4);
mom->m11 = dstsum[4]; mom->m02 = dstsum.at<double>(0, 5);
mom->m02 = dstsum[5]; mom->m30 = dstsum.at<double>(0, 6);
mom->m30 = dstsum[6]; mom->m21 = dstsum.at<double>(0, 7);
mom->m21 = dstsum[7]; mom->m12 = dstsum.at<double>(0, 8);
mom->m12 = dstsum[8]; mom->m03 = dstsum.at<double>(0, 9);
mom->m03 = dstsum[9];
delete [] dstsum;
openCLSafeCall(clReleaseMemObject(sum));
icvCompleteMomentState( mom ); icvCompleteMomentState( mom );
} }

@ -173,10 +173,10 @@ __kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_widt
sum[i] = dst_sum[i][0]; sum[i] = dst_sum[i][0];
} }
__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step,
__global F* dst_m, __global F* dst_m,
int dst_cols, int dst_step, int blocky, int dst_cols, int dst_step, int blocky,
int type, int depth, int cn, int coi, int binary, int TILE_SIZE) int depth, int cn, int coi, int binary, int TILE_SIZE)
{ {
uchar tmp_coi[16]; // get the coi data uchar tmp_coi[16]; // get the coi data
uchar16 tmp[16]; uchar16 tmp[16];
@ -192,35 +192,43 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
int x = wgidx*TILE_SIZE; // vector length of uchar int x = wgidx*TILE_SIZE; // vector length of uchar
int kcn = (cn==2)?2:4; int kcn = (cn==2)?2:4;
int rstep = min(src_step, TILE_SIZE); int rstep = min(src_step, TILE_SIZE);
tileSize_height = min(TILE_SIZE, src_rows - y); int tileSize_height = min(TILE_SIZE, src_rows - y);
tileSize_width = min(TILE_SIZE, src_cols - x); int tileSize_width = min(TILE_SIZE, src_cols - x);
if( tileSize_width < TILE_SIZE ) if ( y+lidy < src_rows )
for(int i = tileSize_width; i < rstep; i++ ) {
*((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0; if( tileSize_width < TILE_SIZE )
if( coi > 0 ) //channel of interest for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
for(int i = 0; i < tileSize_width; i += VLEN_C) *((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
{
for(int j=0; j<VLEN_C; j++) if( coi > 0 ) //channel of interest
tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1); for(int i = 0; i < tileSize_width; i += VLEN_C)
tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7], {
tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]); for(int j=0; j<VLEN_C; j++)
} tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
else tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
for(int i=0; i < tileSize_width; i+=VLEN_C) tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C); }
else
for(int i=0; i < tileSize_width; i+=VLEN_C)
tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
}
uchar16 zero = (uchar16)(0); uchar16 zero = (uchar16)(0);
uchar16 full = (uchar16)(255); uchar16 full = (uchar16)(255);
if( binary ) if( binary )
for(int i=0; i < tileSize_width; i+=VLEN_C) for(int i=0; i < tileSize_width; i+=VLEN_C)
tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero; tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero;
F mom[10]; F mom[10];
__local int m[10][128]; __local int m[10][128];
if(lidy == 0) if(lidy < 128)
{
for(int i=0; i<10; i++) for(int i=0; i<10; i++)
for(int j=0; j<128; j++) m[i][lidy]=0;
m[i][j]=0; }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
int lm[10] = {0}; int lm[10] = {0};
int16 x0 = (int16)(0); int16 x0 = (int16)(0);
int16 x1 = (int16)(0); int16 x1 = (int16)(0);
@ -281,6 +289,7 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
m[i][lidy-j/2] = lm[i]; m[i][lidy-j/2] = lm[i];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
if(lidy == 0&&lidx == 0) if(lidy == 0&&lidx == 0)
{ {
for( int mt = 0; mt < 10; mt++ ) for( int mt = 0; mt < 10; mt++ )
@ -328,10 +337,10 @@ __kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_col
} }
} }
__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step,
__global F* dst_m, __global F* dst_m,
int dst_cols, int dst_step, int blocky, int dst_cols, int dst_step, int blocky,
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE) int depth, int cn, int coi, int binary, const int TILE_SIZE)
{ {
ushort tmp_coi[8]; // get the coi data ushort tmp_coi[8]; // get the coi data
ushort8 tmp[32]; ushort8 tmp[32];
@ -346,21 +355,26 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
int x = wgidx*TILE_SIZE; // real X index of pixel int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4; int kcn = (cn==2)?2:4;
int rstep = min(src_step/2, TILE_SIZE); int rstep = min(src_step/2, TILE_SIZE);
tileSize_height = min(TILE_SIZE, src_rows - y); int tileSize_height = min(TILE_SIZE, src_rows - y);
tileSize_width = min(TILE_SIZE, src_cols -x); int tileSize_width = min(TILE_SIZE, src_cols -x);
if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
for(int i=tileSize_width; i < rstep; i++ ) if ( y+lidy < src_rows )
*((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0; {
if( coi > 0 ) if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
for(int i=0; i < tileSize_width; i+=VLEN_US) for(int i=tileSize_width; i < rstep && (x+i) < src_cols; i++ )
{ *((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
for(int j=0; j<VLEN_US; j++) if( coi > 0 )
tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1); for(int i=0; i < tileSize_width; i+=VLEN_US)
tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]); {
} for(int j=0; j<VLEN_US; j++)
else tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
for(int i=0; i < tileSize_width; i+=VLEN_US) tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US); }
else
for(int i=0; i < tileSize_width; i+=VLEN_US)
tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
}
ushort8 zero = (ushort8)(0); ushort8 zero = (ushort8)(0);
ushort8 full = (ushort8)(255); ushort8 full = (ushort8)(255);
if( binary ) if( binary )
@ -368,11 +382,11 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero; tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero;
F mom[10]; F mom[10];
__local long m[10][128]; __local long m[10][128];
if(lidy == 0) if(lidy < 128)
for(int i=0; i<10; i++) for(int i=0; i<10; i++)
for(int j=0; j<128; j++) m[i][lidy]=0;
m[i][j]=0;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
long lm[10] = {0}; long lm[10] = {0};
int8 x0 = (int8)(0); int8 x0 = (int8)(0);
int8 x1 = (int8)(0); int8 x1 = (int8)(0);
@ -422,17 +436,22 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
lm[0] = x0.s0; // m00 lm[0] = x0.s0; // m00
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
{ {
if(lidy < j) if(lidy < j)
for( int i = 0; i < 10; i++ ) for( int i = 0; i < 10; i++ )
lm[i] = lm[i] + m[i][lidy]; lm[i] = lm[i] + m[i][lidy];
barrier(CLK_LOCAL_MEM_FENCE); }
barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
{
if(lidy >= j/2&&lidy < j) if(lidy >= j/2&&lidy < j)
for( int i = 0; i < 10; i++ ) for( int i = 0; i < 10; i++ )
m[i][lidy-j/2] = lm[i]; m[i][lidy-j/2] = lm[i];
barrier(CLK_LOCAL_MEM_FENCE);
} }
barrier(CLK_LOCAL_MEM_FENCE);
if(lidy == 0&&lidx == 0) if(lidy == 0&&lidx == 0)
{ {
for(int mt = 0; mt < 10; mt++ ) for(int mt = 0; mt < 10; mt++ )
@ -482,10 +501,10 @@ __kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_col
} }
} }
__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step,
__global F* dst_m, __global F* dst_m,
int dst_cols, int dst_step, int blocky, int dst_cols, int dst_step, int blocky,
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE) int depth, int cn, int coi, int binary, const int TILE_SIZE)
{ {
short tmp_coi[8]; // get the coi data short tmp_coi[8]; // get the coi data
short8 tmp[32]; short8 tmp[32];
@ -500,21 +519,26 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
int x = wgidx*TILE_SIZE; // real X index of pixel int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4; int kcn = (cn==2)?2:4;
int rstep = min(src_step/2, TILE_SIZE); int rstep = min(src_step/2, TILE_SIZE);
tileSize_height = min(TILE_SIZE, src_rows - y); int tileSize_height = min(TILE_SIZE, src_rows - y);
tileSize_width = min(TILE_SIZE, src_cols -x); int tileSize_width = min(TILE_SIZE, src_cols -x);
if(tileSize_width < TILE_SIZE)
for(int i = tileSize_width; i < rstep; i++ ) if ( y+lidy < src_rows )
*((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0; {
if( coi > 0 ) if(tileSize_width < TILE_SIZE)
for(int i=0; i < tileSize_width; i+=VLEN_S) for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
{ *((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
for(int j=0; j<VLEN_S; j++) if( coi > 0 )
tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1); for(int i=0; i < tileSize_width; i+=VLEN_S)
tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]); {
} for(int j=0; j<VLEN_S; j++)
else tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
for(int i=0; i < tileSize_width; i+=VLEN_S) tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S); }
else
for(int i=0; i < tileSize_width; i+=VLEN_S)
tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
}
short8 zero = (short8)(0); short8 zero = (short8)(0);
short8 full = (short8)(255); short8 full = (short8)(255);
if( binary ) if( binary )
@ -523,10 +547,9 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
F mom[10]; F mom[10];
__local long m[10][128]; __local long m[10][128];
if(lidy == 0) if(lidy < 128)
for(int i=0; i<10; i++) for(int i=0; i<10; i++)
for(int j=0; j<128; j++) m[i][lidy]=0;
m[i][j]=0;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
long lm[10] = {0}; long lm[10] = {0};
int8 x0 = (int8)(0); int8 x0 = (int8)(0);
@ -637,10 +660,10 @@ __kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols
} }
} }
__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step,
__global F* dst_m, __global F* dst_m,
int dst_cols, int dst_step, int blocky, int dst_cols, int dst_step, int blocky,
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE) int depth, int cn, int coi, int binary, const int TILE_SIZE)
{ {
float tmp_coi[4]; // get the coi data float tmp_coi[4]; // get the coi data
float4 tmp[64] ; float4 tmp[64] ;
@ -654,33 +677,30 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
int y = wgidy*TILE_SIZE; // real Y index of pixel int y = wgidy*TILE_SIZE; // real Y index of pixel
int x = wgidx*TILE_SIZE; // real X index of pixel int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4; int kcn = (cn==2)?2:4;
src_step /= sizeof(*src_data); int rstep = min(src_step/4, TILE_SIZE);
int rstep = min(src_step, TILE_SIZE); int tileSize_height = min(TILE_SIZE, src_rows - y);
tileSize_height = min(TILE_SIZE, src_rows - y); int tileSize_width = min(TILE_SIZE, src_cols -x);
tileSize_width = min(TILE_SIZE, src_cols -x);
int maxIdx = mul24(src_rows, src_cols); int maxIdx = mul24(src_rows, src_cols);
int yOff = (y+lidy)*src_step; int yOff = (y+lidy)*src_step;
int index; int index;
if(tileSize_width < TILE_SIZE && yOff < src_rows)
for(int i = tileSize_width; i < rstep && (yOff+x+i) < maxIdx; i++ ) if ( y+lidy < src_rows )
*(src_data+yOff+x+i) = 0; {
if( coi > 0 ) if(tileSize_width < TILE_SIZE)
for(int i=0; i < tileSize_width; i+=VLEN_F) for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
{ *((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0;
#pragma unroll if( coi > 0 )
for(int j=0; j<4; j++) for(int i=0; i < tileSize_width; i+=VLEN_F)
{ {
index = yOff+(x+i+j)*kcn+coi-1; for(int j=0; j<4; j++)
if (index < maxIdx) tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1);
tmp_coi[j] = *(src_data+index); tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
else
tmp_coi[j] = 0;
} }
tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); else
} for(int i=0; i < tileSize_width; i+=VLEN_F)
else tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3));
for(int i=0; i < tileSize_width && (yOff+x+i) < maxIdx; i+=VLEN_F) }
tmp[i/VLEN_F] = (*(__global float4 *)(src_data+yOff+x+i));
float4 zero = (float4)(0); float4 zero = (float4)(0);
float4 full = (float4)(255); float4 full = (float4)(255);
if( binary ) if( binary )
@ -688,10 +708,9 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero; tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero;
F mom[10]; F mom[10];
__local F m[10][128]; __local F m[10][128];
if(lidy == 0) if(lidy < 128)
for(int i = 0; i < 10; i ++) for(int i = 0; i < 10; i ++)
for(int j = 0; j < 128; j ++) m[i][lidy] = 0;
m[i][j] = 0;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
F lm[10] = {0}; F lm[10] = {0};
F4 x0 = (F4)(0); F4 x0 = (F4)(0);
@ -770,66 +789,42 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
// accumulate moments computed in each tile // accumulate moments computed in each tile
dst_step /= sizeof(F); dst_step /= sizeof(F);
int dst_x_off = mad24(wgidy, dst_cols, wgidx);
int dst_off = 0;
int max_dst_index = 10 * blocky * get_global_size(1);
// + m00 ( = m00' ) // + m00 ( = m00' )
dst_off = mad24(DST_ROW_00 * blocky, dst_step, dst_x_off); *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[0];
// + m10 ( = m10' + x*m00' ) // + m10 ( = m10' + x*m00' )
dst_off = mad24(DST_ROW_10 * blocky, dst_step, dst_x_off); *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[1] + xm;
// + m01 ( = m01' + y*m00' ) // + m01 ( = m01' + y*m00' )
dst_off = mad24(DST_ROW_01 * blocky, dst_step, dst_x_off); *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' ) // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
dst_off = mad24(DST_ROW_20 * blocky, dst_step, dst_x_off); *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
dst_off = mad24(DST_ROW_11 * blocky, dst_step, dst_x_off); *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' ) // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
dst_off = mad24(DST_ROW_02 * blocky, dst_step, dst_x_off); *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
dst_off = mad24(DST_ROW_30 * blocky, dst_step, dst_x_off); *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
dst_off = mad24(DST_ROW_21 * blocky, dst_step, dst_x_off); *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
dst_off = mad24(DST_ROW_12 * blocky, dst_step, dst_x_off); *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
dst_off = mad24(DST_ROW_03 * blocky, dst_step, dst_x_off); *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
} }
} }
__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step, int tileSize_width, int tileSize_height, __kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step,
__global F* dst_m, __global F* dst_m,
int dst_cols, int dst_step, int blocky, int dst_cols, int dst_step, int blocky,
int type, int depth, int cn, int coi, int binary, const int TILE_SIZE) int depth, int cn, int coi, int binary, const int TILE_SIZE)
{ {
F tmp_coi[4]; // get the coi data F tmp_coi[4]; // get the coi data
F4 tmp[64]; F4 tmp[64];
@ -844,22 +839,26 @@ __kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, in
int x = wgidx*TILE_SIZE; // real X index of pixel int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4; int kcn = (cn==2)?2:4;
int rstep = min(src_step/8, TILE_SIZE); int rstep = min(src_step/8, TILE_SIZE);
tileSize_height = min(TILE_SIZE, src_rows - y); int tileSize_height = min(TILE_SIZE, src_rows - y);
tileSize_width = min(TILE_SIZE, src_cols - x); int tileSize_width = min(TILE_SIZE, src_cols - x);
if ( y+lidy < src_rows )
{
if(tileSize_width < TILE_SIZE)
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
*((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_D)
{
for(int j=0; j<4 && ((x+i+j)*kcn+coi-1)<src_cols; j++)
tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
}
else
for(int i=0; i < tileSize_width && (x+i+3) < src_cols; i+=VLEN_D)
tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
}
if(tileSize_width < TILE_SIZE)
for(int i = tileSize_width; i < rstep; i++ )
*((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_D)
{
for(int j=0; j<4; j++)
tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_D)
tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
F4 zero = (F4)(0); F4 zero = (F4)(0);
F4 full = (F4)(255); F4 full = (F4)(255);
if( binary ) if( binary )
@ -867,10 +866,9 @@ __kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, in
tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero; tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
F mom[10]; F mom[10];
__local F m[10][128]; __local F m[10][128];
if(lidy == 0) if(lidy < 128)
for(int i=0; i<10; i++) for(int i=0; i<10; i++)
for(int j=0; j<128; j++) m[i][lidy]=0;
m[i][j]=0;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
F lm[10] = {0}; F lm[10] = {0};
F4 x0 = (F4)(0); F4 x0 = (F4)(0);
@ -907,7 +905,6 @@ __kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, in
m[1][lidy-bheight] = x1.s0; // m10 m[1][lidy-bheight] = x1.s0; // m10
m[0][lidy-bheight] = x0.s0; // m00 m[0][lidy-bheight] = x0.s0; // m00
} }
else if(lidy < bheight) else if(lidy < bheight)
{ {
lm[9] = ((F)py) * sy; // m03 lm[9] = ((F)py) * sy; // m03
@ -922,6 +919,7 @@ __kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, in
lm[0] = x0.s0; // m00 lm[0] = x0.s0; // m00
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
{ {
if(lidy < j) if(lidy < j)

@ -162,8 +162,8 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
int y_tex; int y_tex;
int x_tex = X - radius; int x_tex = X - radius;
if (x_tex >= cwidth) //if (x_tex >= cwidth)
return; // return;
for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP) for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
{ {

@ -45,12 +45,12 @@ TEST_P(MomentsTest, Mat)
{ {
if(test_contours) if(test_contours)
{ {
Mat src = imread( workdir + "../cpp/pic3.png", 1 ); Mat src = imread( workdir + "../cpp/pic3.png", IMREAD_GRAYSCALE );
Mat src_gray, canny_output; ASSERT_FALSE(src.empty());
cvtColor( src, src_gray, CV_BGR2GRAY ); Mat canny_output;
vector<vector<Point> > contours; vector<vector<Point> > contours;
vector<Vec4i> hierarchy; vector<Vec4i> hierarchy;
Canny( src_gray, canny_output, 100, 200, 3 ); Canny( src, canny_output, 100, 200, 3 );
findContours( canny_output, contours, hierarchy, CV_RETR_TREE, CV_CHAIN_APPROX_SIMPLE, Point(0, 0) ); findContours( canny_output, contours, hierarchy, CV_RETR_TREE, CV_CHAIN_APPROX_SIMPLE, Point(0, 0) );
for( size_t i = 0; i < contours.size(); i++ ) for( size_t i = 0; i < contours.size(); i++ )
{ {

Loading…
Cancel
Save