|
|
|
@ -58,12 +58,13 @@ using namespace std; |
|
|
|
|
//////////////////////////////// oclMat ////////////////////////////////
|
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
|
|
//helper routines
|
|
|
|
|
// helper routines
|
|
|
|
|
namespace cv |
|
|
|
|
{ |
|
|
|
|
namespace ocl |
|
|
|
|
{ |
|
|
|
|
///////////////////////////OpenCL kernel strings///////////////////////////
|
|
|
|
|
/////////////////////////// OpenCL kernel strings ///////////////////////////
|
|
|
|
|
|
|
|
|
|
extern const char *operator_copyToM; |
|
|
|
|
extern const char *operator_convertTo; |
|
|
|
|
extern const char *operator_setTo; |
|
|
|
@ -74,42 +75,18 @@ namespace cv |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
|
|
|
// convert_C3C4
|
|
|
|
|
|
|
|
|
|
static void convert_C3C4(const cl_mem &src, oclMat &dst) |
|
|
|
|
{ |
|
|
|
|
int dstStep_in_pixel = dst.step1() / dst.oclchannels(); |
|
|
|
|
int pixel_end = dst.wholecols * dst.wholerows - 1; |
|
|
|
|
Context *clCxt = dst.clCxt; |
|
|
|
|
string kernelName = "convertC3C4"; |
|
|
|
|
char compile_option[32]; |
|
|
|
|
switch(dst.depth()) |
|
|
|
|
{ |
|
|
|
|
case 0: |
|
|
|
|
sprintf(compile_option, "-D GENTYPE4=uchar4"); |
|
|
|
|
break; |
|
|
|
|
case 1: |
|
|
|
|
sprintf(compile_option, "-D GENTYPE4=char4"); |
|
|
|
|
break; |
|
|
|
|
case 2: |
|
|
|
|
sprintf(compile_option, "-D GENTYPE4=ushort4"); |
|
|
|
|
break; |
|
|
|
|
case 3: |
|
|
|
|
sprintf(compile_option, "-D GENTYPE4=short4"); |
|
|
|
|
break; |
|
|
|
|
case 4: |
|
|
|
|
sprintf(compile_option, "-D GENTYPE4=int4"); |
|
|
|
|
break; |
|
|
|
|
case 5: |
|
|
|
|
sprintf(compile_option, "-D GENTYPE4=float4"); |
|
|
|
|
break; |
|
|
|
|
case 6: |
|
|
|
|
sprintf(compile_option, "-D GENTYPE4=double4"); |
|
|
|
|
break; |
|
|
|
|
default: |
|
|
|
|
CV_Error(CV_StsUnsupportedFormat, "unknown depth"); |
|
|
|
|
} |
|
|
|
|
int pixel_end = dst.wholecols * dst.wholerows - 1; |
|
|
|
|
int dstStep_in_pixel = dst.step1() / dst.oclchannels(); |
|
|
|
|
|
|
|
|
|
const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" }; |
|
|
|
|
std::string buildOptions = format("-D GENTYPE4=%s4", typeMap[dst.depth()]); |
|
|
|
|
|
|
|
|
|
vector< pair<size_t, const void *> > args; |
|
|
|
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&src)); |
|
|
|
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); |
|
|
|
@ -118,46 +95,24 @@ static void convert_C3C4(const cl_mem &src, oclMat &dst) |
|
|
|
|
args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep_in_pixel)); |
|
|
|
|
args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end)); |
|
|
|
|
|
|
|
|
|
size_t globalThreads[3] = {((dst.wholecols * dst.wholerows + 3) / 4 + 255) / 256 * 256, 1, 1}; |
|
|
|
|
size_t localThreads[3] = {256, 1, 1}; |
|
|
|
|
size_t globalThreads[3] = { divUp(dst.wholecols * dst.wholerows, 4), 1, 1 }; |
|
|
|
|
size_t localThreads[3] = { 256, 1, 1 }; |
|
|
|
|
|
|
|
|
|
openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1, compile_option); |
|
|
|
|
openCLExecuteKernel(clCxt, &convertC3C4, "convertC3C4", globalThreads, localThreads, |
|
|
|
|
args, -1, -1, buildOptions.c_str()); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
|
|
|
// convert_C4C3
|
|
|
|
|
|
|
|
|
|
static void convert_C4C3(const oclMat &src, cl_mem &dst) |
|
|
|
|
{ |
|
|
|
|
int srcStep_in_pixel = src.step1() / src.oclchannels(); |
|
|
|
|
int pixel_end = src.wholecols * src.wholerows - 1; |
|
|
|
|
Context *clCxt = src.clCxt; |
|
|
|
|
string kernelName = "convertC4C3"; |
|
|
|
|
char compile_option[32]; |
|
|
|
|
switch(src.depth()) |
|
|
|
|
{ |
|
|
|
|
case 0: |
|
|
|
|
sprintf(compile_option, "-D GENTYPE4=uchar4"); |
|
|
|
|
break; |
|
|
|
|
case 1: |
|
|
|
|
sprintf(compile_option, "-D GENTYPE4=char4"); |
|
|
|
|
break; |
|
|
|
|
case 2: |
|
|
|
|
sprintf(compile_option, "-D GENTYPE4=ushort4"); |
|
|
|
|
break; |
|
|
|
|
case 3: |
|
|
|
|
sprintf(compile_option, "-D GENTYPE4=short4"); |
|
|
|
|
break; |
|
|
|
|
case 4: |
|
|
|
|
sprintf(compile_option, "-D GENTYPE4=int4"); |
|
|
|
|
break; |
|
|
|
|
case 5: |
|
|
|
|
sprintf(compile_option, "-D GENTYPE4=float4"); |
|
|
|
|
break; |
|
|
|
|
case 6: |
|
|
|
|
sprintf(compile_option, "-D GENTYPE4=double4"); |
|
|
|
|
break; |
|
|
|
|
default: |
|
|
|
|
CV_Error(CV_StsUnsupportedFormat, "unknown depth"); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" }; |
|
|
|
|
std::string buildOptions = format("-D GENTYPE4=%s4", typeMap[src.depth()]); |
|
|
|
|
|
|
|
|
|
vector< pair<size_t, const void *> > args; |
|
|
|
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); |
|
|
|
@ -167,10 +122,10 @@ static void convert_C4C3(const oclMat &src, cl_mem &dst) |
|
|
|
|
args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep_in_pixel)); |
|
|
|
|
args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end)); |
|
|
|
|
|
|
|
|
|
size_t globalThreads[3] = {((src.wholecols * src.wholerows + 3) / 4 + 255) / 256 * 256, 1, 1}; |
|
|
|
|
size_t localThreads[3] = {256, 1, 1}; |
|
|
|
|
size_t globalThreads[3] = { divUp(src.wholecols * src.wholerows, 4), 1, 1}; |
|
|
|
|
size_t localThreads[3] = { 256, 1, 1 }; |
|
|
|
|
|
|
|
|
|
openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1, compile_option); |
|
|
|
|
openCLExecuteKernel(clCxt, &convertC3C4, "convertC4C3", globalThreads, localThreads, args, -1, -1, buildOptions.c_str()); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void cv::ocl::oclMat::upload(const Mat &m) |
|
|
|
@ -179,14 +134,10 @@ void cv::ocl::oclMat::upload(const Mat &m) |
|
|
|
|
Size wholeSize; |
|
|
|
|
Point ofs; |
|
|
|
|
m.locateROI(wholeSize, ofs); |
|
|
|
|
// int type = m.type();
|
|
|
|
|
// if(m.oclchannels() == 3)
|
|
|
|
|
//{
|
|
|
|
|
// type = CV_MAKETYPE(m.depth(), 4);
|
|
|
|
|
//}
|
|
|
|
|
|
|
|
|
|
create(wholeSize, m.type()); |
|
|
|
|
|
|
|
|
|
if(m.channels() == 3) |
|
|
|
|
if (m.channels() == 3) |
|
|
|
|
{ |
|
|
|
|
int pitch = wholeSize.width * 3 * m.elemSize1(); |
|
|
|
|
int tail_padding = m.elemSize1() * 3072; |
|
|
|
@ -197,35 +148,15 @@ void cv::ocl::oclMat::upload(const Mat &m) |
|
|
|
|
|
|
|
|
|
openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step, wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3); |
|
|
|
|
convert_C3C4(temp, *this); |
|
|
|
|
//int* cputemp=new int[wholeSize.height*wholeSize.width * 3];
|
|
|
|
|
//int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
|
|
|
|
|
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE,
|
|
|
|
|
// 0, wholeSize.height*wholeSize.width * 3* sizeof(int), cputemp, 0, NULL, NULL));
|
|
|
|
|
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE,
|
|
|
|
|
// 0, this->step*this->wholerows, cpudata, 0, NULL, NULL));
|
|
|
|
|
//for(int i=0;i<wholeSize.height;i++)
|
|
|
|
|
//{
|
|
|
|
|
// int *a = cputemp+i*wholeSize.width * 3,*b = cpudata + i*this->step/sizeof(int);
|
|
|
|
|
// for(int j=0;j<wholeSize.width;j++)
|
|
|
|
|
// {
|
|
|
|
|
// if((a[3*j] != b[4*j])||(a[3*j+1] != b[4*j+1])||(a[3*j+2] != b[4*j+2]))
|
|
|
|
|
// printf("rows=%d,cols=%d,cputtemp=%d,%d,%d;cpudata=%d,%d,%d\n",
|
|
|
|
|
// i,j,a[3*j],a[3*j+1],a[3*j+2],b[4*j],b[4*j+1],b[4*j+2]);
|
|
|
|
|
// }
|
|
|
|
|
//}
|
|
|
|
|
//delete []cputemp;
|
|
|
|
|
//delete []cpudata;
|
|
|
|
|
|
|
|
|
|
openCLSafeCall(clReleaseMemObject(temp)); |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
openCLMemcpy2D(clCxt, data, step, m.datastart, m.step, wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
rows = m.rows; |
|
|
|
|
cols = m.cols; |
|
|
|
|
offset = ofs.y * step + ofs.x * elemSize(); |
|
|
|
|
//download_channels = m.channels();
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
cv::ocl::oclMat::operator cv::_InputArray() |
|
|
|
@ -259,11 +190,6 @@ cv::ocl::oclMat& cv::ocl::getOclMatRef(OutputArray src) |
|
|
|
|
void cv::ocl::oclMat::download(cv::Mat &m) const |
|
|
|
|
{ |
|
|
|
|
CV_DbgAssert(!this->empty()); |
|
|
|
|
// int t = type();
|
|
|
|
|
// if(download_channels == 3)
|
|
|
|
|
//{
|
|
|
|
|
// t = CV_MAKETYPE(depth(), 3);
|
|
|
|
|
//}
|
|
|
|
|
m.create(wholerows, wholecols, type()); |
|
|
|
|
|
|
|
|
|
if(m.channels() == 3) |
|
|
|
@ -277,30 +203,14 @@ void cv::ocl::oclMat::download(cv::Mat &m) const |
|
|
|
|
|
|
|
|
|
convert_C4C3(*this, temp); |
|
|
|
|
openCLMemcpy2D(clCxt, m.data, m.step, temp, pitch, wholecols * m.elemSize(), wholerows, clMemcpyDeviceToHost, 3); |
|
|
|
|
//int* cputemp=new int[wholecols*wholerows * 3];
|
|
|
|
|
//int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
|
|
|
|
|
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE,
|
|
|
|
|
// 0, wholecols*wholerows * 3* sizeof(int), cputemp, 0, NULL, NULL));
|
|
|
|
|
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE,
|
|
|
|
|
// 0, this->step*this->wholerows, cpudata, 0, NULL, NULL));
|
|
|
|
|
//for(int i=0;i<wholerows;i++)
|
|
|
|
|
//{
|
|
|
|
|
// int *a = cputemp+i*wholecols * 3,*b = cpudata + i*this->step/sizeof(int);
|
|
|
|
|
// for(int j=0;j<wholecols;j++)
|
|
|
|
|
// {
|
|
|
|
|
// if((a[3*j] != b[4*j])||(a[3*j+1] != b[4*j+1])||(a[3*j+2] != b[4*j+2]))
|
|
|
|
|
// printf("rows=%d,cols=%d,cputtemp=%d,%d,%d;cpudata=%d,%d,%d\n",
|
|
|
|
|
// i,j,a[3*j],a[3*j+1],a[3*j+2],b[4*j],b[4*j+1],b[4*j+2]);
|
|
|
|
|
// }
|
|
|
|
|
//}
|
|
|
|
|
//delete []cputemp;
|
|
|
|
|
//delete []cpudata;
|
|
|
|
|
|
|
|
|
|
openCLSafeCall(clReleaseMemObject(temp)); |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
openCLMemcpy2D(clCxt, m.data, m.step, data, step, wholecols * elemSize(), wholerows, clMemcpyDeviceToHost); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
Size wholesize; |
|
|
|
|
Point ofs; |
|
|
|
|
locateROI(wholesize, ofs); |
|
|
|
@ -323,6 +233,7 @@ static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask |
|
|
|
|
{"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"}, |
|
|
|
|
{"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
char compile_option[32]; |
|
|
|
|
sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str()); |
|
|
|
|
size_t localThreads[3] = {16, 16, 1}; |
|
|
|
@ -366,6 +277,7 @@ void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const |
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
|
//////////////////////////////// ConvertTo ////////////////////////////////
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
|
|
static void convert_run(const oclMat &src, oclMat &dst, double alpha, double beta) |
|
|
|
|
{ |
|
|
|
|
string kernelName = "convert_to"; |
|
|
|
@ -404,6 +316,7 @@ static void convert_run(const oclMat &src, oclMat &dst, double alpha, double bet |
|
|
|
|
openCLExecuteKernel(dst.clCxt , &operator_convertTo, kernelName, globalThreads, |
|
|
|
|
localThreads, args, -1, -1, buildOptions); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double beta ) const |
|
|
|
|
{ |
|
|
|
|
if (!clCxt->supportsFeature(Context::CL_DOUBLE) && |
|
|
|
|