|
|
|
@ -612,7 +612,7 @@ static void* initOpenCLAndLoad(const char* funcname) |
|
|
|
|
return 0; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return funcname ? dlsym(handle, funcname) : 0; |
|
|
|
|
return funcname && handle ? dlsym(handle, funcname) : 0; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#elif defined WIN32 || defined _WIN32 |
|
|
|
@ -2002,7 +2002,7 @@ void* Queue::ptr() const |
|
|
|
|
Queue& Queue::getDefault() |
|
|
|
|
{ |
|
|
|
|
Queue& q = TLSData::get()->oclQueue; |
|
|
|
|
if( !q.p ) |
|
|
|
|
if( !q.p && haveOpenCL() ) |
|
|
|
|
q.create(Context2::getDefault()); |
|
|
|
|
return q; |
|
|
|
|
} |
|
|
|
@ -2251,22 +2251,32 @@ int Kernel::set(int i, const KernelArg& arg) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool Kernel::run(int dims, size_t globalsize[], size_t localsize[], |
|
|
|
|
bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[], |
|
|
|
|
bool sync, const Queue& q) |
|
|
|
|
{ |
|
|
|
|
if(!p || !p->handle || p->e != 0) |
|
|
|
|
return false; |
|
|
|
|
|
|
|
|
|
AutoBuffer<size_t> _globalSize(dims); |
|
|
|
|
size_t * globalSizePtr = (size_t *)_globalSize; |
|
|
|
|
for (int i = 0; i < dims; ++i) |
|
|
|
|
globalSizePtr[i] = localsize == NULL ? globalsize[i] : |
|
|
|
|
((globalsize[i] + localsize[i] - 1) / localsize[i]) * localsize[i]; |
|
|
|
|
|
|
|
|
|
cl_command_queue qq = getQueue(q); |
|
|
|
|
size_t offset[CV_MAX_DIM] = {0}; |
|
|
|
|
size_t offset[CV_MAX_DIM] = {0}, globalsize[CV_MAX_DIM] = {1,1,1}, localsize[CV_MAX_DIM] = {1,1,1}; |
|
|
|
|
size_t total = 1; |
|
|
|
|
for (int i = 0; i < dims; i++) |
|
|
|
|
{ |
|
|
|
|
size_t val0 = _localsize ? _localsize[i] : |
|
|
|
|
dims == 1 ? 64 : dims == 2 ? 16>>i : dims == 3 ? 8>>(i>0) : 1; |
|
|
|
|
size_t val = 1; |
|
|
|
|
while( val*2 < val0 ) |
|
|
|
|
val *= 2; |
|
|
|
|
if( _localsize ) |
|
|
|
|
localsize[i] = val; |
|
|
|
|
CV_Assert(_globalsize && _globalsize[i] >= 0); |
|
|
|
|
total *= _globalsize[i]; |
|
|
|
|
globalsize[i] = ((_globalsize[i] + val - 1)/val)*val; |
|
|
|
|
} |
|
|
|
|
if( total == 0 ) |
|
|
|
|
return true; |
|
|
|
|
cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims, |
|
|
|
|
offset, globalSizePtr, localsize, 0, 0, |
|
|
|
|
offset, globalsize, _localsize ? localsize : 0, 0, 0, |
|
|
|
|
sync ? 0 : &p->e); |
|
|
|
|
if( sync || retval < 0 ) |
|
|
|
|
{ |
|
|
|
@ -2361,14 +2371,23 @@ struct Program::Impl |
|
|
|
|
retval = clBuildProgram(handle, n, |
|
|
|
|
(const cl_device_id*)deviceList, |
|
|
|
|
buildflags.c_str(), 0, 0); |
|
|
|
|
if( retval == CL_BUILD_PROGRAM_FAILURE ) |
|
|
|
|
if( retval < 0 ) |
|
|
|
|
{ |
|
|
|
|
char buf[1<<16]; |
|
|
|
|
size_t retsz = 0; |
|
|
|
|
clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0], CL_PROGRAM_BUILD_LOG, |
|
|
|
|
sizeof(buf)-16, buf, &retsz); |
|
|
|
|
errmsg = String(buf); |
|
|
|
|
CV_Error_(Error::StsAssert, ("OpenCL program can not be built: %s", errmsg.c_str())); |
|
|
|
|
retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0], |
|
|
|
|
CL_PROGRAM_BUILD_LOG, 0, 0, &retsz); |
|
|
|
|
if( retval >= 0 && retsz > 0 ) |
|
|
|
|
{ |
|
|
|
|
AutoBuffer<char> bufbuf(retsz + 16); |
|
|
|
|
char* buf = bufbuf; |
|
|
|
|
retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0], |
|
|
|
|
CL_PROGRAM_BUILD_LOG, retsz+1, buf, &retsz); |
|
|
|
|
if( retval >= 0 ) |
|
|
|
|
{ |
|
|
|
|
errmsg = String(buf); |
|
|
|
|
CV_Error_(Error::StsAssert, ("OpenCL program can not be built: %s", errmsg.c_str())); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
CV_Assert(retval >= 0); |
|
|
|
|
} |
|
|
|
@ -2608,17 +2627,17 @@ ProgramSource2::hash_t ProgramSource2::hash() const |
|
|
|
|
class OpenCLAllocator : public MatAllocator |
|
|
|
|
{ |
|
|
|
|
public: |
|
|
|
|
OpenCLAllocator() {} |
|
|
|
|
OpenCLAllocator() { matStdAllocator = Mat::getStdAllocator(); } |
|
|
|
|
|
|
|
|
|
UMatData* defaultAllocate(int dims, const int* sizes, int type, size_t* step) const |
|
|
|
|
UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step, int flags) const |
|
|
|
|
{ |
|
|
|
|
UMatData* u = Mat::getStdAllocator()->allocate(dims, sizes, type, step); |
|
|
|
|
UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags); |
|
|
|
|
u->urefcount = 1; |
|
|
|
|
u->refcount = 0; |
|
|
|
|
return u; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void getBestFlags(const Context2& ctx, int& createFlags, int& flags0) const |
|
|
|
|
void getBestFlags(const Context2& ctx, int /*flags*/, int& createFlags, int& flags0) const |
|
|
|
|
{ |
|
|
|
|
const Device& dev = ctx.device(0); |
|
|
|
|
createFlags = CL_MEM_READ_WRITE; |
|
|
|
@ -2629,10 +2648,12 @@ public: |
|
|
|
|
flags0 = UMatData::COPY_ON_MAP; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
UMatData* allocate(int dims, const int* sizes, int type, size_t* step) const |
|
|
|
|
UMatData* allocate(int dims, const int* sizes, int type, |
|
|
|
|
void* data, size_t* step, int flags) const |
|
|
|
|
{ |
|
|
|
|
if(!useOpenCL()) |
|
|
|
|
return defaultAllocate(dims, sizes, type, step); |
|
|
|
|
return defaultAllocate(dims, sizes, type, data, step, flags); |
|
|
|
|
CV_Assert(data == 0); |
|
|
|
|
size_t total = CV_ELEM_SIZE(type); |
|
|
|
|
for( int i = dims-1; i >= 0; i-- ) |
|
|
|
|
{ |
|
|
|
@ -2643,13 +2664,13 @@ public: |
|
|
|
|
|
|
|
|
|
Context2& ctx = Context2::getDefault(); |
|
|
|
|
int createFlags = 0, flags0 = 0; |
|
|
|
|
getBestFlags(ctx, createFlags, flags0); |
|
|
|
|
getBestFlags(ctx, flags, createFlags, flags0); |
|
|
|
|
|
|
|
|
|
cl_int retval = 0; |
|
|
|
|
void* handle = clCreateBuffer((cl_context)ctx.ptr(), |
|
|
|
|
createFlags, total, 0, &retval); |
|
|
|
|
if( !handle || retval < 0 ) |
|
|
|
|
return defaultAllocate(dims, sizes, type, step); |
|
|
|
|
return defaultAllocate(dims, sizes, type, data, step, flags); |
|
|
|
|
UMatData* u = new UMatData(this); |
|
|
|
|
u->data = 0; |
|
|
|
|
u->size = total; |
|
|
|
@ -2672,7 +2693,7 @@ public: |
|
|
|
|
CV_Assert(u->origdata != 0); |
|
|
|
|
Context2& ctx = Context2::getDefault(); |
|
|
|
|
int createFlags = 0, flags0 = 0; |
|
|
|
|
getBestFlags(ctx, createFlags, flags0); |
|
|
|
|
getBestFlags(ctx, accessFlags, createFlags, flags0); |
|
|
|
|
|
|
|
|
|
cl_context ctx_handle = (cl_context)ctx.ptr(); |
|
|
|
|
cl_int retval = 0; |
|
|
|
@ -2697,19 +2718,41 @@ public: |
|
|
|
|
return true; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void sync(UMatData* u) const |
|
|
|
|
{ |
|
|
|
|
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); |
|
|
|
|
clFinish(q); |
|
|
|
|
|
|
|
|
|
if( u->hostCopyObsolete() && u->handle && |
|
|
|
|
u->tempCopiedUMat() && u->refcount > 0 && u->origdata) |
|
|
|
|
{ |
|
|
|
|
UMatDataAutoLock lock(u); |
|
|
|
|
clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, |
|
|
|
|
u->size, u->origdata, 0, 0, 0); |
|
|
|
|
u->markHostCopyObsolete(false); |
|
|
|
|
} |
|
|
|
|
else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data ) |
|
|
|
|
{ |
|
|
|
|
UMatDataAutoLock lock(u); |
|
|
|
|
clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, |
|
|
|
|
u->size, u->data, 0, 0, 0); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void deallocate(UMatData* u) const |
|
|
|
|
{ |
|
|
|
|
if(!u) |
|
|
|
|
return; |
|
|
|
|
UMatDataAutoLock lock(u); |
|
|
|
|
|
|
|
|
|
// TODO: !!! when we add Shared Virtual Memory Support,
|
|
|
|
|
// this function (as well as the others should be corrected)
|
|
|
|
|
// this function (as well as the others) should be corrected
|
|
|
|
|
CV_Assert(u->handle != 0 && u->urefcount == 0); |
|
|
|
|
if(u->tempUMat()) |
|
|
|
|
{ |
|
|
|
|
if( u->hostCopyObsolete() && u->refcount > 0 && u->tempCopiedUMat() ) |
|
|
|
|
{ |
|
|
|
|
clEnqueueWriteBuffer((cl_command_queue)Queue::getDefault().ptr(), |
|
|
|
|
clEnqueueReadBuffer((cl_command_queue)Queue::getDefault().ptr(), |
|
|
|
|
(cl_mem)u->handle, CL_TRUE, 0, |
|
|
|
|
u->size, u->origdata, 0, 0, 0); |
|
|
|
|
} |
|
|
|
@ -2717,7 +2760,7 @@ public: |
|
|
|
|
clReleaseMemObject((cl_mem)u->handle); |
|
|
|
|
u->handle = 0; |
|
|
|
|
u->currAllocator = u->prevAllocator; |
|
|
|
|
if(u->data && u->copyOnMap()) |
|
|
|
|
if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED)) |
|
|
|
|
fastFree(u->data); |
|
|
|
|
u->data = u->origdata; |
|
|
|
|
if(u->refcount == 0) |
|
|
|
@ -2725,8 +2768,11 @@ public: |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
{ |
|
|
|
|
if(u->data && u->copyOnMap()) |
|
|
|
|
if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED)) |
|
|
|
|
{ |
|
|
|
|
fastFree(u->data); |
|
|
|
|
u->data = 0; |
|
|
|
|
} |
|
|
|
|
clReleaseMemObject((cl_mem)u->handle); |
|
|
|
|
u->handle = 0; |
|
|
|
|
delete u; |
|
|
|
@ -2793,15 +2839,18 @@ public: |
|
|
|
|
UMatDataAutoLock autolock(u); |
|
|
|
|
|
|
|
|
|
cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); |
|
|
|
|
cl_int retval = 0; |
|
|
|
|
if( !u->copyOnMap() && u->data ) |
|
|
|
|
{ |
|
|
|
|
CV_Assert( clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0) >= 0 ); |
|
|
|
|
CV_Assert( (retval = clEnqueueUnmapMemObject(q, |
|
|
|
|
(cl_mem)u->handle, u->data, 0, 0, 0)) >= 0 ); |
|
|
|
|
clFinish(q); |
|
|
|
|
u->data = 0; |
|
|
|
|
} |
|
|
|
|
else if( u->copyOnMap() && u->deviceCopyObsolete() ) |
|
|
|
|
{ |
|
|
|
|
CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, |
|
|
|
|
u->size, u->data, 0, 0, 0) >= 0 ); |
|
|
|
|
CV_Assert( (retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, |
|
|
|
|
u->size, u->data, 0, 0, 0)) >= 0 ); |
|
|
|
|
} |
|
|
|
|
u->markDeviceCopyObsolete(false); |
|
|
|
|
u->markHostCopyObsolete(false); |
|
|
|
@ -3033,6 +3082,8 @@ public: |
|
|
|
|
if( sync ) |
|
|
|
|
clFinish(q); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
MatAllocator* matStdAllocator; |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
MatAllocator* getOpenCLAllocator() |
|
|
|
|